pytorch_lightning深入浅出

2023-10-07 16:04:38 浏览数 (1)

  • 可以不必编写自定义循环,只要指定loss计算方法即可。
  • 可以通过callbacks非常方便地添加CheckPoint参数保存、early_stopping 等功能。
  • 可以非常方便地在单CPU、多CPU、单GPU、多GPU乃至多TPU上训练模型。
  • 可以通过调用torchmetrics库,非常方便地添加Accuracy,AUC,Precision等各种常用评估指标。
  • 可以非常方便地实施多批次梯度累加、半精度混合精度训练、最大batch_size自动搜索等技巧,加快训练过程。
  • 可以非常方便地使用SWA(随机参数平均)、CyclicLR(学习率周期性调度策略)与auto_lr_find(最优学习率发现)等技巧 实现模型涨点。

一般按照如下方式 安装和 引入 pytorch-lightning 库。

一,pytorch-lightning的设计哲学

pytorch-lightning 的核心设计哲学是将 深度学习项目中的 研究代码(定义模型) 和 工程代码 (训练模型) 相互分离。

用户只需专注于研究代码(pl.LightningModule)的实现,而工程代码借助训练工具类(pl.Trainer)统一实现。

更详细地说,深度学习项目代码可以分成如下4部分:

  • 研究代码 (Research code),用户继承LightningModule实现。
  • 工程代码 (Engineering code),用户无需关注通过调用Trainer实现。
  • 非必要代码 (Non-essential research code,logging, etc...),用户通过调用Callbacks实现。
  • 数据 (Data),用户通过torch.utils.data.DataLoader实现,也可以封装成pl.LightningDataModule。

二,pytorch-lightning使用范例

下面我们使用minist图片分类问题为例,演示pytorch-lightning的最佳实践。

1,准备数据
代码语言:javascript复制
import torch 
from torch import nn 
from torchvision import transforms as T
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader,random_split
import pytorch_lightning as pl 
from torchmetrics import Accuracy 


class MNISTDataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str = "./minist/", 
                 batch_size: int = 32,
                 num_workers: int =4):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.num_workers = num_workers

    def setup(self, stage = None):
        transform = T.Compose([T.ToTensor()])
        self.ds_test = MNIST(self.data_dir, train=False,transform=transform,download=True)
        self.ds_predict = MNIST(self.data_dir, train=False,transform=transform,download=True)
        ds_full = MNIST(self.data_dir, train=True,transform=transform,download=True)
        self.ds_train, self.ds_val = random_split(ds_full, [55000, 5000])

    def train_dataloader(self):
        return DataLoader(self.ds_train, batch_size=self.batch_size,
                          shuffle=True, num_workers=self.num_workers,
                          pin_memory=True)

    def val_dataloader(self):
        return DataLoader(self.ds_val, batch_size=self.batch_size,
                          shuffle=False, num_workers=self.num_workers,
                          pin_memory=True)

    def test_dataloader(self):
        return DataLoader(self.ds_test, batch_size=self.batch_size,
                          shuffle=False, num_workers=self.num_workers,
                          pin_memory=True)

    def predict_dataloader(self):
        return DataLoader(self.ds_predict, batch_size=self.batch_size,
                          shuffle=False, num_workers=self.num_workers,
                          pin_memory=True)
    

data_mnist = MNISTDataModule()
data_mnist.setup()

for features,labels in data_mnist.train_dataloader():
    print(features.shape)
    print(labels.shape)
    break 

torch.Size([32, 1, 28, 28])
torch.Size([32])
2,定义模型
代码语言:javascript复制
net = nn.Sequential(
    nn.Conv2d(in_channels=1,out_channels=32,kernel_size = 3),
    nn.MaxPool2d(kernel_size = 2,stride = 2),
    nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5),
    nn.MaxPool2d(kernel_size = 2,stride = 2),
    nn.Dropout2d(p = 0.1),
    nn.AdaptiveMaxPool2d((1,1)),
    nn.Flatten(),
    nn.Linear(64,32),
    nn.ReLU(),
    nn.Linear(32,10)
)

class Model(pl.LightningModule):
    
    def __init__(self,net,learning_rate=1e-3):
        super().__init__()
        self.save_hyperparameters()
        self.net = net
        self.train_acc = Accuracy()
        self.val_acc = Accuracy()
        self.test_acc = Accuracy() 
        
        
    def forward(self,x):
        x = self.net(x)
        return x
    
    
    #定义loss
    def training_step(self, batch, batch_idx):
        x, y = batch
        preds = self(x)
        loss = nn.CrossEntropyLoss()(preds,y)
        return {"loss":loss,"preds":preds.detach(),"y":y.detach()}
    
    #定义各种metrics
    def training_step_end(self,outputs):
        train_acc = self.train_acc(outputs['preds'], outputs['y']).item()    
        self.log("train_acc",train_acc,prog_bar=True)
        return {"loss":outputs["loss"].mean()}
    
    #定义optimizer,以及可选的lr_scheduler
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        preds = self(x)
        loss = nn.CrossEntropyLoss()(preds,y)
        return {"loss":loss,"preds":preds.detach(),"y":y.detach()}

    def validation_step_end(self,outputs):
        val_acc = self.val_acc(outputs['preds'], outputs['y']).item()    
        self.log("val_loss",outputs["loss"].mean(),on_epoch=True,on_step=False)
        self.log("val_acc",val_acc,prog_bar=True,on_epoch=True,on_step=False)
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        preds = self(x)
        loss = nn.CrossEntropyLoss()(preds,y)
        return {"loss":loss,"preds":preds.detach(),"y":y.detach()}
    
    def test_step_end(self,outputs):
        test_acc = self.test_acc(outputs['preds'], outputs['y']).item()    
        self.log("test_acc",test_acc,on_epoch=True,on_step=False)
        self.log("test_loss",outputs["loss"].mean(),on_epoch=True,on_step=False)
    
model = Model(net)

#查看模型大小
model_size = pl.utilities.memory.get_model_size_mb(model)
print("model_size = {} M n".format(model_size))
model.example_input_array = [features]
summary = pl.utilities.model_summary.ModelSummary(model,max_depth=-1)
print(summary) 


model_size = 0.218447 M 

   | Name      | Type              | Params | In sizes         | Out sizes       
---------------------------------------------------------------------------------------
0  | net       | Sequential        | 54.0 K | [32, 1, 28, 28]  | [32, 10]        
1  | net.0     | Conv2d            | 320    | [32, 1, 28, 28]  | [32, 32, 26, 26]
2  | net.1     | MaxPool2d         | 0      | [32, 32, 26, 26] | [32, 32, 13, 13]
3  | net.2     | Conv2d            | 51.3 K | [32, 32, 13, 13] | [32, 64, 9, 9]  
4  | net.3     | MaxPool2d         | 0      | [32, 64, 9, 9]   | [32, 64, 4, 4]  
5  | net.4     | Dropout2d         | 0      | [32, 64, 4, 4]   | [32, 64, 4, 4]  
6  | net.5     | AdaptiveMaxPool2d | 0      | [32, 64, 4, 4]   | [32, 64, 1, 1]  
7  | net.6     | Flatten           | 0      | [32, 64, 1, 1]   | [32, 64]        
8  | net.7     | Linear            | 2.1 K  | [32, 64]         | [32, 32]        
9  | net.8     | ReLU              | 0      | [32, 32]         | [32, 32]        
10 | net.9     | Linear            | 330    | [32, 32]         | [32, 10]        
11 | train_acc | Accuracy          | 0      | ?                | ?               
12 | val_acc   | Accuracy          | 0      | ?                | ?               
13 | test_acc  | Accuracy          | 0      | ?                | ?               
---------------------------------------------------------------------------------------
54.0 K    Trainable params
0         Non-trainable params
54.0 K    Total params
0.216     Total estimated model params size (MB)
3,训练模型
代码语言:javascript复制
pl.seed_everything(1234)

ckpt_callback = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    save_top_k=1,
    mode='min'
)
early_stopping = pl.callbacks.EarlyStopping(monitor = 'val_loss',
               patience=3,
               mode = 'min')

# gpus=0 则使用cpu训练,gpus=1则使用1个gpu训练,gpus=2则使用2个gpu训练,gpus=-1则使用所有gpu训练,
# gpus=[0,1]则指定使用0号和1号gpu训练, gpus="0,1,2,3"则使用0,1,2,3号gpu训练
# tpus=1 则使用1个tpu训练

trainer = pl.Trainer(max_epochs=20,   
     #gpus=0, #单CPU模式
     gpus=0, #单GPU模式
     #num_processes=4,strategy="ddp_find_unused_parameters_false", #多CPU(进程)模式
     #gpus=[0,1,2,3],strategy="dp", #多GPU的DataParallel(速度提升效果一般)
     #gpus=[0,1,2,3],strategy=“ddp_find_unused_parameters_false" #多GPU的DistributedDataParallel(速度提升效果好)
     callbacks = [ckpt_callback,early_stopping],
     profiler="simple") 

#断点续训
#trainer = pl.Trainer(resume_from_checkpoint='./lightning_logs/version_31/checkpoints/epoch=02-val_loss=0.05.ckpt')

#训练模型
trainer.fit(model,data_mnist)


Epoch 8: 100%
1876/1876 [01:44<00:00, 17.93it/s, loss=0.0603, v_num=0, train_acc=1.000, val_acc=0.985]
4,评估模型
代码语言:javascript复制
result = trainer.test(model,data_mnist.train_dataloader(),ckpt_path='best')

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9966545701026917, 'test_loss': 0.010617421939969063}
--------------------------------------------------------------------------------

result = trainer.test(model,data_mnist.val_dataloader(),ckpt_path='best')

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9865999817848206, 'test_loss': 0.042671505361795425}
--------------------------------------------------------------------------------

result = trainer.test(model,data_mnist.test_dataloader(),ckpt_path='best')

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.987500011920929, 'test_loss': 0.047178059816360474}
--------------------------------------------------------------------------------
5,使用模型
代码语言:javascript复制
data,label = next(iter(data_module.test_dataloader()))
model.eval()
prediction = model(data)
print(prediction)


tensor([[-13.0112,  -2.8257,  -1.8588,  -3.6137,  -0.3307,  -5.4953, -19.7282,
          15.9651,  -8.0379,  -2.2925],
        [ -6.0261,  -2.5480,  13.4140,  -5.5701, -10.2049,  -6.4469,  -3.7119,
          -6.0732,  -6.0826,  -7.7339],
          ...
        [-16.7028,  -4.9060,   0.4400,  24.4337, -12.8793,   1.5085, -17.9232,
          -3.0839,   0.5491,   1.9846],
        [ -5.0909,  10.1805,  -8.2528,  -9.2240,  -1.8044,  -4.0296,  -8.2297,
          -3.1828,  -5.9361,  -4.8410]], grad_fn=<AddmmBackward0>)
6,保存模型

最优模型默认保存在 trainer.checkpoint_callback.best_model_path 的目录下,可以直接加载。

代码语言:javascript复制
print(trainer.checkpoint_callback.best_model_path)
print(trainer.checkpoint_callback.best_model_score)

lightning_logs/version_10/checkpoints/epoch=8-step=15470.ckpt
tensor(0.0376, device='cuda:0')

model_clone = Model.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
trainer_clone = pl.Trainer(max_epochs=3,gpus=1) 
result = trainer_clone.test(model_clone,data_module.test_dataloader())
print(result)


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9887999892234802, 'test_loss': 0.03627564385533333}
--------------------------------------------------------------------------------
[{'test_acc': 0.9887999892234802, 'test_loss': 0.03627564385533333}]

0 人点赞