- 可以不必编写自定义循环,只要指定loss计算方法即可。
- 可以通过callbacks非常方便地添加CheckPoint参数保存、early_stopping 等功能。
- 可以非常方便地在单CPU、多CPU、单GPU、多GPU乃至多TPU上训练模型。
- 可以通过调用torchmetrics库,非常方便地添加Accuracy,AUC,Precision等各种常用评估指标。
- 可以非常方便地实施多批次梯度累加、半精度混合精度训练、最大batch_size自动搜索等技巧,加快训练过程。
- 可以非常方便地使用SWA(随机参数平均)、CyclicLR(学习率周期性调度策略)与auto_lr_find(最优学习率发现)等技巧 实现模型涨点。
一般按照如下方式 安装和 引入 pytorch-lightning 库。
一,pytorch-lightning的设计哲学
pytorch-lightning 的核心设计哲学是将 深度学习项目中的 研究代码(定义模型) 和 工程代码 (训练模型) 相互分离。
用户只需专注于研究代码(pl.LightningModule)的实现,而工程代码借助训练工具类(pl.Trainer)统一实现。
更详细地说,深度学习项目代码可以分成如下4部分:
- 研究代码 (Research code),用户继承LightningModule实现。
- 工程代码 (Engineering code),用户无需关注通过调用Trainer实现。
- 非必要代码 (Non-essential research code,logging, etc...),用户通过调用Callbacks实现。
- 数据 (Data),用户通过torch.utils.data.DataLoader实现,也可以封装成pl.LightningDataModule。
二,pytorch-lightning使用范例
下面我们使用minist图片分类问题为例,演示pytorch-lightning的最佳实践。
1,准备数据
代码语言:javascript复制import torch
from torch import nn
from torchvision import transforms as T
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader,random_split
import pytorch_lightning as pl
from torchmetrics import Accuracy
class MNISTDataModule(pl.LightningDataModule):
def __init__(self, data_dir: str = "./minist/",
batch_size: int = 32,
num_workers: int =4):
super().__init__()
self.data_dir = data_dir
self.batch_size = batch_size
self.num_workers = num_workers
def setup(self, stage = None):
transform = T.Compose([T.ToTensor()])
self.ds_test = MNIST(self.data_dir, train=False,transform=transform,download=True)
self.ds_predict = MNIST(self.data_dir, train=False,transform=transform,download=True)
ds_full = MNIST(self.data_dir, train=True,transform=transform,download=True)
self.ds_train, self.ds_val = random_split(ds_full, [55000, 5000])
def train_dataloader(self):
return DataLoader(self.ds_train, batch_size=self.batch_size,
shuffle=True, num_workers=self.num_workers,
pin_memory=True)
def val_dataloader(self):
return DataLoader(self.ds_val, batch_size=self.batch_size,
shuffle=False, num_workers=self.num_workers,
pin_memory=True)
def test_dataloader(self):
return DataLoader(self.ds_test, batch_size=self.batch_size,
shuffle=False, num_workers=self.num_workers,
pin_memory=True)
def predict_dataloader(self):
return DataLoader(self.ds_predict, batch_size=self.batch_size,
shuffle=False, num_workers=self.num_workers,
pin_memory=True)
data_mnist = MNISTDataModule()
data_mnist.setup()
for features,labels in data_mnist.train_dataloader():
print(features.shape)
print(labels.shape)
break
torch.Size([32, 1, 28, 28])
torch.Size([32])
2,定义模型
代码语言:javascript复制net = nn.Sequential(
nn.Conv2d(in_channels=1,out_channels=32,kernel_size = 3),
nn.MaxPool2d(kernel_size = 2,stride = 2),
nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5),
nn.MaxPool2d(kernel_size = 2,stride = 2),
nn.Dropout2d(p = 0.1),
nn.AdaptiveMaxPool2d((1,1)),
nn.Flatten(),
nn.Linear(64,32),
nn.ReLU(),
nn.Linear(32,10)
)
class Model(pl.LightningModule):
def __init__(self,net,learning_rate=1e-3):
super().__init__()
self.save_hyperparameters()
self.net = net
self.train_acc = Accuracy()
self.val_acc = Accuracy()
self.test_acc = Accuracy()
def forward(self,x):
x = self.net(x)
return x
#定义loss
def training_step(self, batch, batch_idx):
x, y = batch
preds = self(x)
loss = nn.CrossEntropyLoss()(preds,y)
return {"loss":loss,"preds":preds.detach(),"y":y.detach()}
#定义各种metrics
def training_step_end(self,outputs):
train_acc = self.train_acc(outputs['preds'], outputs['y']).item()
self.log("train_acc",train_acc,prog_bar=True)
return {"loss":outputs["loss"].mean()}
#定义optimizer,以及可选的lr_scheduler
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
def validation_step(self, batch, batch_idx):
x, y = batch
preds = self(x)
loss = nn.CrossEntropyLoss()(preds,y)
return {"loss":loss,"preds":preds.detach(),"y":y.detach()}
def validation_step_end(self,outputs):
val_acc = self.val_acc(outputs['preds'], outputs['y']).item()
self.log("val_loss",outputs["loss"].mean(),on_epoch=True,on_step=False)
self.log("val_acc",val_acc,prog_bar=True,on_epoch=True,on_step=False)
def test_step(self, batch, batch_idx):
x, y = batch
preds = self(x)
loss = nn.CrossEntropyLoss()(preds,y)
return {"loss":loss,"preds":preds.detach(),"y":y.detach()}
def test_step_end(self,outputs):
test_acc = self.test_acc(outputs['preds'], outputs['y']).item()
self.log("test_acc",test_acc,on_epoch=True,on_step=False)
self.log("test_loss",outputs["loss"].mean(),on_epoch=True,on_step=False)
model = Model(net)
#查看模型大小
model_size = pl.utilities.memory.get_model_size_mb(model)
print("model_size = {} M n".format(model_size))
model.example_input_array = [features]
summary = pl.utilities.model_summary.ModelSummary(model,max_depth=-1)
print(summary)
model_size = 0.218447 M
| Name | Type | Params | In sizes | Out sizes
---------------------------------------------------------------------------------------
0 | net | Sequential | 54.0 K | [32, 1, 28, 28] | [32, 10]
1 | net.0 | Conv2d | 320 | [32, 1, 28, 28] | [32, 32, 26, 26]
2 | net.1 | MaxPool2d | 0 | [32, 32, 26, 26] | [32, 32, 13, 13]
3 | net.2 | Conv2d | 51.3 K | [32, 32, 13, 13] | [32, 64, 9, 9]
4 | net.3 | MaxPool2d | 0 | [32, 64, 9, 9] | [32, 64, 4, 4]
5 | net.4 | Dropout2d | 0 | [32, 64, 4, 4] | [32, 64, 4, 4]
6 | net.5 | AdaptiveMaxPool2d | 0 | [32, 64, 4, 4] | [32, 64, 1, 1]
7 | net.6 | Flatten | 0 | [32, 64, 1, 1] | [32, 64]
8 | net.7 | Linear | 2.1 K | [32, 64] | [32, 32]
9 | net.8 | ReLU | 0 | [32, 32] | [32, 32]
10 | net.9 | Linear | 330 | [32, 32] | [32, 10]
11 | train_acc | Accuracy | 0 | ? | ?
12 | val_acc | Accuracy | 0 | ? | ?
13 | test_acc | Accuracy | 0 | ? | ?
---------------------------------------------------------------------------------------
54.0 K Trainable params
0 Non-trainable params
54.0 K Total params
0.216 Total estimated model params size (MB)
3,训练模型
代码语言:javascript复制pl.seed_everything(1234)
ckpt_callback = pl.callbacks.ModelCheckpoint(
monitor='val_loss',
save_top_k=1,
mode='min'
)
early_stopping = pl.callbacks.EarlyStopping(monitor = 'val_loss',
patience=3,
mode = 'min')
# gpus=0 则使用cpu训练,gpus=1则使用1个gpu训练,gpus=2则使用2个gpu训练,gpus=-1则使用所有gpu训练,
# gpus=[0,1]则指定使用0号和1号gpu训练, gpus="0,1,2,3"则使用0,1,2,3号gpu训练
# tpus=1 则使用1个tpu训练
trainer = pl.Trainer(max_epochs=20,
#gpus=0, #单CPU模式
gpus=0, #单GPU模式
#num_processes=4,strategy="ddp_find_unused_parameters_false", #多CPU(进程)模式
#gpus=[0,1,2,3],strategy="dp", #多GPU的DataParallel(速度提升效果一般)
#gpus=[0,1,2,3],strategy=“ddp_find_unused_parameters_false" #多GPU的DistributedDataParallel(速度提升效果好)
callbacks = [ckpt_callback,early_stopping],
profiler="simple")
#断点续训
#trainer = pl.Trainer(resume_from_checkpoint='./lightning_logs/version_31/checkpoints/epoch=02-val_loss=0.05.ckpt')
#训练模型
trainer.fit(model,data_mnist)
Epoch 8: 100%
1876/1876 [01:44<00:00, 17.93it/s, loss=0.0603, v_num=0, train_acc=1.000, val_acc=0.985]
4,评估模型
代码语言:javascript复制result = trainer.test(model,data_mnist.train_dataloader(),ckpt_path='best')
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9966545701026917, 'test_loss': 0.010617421939969063}
--------------------------------------------------------------------------------
result = trainer.test(model,data_mnist.val_dataloader(),ckpt_path='best')
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9865999817848206, 'test_loss': 0.042671505361795425}
--------------------------------------------------------------------------------
result = trainer.test(model,data_mnist.test_dataloader(),ckpt_path='best')
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.987500011920929, 'test_loss': 0.047178059816360474}
--------------------------------------------------------------------------------
5,使用模型
代码语言:javascript复制data,label = next(iter(data_module.test_dataloader()))
model.eval()
prediction = model(data)
print(prediction)
tensor([[-13.0112, -2.8257, -1.8588, -3.6137, -0.3307, -5.4953, -19.7282,
15.9651, -8.0379, -2.2925],
[ -6.0261, -2.5480, 13.4140, -5.5701, -10.2049, -6.4469, -3.7119,
-6.0732, -6.0826, -7.7339],
...
[-16.7028, -4.9060, 0.4400, 24.4337, -12.8793, 1.5085, -17.9232,
-3.0839, 0.5491, 1.9846],
[ -5.0909, 10.1805, -8.2528, -9.2240, -1.8044, -4.0296, -8.2297,
-3.1828, -5.9361, -4.8410]], grad_fn=<AddmmBackward0>)
6,保存模型
最优模型默认保存在 trainer.checkpoint_callback.best_model_path 的目录下,可以直接加载。
代码语言:javascript复制print(trainer.checkpoint_callback.best_model_path)
print(trainer.checkpoint_callback.best_model_score)
lightning_logs/version_10/checkpoints/epoch=8-step=15470.ckpt
tensor(0.0376, device='cuda:0')
model_clone = Model.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
trainer_clone = pl.Trainer(max_epochs=3,gpus=1)
result = trainer_clone.test(model_clone,data_module.test_dataloader())
print(result)
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9887999892234802, 'test_loss': 0.03627564385533333}
--------------------------------------------------------------------------------
[{'test_acc': 0.9887999892234802, 'test_loss': 0.03627564385533333}]