使用预训练模型进行句对分类(Paddle、PyTorch)

2021-09-06 11:16:16 浏览数 (1)

文章目录

    • 1. Paddle
    • 2. PyTorch
    • 3. 提交结果

分别使用两种框架,加载预训练模型,对句对进行分类

数据下载:千言数据集:文本相似度

1. Paddle

可以使用 paddlenlp 直接加载预训练模型,比较方便

代码语言:javascript复制
# %%
# 比赛地址
# https://aistudio.baidu.com/aistudio/competition/detail/45
import time
import os
import numpy as np
import paddle
import paddlenlp
import paddle.nn.functional as F
import paddle.distributed as dist  # 并行
from functools import partial
from paddlenlp.data import Stack, Pad, Tuple
import paddle.nn as nn
from paddlenlp.datasets import load_dataset
from paddlenlp.transformers import LinearDecayWithWarmup


def read(data, datasetname, predict=False): # 将数据转成迭代器
    if not predict:
        for d in data:
            label = d["label"]
            if datasetname != "lcqmc":
                text1, text2 = d["sentence1"], d["sentence2"]
            else:
                text1, text2 = d["query"], d["title"]
            yield {"label": label, "text1": text1, "text2": text2}
    else:
        for d in data:
            if datasetname != "lcqmc":
                text1, text2 = d["sentence1"], d["sentence2"]
            else:
                text1, text2 = d["query"], d["title"]
            yield {"text1": text1, "text2": text2}


def convert_data(data, tokenizer, datasetname, max_seq_len=512, is_test=False): # 数据转码为模型的输入
    text1, text2 = data["text1"], data["text2"]
    encoded_inputs = tokenizer(text=text1, text_pair=text2, max_seq_len=max_seq_len)
    input_ids = encoded_inputs["input_ids"]
    token_type_ids = encoded_inputs["token_type_ids"]
    if not is_test:
        label = np.array([data["label"]], dtype="int64")
        return input_ids, token_type_ids, label
    return input_ids, token_type_ids


class PretrainedModel(nn.Layer): # 预训练模型   FC
    def __init__(self, pretrained_model, dropout=None):
        super().__init__()
        self.ptm = pretrained_model
        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
        self.clf = nn.Linear(self.ptm.config["hidden_size"], 2)

    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
        _, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask)
        cls_embedding = self.dropout(cls_embedding)
        logits = self.clf(cls_embedding)
        probs = F.softmax(logits)
        return probs


class Recongnizer(): # 识别器类
    def __init__(self, datasetname, state_dict_path=None):  # 传入模型参数路径
        self.seed = 100
        paddle.seed = self.seed
        self.batch_size = 128
        self.epochs = 20
        self.max_seq_len = 512
        self.datasetname = datasetname
        # paddlenlp自带的一键加载数据
        self.train_ds, self.dev_ds, self.test_ds = load_dataset(datasetname, splits=["train", "dev", "test"])
        # 使用预训练模型的tokenizer
        self.tokenizer = paddlenlp.transformers.ErnieGramTokenizer.from_pretrained("ernie-gram-zh")
        # https://gitee.com/paddlepaddle/PaddleNLP/blob/develop/docs/model_zoo/transformers.rst
        # 加载预训练模型
        self.pretrained_model = paddlenlp.transformers.ErnieGramModel.from_pretrained("ernie-gram-zh")
        self.model = PretrainedModel(self.pretrained_model)
        if state_dict_path: # 如果传入了模型参数,直接加载参数
            try:
                state_dict = paddle.load(state_dict_path)
                self.model.set_dict(state_dict)
            except:
                print("加载模型参数失败!")
        self.pathname = "checkpoint"
        self.global_step = 0
        isExists = os.path.exists(self.pathname)
        if not isExists:
            os.mkdir(self.pathname)
        self.save_dir = ""
        self.save_param_path = ""

    def fit(self):
        # 加载数据集
        train_ds = load_dataset(read, data=self.train_ds, datasetname=self.datasetname, lazy=False)
        dev_ds = load_dataset(read, data=self.dev_ds, datasetname=self.datasetname, lazy=False)
        test_ds = load_dataset(read, data=self.test_ds, datasetname=self.datasetname, predict=True, lazy=False)
        # 展示数据
        for i, example in enumerate(train_ds):
            if i < 5:
                print(example)
        input_ids, token_type_ids, label = convert_data(train_ds[0], self.tokenizer, self.datasetname)
        print(input_ids)
        # [1, 692, 811, 445, 2001, 497, 5, 654, 21, 692, 811, 614, 356, 314, 5, 291, 21, 2, 329, 445, 2001, 497, 5, 654, 21, 692, 811, 614, 356, 314, 5, 291, 21, 2]
        print(token_type_ids)
        # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
        print(label)
        # [1]

        # 数据转换函数
        trans_func = partial(convert_data, tokenizer=self.tokenizer, datasetname=self.datasetname,
                             max_seq_len=self.max_seq_len)
        # 对数据进行批量打包 pad
        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=self.tokenizer.pad_token_id),
            Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id),
            Stack(dtype="int64")
        ): [d for d in fn(samples)]
        # 将长度不同的多个句子padding到统一长度,取N个输入数据中的最大长度
        # 长度是指的: 一个batch中的最大长度,主要考虑性能开销

        # 取样器
        batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=self.batch_size, shuffle=True)
        # 数据加载器
        train_data_loader = paddle.io.DataLoader(
            dataset=train_ds.map(trans_func),
            batch_sampler=batch_sampler,
            collate_fn=batchify_fn,
            return_list=True
        )

        batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=self.batch_size, shuffle=False)
        dev_data_loader = paddle.io.DataLoader(
            dataset=dev_ds.map(trans_func),
            batch_sampler=batch_sampler,
            collate_fn=batchify_fn,
            return_list=True
        )
        num_training_steps = len(train_data_loader) * self.epochs
        # 学习率
        lr_scheduler = LinearDecayWithWarmup(5e-5, num_training_steps, 0.0)
        # 衰减的参数
        decay_params = [
            p.name for n, p in self.model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        # 梯度剪切
        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
        # 优化器
        optimizer = paddle.optimizer.AdamW(
            learning_rate=lr_scheduler,
            parameters=self.model.parameters(),
            weight_decay=0.0,
            apply_decay_param_fun=lambda x: x in decay_params,
            grad_clip=clip
        )
        # 损失函数
        criterion = paddle.nn.loss.CrossEntropyLoss()
        # 评估准确率
        metric = paddle.metric.Precision()

        t_start = time.time()
        F1 = 0 # 最大F1值
        for epoch in range(1, self.epochs   1):
            for step, batch in enumerate(train_data_loader, start=1):
                input_ids, token_type_ids, labels = batch
                probs = self.model(input_ids=input_ids, token_type_ids=token_type_ids)
                loss = criterion(probs, labels)
                metric.update(np.argmax(probs, axis=1), labels)
                acc = metric.accumulate()
                self.global_step  = 1
                if self.global_step % 10 == 0:
                    print("训练步数 %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f, speed: %.2f step/s"
                          % (self.global_step, epoch, step, loss, acc,
                             10 / (time.time() - t_start)))
                    t_start = time.time()
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()

                if self.global_step % 100 == 0:
                    _, F1, improve = self.evaluate(criterion, metric, dev_data_loader, F1, "dev")
                    if improve:
                        print("评估结果 F1值 : {:.3f} , 模型保存于:".format(F1)   self.save_param_path)
                    else:
                        print("最好结果 F1值 : {:.3f} , 当前评估没有提升!".format(F1))
        print("-----训练完成------")
        # 用最好的模型参数,提交预测
        state_dict = paddle.load(self.save_param_path)
        self.model.set_dict(state_dict)
        self.writeToFile(test_ds)

    @paddle.no_grad()
    def evaluate(self, criterion, metric, data_loader, F1, phase="dev"):
        self.model.eval()
        metric.reset()
        recall = paddle.metric.Recall()
        recall.reset()
        losses = []
        prob_list = []
        for batch in data_loader:
            input_ids, token_type_ids, labels = batch
            probs = self.model(input_ids=input_ids, token_type_ids=token_type_ids)
            prob_list.extend(probs)
            loss = criterion(probs, labels)
            losses.append(loss.numpy())
            metric.update(np.argmax(probs, axis=1), labels)
            recall.update(np.argmax(probs, axis=1), labels)
            acc = metric.accumulate()
            rec = recall.accumulate()
        f1 = 0 if (acc   rec) == 0.0 else 2 * acc * rec / (acc   rec)
        improve = False
        if f1 > F1: # 保存 F1 值最大的时候的模型参数
            F1 = f1
            improve = True
            self.save_dir = os.path.join(self.pathname, "best_model_state")
            self.save_param_path = os.path.join(self.save_dir, "model_state_pdparams_F1_"   str(round(F1, 4)))
            paddle.save(self.model.state_dict(), self.save_param_path)
            self.tokenizer.save_pretrained(self.save_dir)

        print("评估 {} loss: {:.5}, acc: {:.5}, recall: {:.5}".format(phase, np.mean(losses), acc, rec))
        self.model.train()
        metric.reset()
        return prob_list, F1, improve

    def predict(self, text1, text2): # 单条 句对预测
        encoded_inputs = self.tokenizer(text=text1, text_pair=text2, max_seq_len=self.max_seq_len)
        input_ids = encoded_inputs["input_ids"]
        token_type_ids = encoded_inputs["token_type_ids"]
        predict_data_loader = [(input_ids, token_type_ids)]
        batch_probs = []
        self.model.eval()
        with paddle.no_grad():
            for batch_data in predict_data_loader:
                input_ids, token_type_ids = batch_data
                input_ids = paddle.to_tensor([input_ids])
                token_type_ids = paddle.to_tensor([token_type_ids])
                batch_prob = self.model(input_ids=input_ids, token_type_ids=token_type_ids)
                batch_prob = F.softmax(batch_prob, axis=1).numpy()
                batch_probs.append(batch_prob)
            batch_probs = np.concatenate(batch_probs, axis=0)
            return batch_probs

    def writeToFile(self, test_ds): # 对测试集进行预测,写入文件
        with open(self.datasetname   ".tsv", "w", encoding="utf-8") as f:
            f.write("indextpredictionn")
            for i, d in enumerate(test_ds):
                prob = self.predict(d["text1"], d["text2"])
                label = 1 if prob[0][1] >= 0.5 else 0
                f.write(str(i)   "t"   str(label)   "n")


if __name__ == "__main__":
    dist.init_parallel_env()  # 初始化并行环境
    # 启动命令 python -m paddle.distributed.launch --gpus '0,1' xxx.py &
    # 并行训练设置
    # https://aistudio.baidu.com/aistudio/projectdetail/1222066
    # https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.html
    datasetnames = ["lcqmc", "bq_corpus", "paws-x"]  #
    for name in datasetnames:
        model = Recongnizer(name)
        model.fit()

2. PyTorch

预训练模型下载:https://huggingface.co/nghuyong/ernie-1.0

代码语言:javascript复制
# %%
# 比赛地址
# https://aistudio.baidu.com/aistudio/competition/detail/45
import time
import os
import numpy as np
import torch
from datetime import timedelta
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from sklearn import metrics

np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(4)
torch.backends.cudnn.deterministic = True  # 保证每次运行结果一样


def get_time_dif(start_time): # 辅助函数,获取训练时长
    """
    获取已经使用的时间
    """
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))


def load(file, test=False): # 加载数据集
    data = []
    with open(file, 'r', encoding='utf-8') as f:
        for line in f:
            res = line[:-1].split('t')
            if not test:
                data.append({"text1": res[0], "text2": res[1], "label": int(res[2])})
            else:
                data.append({"text1": res[0], "text2": res[1]})
    return data


def load_dataset(datasetname): # 加载数据集
    train_ds = load(os.path.join(datasetname, "train.tsv"))
    dev_ds = load(os.path.join(datasetname, "dev.tsv"))
    test_ds = load(os.path.join(datasetname, "test.tsv"), True)
    return train_ds, dev_ds, test_ds


def convert_data(data, tokenizer, max_seq_len=512, is_test=False): # 转换数据集为模型可用的编码
    if isinstance(data, dict):
        text1, text2 = data["text1"], data["text2"]
        if not is_test:
            label = [data["label"]]
    else:
        text1, text2, label = [], [], []
        for d in data:
            if not is_test:
                label.append(d["label"])
            text1.append(d["text1"])
            text2.append(d["text2"])
    # tokenizer 参数说明见下面链接
    # https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=tokenizer
    x = tokenizer(text=text1, text_pair=text2, add_special_tokens=True, padding='longest',
                  max_length=max_seq_len, return_tensors='np', return_token_type_ids=True, return_attention_mask=True,
                  truncation=True)
    input_ids, token_type_ids, mask = x["input_ids"], x["token_type_ids"], x["attention_mask"]
    if not is_test:
        return input_ids, token_type_ids, mask, np.array(label, dtype="int64")
    return input_ids, token_type_ids, mask


class DatasetIterator(object): # 数据迭代器
    def __init__(self, tokenizer, data, batch_size, device, max_seq_len=512, pred=False):
        self.batch_size = batch_size
        self.pred = pred
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.dataset = []
        self.batch_data = []
        self.ct = 0
        for d in data:  # 对数据进行 batch 化,每 batch_size 个一组
            self.batch_data.append(d)
            self.ct  = 1
            if self.ct == self.batch_size:
                self.dataset.append(self.batch_data)
                self.ct = 0
                self.batch_data = []
        if self.ct != 0:
            self.dataset.append(self.batch_data)
        self.n_batches = len(self.dataset)
        self.index = 0
        self.device = device

    def _to_tensor(self, datas): # 转化为 tensor
        input_ids = torch.LongTensor(np.array(datas[0])).to(self.device)
        token_type_ids = torch.LongTensor(np.array(datas[1])).to(self.device)
        mask = torch.LongTensor(np.array(datas[2])).to(self.device)
        if not self.pred:
            label = datas[3]
            label = torch.LongTensor(np.array(label)).to(self.device)
            return (input_ids, token_type_ids, mask), label
        else:
            return (input_ids, token_type_ids, mask)

    def __next__(self):
        if self.index < self.n_batches:
            batches = self.dataset[self.index]
            self.index  = 1
            batches = convert_data(batches, self.tokenizer, self.max_seq_len, self.pred)
            batches = self._to_tensor(batches)
            return batches
        else:
            self.index = 0
            raise StopIteration

    def __iter__(self):
        return self

    def __len__(self):
        return self.n_batches


def bulid_iterator(tokenizer, dataset, batch_size, device, max_seq_len=512, pred=False):
    iter = DatasetIterator(tokenizer, dataset, batch_size, device, max_seq_len, pred)
    return iter


class PretrainedModel(nn.Module): # 预训练模型   FC
    def __init__(self, pretrained_model, dropout=None):
        super(PretrainedModel, self).__init__()
        self.ptm = pretrained_model
        for param in self.ptm.parameters():
            param.requires_grad = True  # 打开 finetune 开关
        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
        self.clf = nn.Linear(768, 2)

    def forward(self, x):
        out = self.ptm(input_ids=x[0], attention_mask=x[2], token_type_ids=x[1])
        cls_embedding = self.dropout(out['pooler_output'])
        logits = self.clf(cls_embedding)
        probs = F.softmax(logits, dim=1)
        return probs


class Recongnizer(): # 识别器类
    def __init__(self, datasetname, state_dict_path=None):  # 传入模型参数路径
        self.seed = torch.initial_seed()
        self.datasetname = datasetname
        self.batch_size = 64
        self.epochs = 3
        self.max_seq_len = 512
        self.lr = 1e-5
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.train_ds, self.dev_ds, self.test_ds = load_dataset(self.datasetname)
        # 使用预训练模型的tokenizer, 见 https://huggingface.co/nghuyong/ernie-1.0
        self.tokenizer = AutoTokenizer.from_pretrained("../ERNIE_pretrain")
        self.pretrained_model = AutoModel.from_pretrained("../ERNIE_pretrain")
        self.model = PretrainedModel(self.pretrained_model).to(self.device)
        if state_dict_path:
            try:
                state_dict = torch.load(state_dict_path)
                self.model.load_state_dict(state_dict)
            except:
                print("加载模型参数失败!")
        self.pathname = "checkpoint_pt"
        self.global_step = 0
        isExists = os.path.exists(self.pathname)
        if not isExists:
            os.mkdir(self.pathname)
        self.save_param_path = os.path.join(self.pathname, "best_params_pt")

    def fit(self):
        # 加载数据集
        train_ds = bulid_iterator(self.tokenizer, self.train_ds, self.batch_size, self.device, self.max_seq_len)
        dev_ds = bulid_iterator(self.tokenizer, self.dev_ds, self.batch_size, self.device, self.max_seq_len)
        test_ds = bulid_iterator(self.tokenizer, self.test_ds, self.batch_size, self.device, self.max_seq_len, pred=True)
        # 展示数据
        for i, example in enumerate(train_ds):
            if i < 2:
                (input_ids, token_type_ids, mask), label = example
        print(input_ids)
        # [1, 692, 811, 445, 2001, 497, 5, 654, 21, 692, 811, 614, 356, 314, 5, 291, 21, 2, 329, 445, 2001, 497, 5, 654, 21, 692, 811, 614, 356, 314, 5, 291, 21, 2]
        print(token_type_ids)
        # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
        print(label)
        # [1]

        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        # 优化的参数
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_deacy': 0.0}
        ]
        # 优化器
        optimizer = torch.optim.AdamW(params=optimizer_grouped_parameters, lr=self.lr)

        t_start = time.time()
        dev_best_loss = float('inf')
        self.model.train()
        for epoch in range(1, self.epochs   1):
            for step, batch in enumerate(train_ds, start=1):
                x, labels = batch
                probs = self.model(x)
                self.model.zero_grad()
                loss = F.cross_entropy(probs, labels)
                loss.backward(retain_graph=False)
                optimizer.step()

                self.global_step  = 1
                if self.global_step % 10 == 0:
                    true = labels.data.cpu()
                    predit = torch.max(probs.data, 1)[1].cpu()
                    train_acc = metrics.accuracy_score(true, predit)
                    dev_acc, dev_loss = self.evaluate(dev_ds)
                    if dev_loss < dev_best_loss: # 保存 dev 上 loss 最小的模型
                        dev_best_loss = dev_loss
                        torch.save(self.model.state_dict(), self.save_param_path)
                        improve = '*'
                    else:
                        improve = ''
                    time_dif = get_time_dif(t_start)
                    msg = 'Iter:{0:>6}, Train Loss:{1:>5.2}, Train Acc:{2:>6.2}, Val Loss:{3:>5.2}, Val Acc:{4:>6.2%}, Time:{5} {6}'
                    print(msg.format(self.global_step, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                    self.model.train()

        print("-----训练完成------")
        # 用最好的模型参数,测评一次
        state_dict = torch.load(self.save_param_path)
        self.model.load_state_dict(state_dict)
        self.model.eval()
        self.writeToFile(self.test_ds)

    def evaluate(self, data_loader, test=False): # 评估
        self.model.eval()
        loss_total = 0
        predict_all = np.array([], dtype=int)
        labels_all = np.array([], dtype=int)
        with torch.no_grad():
            for x, labels in data_loader:
                outputs = self.model(x)
                loss = F.cross_entropy(outputs, labels)
                loss_total = loss_total   loss
                labels = labels.data.cpu().numpy()
                predict = torch.max(outputs.data, 1)[1].cpu().numpy()
                labels_all = np.append(labels_all, labels)
                predict_all = np.append(predict_all, predict)

        acc = metrics.accuracy_score(labels_all, predict_all)
        if test:
            return acc, loss_total / len(data_loader), predict_all
        return acc, loss_total / len(data_loader)

    def predict(self, text1, text2):  # 单条句对预测
        d = {"text1": text1, "text2": text2}
        input_ids, token_type_ids, mask = convert_data(d, self.tokenizer, max_seq_len=self.max_seq_len, is_test=True)
        input_ids = torch.LongTensor(input_ids).to(self.device)
        token_type_ids = torch.LongTensor(token_type_ids).to(self.device)
        mask = torch.LongTensor(mask).to(self.device)
        predict_data_loader = [(input_ids, token_type_ids, mask)]
        batch_probs = []
        self.model.eval()
        with torch.no_grad():
            for x in predict_data_loader:
                batch_prob = self.model(x)
                batch_probs.append(batch_prob.cpu().numpy())
            batch_probs = np.concatenate(batch_probs, axis=0)
            return batch_probs

    def writeToFile(self, test_ds): # 将预测结果写入文件
        with open(self.datasetname   ".tsv", "w", encoding="utf-8") as f:
            f.write("indextpredictionn")
            for i, d in enumerate(test_ds):
                prob = self.predict(d["text1"], d["text2"])
                label = 1 if prob[0][1] >= 0.5 else 0
                f.write(str(i)   "t"   str(label)   "n")


if __name__ == "__main__":
    datasetnames = ["lcqmc", "bq_corpus", "paws-x"]  #
    for name in datasetnames:
        model = Recongnizer(name)
        model.fit()

3. 提交结果

0 人点赞