CAIL2021-阅读理解任务-数据预处理模块（二）

代码地址：https://github.com/china-ai-law-challenge/CAIL2021/

代码语言：javascript复制

# /*
#  * @Author: Yue.Fan 
#  * @Date: 2022-03-23 11:35:28 
#  * @Last Modified by:   Yue.Fan 
#  * @Last Modified time: 2022-03-23 11:35:28 
#  */
import logging
from dataclasses import dataclass
from typing import List, Dict
import json
from tqdm import tqdm
from transformers import PreTrainedTokenizer, BasicTokenizer, BertTokenizer
from transformers.tokenization_utils import _is_whitespace, _is_punctuation, _is_control
import numpy as np
import torch
from torch.utils.data import Dataset, TensorDataset

YES_TOKEN = "[unused1]"
NO_TOKEN = "[unused2]"


class CAILExample:
    def __init__(self,
                 qas_id: str,
                 question_text: str,
                 context_text: str,
                 answer_texts: List[str],
                 answer_start_indexes: List[int],
                 is_impossible: bool,
                 is_yes_no: bool,
                 is_multi_span: bool,
                 answers: List,
                 case_id: str,
                 case_name: str):
        self.qas_id = qas_id  # 每一个问题都有一个唯一的id
        self.question_text = question_text  # 问题文本
        self.context_text = context_text  # 内容文本
        self.answer_texts = answer_texts  # 答案列表
        self.answer_start_indexes = answer_start_indexes # 答案开始位置列表
        self.is_impossible = is_impossible  # 是否不存在答案
        self.is_yes_no = is_yes_no  # 是否是 是否类
        self.is_multi_span = is_multi_span  # 是否是 多片段类
        self.answers = answers  # 未经处理的答案列表
        self.case_id = case_id  # 每一个内容都有一个唯一的案件id
        self.case_name = case_name  # 案件类型

        self.doc_tokens = []
        self.char_to_word_offset = []

        raw_doc_tokens = customize_tokenizer(context_text, True)  # 初步得到token
        k = 0
        temp_word = ""
        # 有的文本中会存在空格、换行等，使用bert会导致答案的偏移
        # 因此才会有char_to_word_offset，举个例子
        """
        我nt爱北京nt天安门
        ['我', '爱', '北', '京', '天', '安', '门']
        [0, 0, 0, 1, 2, 3, 3, 3, 4, 5, 6]
        这里从0-1之间多了两个0，表明我和爱之间有两个空格
        """
        for char in self.context_text:
            if _is_whitespace(char):
                self.char_to_word_offset.append(k - 1)
                continue
            else:
                temp_word  = char
                self.char_to_word_offset.append(k)
            if temp_word.lower() == raw_doc_tokens[k]:
                self.doc_tokens.append(temp_word)
                temp_word = ""
                k  = 1
        assert k == len(raw_doc_tokens)

        if answer_texts is not None:  # if for training
            start_positions = []
            end_positions = []

            if not is_impossible and not is_yes_no:
                for i in range(len(answer_texts)):
                    # 这里还是以上面例子为例
                    # 北京在原始文本的开始位置是4
                    answer_offset = context_text.index(answer_texts[i])  # 这里直接index不太好吧
                    # answer_offset = answer_start_indexes[i]
                    answer_length = len(answer_texts[i])
                    start_position = self.char_to_word_offset[answer_offset]  # 在doc_tokens里面的位置就是
                    end_position = self.char_to_word_offset[answer_offset   answer_length - 1]
                    start_positions.append(start_position)  # 真正的开始位置
                    end_positions.append(end_position)  # 真正的结束位置
            else:
                start_positions.append(-1)  # 不存在答案就设置为-1
                end_positions.append(-1)  # 不存在答案就设置为-1
            self.start_positions = start_positions
            self.end_positions = end_positions

    def __repr__(self):
        string = ""
        for key, value in self.__dict__.items():
            string  = f"{key}: {value}n"
        # return f"<{self.__class__}>"
        return string


@dataclass
class CAILFeature:
    input_ids: List[int]
    attention_mask: List[int]
    token_type_ids: List[int]
    cls_index: int
    p_mask: List
    example_index: int
    unique_id: int
    paragraph_len: int
    token_is_max_context: object
    tokens: List
    token_to_orig_map: Dict
    start_positions: List[int]
    end_positions: List[int]
    is_impossible: bool


@dataclass
class CAILResult:
    unique_id: int
    start_logits: torch.Tensor
    end_logits: torch.Tensor


def read_examples(file: str, is_training: bool) -> List[CAILExample]:
    example_list = []
    with open(file, "r", encoding="utf-8") as file:
        original_data = json.load(file)["data"]

    for entry in tqdm(original_data):
        case_id = entry["caseid"]
        for paragraph in entry["paragraphs"]:
            context = paragraph["context"]
            case_name = paragraph["casename"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                qas_id = qa["id"]
                answer_texts = None
                answer_starts = None
                is_impossible = None
                is_yes_no = None
                is_multi_span = None
                all_answers = None
                # cail2021包含以下击中答案：单片段、是否类和拒答类的问题类型，相较于之前的，
                # 额外引入了多片段类型，即答案是由多个片段组合而成
                if is_training:
                    all_answers = qa["answers"]
                    # all_answers为[]，说明没有答案
                    if len(all_answers) == 0:
                        answer = []
                    else:
                        # 否则取第0个
                        answer = all_answers[0]
                    # a little difference between 19 and 21 data.
                    # 如果是一个字典的话将其用列表包裹
                    if type(answer) == dict:
                        answer = [answer]
                    # 不存在答案就初始化答案的文本为""，答案起始位置设置为-1
                    if len(answer) == 0:  # NO Answer
                        answer_texts = [""]
                        answer_starts = [-1]
                    else:
                        # 否则的话这里整合答案
                        answer_texts = []
                        answer_starts = []
                        # 如果是单个span，就是一个
                        # 否则的话就遍历一下
                        for a in answer:
                            answer_texts.append(a["text"])
                            answer_starts.append(a["answer_start"])
                    # Judge YES or NO
                    # 判断是否是 是还是否类型的，并进行设置
                    if len(answer_texts) == 1 and answer_starts[0] == -1 and (
                            answer_texts[0] == "YES" or answer_texts[0] == "NO"):
                        is_yes_no = True
                    else:
                        is_yes_no = False
                    # Judge Multi Span
                    # 判断是否是由多个span构成的答案
                    if len(answer_texts) > 1:
                        is_multi_span = True
                    else:
                        is_multi_span = False
                    # Judge No Answer
                    # 如果不存在答案的话用以下的进行标识
                    if len(answer_texts) == 1 and answer_texts[0] == "":
                        is_impossible = True
                    else:
                        is_impossible = False

                example = CAILExample(
                    qas_id=qas_id,
                    question_text=question,
                    context_text=context,
                    answer_texts=answer_texts,
                    answer_start_indexes=answer_starts,
                    is_impossible=is_impossible,
                    is_yes_no=is_yes_no,
                    is_multi_span=is_multi_span,
                    answers=all_answers,
                    case_id=case_id,
                    case_name=case_name
                )
                # Discard possible bad example
                if is_training and example.answer_start_indexes[0] >= 0:
                    for i in range(len(example.answer_texts)):
                        actual_text = "".join(
                            example.doc_tokens[example.start_positions[i]: (example.end_positions[i]   1)])
                        cleaned_answer_text = "".join(whitespace_tokenize(example.answer_texts[i]))
                        if actual_text.find(cleaned_answer_text) == -1:
                            logging.info(f"Could not find answer: {actual_text} vs. {cleaned_answer_text}")
                            continue
                example_list.append(example)
    return example_list


def convert_examples_to_features(example_list: List[CAILExample], tokenizer: PreTrainedTokenizer, args,
                                 is_training: bool) -> List[CAILFeature]:
    # Validate there are no duplicate ids in example_list
    qas_id_set = set()
    for example in example_list:
        if example.qas_id in qas_id_set:
            raise Exception("Duplicate qas_id!")
        else:
            qas_id_set.add(example.qas_id)

    feature_list = []
    unique_id = 0
    example_index = 0
    i = 0
    for example in tqdm(example_list):
        i  = 1
        # if i % 100 == 0:
        #     print(i)
        current_example_features = convert_single_example_to_features(example, tokenizer, args.max_seq_length,
                                                                      args.max_query_length, args.doc_stride,
                                                                      is_training)
        for feature in current_example_features:
            feature.example_index = example_index
            feature.unique_id = unique_id
            unique_id  = 1
        example_index  = 1
        feature_list.extend(current_example_features)

    return feature_list


def convert_single_example_to_features(example: CAILExample, tokenizer: PreTrainedTokenizer,
                                       max_seq_length, max_query_length, doc_stride, is_training) -> List[CAILFeature]:
    """
    Transfer original text to sequence which can be accepted by ELECTRA
    Format: [CLS] YES_TOKEN NO_TOKEN question [SEP] context [SEP]
    """
    features = []
    tok_to_orig_index = []
    orig_to_tok_index = []
    all_doc_tokens = []
    """
    ['我', '爱', '北', '京', '15826458891', '天', '安', '门']
    orig_to_tok_index:[0, 1, 2, 3, 4, 9, 10, 11]
    tok_to_orig_index:[0, 1, 2, 3, 4, 4, 4, 4, 4, 5, 6, 7]
    all_doc_tokens:['我', '爱', '北', '京', '158', '##26', '##45', '##88', '##91', '天', '安', '门']
    """
    for (i, token) in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        sub_tokens = tokenizer.tokenize(token)  # 这里进一步对token尽可能进行切分
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)  # 每一个sub_token对应的i是相同的
            all_doc_tokens.append(sub_token)

    if is_training:
        if example.is_impossible or example.answer_start_indexes[0] == -1:
            start_positions = [-1]
            end_positions = [-1]
        else:
            start_positions = []
            end_positions = []
            # 以下是对tokenize化之后校准position
            for i in range(len(example.start_positions)):
                start_position = orig_to_tok_index[example.start_positions[i]]
                if example.end_positions[i] < len(example.doc_tokens) - 1:
                    end_position = orig_to_tok_index[example.end_positions[i]   1] - 1
                else:
                    end_position = len(all_doc_tokens) - 1
                (start_position, end_position) = _improve_answer_span(
                    all_doc_tokens, start_position, end_position, tokenizer, example.answer_texts[i]
                )
                start_positions.append(start_position)
                end_positions.append(end_position)
    else:
        start_positions = None
        end_positions = None

    query_tokens = tokenizer.tokenize(example.question_text)
    query_tokens = [YES_TOKEN, NO_TOKEN]   query_tokens  # 是否类和问题拼接
    truncated_query = tokenizer.encode(query_tokens, add_special_tokens=False, max_length=max_query_length,
                                       truncation=True)

    sequence_pair_added_tokens = tokenizer.num_special_tokens_to_add(pair=True)
    assert sequence_pair_added_tokens == 3

    added_tokens_num_before_second_sequence = tokenizer.num_special_tokens_to_add(pair=False)
    assert added_tokens_num_before_second_sequence == 2
    span_doc_tokens = all_doc_tokens
    spans = []

    # print("query_tokens:", query_tokens)
    # print("all_doc_tokens:", all_doc_tokens)

    # print("".join(all_doc_tokens))
    # print("start_positions:", start_positions)
    # print("end_positions:", end_positions)
    # 这里使用滑动窗口法
    while len(spans) * doc_stride < len(all_doc_tokens):
        # print(max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,)
        # 以步长为doc_stride进行滑窗
        encoded_dict = tokenizer.encode_plus(
            truncated_query,
            span_doc_tokens,
            max_length=max_seq_length,
            return_overflowing_tokens=True,
            padding="max_length",
            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
            truncation="only_second",
            return_token_type_ids=True
        )
        # print(span_doc_tokens)
        # print("stride:", max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens)
        # print(tokenizer.convert_ids_to_tokens(encoded_dict['input_ids']))
        # print(len(encoded_dict['input_ids']))
        # print(tokenizer.convert_ids_to_tokens(encoded_dict['overflowing_tokens']))
        # 句子的真实长度
        paragraph_len = min(
            len(all_doc_tokens) - len(spans) * doc_stride,
            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
        )
        # 不包含[PAD]的token_ids
        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
            non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
        else:
            non_padded_ids = encoded_dict["input_ids"]
        # 重新将ids转换为tokens
        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)

        token_to_orig_map = {}
        token_to_orig_map[0] = -1
        token_to_orig_map[1] = -1
        token_to_orig_map[2] = -1

        token_is_max_context = {0: True, 1: True, 2: True}
        for i in range(paragraph_len):
            # token在输入[CLS]query[SEP]context[SEP]里面的索引
            index = len(truncated_query)   added_tokens_num_before_second_sequence   i
            # tok_to_orig_index是token在context里面的索引
            # spans的长度表明当前总共有几个片段
            # token_to_orig_map是将index映射到真实的i上
            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride   i]
        # print(token_to_orig_map)
        encoded_dict["paragraph_len"] = paragraph_len
        encoded_dict["tokens"] = tokens
        encoded_dict["token_to_orig_map"] = token_to_orig_map
        encoded_dict["truncated_query_with_special_tokens_length"] = len(
            truncated_query)   added_tokens_num_before_second_sequence
        encoded_dict["token_is_max_context"] = token_is_max_context
        encoded_dict["start"] = len(spans) * doc_stride  # 文本的起始索引
        encoded_dict["length"] = paragraph_len

        # 这里将是否类的标记token_type_ids设置为1，为什么？
        encoded_dict["token_type_ids"][1] = 1
        encoded_dict["token_type_ids"][2] = 1

        # print(encoded_dict["token_type_ids"])
        spans.append(encoded_dict)

        if "overflowing_tokens" not in encoded_dict or len(encoded_dict["overflowing_tokens"]) == 0:
            break
        else:
            span_doc_tokens = encoded_dict["overflowing_tokens"]

    for doc_span_index in range(len(spans)):
        for j in range(spans[doc_span_index]["paragraph_len"]):
            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride   j)
            index = spans[doc_span_index]["truncated_query_with_special_tokens_length"]   j
            spans[doc_span_index]["token_is_max_context"][index] = is_max_context

    for span in spans:
        cls_index = span["input_ids"].index(tokenizer.cls_token_id)

        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
        # p_mask是将问题和SEP对应位置设置为1，其余位置设置为0
        p_mask = np.array(span["token_type_ids"])
        p_mask = np.minimum(p_mask, 1)
        p_mask = 1 - p_mask
        p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
        p_mask[cls_index] = 0
        p_mask[1] = 0
        p_mask[2] = 0


        current_start_positions = None
        current_end_positions = None
        span_is_impossible = None
        if is_training:
            current_start_positions = [0 for i in range(max_seq_length)]
            current_end_positions = [0 for i in range(max_seq_length)]
            doc_start = span["start"]
            doc_end = span["start"]   span["length"] - 1  # 文本的截止索引
            doc_offset = len(truncated_query)   added_tokens_num_before_second_sequence  # 偏移量
            for i in range(len(start_positions)):
                start_position = start_positions[i]
                end_position = end_positions[i]
                # 这里重新整合start_position和end_position
                if start_position >= doc_start and end_position <= doc_end:
                    span_is_impossible = False
                    current_start_positions[start_position - doc_start   doc_offset] = 1
                    current_end_positions[end_position - doc_start   doc_offset] = 1

            # print(current_start_positions)
            # print(current_end_positions)
            # 处理是否类，将1或者2索引位置设置为1
            if example.is_yes_no:
                assert len(example.answer_start_indexes) == 1
                assert 1 not in current_start_positions and 1 not in current_end_positions
                if example.answer_texts[0] == "YES" and example.answer_start_indexes[0] == -1:
                    current_start_positions[1] = 1
                    current_end_positions[1] = 1
                elif example.answer_texts[0] == "NO" and example.answer_start_indexes[0] == -1:
                    current_start_positions[2] = 1
                    current_end_positions[2] = 1
                else:
                    raise Exception("example构造出错,请检查")
                span_is_impossible = False

            # 处理没有答案的类，将0索引位置设置为1
            if 1 not in current_start_positions:  # Current Feature does not contain answer span
                span_is_impossible = True
                current_start_positions[cls_index] = 1
                current_end_positions[cls_index] = 1
            assert span_is_impossible is not None
        features.append(
            CAILFeature(
                input_ids=span["input_ids"],
                attention_mask=span["attention_mask"],
                token_type_ids=span["token_type_ids"],
                cls_index=cls_index,
                p_mask=p_mask.tolist(),
                example_index=0,
                unique_id=0,
                paragraph_len=span["paragraph_len"],
                token_is_max_context=span["token_is_max_context"],
                tokens=span["tokens"],
                token_to_orig_map=span["token_to_orig_map"],
                start_positions=current_start_positions,
                end_positions=current_end_positions,
                is_impossible=span_is_impossible
            )
        )
    return features


def convert_features_to_dataset(features: List[CAILFeature], is_training: bool) -> Dataset:
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
    all_example_indexes = torch.tensor([f.example_index for f in features], dtype=torch.long)
    all_feature_indexes = torch.arange(all_input_ids.size(0), dtype=torch.long)
    if is_training:
        all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
        all_start_labels = torch.tensor([f.start_positions for f in features], dtype=torch.float)
        all_end_labels = torch.tensor([f.end_positions for f in features], dtype=torch.float)
        dataset = TensorDataset(
            all_input_ids,
            all_attention_masks,
            all_token_type_ids,
            all_start_labels,
            all_end_labels,
            all_cls_index,
            all_p_mask,
            all_is_impossible,
            all_example_indexes,
            all_feature_indexes
        )
    else:
        dataset = TensorDataset(
            all_input_ids,
            all_attention_masks,
            all_token_type_ids,
            all_cls_index,
            all_p_mask,
            all_example_indexes,
            all_feature_indexes
        )
    return dataset


def _is_whitespace(c):
    if c == " " or c == "t" or c == "r" or c == "n" or ord(c) == 0x202F:
        return True
    return False


def _new_check_is_max_context(doc_spans, cur_span_index, position):
    """
    Check if this is the 'max context' doc span for the token.
    """
    # if len(doc_spans) == 1:
    # return True
    best_score = None
    best_span_index = None
    for (span_index, doc_span) in enumerate(doc_spans):
        end = doc_span["start"]   doc_span["length"] - 1
        if position < doc_span["start"]:
            continue
        if position > end:
            continue
        num_left_context = position - doc_span["start"]
        num_right_context = end - position
        score = min(num_left_context, num_right_context)   0.01 * doc_span["length"]
        if best_score is None or score > best_score:
            best_score = score
            best_span_index = span_index

    return cur_span_index == best_span_index


def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
    """
    Returns tokenized answer spans that better match the annotated answer.
    """
    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))

    for new_start in range(input_start, input_end   1):
        for new_end in range(input_end, new_start - 1, -1):
            text_span = " ".join(doc_tokens[new_start: (new_end   1)])
            if text_span == tok_answer_text:
                return (new_start, new_end)

    return (input_start, input_end)


def customize_tokenizer(text: str, do_lower_case=True) -> List[str]:
    temp_x = ""
    for char in text:
        # 在一些特殊字符左右插入两个空格
        if _is_chinese_char(ord(char)) or _is_punctuation(char) or _is_whitespace(char) or _is_control(char):
            temp_x  = " "   char   " "
        else:
            temp_x  = char
    # 是否将英文大写转换为小写
    if do_lower_case:
        temp_x = temp_x.lower()

    return temp_x.split()  # 这里会使用空格进行切分


def _is_chinese_char(cp):
    """Checks whether CP is the codepoint of a CJK character."""
    # This defines a "chinese character" as anything in the CJK Unicode block:
    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
    #
    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
    # despite its name. The modern Korean Hangul alphabet is a different block,
    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
    # space-separated words, so they are not treated specially and handled
    # like the all of the other languages.
    if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)  #
            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
    ):  #
        return True

    return False


def whitespace_tokenize(text: str):
    if text is None:
        return []
    text = text.strip()
    tokens = text.split()
    return tokens


def write_example_orig_file(examples: List[CAILExample], file: str):
    """
    convert examples to original json file
    """
    data_list = []
    for example in examples:
        data = {
            "paragraphs": [
                {
                    "context": example.context_text,
                    "casename": example.case_name,
                    "qas": [
                        {
                            "question": example.question_text,
                            "answers": example.answers,
                            "id": example.qas_id,
                            "is_impossible": "true" if example.is_impossible else "false",
                        }
                    ]
                }
            ],
            "caseid": example.case_id
        }
        data_list.append(data)
    final_data = {
        "data": data_list,
        "version": "1.0"
    }
    with open(file, mode="w", encoding="utf-8") as file:
        file.write(json.dumps(final_data, ensure_ascii=False))


if __name__ == '__main__':
    data_file = 'data_sample/cail2021_mrc_small.json'
    examples = read_examples(data_file, is_training=True)
    tokenizer = BertTokenizer.from_pretrained('model_hub/chinese-bert-wwm-ext/')
    # example = examples[3]
    # print(example)
    # print(len(example.doc_tokens))
    # convert_single_example_to_features(
    #     example=example,
    #     tokenizer=tokenizer,
    #     max_seq_length=512,
    #     max_query_length=64,
    #     doc_stride=128,
    #     is_training=True
    # )
    class Args:
        max_seq_length = 512
        max_query_length = 64
        doc_stride = 128
    args = Args()
    feature_lists = convert_examples_to_features(
        examples,
        tokenizer,
        args,
        is_training=True,
    )
    print(feature_lists[0])
    datasets = convert_features_to_dataset(feature_lists, is_training=True)
    print(datasets[0])
    # for ex in examples:
    #     print(ex)
    # context_text = "我nt爱北京nt15826458891天安门"
    # all_doc_tokens = customize_tokenizer(context_text, True)
    #
    # k = 0
    # temp_word = ""
    # doc_tokens = []
    # char_to_word_offset = []
    # print(context_text)
    # print(all_doc_tokens)
    # for char in context_text:
    #     if _is_whitespace(char):
    #         char_to_word_offset.append(k - 1)
    #         continue
    #     else:
    #         temp_word  = char
    #         char_to_word_offset.append(k)
    #     if temp_word.lower() == all_doc_tokens[k]:
    #         doc_tokens.append(temp_word)
    #         temp_word = ""
    #         k  = 1
    # print(k)
    # print(doc_tokens)
    # print(char_to_word_offset)
    #
    # tok_to_orig_index = []
    # orig_to_tok_index = []
    # all_doc_tokens = []
    # for (i, token) in enumerate(doc_tokens):
    #     orig_to_tok_index.append(len(all_doc_tokens))
    #     sub_tokens = tokenizer.tokenize(token)
    #     for sub_token in sub_tokens:
    #         tok_to_orig_index.append(i)
    #         all_doc_tokens.append(sub_token)
    #
    # print(orig_to_tok_index)
    # print(tok_to_orig_index)
    # print(all_doc_tokens)

我们需要注意的是总过经过了三个重整的阶段：

第一阶段：先初步将文本进行token化，这一步是去除掉文本中的一些特殊符号及空格等，因此要对答案的起始位置进行校准。
第二阶段：这一步利用tokenizer对每一个字（词）进行token化，由于是wordpiece，会影响句子的长度以及答案，因此也要重新进行校准。
第三阶段：这一步是要整合问题和文本，同时采用滑动窗口法，因此也要重新校准答案在文本中的位置。

代码语言：javascript复制

<input_ids: [101, 1, 2, 7342, 12124, 1762, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 1905, 2832, 749, 784, 720, 924, 7372, 8043, 102, 9595, 119, 125, 1039, 132, 124, 119, 1161, 808, 7342, 12124, 510, 3342, 10871, 3118, 802, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 2526, 2360, 6589, 11960, 8129, 1039, 132, 125, 119, 7342, 12124, 510, 3342, 10871, 2824, 2857, 3315, 3428, 4638, 6401, 6390, 6589, 4500, 752, 2141, 680, 4415, 4507, 131, 8138, 2399, 8110, 3299, 124, 3189, 117, 7342, 12124, 6206, 3724, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 711, 1071, 1762, 704, 1744, 1093, 689, 7213, 6121, 5500, 819, 3300, 7361, 1062, 1385, 2128, 2551, 4689, 1146, 6121, 5852, 689, 6956, 113, 809, 678, 5042, 4917, 1093, 6121, 2128, 2551, 4689, 1146, 6121, 114, 4638, 8416, 9086, 1039, 6587, 3621, 2990, 897, 702, 782, 3867, 6589, 928, 6587, 924, 6395, 924, 7372, 117, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 2970, 1358, 2832, 924, 2400, 5041, 1355, 924, 7372, 1296, 113, 924, 1296, 5356, 1384, 131, 8752, 9723, 9131, 8756, 9086, 11906, 9446, 8311, 8152, 114, 117, 924, 6589, 5373, 5287, 3175, 2466, 711, 3309, 5373, 117, 3680, 3309, 113, 3299, 114, 9649, 8158, 1039, 117, 924, 7372, 3309, 7313, 5632, 702, 782, 3867, 6589, 928, 6587, 1394, 1398, 7555, 678, 6587, 3621, 1355, 3123, 722, 3189, 6629, 5635, 3926, 985, 1059, 6956, 6587, 3621, 3315, 2622, 722, 3189, 3632, 6421, 924, 1296, 5276, 2137, 117, 7342, 12124, 2870, 3612, 818, 862, 671, 3309, 6587, 3621, 6809, 1168, 8188, 1921, 4638, 117, 6228, 711, 924, 7372, 752, 3125, 1355, 4495, 117, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 898, 2945, 924, 7372, 1394, 1398, 5276, 2137, 2190, 1093, 6121, 2128, 2551, 4689, 1146, 6121, 6822, 6121, 6608, 985, 132, 794, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 6608, 985, 2496, 3189, 6629, 2458, 1993, 6631, 6814, 8114, 1921, 117, 2832, 924, 782, 793, 3313, 1403, 924, 7372, 782, 2495, 6820, 1059, 6956, 6608, 985, 3621, 7555, 4638, 117, 1156, 6228, 711, 2832, 924, 782, 6824, 5276, 117, 2832, 924, 782, 7444, 809, 2213, 3612, 1059, 6956, 3621, 7555, 711, 1825, 3144, 117, 794, 924, 7372, 782, 6608, 985, 2496, 3189, 2458, 1993, 6369, 5050, 117, 2902, 3680, 3189, 1283, 1146, 722, 671, 3403, 1114, 117, 1403, 924, 7372, 782, 5373, 5287, 6824, 5276, 7032, 7342, 12124, 2832, 924, 1400, 117, 794, 1093, 6121, 2128, 2551, 4689, 1146, 6121, 5815, 2533, 2207, 7583, 3867, 6589, 6587, 3621, 1066, 8416, 9086, 1039, 117, 6587, 3621, 4500, 6854, 711, 3189, 2382, 4495, 3833, 3867, 6589, 117, 955, 3621, 3309, 7361, 711, 8216, 702, 3299, 117, 2902, 3299, 5023, 7583, 3315, 2622, 6820, 3621, 1400, 1728, 7342, 12124, 3313, 2130, 1059, 2252, 6121, 1394, 1398, 5276, 2137, 4638, 6820, 3621, 721, 1218, 117, 1093, 6121, 2128, 2551, 4689, 1146, 6121, 754, 8119, 2399, 123, 3299, 8132, 3189, 1403, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 4509, 6435, 5164, 6608, 117, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 102]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
token_type_ids: [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
cls_index: 0
p_mask: [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
example_index: 4
unique_id: 7
paragraph_len: 487
/*
* 提示：该行代码过长，系统自动注释不进行高亮。一键复制会移除系统注释 
* token_is_max_context: {0: True, 1: True, 2: True, 24: False, 25: False, 26: False, 27: False, 28: False, 29: False, 30: False, 31: False, 32: False, 33: False, 34: False, 35: False, 36: False, 37: False, 38: False, 39: False, 40: False, 41: False, 42: False, 43: False, 44: False, 45: False, 46: False, 47: False, 48: False, 49: False, 50: False, 51: False, 52: False, 53: False, 54: False, 55: False, 56: False, 57: False, 58: False, 59: False, 60: False, 61: False, 62: False, 63: False, 64: False, 65: False, 66: False, 67: False, 68: False, 69: False, 70: False, 71: False, 72: False, 73: False, 74: False, 75: False, 76: False, 77: False, 78: False, 79: False, 80: False, 81: False, 82: False, 83: False, 84: False, 85: False, 86: False, 87: False, 88: False, 89: False, 90: False, 91: False, 92: False, 93: False, 94: False, 95: False, 96: False, 97: False, 98: False, 99: False, 100: False, 101: False, 102: False, 103: False, 104: False, 105: False, 106: False, 107: False, 108: False, 109: False, 110: False, 111: False, 112: False, 113: False, 114: False, 115: False, 116: False, 117: False, 118: False, 119: False, 120: False, 121: False, 122: False, 123: False, 124: False, 125: False, 126: False, 127: False, 128: False, 129: False, 130: False, 131: False, 132: False, 133: False, 134: False, 135: False, 136: False, 137: False, 138: False, 139: False, 140: False, 141: False, 142: False, 143: False, 144: False, 145: False, 146: False, 147: False, 148: False, 149: False, 150: False, 151: False, 152: False, 153: False, 154: False, 155: False, 156: False, 157: False, 158: False, 159: False, 160: False, 161: False, 162: False, 163: False, 164: False, 165: False, 166: False, 167: False, 168: False, 169: False, 170: False, 171: False, 172: False, 173: False, 174: False, 175: False, 176: False, 177: False, 178: False, 179: False, 180: False, 181: False, 182: False, 183: False, 184: False, 185: False, 186: False, 187: False, 188: False, 189: False, 190: False, 191: False, 192: False, 193: False, 194: False, 195: False, 196: False, 197: False, 198: False, 199: False, 200: False, 201: False, 202: False, 203: False, 204: True, 205: True, 206: True, 207: True, 208: True, 209: True, 210: True, 211: True, 212: True, 213: True, 214: True, 215: True, 216: True, 217: True, 218: True, 219: True, 220: True, 221: True, 222: True, 223: True, 224: True, 225: True, 226: True, 227: True, 228: True, 229: True, 230: True, 231: True, 232: True, 233: True, 234: True, 235: True, 236: True, 237: True, 238: True, 239: True, 240: True, 241: True, 242: True, 243: True, 244: True, 245: True, 246: True, 247: True, 248: True, 249: True, 250: True, 251: True, 252: True, 253: True, 254: True, 255: True, 256: True, 257: True, 258: True, 259: True, 260: True, 261: True, 262: True, 263: True, 264: True, 265: True, 266: True, 267: True, 268: True, 269: True, 270: True, 271: True, 272: True, 273: True, 274: True, 275: True, 276: True, 277: True, 278: True, 279: True, 280: True, 281: True, 282: True, 283: True, 284: True, 285: True, 286: True, 287: True, 288: True, 289: True, 290: True, 291: True, 292: True, 293: True, 294: True, 295: True, 296: True, 297: True, 298: True, 299: True, 300: True, 301: True, 302: True, 303: True, 304: True, 305: True, 306: True, 307: True, 308: True, 309: True, 310: True, 311: True, 312: True, 313: True, 314: True, 315: True, 316: True, 317: True, 318: True, 319: True, 320: True, 321: True, 322: True, 323: True, 324: True, 325: True, 326: True, 327: True, 328: True, 329: True, 330: True, 331: True, 332: False, 333: False, 334: False, 335: False, 336: False, 337: False, 338: False, 339: False, 340: False, 341: False, 342: False, 343: False, 344: False, 345: False, 346: False, 347: False, 348: False, 349: False, 350: False, 351: False, 352: False, 353: False, 354: False, 355: False, 356: False, 357: False, 358: False, 359: False, 360: False, 361: False, 362: False, 363: False, 364: False, 365: False, 366: False, 367: False, 368: False, 369: False, 370: False, 371: False, 372: False, 373: False, 374: False, 375: False, 376: False, 377: False, 378: False, 379: False, 380: False, 381: False, 382: False, 383: False, 384: False, 385: False, 386: False, 387: False, 388: False, 389: False, 390: False, 391: False, 392: False, 393: False, 394: False, 395: False, 396: False, 397: False, 398: False, 399: False, 400: False, 401: False, 402: False, 403: False, 404: False, 405: False, 406: False, 407: False, 408: False, 409: False, 410: False, 411: False, 412: False, 413: False, 414: False, 415: False, 416: False, 417: False, 418: False, 419: False, 420: False, 421: False, 422: False, 423: False, 424: False, 425: False, 426: False, 427: False, 428: False, 429: False, 430: False, 431: False, 432: False, 433: False, 434: False, 435: False, 436: False, 437: False, 438: False, 439: False, 440: False, 441: False, 442: False, 443: False, 444: False, 445: False, 446: False, 447: False, 448: False, 449: False, 450: False, 451: False, 452: False, 453: False, 454: False, 455: False, 456: False, 457: False, 458: False, 459: False, 460: False, 461: False, 462: False, 463: False, 464: False, 465: False, 466: False, 467: False, 468: False, 469: False, 470: False, 471: False, 472: False, 473: False, 474: False, 475: False, 476: False, 477: False, 478: False, 479: False, 480: False, 481: False, 482: False, 483: False, 484: False, 485: False, 486: False, 487: False, 488: False, 489: False, 490: False, 491: False, 492: False, 493: False, 494: False, 495: False, 496: False, 497: False, 498: False, 499: False, 500: False, 501: False, 502: False, 503: False, 504: False, 505: False, 506: False, 507: False, 508: False, 509: False, 510: False}
*/
tokens: ['[CLS]', '[unused1]', '[unused2]', '阮', 'x4', '在', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '处', '投', '了', '什', '么', '保', '险', '？', '[SEP]', '##92', '.', '4', '元', ';', '3', '.', '判', '令', '阮', 'x4', '、', '杨', 'x5', '支', '付', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '律', '师', '费', '690', '##0', '元', ';', '4', '.', '阮', 'x4', '、', '杨', 'x5', '承', '担', '本', '案', '的', '诉', '讼', '费', '用', '事', '实', '与', '理', '由', ':', '2013', '年', '12', '月', '3', '日', ',', '阮', 'x4', '要', '求', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '为', '其', '在', '中', '国', '农', '业', '银', '行', '股', '份', '有', '限', '公', '司', '安', '徽', '省', '分', '行', '营', '业', '部', '(', '以', '下', '简', '称', '农', '行', '安', '徽', '省', '分', '行', ')', '的', '94', '##000', '元', '贷', '款', '提', '供', '个', '人', '消', '费', '信', '贷', '保', '证', '保', '险', ',', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '接', '受', '投', '保', '并', '签', '发', '保', '险', '单', '(', '保', '单', '编', '号', ':', '125', '##94', '##07', '##26', '##000', '##010', '##87', '##10', '##3', ')', ',', '保', '费', '缴', '纳', '方', '式', '为', '期', '缴', ',', '每', '期', '(', '月', ')', '178', '##6', '元', ',', '保', '险', '期', '间', '自', '个', '人', '消', '费', '信', '贷', '合', '同', '项', '下', '贷', '款', '发', '放', '之', '日', '起', '至', '清', '偿', '全', '部', '贷', '款', '本', '息', '之', '日', '止', '该', '保', '单', '约', '定', ',', '阮', 'x4', '拖', '欠', '任', '何', '一', '期', '贷', '款', '达', '到', '80', '天', '的', ',', '视', '为', '保', '险', '事', '故', '发', '生', ',', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '依', '据', '保', '险', '合', '同', '约', '定', '对', '农', '行', '安', '徽', '省', '分', '行', '进', '行', '赔', '偿', ';', '从', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '赔', '偿', '当', '日', '起', '开', '始', '超', '过', '30', '天', ',', '投', '保', '人', '仍', '未', '向', '保', '险', '人', '归', '还', '全', '部', '赔', '偿', '款', '项', '的', ',', '则', '视', '为', '投', '保', '人', '违', '约', ',', '投', '保', '人', '需', '以', '尚', '欠', '全', '部', '款', '项', '为', '基', '数', ',', '从', '保', '险', '人', '赔', '偿', '当', '日', '开', '始', '计', '算', ',', '按', '每', '日', '千', '分', '之', '一', '标', '准', ',', '向', '保', '险', '人', '缴', '纳', '违', '约', '金', '阮', 'x4', '投', '保', '后', ',', '从', '农', '行', '安', '徽', '省', '分', '行', '获', '得', '小', '额', '消', '费', '贷', '款', '共', '94', '##000', '元', ',', '贷', '款', '用', '途', '为', '日', '常', '生', '活', '消', '费', ',', '借', '款', '期', '限', '为', '36', '个', '月', ',', '按', '月', '等', '额', '本', '息', '还', '款', '后', '因', '阮', 'x4', '未', '完', '全', '履', '行', '合', '同', '约', '定', '的', '还', '款', '义', '务', ',', '农', '行', '安', '徽', '省', '分', '行', '于', '2015', '年', '2', '月', '25', '日', '向', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '申', '请', '索', '赔', ',', '平', '安', '财', '险', '安', '徽', '分', '[SEP]']
token_to_orig_map: {0: -1, 1: -1, 2: -1, 24: 124, 25: 125, 26: 126, 27: 127, 28: 128, 29: 129, 30: 130, 31: 131, 32: 132, 33: 133, 34: 134, 35: 135, 36: 136, 37: 137, 38: 138, 39: 139, 40: 140, 41: 141, 42: 142, 43: 143, 44: 144, 45: 145, 46: 146, 47: 147, 48: 148, 49: 149, 50: 150, 51: 151, 52: 152, 53: 152, 54: 153, 55: 154, 56: 155, 57: 156, 58: 157, 59: 158, 60: 159, 61: 160, 62: 161, 63: 162, 64: 163, 65: 164, 66: 165, 67: 166, 68: 167, 69: 168, 70: 169, 71: 170, 72: 171, 73: 172, 74: 173, 75: 174, 76: 175, 77: 176, 78: 177, 79: 178, 80: 179, 81: 180, 82: 181, 83: 182, 84: 183, 85: 184, 86: 185, 87: 186, 88: 187, 89: 188, 90: 189, 91: 190, 92: 191, 93: 192, 94: 193, 95: 194, 96: 195, 97: 196, 98: 197, 99: 198, 100: 199, 101: 200, 102: 201, 103: 202, 104: 203, 105: 204, 106: 205, 107: 206, 108: 207, 109: 208, 110: 209, 111: 210, 112: 211, 113: 212, 114: 213, 115: 214, 116: 215, 117: 216, 118: 217, 119: 218, 120: 219, 121: 220, 122: 221, 123: 222, 124: 223, 125: 224, 126: 225, 127: 226, 128: 227, 129: 228, 130: 229, 131: 230, 132: 231, 133: 232, 134: 233, 135: 234, 136: 234, 137: 235, 138: 236, 139: 237, 140: 238, 141: 239, 142: 240, 143: 241, 144: 242, 145: 243, 146: 244, 147: 245, 148: 246, 149: 247, 150: 248, 151: 249, 152: 250, 153: 251, 154: 252, 155: 253, 156: 254, 157: 255, 158: 256, 159: 257, 160: 258, 161: 259, 162: 260, 163: 261, 164: 262, 165: 263, 166: 264, 167: 265, 168: 266, 169: 267, 170: 268, 171: 269, 172: 270, 173: 271, 174: 272, 175: 273, 176: 274, 177: 275, 178: 276, 179: 276, 180: 276, 181: 276, 182: 276, 183: 276, 184: 276, 185: 276, 186: 276, 187: 277, 188: 278, 189: 279, 190: 280, 191: 281, 192: 282, 193: 283, 194: 284, 195: 285, 196: 286, 197: 287, 198: 288, 199: 289, 200: 290, 201: 291, 202: 292, 203: 293, 204: 294, 205: 294, 206: 295, 207: 296, 208: 297, 209: 298, 210: 299, 211: 300, 212: 301, 213: 302, 214: 303, 215: 304, 216: 305, 217: 306, 218: 307, 219: 308, 220: 309, 221: 310, 222: 311, 223: 312, 224: 313, 225: 314, 226: 315, 227: 316, 228: 317, 229: 318, 230: 319, 231: 320, 232: 321, 233: 322, 234: 323, 235: 324, 236: 325, 237: 326, 238: 327, 239: 328, 240: 329, 241: 330, 242: 331, 243: 332, 244: 333, 245: 334, 246: 335, 247: 336, 248: 337, 249: 338, 250: 339, 251: 340, 252: 341, 253: 342, 254: 343, 255: 344, 256: 345, 257: 346, 258: 347, 259: 348, 260: 349, 261: 350, 262: 351, 263: 352, 264: 353, 265: 354, 266: 355, 267: 356, 268: 357, 269: 358, 270: 359, 271: 360, 272: 361, 273: 362, 274: 363, 275: 364, 276: 365, 277: 366, 278: 367, 279: 368, 280: 369, 281: 370, 282: 371, 283: 372, 284: 373, 285: 374, 286: 375, 287: 376, 288: 377, 289: 378, 290: 379, 291: 380, 292: 381, 293: 382, 294: 383, 295: 384, 296: 385, 297: 386, 298: 387, 299: 388, 300: 389, 301: 390, 302: 391, 303: 392, 304: 393, 305: 394, 306: 395, 307: 396, 308: 397, 309: 398, 310: 399, 311: 400, 312: 401, 313: 402, 314: 403, 315: 404, 316: 405, 317: 406, 318: 407, 319: 408, 320: 409, 321: 410, 322: 411, 323: 412, 324: 413, 325: 414, 326: 415, 327: 416, 328: 417, 329: 418, 330: 419, 331: 420, 332: 421, 333: 422, 334: 423, 335: 424, 336: 425, 337: 426, 338: 427, 339: 428, 340: 429, 341: 430, 342: 431, 343: 432, 344: 433, 345: 434, 346: 435, 347: 436, 348: 437, 349: 438, 350: 439, 351: 440, 352: 441, 353: 442, 354: 443, 355: 444, 356: 445, 357: 446, 358: 447, 359: 448, 360: 449, 361: 450, 362: 451, 363: 452, 364: 453, 365: 454, 366: 455, 367: 456, 368: 457, 369: 458, 370: 459, 371: 460, 372: 461, 373: 462, 374: 463, 375: 464, 376: 465, 377: 466, 378: 467, 379: 468, 380: 469, 381: 470, 382: 471, 383: 472, 384: 473, 385: 474, 386: 475, 387: 476, 388: 477, 389: 478, 390: 479, 391: 480, 392: 481, 393: 482, 394: 483, 395: 484, 396: 485, 397: 486, 398: 487, 399: 488, 400: 489, 401: 490, 402: 491, 403: 492, 404: 493, 405: 494, 406: 495, 407: 496, 408: 497, 409: 498, 410: 499, 411: 500, 412: 501, 413: 502, 414: 503, 415: 504, 416: 505, 417: 506, 418: 507, 419: 508, 420: 509, 421: 510, 422: 511, 423: 512, 424: 512, 425: 513, 426: 514, 427: 515, 428: 516, 429: 517, 430: 518, 431: 519, 432: 520, 433: 521, 434: 522, 435: 523, 436: 524, 437: 525, 438: 526, 439: 527, 440: 528, 441: 529, 442: 530, 443: 531, 444: 532, 445: 533, 446: 534, 447: 535, 448: 536, 449: 537, 450: 538, 451: 539, 452: 540, 453: 541, 454: 542, 455: 543, 456: 544, 457: 545, 458: 546, 459: 547, 460: 548, 461: 549, 462: 550, 463: 551, 464: 552, 465: 553, 466: 554, 467: 555, 468: 556, 469: 557, 470: 558, 471: 559, 472: 560, 473: 561, 474: 562, 475: 563, 476: 564, 477: 565, 478: 566, 479: 567, 480: 568, 481: 569, 482: 570, 483: 571, 484: 572, 485: 573, 486: 574, 487: 575, 488: 576, 489: 577, 490: 578, 491: 579, 492: 580, 493: 581, 494: 582, 495: 583, 496: 584, 497: 585, 498: 586, 499: 587, 500: 588, 501: 589, 502: 590, 503: 591, 504: 592, 505: 593, 506: 594, 507: 595, 508: 596, 509: 597, 510: 598}
start_positions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
end_positions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
is_impossible: False
>

最后需要注意的是：怎么将是否类、单个片段类和多个片段类进行统一的标识，以及输入中的token_type_ids是怎么进行设置的。最有有一个处理文件将处理后的数据保存下来，避免每次重复进行处理，消耗时间：

代码语言：javascript复制

"""
数据处理相关代码
"""
import argparse
import json

from transformers import PreTrainedTokenizer, BertTokenizer
from data_process_utils import *
import gzip
import pickle
import os
from os.path import join
import logging


def convert_and_write(args, tokenizer: PreTrainedTokenizer, file, examples_fn, features_fn, is_training):
    logging.info(f"Reading examples from :{file} ...")
    example_list = read_examples(file, is_training=is_training)
    logging.info(f"Total examples:{len(example_list)}")

    logging.info(f"Start converting examples to features.")
    feature_list = convert_examples_to_features(example_list, tokenizer, args, is_training)
    logging.info(f"Total features:{len(feature_list)}")

    logging.info(f"Converting complete, writing examples and features to file.")
    with gzip.open(join(args.output_path, examples_fn), "wb") as file:
        pickle.dump(example_list, file)
    with gzip.open(join(args.output_path, features_fn), "wb") as file:
        pickle.dump(feature_list, file)


def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--input_file",
        type=str,
        required=True,
        help="The file to be processed."
    )

    parser.add_argument(
        "--for_training",
        action="store_true",
        help="Process for training or not."
    )

    parser.add_argument(
        "--output_prefix",
        type=str,
        required=True,
        help="The prefix of output file's name."
    )

    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model."
    )

    parser.add_argument(
        "--tokenizer_path",
        type=str,
        required=True,
        help="Path to tokenizer which will be used to tokenize text.(ElectraTokenizer)"
    )

    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help="The maximum total input sequence length after WordPiece tokenization. "
             "Longer will be truncated, and shorter will be padded."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help="The maximum number of tokens for the question. Questions longer will be truncated to the length."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help="When splitting up a long document into chunks, how much stride to take between chunks."
    )

    parser.add_argument(
        "--output_path",
        default="./processed_data/",
        type=str,
        help="Output path of the constructed examples and features."
    )

    args = parser.parse_args()
    args.max_query_length  = 2  # position for token yes and no
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s: %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO,
    )

    logging.info("All input parameters:")
    print(json.dumps(vars(args), sort_keys=False, indent=2))

    tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path)

    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)

    convert_and_write(args, tokenizer, args.input_file, args.output_prefix   "_examples.pkl.gz",
                      args.output_prefix   "_features.pkl.gz", args.for_training)


if __name__ == "__main__":
    main()

运行指令：

代码语言：javascript复制

python data_process.py --input_file data_sample/cail2021_mrc_small.json --output_prefix cail2021_mrc_small --tokenizer_path model_hub/chinese-bert-wwm-ext --max_seq_length 512 --max_query_length 64 --doc_stride 128 --do_lower_case --for_training

token

0 人点赞