代码地址:https://github.com/china-ai-law-challenge/CAIL2021/
代码语言:javascript复制# /*
# * @Author: Yue.Fan
# * @Date: 2022-03-23 11:35:28
# * @Last Modified by: Yue.Fan
# * @Last Modified time: 2022-03-23 11:35:28
# */
import logging
from dataclasses import dataclass
from typing import List, Dict
import json
from tqdm import tqdm
from transformers import PreTrainedTokenizer, BasicTokenizer, BertTokenizer
from transformers.tokenization_utils import _is_whitespace, _is_punctuation, _is_control
import numpy as np
import torch
from torch.utils.data import Dataset, TensorDataset
YES_TOKEN = "[unused1]"
NO_TOKEN = "[unused2]"
class CAILExample:
def __init__(self,
qas_id: str,
question_text: str,
context_text: str,
answer_texts: List[str],
answer_start_indexes: List[int],
is_impossible: bool,
is_yes_no: bool,
is_multi_span: bool,
answers: List,
case_id: str,
case_name: str):
self.qas_id = qas_id # 每一个问题都有一个唯一的id
self.question_text = question_text # 问题文本
self.context_text = context_text # 内容文本
self.answer_texts = answer_texts # 答案列表
self.answer_start_indexes = answer_start_indexes # 答案开始位置列表
self.is_impossible = is_impossible # 是否不存在答案
self.is_yes_no = is_yes_no # 是否是 是否类
self.is_multi_span = is_multi_span # 是否是 多片段类
self.answers = answers # 未经处理的答案列表
self.case_id = case_id # 每一个内容都有一个唯一的案件id
self.case_name = case_name # 案件类型
self.doc_tokens = []
self.char_to_word_offset = []
raw_doc_tokens = customize_tokenizer(context_text, True) # 初步得到token
k = 0
temp_word = ""
# 有的文本中会存在空格、换行等,使用bert会导致答案的偏移
# 因此才会有char_to_word_offset,举个例子
"""
我nt爱北京nt天安门
['我', '爱', '北', '京', '天', '安', '门']
[0, 0, 0, 1, 2, 3, 3, 3, 4, 5, 6]
这里从0-1之间多了两个0,表明我和爱之间有两个空格
"""
for char in self.context_text:
if _is_whitespace(char):
self.char_to_word_offset.append(k - 1)
continue
else:
temp_word = char
self.char_to_word_offset.append(k)
if temp_word.lower() == raw_doc_tokens[k]:
self.doc_tokens.append(temp_word)
temp_word = ""
k = 1
assert k == len(raw_doc_tokens)
if answer_texts is not None: # if for training
start_positions = []
end_positions = []
if not is_impossible and not is_yes_no:
for i in range(len(answer_texts)):
# 这里还是以上面例子为例
# 北京在原始文本的开始位置是4
answer_offset = context_text.index(answer_texts[i]) # 这里直接index不太好吧
# answer_offset = answer_start_indexes[i]
answer_length = len(answer_texts[i])
start_position = self.char_to_word_offset[answer_offset] # 在doc_tokens里面的位置就是
end_position = self.char_to_word_offset[answer_offset answer_length - 1]
start_positions.append(start_position) # 真正的开始位置
end_positions.append(end_position) # 真正的结束位置
else:
start_positions.append(-1) # 不存在答案就设置为-1
end_positions.append(-1) # 不存在答案就设置为-1
self.start_positions = start_positions
self.end_positions = end_positions
def __repr__(self):
string = ""
for key, value in self.__dict__.items():
string = f"{key}: {value}n"
# return f"<{self.__class__}>"
return string
@dataclass
class CAILFeature:
input_ids: List[int]
attention_mask: List[int]
token_type_ids: List[int]
cls_index: int
p_mask: List
example_index: int
unique_id: int
paragraph_len: int
token_is_max_context: object
tokens: List
token_to_orig_map: Dict
start_positions: List[int]
end_positions: List[int]
is_impossible: bool
@dataclass
class CAILResult:
unique_id: int
start_logits: torch.Tensor
end_logits: torch.Tensor
def read_examples(file: str, is_training: bool) -> List[CAILExample]:
example_list = []
with open(file, "r", encoding="utf-8") as file:
original_data = json.load(file)["data"]
for entry in tqdm(original_data):
case_id = entry["caseid"]
for paragraph in entry["paragraphs"]:
context = paragraph["context"]
case_name = paragraph["casename"]
for qa in paragraph["qas"]:
question = qa["question"]
qas_id = qa["id"]
answer_texts = None
answer_starts = None
is_impossible = None
is_yes_no = None
is_multi_span = None
all_answers = None
# cail2021包含以下击中答案:单片段、是否类和拒答类的问题类型,相较于之前的,
# 额外引入了多片段类型,即答案是由多个片段组合而成
if is_training:
all_answers = qa["answers"]
# all_answers为[],说明没有答案
if len(all_answers) == 0:
answer = []
else:
# 否则取第0个
answer = all_answers[0]
# a little difference between 19 and 21 data.
# 如果是一个字典的话将其用列表包裹
if type(answer) == dict:
answer = [answer]
# 不存在答案就初始化答案的文本为"",答案起始位置设置为-1
if len(answer) == 0: # NO Answer
answer_texts = [""]
answer_starts = [-1]
else:
# 否则的话这里整合答案
answer_texts = []
answer_starts = []
# 如果是单个span,就是一个
# 否则的话就遍历一下
for a in answer:
answer_texts.append(a["text"])
answer_starts.append(a["answer_start"])
# Judge YES or NO
# 判断是否是 是还是否类型的,并进行设置
if len(answer_texts) == 1 and answer_starts[0] == -1 and (
answer_texts[0] == "YES" or answer_texts[0] == "NO"):
is_yes_no = True
else:
is_yes_no = False
# Judge Multi Span
# 判断是否是由多个span构成的答案
if len(answer_texts) > 1:
is_multi_span = True
else:
is_multi_span = False
# Judge No Answer
# 如果不存在答案的话用以下的进行标识
if len(answer_texts) == 1 and answer_texts[0] == "":
is_impossible = True
else:
is_impossible = False
example = CAILExample(
qas_id=qas_id,
question_text=question,
context_text=context,
answer_texts=answer_texts,
answer_start_indexes=answer_starts,
is_impossible=is_impossible,
is_yes_no=is_yes_no,
is_multi_span=is_multi_span,
answers=all_answers,
case_id=case_id,
case_name=case_name
)
# Discard possible bad example
if is_training and example.answer_start_indexes[0] >= 0:
for i in range(len(example.answer_texts)):
actual_text = "".join(
example.doc_tokens[example.start_positions[i]: (example.end_positions[i] 1)])
cleaned_answer_text = "".join(whitespace_tokenize(example.answer_texts[i]))
if actual_text.find(cleaned_answer_text) == -1:
logging.info(f"Could not find answer: {actual_text} vs. {cleaned_answer_text}")
continue
example_list.append(example)
return example_list
def convert_examples_to_features(example_list: List[CAILExample], tokenizer: PreTrainedTokenizer, args,
is_training: bool) -> List[CAILFeature]:
# Validate there are no duplicate ids in example_list
qas_id_set = set()
for example in example_list:
if example.qas_id in qas_id_set:
raise Exception("Duplicate qas_id!")
else:
qas_id_set.add(example.qas_id)
feature_list = []
unique_id = 0
example_index = 0
i = 0
for example in tqdm(example_list):
i = 1
# if i % 100 == 0:
# print(i)
current_example_features = convert_single_example_to_features(example, tokenizer, args.max_seq_length,
args.max_query_length, args.doc_stride,
is_training)
for feature in current_example_features:
feature.example_index = example_index
feature.unique_id = unique_id
unique_id = 1
example_index = 1
feature_list.extend(current_example_features)
return feature_list
def convert_single_example_to_features(example: CAILExample, tokenizer: PreTrainedTokenizer,
max_seq_length, max_query_length, doc_stride, is_training) -> List[CAILFeature]:
"""
Transfer original text to sequence which can be accepted by ELECTRA
Format: [CLS] YES_TOKEN NO_TOKEN question [SEP] context [SEP]
"""
features = []
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
"""
['我', '爱', '北', '京', '15826458891', '天', '安', '门']
orig_to_tok_index:[0, 1, 2, 3, 4, 9, 10, 11]
tok_to_orig_index:[0, 1, 2, 3, 4, 4, 4, 4, 4, 5, 6, 7]
all_doc_tokens:['我', '爱', '北', '京', '158', '##26', '##45', '##88', '##91', '天', '安', '门']
"""
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token) # 这里进一步对token尽可能进行切分
for sub_token in sub_tokens:
tok_to_orig_index.append(i) # 每一个sub_token对应的i是相同的
all_doc_tokens.append(sub_token)
if is_training:
if example.is_impossible or example.answer_start_indexes[0] == -1:
start_positions = [-1]
end_positions = [-1]
else:
start_positions = []
end_positions = []
# 以下是对tokenize化之后校准position
for i in range(len(example.start_positions)):
start_position = orig_to_tok_index[example.start_positions[i]]
if example.end_positions[i] < len(example.doc_tokens) - 1:
end_position = orig_to_tok_index[example.end_positions[i] 1] - 1
else:
end_position = len(all_doc_tokens) - 1
(start_position, end_position) = _improve_answer_span(
all_doc_tokens, start_position, end_position, tokenizer, example.answer_texts[i]
)
start_positions.append(start_position)
end_positions.append(end_position)
else:
start_positions = None
end_positions = None
query_tokens = tokenizer.tokenize(example.question_text)
query_tokens = [YES_TOKEN, NO_TOKEN] query_tokens # 是否类和问题拼接
truncated_query = tokenizer.encode(query_tokens, add_special_tokens=False, max_length=max_query_length,
truncation=True)
sequence_pair_added_tokens = tokenizer.num_special_tokens_to_add(pair=True)
assert sequence_pair_added_tokens == 3
added_tokens_num_before_second_sequence = tokenizer.num_special_tokens_to_add(pair=False)
assert added_tokens_num_before_second_sequence == 2
span_doc_tokens = all_doc_tokens
spans = []
# print("query_tokens:", query_tokens)
# print("all_doc_tokens:", all_doc_tokens)
# print("".join(all_doc_tokens))
# print("start_positions:", start_positions)
# print("end_positions:", end_positions)
# 这里使用滑动窗口法
while len(spans) * doc_stride < len(all_doc_tokens):
# print(max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,)
# 以步长为doc_stride进行滑窗
encoded_dict = tokenizer.encode_plus(
truncated_query,
span_doc_tokens,
max_length=max_seq_length,
return_overflowing_tokens=True,
padding="max_length",
stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
truncation="only_second",
return_token_type_ids=True
)
# print(span_doc_tokens)
# print("stride:", max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens)
# print(tokenizer.convert_ids_to_tokens(encoded_dict['input_ids']))
# print(len(encoded_dict['input_ids']))
# print(tokenizer.convert_ids_to_tokens(encoded_dict['overflowing_tokens']))
# 句子的真实长度
paragraph_len = min(
len(all_doc_tokens) - len(spans) * doc_stride,
max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
)
# 不包含[PAD]的token_ids
if tokenizer.pad_token_id in encoded_dict["input_ids"]:
non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
else:
non_padded_ids = encoded_dict["input_ids"]
# 重新将ids转换为tokens
tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
token_to_orig_map = {}
token_to_orig_map[0] = -1
token_to_orig_map[1] = -1
token_to_orig_map[2] = -1
token_is_max_context = {0: True, 1: True, 2: True}
for i in range(paragraph_len):
# token在输入[CLS]query[SEP]context[SEP]里面的索引
index = len(truncated_query) added_tokens_num_before_second_sequence i
# tok_to_orig_index是token在context里面的索引
# spans的长度表明当前总共有几个片段
# token_to_orig_map是将index映射到真实的i上
token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride i]
# print(token_to_orig_map)
encoded_dict["paragraph_len"] = paragraph_len
encoded_dict["tokens"] = tokens
encoded_dict["token_to_orig_map"] = token_to_orig_map
encoded_dict["truncated_query_with_special_tokens_length"] = len(
truncated_query) added_tokens_num_before_second_sequence
encoded_dict["token_is_max_context"] = token_is_max_context
encoded_dict["start"] = len(spans) * doc_stride # 文本的起始索引
encoded_dict["length"] = paragraph_len
# 这里将是否类的标记token_type_ids设置为1,为什么?
encoded_dict["token_type_ids"][1] = 1
encoded_dict["token_type_ids"][2] = 1
# print(encoded_dict["token_type_ids"])
spans.append(encoded_dict)
if "overflowing_tokens" not in encoded_dict or len(encoded_dict["overflowing_tokens"]) == 0:
break
else:
span_doc_tokens = encoded_dict["overflowing_tokens"]
for doc_span_index in range(len(spans)):
for j in range(spans[doc_span_index]["paragraph_len"]):
is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride j)
index = spans[doc_span_index]["truncated_query_with_special_tokens_length"] j
spans[doc_span_index]["token_is_max_context"][index] = is_max_context
for span in spans:
cls_index = span["input_ids"].index(tokenizer.cls_token_id)
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
# p_mask是将问题和SEP对应位置设置为1,其余位置设置为0
p_mask = np.array(span["token_type_ids"])
p_mask = np.minimum(p_mask, 1)
p_mask = 1 - p_mask
p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
p_mask[cls_index] = 0
p_mask[1] = 0
p_mask[2] = 0
current_start_positions = None
current_end_positions = None
span_is_impossible = None
if is_training:
current_start_positions = [0 for i in range(max_seq_length)]
current_end_positions = [0 for i in range(max_seq_length)]
doc_start = span["start"]
doc_end = span["start"] span["length"] - 1 # 文本的截止索引
doc_offset = len(truncated_query) added_tokens_num_before_second_sequence # 偏移量
for i in range(len(start_positions)):
start_position = start_positions[i]
end_position = end_positions[i]
# 这里重新整合start_position和end_position
if start_position >= doc_start and end_position <= doc_end:
span_is_impossible = False
current_start_positions[start_position - doc_start doc_offset] = 1
current_end_positions[end_position - doc_start doc_offset] = 1
# print(current_start_positions)
# print(current_end_positions)
# 处理是否类,将1或者2索引位置设置为1
if example.is_yes_no:
assert len(example.answer_start_indexes) == 1
assert 1 not in current_start_positions and 1 not in current_end_positions
if example.answer_texts[0] == "YES" and example.answer_start_indexes[0] == -1:
current_start_positions[1] = 1
current_end_positions[1] = 1
elif example.answer_texts[0] == "NO" and example.answer_start_indexes[0] == -1:
current_start_positions[2] = 1
current_end_positions[2] = 1
else:
raise Exception("example构造出错,请检查")
span_is_impossible = False
# 处理没有答案的类,将0索引位置设置为1
if 1 not in current_start_positions: # Current Feature does not contain answer span
span_is_impossible = True
current_start_positions[cls_index] = 1
current_end_positions[cls_index] = 1
assert span_is_impossible is not None
features.append(
CAILFeature(
input_ids=span["input_ids"],
attention_mask=span["attention_mask"],
token_type_ids=span["token_type_ids"],
cls_index=cls_index,
p_mask=p_mask.tolist(),
example_index=0,
unique_id=0,
paragraph_len=span["paragraph_len"],
token_is_max_context=span["token_is_max_context"],
tokens=span["tokens"],
token_to_orig_map=span["token_to_orig_map"],
start_positions=current_start_positions,
end_positions=current_end_positions,
is_impossible=span_is_impossible
)
)
return features
def convert_features_to_dataset(features: List[CAILFeature], is_training: bool) -> Dataset:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
all_example_indexes = torch.tensor([f.example_index for f in features], dtype=torch.long)
all_feature_indexes = torch.arange(all_input_ids.size(0), dtype=torch.long)
if is_training:
all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
all_start_labels = torch.tensor([f.start_positions for f in features], dtype=torch.float)
all_end_labels = torch.tensor([f.end_positions for f in features], dtype=torch.float)
dataset = TensorDataset(
all_input_ids,
all_attention_masks,
all_token_type_ids,
all_start_labels,
all_end_labels,
all_cls_index,
all_p_mask,
all_is_impossible,
all_example_indexes,
all_feature_indexes
)
else:
dataset = TensorDataset(
all_input_ids,
all_attention_masks,
all_token_type_ids,
all_cls_index,
all_p_mask,
all_example_indexes,
all_feature_indexes
)
return dataset
def _is_whitespace(c):
if c == " " or c == "t" or c == "r" or c == "n" or ord(c) == 0x202F:
return True
return False
def _new_check_is_max_context(doc_spans, cur_span_index, position):
"""
Check if this is the 'max context' doc span for the token.
"""
# if len(doc_spans) == 1:
# return True
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span["start"] doc_span["length"] - 1
if position < doc_span["start"]:
continue
if position > end:
continue
num_left_context = position - doc_span["start"]
num_right_context = end - position
score = min(num_left_context, num_right_context) 0.01 * doc_span["length"]
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
"""
Returns tokenized answer spans that better match the annotated answer.
"""
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start: (new_end 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def customize_tokenizer(text: str, do_lower_case=True) -> List[str]:
temp_x = ""
for char in text:
# 在一些特殊字符左右插入两个空格
if _is_chinese_char(ord(char)) or _is_punctuation(char) or _is_whitespace(char) or _is_control(char):
temp_x = " " char " "
else:
temp_x = char
# 是否将英文大写转换为小写
if do_lower_case:
temp_x = temp_x.lower()
return temp_x.split() # 这里会使用空格进行切分
def _is_chinese_char(cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True
return False
def whitespace_tokenize(text: str):
if text is None:
return []
text = text.strip()
tokens = text.split()
return tokens
def write_example_orig_file(examples: List[CAILExample], file: str):
"""
convert examples to original json file
"""
data_list = []
for example in examples:
data = {
"paragraphs": [
{
"context": example.context_text,
"casename": example.case_name,
"qas": [
{
"question": example.question_text,
"answers": example.answers,
"id": example.qas_id,
"is_impossible": "true" if example.is_impossible else "false",
}
]
}
],
"caseid": example.case_id
}
data_list.append(data)
final_data = {
"data": data_list,
"version": "1.0"
}
with open(file, mode="w", encoding="utf-8") as file:
file.write(json.dumps(final_data, ensure_ascii=False))
if __name__ == '__main__':
data_file = 'data_sample/cail2021_mrc_small.json'
examples = read_examples(data_file, is_training=True)
tokenizer = BertTokenizer.from_pretrained('model_hub/chinese-bert-wwm-ext/')
# example = examples[3]
# print(example)
# print(len(example.doc_tokens))
# convert_single_example_to_features(
# example=example,
# tokenizer=tokenizer,
# max_seq_length=512,
# max_query_length=64,
# doc_stride=128,
# is_training=True
# )
class Args:
max_seq_length = 512
max_query_length = 64
doc_stride = 128
args = Args()
feature_lists = convert_examples_to_features(
examples,
tokenizer,
args,
is_training=True,
)
print(feature_lists[0])
datasets = convert_features_to_dataset(feature_lists, is_training=True)
print(datasets[0])
# for ex in examples:
# print(ex)
# context_text = "我nt爱北京nt15826458891天安门"
# all_doc_tokens = customize_tokenizer(context_text, True)
#
# k = 0
# temp_word = ""
# doc_tokens = []
# char_to_word_offset = []
# print(context_text)
# print(all_doc_tokens)
# for char in context_text:
# if _is_whitespace(char):
# char_to_word_offset.append(k - 1)
# continue
# else:
# temp_word = char
# char_to_word_offset.append(k)
# if temp_word.lower() == all_doc_tokens[k]:
# doc_tokens.append(temp_word)
# temp_word = ""
# k = 1
# print(k)
# print(doc_tokens)
# print(char_to_word_offset)
#
# tok_to_orig_index = []
# orig_to_tok_index = []
# all_doc_tokens = []
# for (i, token) in enumerate(doc_tokens):
# orig_to_tok_index.append(len(all_doc_tokens))
# sub_tokens = tokenizer.tokenize(token)
# for sub_token in sub_tokens:
# tok_to_orig_index.append(i)
# all_doc_tokens.append(sub_token)
#
# print(orig_to_tok_index)
# print(tok_to_orig_index)
# print(all_doc_tokens)
我们需要注意的是总过经过了三个重整的阶段:
- 第一阶段:先初步将文本进行token化,这一步是去除掉文本中的一些特殊符号及空格等,因此要对答案的起始位置进行校准。
- 第二阶段:这一步利用tokenizer对每一个字(词)进行token化,由于是wordpiece,会影响句子的长度以及答案,因此也要重新进行校准。
- 第三阶段:这一步是要整合问题和文本,同时采用滑动窗口法,因此也要重新校准答案在文本中的位置。
<input_ids: [101, 1, 2, 7342, 12124, 1762, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 1905, 2832, 749, 784, 720, 924, 7372, 8043, 102, 9595, 119, 125, 1039, 132, 124, 119, 1161, 808, 7342, 12124, 510, 3342, 10871, 3118, 802, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 2526, 2360, 6589, 11960, 8129, 1039, 132, 125, 119, 7342, 12124, 510, 3342, 10871, 2824, 2857, 3315, 3428, 4638, 6401, 6390, 6589, 4500, 752, 2141, 680, 4415, 4507, 131, 8138, 2399, 8110, 3299, 124, 3189, 117, 7342, 12124, 6206, 3724, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 711, 1071, 1762, 704, 1744, 1093, 689, 7213, 6121, 5500, 819, 3300, 7361, 1062, 1385, 2128, 2551, 4689, 1146, 6121, 5852, 689, 6956, 113, 809, 678, 5042, 4917, 1093, 6121, 2128, 2551, 4689, 1146, 6121, 114, 4638, 8416, 9086, 1039, 6587, 3621, 2990, 897, 702, 782, 3867, 6589, 928, 6587, 924, 6395, 924, 7372, 117, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 2970, 1358, 2832, 924, 2400, 5041, 1355, 924, 7372, 1296, 113, 924, 1296, 5356, 1384, 131, 8752, 9723, 9131, 8756, 9086, 11906, 9446, 8311, 8152, 114, 117, 924, 6589, 5373, 5287, 3175, 2466, 711, 3309, 5373, 117, 3680, 3309, 113, 3299, 114, 9649, 8158, 1039, 117, 924, 7372, 3309, 7313, 5632, 702, 782, 3867, 6589, 928, 6587, 1394, 1398, 7555, 678, 6587, 3621, 1355, 3123, 722, 3189, 6629, 5635, 3926, 985, 1059, 6956, 6587, 3621, 3315, 2622, 722, 3189, 3632, 6421, 924, 1296, 5276, 2137, 117, 7342, 12124, 2870, 3612, 818, 862, 671, 3309, 6587, 3621, 6809, 1168, 8188, 1921, 4638, 117, 6228, 711, 924, 7372, 752, 3125, 1355, 4495, 117, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 898, 2945, 924, 7372, 1394, 1398, 5276, 2137, 2190, 1093, 6121, 2128, 2551, 4689, 1146, 6121, 6822, 6121, 6608, 985, 132, 794, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 6608, 985, 2496, 3189, 6629, 2458, 1993, 6631, 6814, 8114, 1921, 117, 2832, 924, 782, 793, 3313, 1403, 924, 7372, 782, 2495, 6820, 1059, 6956, 6608, 985, 3621, 7555, 4638, 117, 1156, 6228, 711, 2832, 924, 782, 6824, 5276, 117, 2832, 924, 782, 7444, 809, 2213, 3612, 1059, 6956, 3621, 7555, 711, 1825, 3144, 117, 794, 924, 7372, 782, 6608, 985, 2496, 3189, 2458, 1993, 6369, 5050, 117, 2902, 3680, 3189, 1283, 1146, 722, 671, 3403, 1114, 117, 1403, 924, 7372, 782, 5373, 5287, 6824, 5276, 7032, 7342, 12124, 2832, 924, 1400, 117, 794, 1093, 6121, 2128, 2551, 4689, 1146, 6121, 5815, 2533, 2207, 7583, 3867, 6589, 6587, 3621, 1066, 8416, 9086, 1039, 117, 6587, 3621, 4500, 6854, 711, 3189, 2382, 4495, 3833, 3867, 6589, 117, 955, 3621, 3309, 7361, 711, 8216, 702, 3299, 117, 2902, 3299, 5023, 7583, 3315, 2622, 6820, 3621, 1400, 1728, 7342, 12124, 3313, 2130, 1059, 2252, 6121, 1394, 1398, 5276, 2137, 4638, 6820, 3621, 721, 1218, 117, 1093, 6121, 2128, 2551, 4689, 1146, 6121, 754, 8119, 2399, 123, 3299, 8132, 3189, 1403, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 1062, 1385, 4509, 6435, 5164, 6608, 117, 2398, 2128, 6568, 7372, 2128, 2551, 1146, 102]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
token_type_ids: [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
cls_index: 0
p_mask: [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
example_index: 4
unique_id: 7
paragraph_len: 487
/*
* 提示:该行代码过长,系统自动注释不进行高亮。一键复制会移除系统注释
* token_is_max_context: {0: True, 1: True, 2: True, 24: False, 25: False, 26: False, 27: False, 28: False, 29: False, 30: False, 31: False, 32: False, 33: False, 34: False, 35: False, 36: False, 37: False, 38: False, 39: False, 40: False, 41: False, 42: False, 43: False, 44: False, 45: False, 46: False, 47: False, 48: False, 49: False, 50: False, 51: False, 52: False, 53: False, 54: False, 55: False, 56: False, 57: False, 58: False, 59: False, 60: False, 61: False, 62: False, 63: False, 64: False, 65: False, 66: False, 67: False, 68: False, 69: False, 70: False, 71: False, 72: False, 73: False, 74: False, 75: False, 76: False, 77: False, 78: False, 79: False, 80: False, 81: False, 82: False, 83: False, 84: False, 85: False, 86: False, 87: False, 88: False, 89: False, 90: False, 91: False, 92: False, 93: False, 94: False, 95: False, 96: False, 97: False, 98: False, 99: False, 100: False, 101: False, 102: False, 103: False, 104: False, 105: False, 106: False, 107: False, 108: False, 109: False, 110: False, 111: False, 112: False, 113: False, 114: False, 115: False, 116: False, 117: False, 118: False, 119: False, 120: False, 121: False, 122: False, 123: False, 124: False, 125: False, 126: False, 127: False, 128: False, 129: False, 130: False, 131: False, 132: False, 133: False, 134: False, 135: False, 136: False, 137: False, 138: False, 139: False, 140: False, 141: False, 142: False, 143: False, 144: False, 145: False, 146: False, 147: False, 148: False, 149: False, 150: False, 151: False, 152: False, 153: False, 154: False, 155: False, 156: False, 157: False, 158: False, 159: False, 160: False, 161: False, 162: False, 163: False, 164: False, 165: False, 166: False, 167: False, 168: False, 169: False, 170: False, 171: False, 172: False, 173: False, 174: False, 175: False, 176: False, 177: False, 178: False, 179: False, 180: False, 181: False, 182: False, 183: False, 184: False, 185: False, 186: False, 187: False, 188: False, 189: False, 190: False, 191: False, 192: False, 193: False, 194: False, 195: False, 196: False, 197: False, 198: False, 199: False, 200: False, 201: False, 202: False, 203: False, 204: True, 205: True, 206: True, 207: True, 208: True, 209: True, 210: True, 211: True, 212: True, 213: True, 214: True, 215: True, 216: True, 217: True, 218: True, 219: True, 220: True, 221: True, 222: True, 223: True, 224: True, 225: True, 226: True, 227: True, 228: True, 229: True, 230: True, 231: True, 232: True, 233: True, 234: True, 235: True, 236: True, 237: True, 238: True, 239: True, 240: True, 241: True, 242: True, 243: True, 244: True, 245: True, 246: True, 247: True, 248: True, 249: True, 250: True, 251: True, 252: True, 253: True, 254: True, 255: True, 256: True, 257: True, 258: True, 259: True, 260: True, 261: True, 262: True, 263: True, 264: True, 265: True, 266: True, 267: True, 268: True, 269: True, 270: True, 271: True, 272: True, 273: True, 274: True, 275: True, 276: True, 277: True, 278: True, 279: True, 280: True, 281: True, 282: True, 283: True, 284: True, 285: True, 286: True, 287: True, 288: True, 289: True, 290: True, 291: True, 292: True, 293: True, 294: True, 295: True, 296: True, 297: True, 298: True, 299: True, 300: True, 301: True, 302: True, 303: True, 304: True, 305: True, 306: True, 307: True, 308: True, 309: True, 310: True, 311: True, 312: True, 313: True, 314: True, 315: True, 316: True, 317: True, 318: True, 319: True, 320: True, 321: True, 322: True, 323: True, 324: True, 325: True, 326: True, 327: True, 328: True, 329: True, 330: True, 331: True, 332: False, 333: False, 334: False, 335: False, 336: False, 337: False, 338: False, 339: False, 340: False, 341: False, 342: False, 343: False, 344: False, 345: False, 346: False, 347: False, 348: False, 349: False, 350: False, 351: False, 352: False, 353: False, 354: False, 355: False, 356: False, 357: False, 358: False, 359: False, 360: False, 361: False, 362: False, 363: False, 364: False, 365: False, 366: False, 367: False, 368: False, 369: False, 370: False, 371: False, 372: False, 373: False, 374: False, 375: False, 376: False, 377: False, 378: False, 379: False, 380: False, 381: False, 382: False, 383: False, 384: False, 385: False, 386: False, 387: False, 388: False, 389: False, 390: False, 391: False, 392: False, 393: False, 394: False, 395: False, 396: False, 397: False, 398: False, 399: False, 400: False, 401: False, 402: False, 403: False, 404: False, 405: False, 406: False, 407: False, 408: False, 409: False, 410: False, 411: False, 412: False, 413: False, 414: False, 415: False, 416: False, 417: False, 418: False, 419: False, 420: False, 421: False, 422: False, 423: False, 424: False, 425: False, 426: False, 427: False, 428: False, 429: False, 430: False, 431: False, 432: False, 433: False, 434: False, 435: False, 436: False, 437: False, 438: False, 439: False, 440: False, 441: False, 442: False, 443: False, 444: False, 445: False, 446: False, 447: False, 448: False, 449: False, 450: False, 451: False, 452: False, 453: False, 454: False, 455: False, 456: False, 457: False, 458: False, 459: False, 460: False, 461: False, 462: False, 463: False, 464: False, 465: False, 466: False, 467: False, 468: False, 469: False, 470: False, 471: False, 472: False, 473: False, 474: False, 475: False, 476: False, 477: False, 478: False, 479: False, 480: False, 481: False, 482: False, 483: False, 484: False, 485: False, 486: False, 487: False, 488: False, 489: False, 490: False, 491: False, 492: False, 493: False, 494: False, 495: False, 496: False, 497: False, 498: False, 499: False, 500: False, 501: False, 502: False, 503: False, 504: False, 505: False, 506: False, 507: False, 508: False, 509: False, 510: False}
*/
tokens: ['[CLS]', '[unused1]', '[unused2]', '阮', 'x4', '在', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '处', '投', '了', '什', '么', '保', '险', '?', '[SEP]', '##92', '.', '4', '元', ';', '3', '.', '判', '令', '阮', 'x4', '、', '杨', 'x5', '支', '付', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '律', '师', '费', '690', '##0', '元', ';', '4', '.', '阮', 'x4', '、', '杨', 'x5', '承', '担', '本', '案', '的', '诉', '讼', '费', '用', '事', '实', '与', '理', '由', ':', '2013', '年', '12', '月', '3', '日', ',', '阮', 'x4', '要', '求', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '为', '其', '在', '中', '国', '农', '业', '银', '行', '股', '份', '有', '限', '公', '司', '安', '徽', '省', '分', '行', '营', '业', '部', '(', '以', '下', '简', '称', '农', '行', '安', '徽', '省', '分', '行', ')', '的', '94', '##000', '元', '贷', '款', '提', '供', '个', '人', '消', '费', '信', '贷', '保', '证', '保', '险', ',', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '接', '受', '投', '保', '并', '签', '发', '保', '险', '单', '(', '保', '单', '编', '号', ':', '125', '##94', '##07', '##26', '##000', '##010', '##87', '##10', '##3', ')', ',', '保', '费', '缴', '纳', '方', '式', '为', '期', '缴', ',', '每', '期', '(', '月', ')', '178', '##6', '元', ',', '保', '险', '期', '间', '自', '个', '人', '消', '费', '信', '贷', '合', '同', '项', '下', '贷', '款', '发', '放', '之', '日', '起', '至', '清', '偿', '全', '部', '贷', '款', '本', '息', '之', '日', '止', '该', '保', '单', '约', '定', ',', '阮', 'x4', '拖', '欠', '任', '何', '一', '期', '贷', '款', '达', '到', '80', '天', '的', ',', '视', '为', '保', '险', '事', '故', '发', '生', ',', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '依', '据', '保', '险', '合', '同', '约', '定', '对', '农', '行', '安', '徽', '省', '分', '行', '进', '行', '赔', '偿', ';', '从', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '赔', '偿', '当', '日', '起', '开', '始', '超', '过', '30', '天', ',', '投', '保', '人', '仍', '未', '向', '保', '险', '人', '归', '还', '全', '部', '赔', '偿', '款', '项', '的', ',', '则', '视', '为', '投', '保', '人', '违', '约', ',', '投', '保', '人', '需', '以', '尚', '欠', '全', '部', '款', '项', '为', '基', '数', ',', '从', '保', '险', '人', '赔', '偿', '当', '日', '开', '始', '计', '算', ',', '按', '每', '日', '千', '分', '之', '一', '标', '准', ',', '向', '保', '险', '人', '缴', '纳', '违', '约', '金', '阮', 'x4', '投', '保', '后', ',', '从', '农', '行', '安', '徽', '省', '分', '行', '获', '得', '小', '额', '消', '费', '贷', '款', '共', '94', '##000', '元', ',', '贷', '款', '用', '途', '为', '日', '常', '生', '活', '消', '费', ',', '借', '款', '期', '限', '为', '36', '个', '月', ',', '按', '月', '等', '额', '本', '息', '还', '款', '后', '因', '阮', 'x4', '未', '完', '全', '履', '行', '合', '同', '约', '定', '的', '还', '款', '义', '务', ',', '农', '行', '安', '徽', '省', '分', '行', '于', '2015', '年', '2', '月', '25', '日', '向', '平', '安', '财', '险', '安', '徽', '分', '公', '司', '申', '请', '索', '赔', ',', '平', '安', '财', '险', '安', '徽', '分', '[SEP]']
token_to_orig_map: {0: -1, 1: -1, 2: -1, 24: 124, 25: 125, 26: 126, 27: 127, 28: 128, 29: 129, 30: 130, 31: 131, 32: 132, 33: 133, 34: 134, 35: 135, 36: 136, 37: 137, 38: 138, 39: 139, 40: 140, 41: 141, 42: 142, 43: 143, 44: 144, 45: 145, 46: 146, 47: 147, 48: 148, 49: 149, 50: 150, 51: 151, 52: 152, 53: 152, 54: 153, 55: 154, 56: 155, 57: 156, 58: 157, 59: 158, 60: 159, 61: 160, 62: 161, 63: 162, 64: 163, 65: 164, 66: 165, 67: 166, 68: 167, 69: 168, 70: 169, 71: 170, 72: 171, 73: 172, 74: 173, 75: 174, 76: 175, 77: 176, 78: 177, 79: 178, 80: 179, 81: 180, 82: 181, 83: 182, 84: 183, 85: 184, 86: 185, 87: 186, 88: 187, 89: 188, 90: 189, 91: 190, 92: 191, 93: 192, 94: 193, 95: 194, 96: 195, 97: 196, 98: 197, 99: 198, 100: 199, 101: 200, 102: 201, 103: 202, 104: 203, 105: 204, 106: 205, 107: 206, 108: 207, 109: 208, 110: 209, 111: 210, 112: 211, 113: 212, 114: 213, 115: 214, 116: 215, 117: 216, 118: 217, 119: 218, 120: 219, 121: 220, 122: 221, 123: 222, 124: 223, 125: 224, 126: 225, 127: 226, 128: 227, 129: 228, 130: 229, 131: 230, 132: 231, 133: 232, 134: 233, 135: 234, 136: 234, 137: 235, 138: 236, 139: 237, 140: 238, 141: 239, 142: 240, 143: 241, 144: 242, 145: 243, 146: 244, 147: 245, 148: 246, 149: 247, 150: 248, 151: 249, 152: 250, 153: 251, 154: 252, 155: 253, 156: 254, 157: 255, 158: 256, 159: 257, 160: 258, 161: 259, 162: 260, 163: 261, 164: 262, 165: 263, 166: 264, 167: 265, 168: 266, 169: 267, 170: 268, 171: 269, 172: 270, 173: 271, 174: 272, 175: 273, 176: 274, 177: 275, 178: 276, 179: 276, 180: 276, 181: 276, 182: 276, 183: 276, 184: 276, 185: 276, 186: 276, 187: 277, 188: 278, 189: 279, 190: 280, 191: 281, 192: 282, 193: 283, 194: 284, 195: 285, 196: 286, 197: 287, 198: 288, 199: 289, 200: 290, 201: 291, 202: 292, 203: 293, 204: 294, 205: 294, 206: 295, 207: 296, 208: 297, 209: 298, 210: 299, 211: 300, 212: 301, 213: 302, 214: 303, 215: 304, 216: 305, 217: 306, 218: 307, 219: 308, 220: 309, 221: 310, 222: 311, 223: 312, 224: 313, 225: 314, 226: 315, 227: 316, 228: 317, 229: 318, 230: 319, 231: 320, 232: 321, 233: 322, 234: 323, 235: 324, 236: 325, 237: 326, 238: 327, 239: 328, 240: 329, 241: 330, 242: 331, 243: 332, 244: 333, 245: 334, 246: 335, 247: 336, 248: 337, 249: 338, 250: 339, 251: 340, 252: 341, 253: 342, 254: 343, 255: 344, 256: 345, 257: 346, 258: 347, 259: 348, 260: 349, 261: 350, 262: 351, 263: 352, 264: 353, 265: 354, 266: 355, 267: 356, 268: 357, 269: 358, 270: 359, 271: 360, 272: 361, 273: 362, 274: 363, 275: 364, 276: 365, 277: 366, 278: 367, 279: 368, 280: 369, 281: 370, 282: 371, 283: 372, 284: 373, 285: 374, 286: 375, 287: 376, 288: 377, 289: 378, 290: 379, 291: 380, 292: 381, 293: 382, 294: 383, 295: 384, 296: 385, 297: 386, 298: 387, 299: 388, 300: 389, 301: 390, 302: 391, 303: 392, 304: 393, 305: 394, 306: 395, 307: 396, 308: 397, 309: 398, 310: 399, 311: 400, 312: 401, 313: 402, 314: 403, 315: 404, 316: 405, 317: 406, 318: 407, 319: 408, 320: 409, 321: 410, 322: 411, 323: 412, 324: 413, 325: 414, 326: 415, 327: 416, 328: 417, 329: 418, 330: 419, 331: 420, 332: 421, 333: 422, 334: 423, 335: 424, 336: 425, 337: 426, 338: 427, 339: 428, 340: 429, 341: 430, 342: 431, 343: 432, 344: 433, 345: 434, 346: 435, 347: 436, 348: 437, 349: 438, 350: 439, 351: 440, 352: 441, 353: 442, 354: 443, 355: 444, 356: 445, 357: 446, 358: 447, 359: 448, 360: 449, 361: 450, 362: 451, 363: 452, 364: 453, 365: 454, 366: 455, 367: 456, 368: 457, 369: 458, 370: 459, 371: 460, 372: 461, 373: 462, 374: 463, 375: 464, 376: 465, 377: 466, 378: 467, 379: 468, 380: 469, 381: 470, 382: 471, 383: 472, 384: 473, 385: 474, 386: 475, 387: 476, 388: 477, 389: 478, 390: 479, 391: 480, 392: 481, 393: 482, 394: 483, 395: 484, 396: 485, 397: 486, 398: 487, 399: 488, 400: 489, 401: 490, 402: 491, 403: 492, 404: 493, 405: 494, 406: 495, 407: 496, 408: 497, 409: 498, 410: 499, 411: 500, 412: 501, 413: 502, 414: 503, 415: 504, 416: 505, 417: 506, 418: 507, 419: 508, 420: 509, 421: 510, 422: 511, 423: 512, 424: 512, 425: 513, 426: 514, 427: 515, 428: 516, 429: 517, 430: 518, 431: 519, 432: 520, 433: 521, 434: 522, 435: 523, 436: 524, 437: 525, 438: 526, 439: 527, 440: 528, 441: 529, 442: 530, 443: 531, 444: 532, 445: 533, 446: 534, 447: 535, 448: 536, 449: 537, 450: 538, 451: 539, 452: 540, 453: 541, 454: 542, 455: 543, 456: 544, 457: 545, 458: 546, 459: 547, 460: 548, 461: 549, 462: 550, 463: 551, 464: 552, 465: 553, 466: 554, 467: 555, 468: 556, 469: 557, 470: 558, 471: 559, 472: 560, 473: 561, 474: 562, 475: 563, 476: 564, 477: 565, 478: 566, 479: 567, 480: 568, 481: 569, 482: 570, 483: 571, 484: 572, 485: 573, 486: 574, 487: 575, 488: 576, 489: 577, 490: 578, 491: 579, 492: 580, 493: 581, 494: 582, 495: 583, 496: 584, 497: 585, 498: 586, 499: 587, 500: 588, 501: 589, 502: 590, 503: 591, 504: 592, 505: 593, 506: 594, 507: 595, 508: 596, 509: 597, 510: 598}
start_positions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
end_positions: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
is_impossible: False
>
最后需要注意的是:怎么将是否类、单个片段类和多个片段类进行统一的标识,以及输入中的token_type_ids是怎么进行设置的。 最有有一个处理文件将处理后的数据保存下来,避免每次重复进行处理,消耗时间:
代码语言:javascript复制"""
数据处理相关代码
"""
import argparse
import json
from transformers import PreTrainedTokenizer, BertTokenizer
from data_process_utils import *
import gzip
import pickle
import os
from os.path import join
import logging
def convert_and_write(args, tokenizer: PreTrainedTokenizer, file, examples_fn, features_fn, is_training):
logging.info(f"Reading examples from :{file} ...")
example_list = read_examples(file, is_training=is_training)
logging.info(f"Total examples:{len(example_list)}")
logging.info(f"Start converting examples to features.")
feature_list = convert_examples_to_features(example_list, tokenizer, args, is_training)
logging.info(f"Total features:{len(feature_list)}")
logging.info(f"Converting complete, writing examples and features to file.")
with gzip.open(join(args.output_path, examples_fn), "wb") as file:
pickle.dump(example_list, file)
with gzip.open(join(args.output_path, features_fn), "wb") as file:
pickle.dump(feature_list, file)
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--input_file",
type=str,
required=True,
help="The file to be processed."
)
parser.add_argument(
"--for_training",
action="store_true",
help="Process for training or not."
)
parser.add_argument(
"--output_prefix",
type=str,
required=True,
help="The prefix of output file's name."
)
parser.add_argument(
"--do_lower_case",
action="store_true",
help="Set this flag if you are using an uncased model."
)
parser.add_argument(
"--tokenizer_path",
type=str,
required=True,
help="Path to tokenizer which will be used to tokenize text.(ElectraTokenizer)"
)
parser.add_argument(
"--max_seq_length",
default=512,
type=int,
help="The maximum total input sequence length after WordPiece tokenization. "
"Longer will be truncated, and shorter will be padded."
)
parser.add_argument(
"--max_query_length",
default=64,
type=int,
help="The maximum number of tokens for the question. Questions longer will be truncated to the length."
)
parser.add_argument(
"--doc_stride",
default=128,
type=int,
help="When splitting up a long document into chunks, how much stride to take between chunks."
)
parser.add_argument(
"--output_path",
default="./processed_data/",
type=str,
help="Output path of the constructed examples and features."
)
args = parser.parse_args()
args.max_query_length = 2 # position for token yes and no
logging.basicConfig(
format="%(asctime)s - %(levelname)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO,
)
logging.info("All input parameters:")
print(json.dumps(vars(args), sort_keys=False, indent=2))
tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path)
if not os.path.exists(args.output_path):
os.makedirs(args.output_path)
convert_and_write(args, tokenizer, args.input_file, args.output_prefix "_examples.pkl.gz",
args.output_prefix "_features.pkl.gz", args.for_training)
if __name__ == "__main__":
main()
运行指令:
代码语言:javascript复制python data_process.py --input_file data_sample/cail2021_mrc_small.json --output_prefix cail2021_mrc_small --tokenizer_path model_hub/chinese-bert-wwm-ext --max_seq_length 512 --max_query_length 64 --doc_stride 128 --do_lower_case --for_training