这里用飞桨的高层API快速搭建模型实现情感分析比赛的结果的提交。具体的原理和分析请参考『NLP打卡营』实践课5:文本情感分析。以下将分三部分:句子级情感分析(NLPCC14-SC,ChnSentiCorp);目标级情感分析(SE-ABSA16_PHNS,SE-ABSA16_CAME);以及观点抽取(COTE-BD,COTE-DP,COTE-MFW)。
项目的使用非常简单,更改相应章节的data_name,并自己调整batch_size和epochs等以达到最佳的训练效果,并运行相应章节的所有代码即可得到对应数据集的预测结果。所有数据预测完成后,下载submission文件夹提交即可。
2021/6/18更新:添加了get_data_loader函数里的返回data_loader的shuffle选项(修复bug);更改了观点抽取中保存文件的名称(修复bug)
2016/6/20:更改了2,3章shuffle的错误拼写。
2016/6/21:观点抽取中替换了英文字母编码时的特殊符号“##”,将"[UNK]"直接替换成了空字符(可以提高大概0.003的成绩)
!pip install --upgrade paddlenlp -i https://pypi.org/simple
句子级情感分析是针对输入的一段话,判断其感情倾向,一般为积极(1)或消极(0)。
import paddlenlp
from paddlenlp.transformers import SkepForSequenceClassification, SkepTokenizer
虽然一些数据集在PaddleNLP已存在,但是为了数据处理上的一致性,这里统一从上传的datasets中处理。对于PaddleNLP已存在的数据集,强烈建议直接用API调用,非常方便。
# 解压数据
!unzip -o datasets/ChnSentiCorp
!unzip -o datasets/NLPCC14-SC
数据内部结构解析:
ChnSentiCorp:
train:
label text_a
0 房间太小。其他的都一般。。。。。。。。。
1 轻便,方便携带,性能也不错,能满足平时的工作需要,对出差人员来说非常不错dev:
qid label text_a
0 1 這間酒店環境和服務態度亦算不錯,但房間空間太小~…test:
qid text_a
0 这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般
… …NLPCC14-SC:
train:
label text_a
1 请问这机不是有个遥控器的吗?
0 全是大道理啊test:
qid text_a
0 我终于找到同道中人啦~~~~从初中开始,我就…
… …
从上可以看出两个数据集可以定义一致的读取方式,但是NLPCC14-SC没有dev数据集,因此不再定义dev数
# 得到数据集字典
def open_func(file_path):
return [line.strip() for line in open(file_path, 'r', encoding='utf8').readlines()[1:] if len(line.strip().split('\t')) >= 2]
data_dict = {'chnsenticorp': {'test': open_func('ChnSentiCorp/test.tsv'),
'dev': open_func('ChnSentiCorp/dev.tsv'),
'train': open_func('ChnSentiCorp/train.tsv')},
'nlpcc14sc': {'test': open_func('NLPCC14-SC/test.tsv'),
'train': open_func('NLPCC14-SC/train.tsv')}}
# 定义数据集
from paddle.io import Dataset, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
import numpy as np
label_list = [0, 1]
# 注意,由于token type在此项任务中并没有起作用,因此这里不再考虑,让模型自行填充。
class MyDataset(Dataset):
def __init__(self, data, tokenizer, max_len=512, for_test=False):
super().__init__()
self._data = data
self._tokenizer = tokenizer
self._max_len = max_len
self._for_test = for_test
def __len__(self):
return len(self._data)
def __getitem__(self, idx):
samples = self._data[idx].split('\t')
label = samples[-2]
text = samples[-1]
label = int(label)
text = self._tokenizer.encode(text, max_seq_len=self._max_len)['input_ids']
if self._for_test:
return np.array(text, dtype='int64')
else:
return np.array(text, dtype='int64'), np.array(label, dtype='int64')
def batchify_fn(for_test=False):
if for_test:
return lambda samples, fn=Pad(axis=0, pad_val=tokenizer.pad_token_id): np.row_stack([data for data in fn(samples)])
else:
return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
Stack()): [data for data in fn(samples)]
def get_data_loader(data, tokenizer, batch_size=32, max_len=512, for_test=False):
dataset = MyDataset(data, tokenizer, max_len, for_test)
shuffle = True if not for_test else False
data_loader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
return data_loader
模型非常简单,我们只需要调用对应的序列分类工具就行了。为了方便训练,直接用高层API Model完成训练。
import paddle
from paddle.static import InputSpec
# 模型和分词
model = SkepForSequenceClassification.from_pretrained('skep_ernie_1.0_large_ch', num_classes=2)
tokenizer = SkepTokenizer.from_pretrained('skep_ernie_1.0_large_ch')
# 参数设置 chnsenticorp nlpcc14sc
data_name = 'nlpcc14sc' # 更改此选项改变数据集
## 训练相关
epochs = 8
learning_rate = 2e-5
batch_size = 8
max_len = 512
## 数据相关
train_dataloader = get_data_loader(data_dict[data_name]['train'], tokenizer, batch_size, max_len, for_test=False)
if data_name == 'chnsenticorp':
dev_dataloader = get_data_loader(data_dict[data_name]['dev'], tokenizer, batch_size, max_len, for_test=False)
else:
dev_dataloader = None
input = InputSpec((-1, -1), dtype='int64', name='input')
label = InputSpec((-1, 2), dtype='int64', name='label')
model = paddle.Model(model, [input], [label])
# 模型准备
optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, parameters=model.parameters())
model.prepare(optimizer, loss=paddle.nn.CrossEntropyLoss(), metrics=[paddle.metric.Accuracy()])
# 开始训练
model.fit(train_dataloader, dev_dataloader, batch_size, epochs, eval_freq=5, save_freq=5, save_dir='./checkpoints', log_freq=200)
step ?200/1250 - loss: 0.3085 - acc: 0.7544 - 308ms/step
step ?400/1250 - loss: 0.3240 - acc: 0.7750 - 305ms/step
step ?600/1250 - loss: 0.5761 - acc: 0.7800 - 311ms/step
step ?800/1250 - loss: 0.1471 - acc: 0.7880 - 307ms/step
step 1000/1250 - loss: 0.3115 - acc: 0.7925 - 303ms/step
step 1200/1250 - loss: 0.8489 - acc: 0.7985 - 302ms/step
step 1250/1250 - loss: 0.4910 - acc: 0.8007 - 302ms/step
save checkpoint at /home/aistudio/checkpoints/0
Epoch 2/8
step ?200/1250 - loss: 0.9686 - acc: 0.8825 - 293ms/step
step ?400/1250 - loss: 0.1028 - acc: 0.8772 - 304ms/step
step ?600/1250 - loss: 0.4760 - acc: 0.8773 - 310ms/step
step ?800/1250 - loss: 0.1483 - acc: 0.8755 - 306ms/step
step 1000/1250 - loss: 0.6983 - acc: 0.8740 - 310ms/step
step 1200/1250 - loss: 0.6258 - acc: 0.8735 - 309ms/step
step 1250/1250 - loss: 0.1694 - acc: 0.8729 - 310ms/step
Epoch 3/8
step ?200/1250 - loss: 0.4893 - acc: 0.9375 - 323ms/step
step ?400/1250 - loss: 0.1291 - acc: 0.9344 - 322ms/step
step ?600/1250 - loss: 0.1544 - acc: 0.9302 - 310ms/step
step ?800/1250 - loss: 0.0948 - acc: 0.9284 - 305ms/step
step 1000/1250 - loss: 0.2340 - acc: 0.9247 - 305ms/step
step 1200/1250 - loss: 0.0474 - acc: 0.9266 - 306ms/step
step 1250/1250 - loss: 0.4171 - acc: 0.9262 - 306ms/step
Epoch 4/8
step ?200/1250 - loss: 0.1890 - acc: 0.9700 - 312ms/step
step ?400/1250 - loss: 0.0281 - acc: 0.9688 - 309ms/step
step ?600/1250 - loss: 0.0076 - acc: 0.9642 - 315ms/step
step ?800/1250 - loss: 0.0038 - acc: 0.9642 - 307ms/step
step 1000/1250 - loss: 0.0757 - acc: 0.9639 - 306ms/step
step 1200/1250 - loss: 0.0072 - acc: 0.9621 - 304ms/step
step 1250/1250 - loss: 0.0525 - acc: 0.9615 - 305ms/step
Epoch 5/8
step ?200/1250 - loss: 0.0035 - acc: 0.9806 - 296ms/step
step ?400/1250 - loss: 0.0149 - acc: 0.9816 - 293ms/step
step ?600/1250 - loss: 0.0142 - acc: 0.9827 - 298ms/step
step ?800/1250 - loss: 0.0300 - acc: 0.9789 - 303ms/step
step 1000/1250 - loss: 0.0716 - acc: 0.9734 - 304ms/step
step 1200/1250 - loss: 0.0867 - acc: 0.9723 - 303ms/step
step 1250/1250 - loss: 0.0047 - acc: 0.9712 - 305ms/step
Epoch 6/8
step ?200/1250 - loss: 0.0346 - acc: 0.9800 - 293ms/step
step ?400/1250 - loss: 3.5588e-04 - acc: 0.9838 - 301ms/step
step ?600/1250 - loss: 0.1159 - acc: 0.9823 - 308ms/step
step ?800/1250 - loss: 0.0036 - acc: 0.9817 - 312ms/step
step 1000/1250 - loss: 0.0824 - acc: 0.9826 - 311ms/step
step 1200/1250 - loss: 0.0160 - acc: 0.9808 - 310ms/step
step 1250/1250 - loss: 0.1026 - acc: 0.9811 - 310ms/step
save checkpoint at /home/aistudio/checkpoints/5
Epoch 7/8
step ?200/1250 - loss: 0.0131 - acc: 0.9888 - 308ms/step
step ?400/1250 - loss: 0.0019 - acc: 0.9884 - 316ms/step
step ?600/1250 - loss: 0.0044 - acc: 0.9879 - 313ms/step
step ?800/1250 - loss: 0.0038 - acc: 0.9864 - 311ms/step
step 1000/1250 - loss: 0.0356 - acc: 0.9864 - 308ms/step
step 1200/1250 - loss: 0.0082 - acc: 0.9868 - 306ms/step
step 1250/1250 - loss: 0.0248 - acc: 0.9869 - 308ms/step
Epoch 8/8
step ?200/1250 - loss: 0.0026 - acc: 0.9912 - 294ms/step
step ?400/1250 - loss: 0.0023 - acc: 0.9891 - 293ms/step
step ?600/1250 - loss: 0.0253 - acc: 0.9879 - 311ms/step
step ?800/1250 - loss: 2.8810e-04 - acc: 0.9864 - 315ms/step
step 1000/1250 - loss: 3.1722e-04 - acc: 0.9859 - 313ms/step
step 1200/1250 - loss: 9.1276e-04 - acc: 0.9852 - 315ms/step
step 1250/1250 - loss: 4.1999e-04 - acc: 0.9850 - 315ms/step
save checkpoint at /home/aistudio/checkpoints/final?
# 导入预训练模型
checkpoint_path = './checkpoints/final' # 填写预训练模型的保存路径
model = SkepForSequenceClassification.from_pretrained('skep_ernie_1.0_large_ch', num_classes=2)
input = InputSpec((-1, -1), dtype='int64', name='input')
model = paddle.Model(model, input)
model.load(checkpoint_path)
# 导入测试集
test_dataloader = get_data_loader(data_dict[data_name]['test'], tokenizer, batch_size, max_len, for_test=True)
# 预测保存
save_file = {'chnsenticorp': './submission/ChnSentiCorp.tsv', 'nlpcc14sc': './submission/NLPCC14-SC.tsv'}
predicts = []
for batch in test_dataloader:
predict = model.predict_batch(batch)
predicts += predict[0].argmax(axis=-1).tolist()
with open(save_file[data_name], 'w', encoding='utf8') as f:
f.write("index\tprediction\n")
for idx, sample in enumerate(data_dict[data_name]['test']):
qid = sample.split('\t')[0]
f.write(qid + '\t' + str(predicts[idx]) + '\n')
f.close()
目标级情感分析将对整句的情感倾向扩充为对多个特定属性的情感倾向,本质上仍然是序列分类,但是针对同一个序列需要进行多次分类,针对不同的属性。这里的思路是将针对的属性也作为输入的一部分传入模型,并预测情感倾向。
import paddlenlp
from paddlenlp.transformers import SkepForSequenceClassification, SkepTokenizer
# 解压数据
!unzip -o datasets/SE-ABSA16_CAME
!unzip -o datasets/SE-ABSA16_PHNS
Archive: ?datasets/SE-ABSA16_CAME.zip
? inflating: SE-ABSA16_CAME/train.tsv ?
? inflating: __MACOSX/SE-ABSA16_CAME/._train.tsv ?
? inflating: SE-ABSA16_CAME/License.pdf ?
? inflating: __MACOSX/SE-ABSA16_CAME/._License.pdf ?
? inflating: SE-ABSA16_CAME/test.tsv ?
? inflating: __MACOSX/SE-ABSA16_CAME/._test.tsv ?
? inflating: __MACOSX/._SE-ABSA16_CAME ?
Archive: ?datasets/SE-ABSA16_PHNS.zip
? inflating: SE-ABSA16_PHNS/train.tsv ?
? inflating: __MACOSX/SE-ABSA16_PHNS/._train.tsv ?
? inflating: SE-ABSA16_PHNS/License.pdf ?
? inflating: __MACOSX/SE-ABSA16_PHNS/._License.pdf ?
? inflating: SE-ABSA16_PHNS/test.tsv ?
? inflating: __MACOSX/SE-ABSA16_PHNS/._test.tsv ?
? inflating: __MACOSX/._SE-ABSA16_PHNS
数据内部结构解析(两个数据集的结构相同):
???????train: | ||
label | text_a | text_b |
1 | phone#design_features | 今天有幸拿到了港版白色iPhone 5真机,试玩了一下,说说感受吧:1. 真机尺寸宽度与4/4s保持一致没有变化... |
0 | software#operation_performance | 苹果iPhone5新机到手 对比4S使用感受1,外观。一开始看发布会和网上照片,我和大多数人观点一样:变化不大,有点小失望。... |
test: | ||
qid | text_a | text_b |
0 | software#usability | 刚刚入手8600,体会。刚刚从淘宝购买,1635元(包邮)。1、全新,... |
... | ... | ... |
# 得到数据集字典
# 得到数据集字典
def open_func(file_path):
return [line.strip() for line in open(file_path, 'r', encoding='utf8').readlines()[1:] if len(line.strip().split('\t')) >= 2]
data_dict = {'seabsa16phns': {'test': open_func('SE-ABSA16_PHNS/test.tsv'),
'train': open_func('SE-ABSA16_PHNS/train.tsv')},
'seabsa16came': {'test': open_func('SE-ABSA16_CAME/test.tsv'),
'train': open_func('SE-ABSA16_CAME/train.tsv')}}
方法与1.2中相似,基本是完全粘贴复制过来即可。这里注意需要两个text,并且要考虑token_type_id了。
# 定义数据集
from paddle.io import Dataset, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
import numpy as np
label_list = [0, 1]
# 考虑token_type_id
class MyDataset(Dataset):
def __init__(self, data, tokenizer, max_len=512, for_test=False):
super().__init__()
self._data = data
self._tokenizer = tokenizer
self._max_len = max_len
self._for_test = for_test
def __len__(self):
return len(self._data)
def __getitem__(self, idx):
samples = self._data[idx].split('\t')
label = samples[-3]
text_b = samples[-1]
text_a = samples[-2]
label = int(label)
encoder_out = self._tokenizer.encode(text_a, text_b, max_seq_len=self._max_len)
text = encoder_out['input_ids']
token_type = encoder_out['token_type_ids']
if self._for_test:
return np.array(text, dtype='int64'), np.array(token_type, dtype='int64')
else:
return np.array(text, dtype='int64'), np.array(token_type, dtype='int64'), np.array(label, dtype='int64')
def batchify_fn(for_test=False):
if for_test:
return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
Pad(axis=0, pad_val=tokenizer.pad_token_type_id)): [data for data in fn(samples)]
else:
return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
Stack()): [data for data in fn(samples)]
def get_data_loader(data, tokenizer, batch_size=32, max_len=512, for_test=False):
dataset = MyDataset(data, tokenizer, max_len, for_test)
shuffle = True if not for_test else False
data_loader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
return data_loader
把1.3的复制粘贴过来,注意该数据集名称,并加上一个token_type_id的输入。
import paddle
from paddle.static import InputSpec
# 模型和分词
model = SkepForSequenceClassification.from_pretrained('skep_ernie_1.0_large_ch', num_classes=2)
tokenizer = SkepTokenizer.from_pretrained('skep_ernie_1.0_large_ch')
# 参数设置 seabsa16phns seabsa16came
data_name = 'seabsa16phns' # 更改此选项改变数据集
## 训练相关
epochs = 1
learning_rate = 2e-5
batch_size = 8
max_len = 512
## 数据相关
train_dataloader = get_data_loader(data_dict[data_name]['train'], tokenizer, batch_size, max_len, for_test=False)
input = InputSpec((-1, -1), dtype='int64', name='input')
token_type = InputSpec((-1, -1), dtype='int64', name='token_type')
label = InputSpec((-1, 2), dtype='int64', name='label')
model = paddle.Model(model, [input, token_type], [label])
# 模型准备
optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, parameters=model.parameters())
model.prepare(optimizer, loss=paddle.nn.CrossEntropyLoss(), metrics=[paddle.metric.Accuracy()])
# 开始训练
model.fit(train_dataloader, batch_size=batch_size, epochs=epochs, save_freq=5, save_dir='./checkpoints', log_freq=200)
# 导入预训练模型
checkpoint_path = './checkpoints/final' # 填写预训练模型的保存路径
model = SkepForSequenceClassification.from_pretrained('skep_ernie_1.0_large_ch', num_classes=2)
input = InputSpec((-1, -1), dtype='int64', name='input')
token_type = InputSpec((-1, -1), dtype='int64', name='token_type')
model = paddle.Model(model, [input, token_type])
model.load(checkpoint_path)
# 导入测试集
test_dataloader = get_data_loader(data_dict[data_name]['test'], tokenizer, batch_size, max_len, for_test=True)
# 预测保存
save_file = {'seabsa16phns': './submission/SE-ABSA16_PHNS.tsv', 'seabsa16came': './submission/SE-ABSA16_CAME.tsv'}
predicts = []
for batch in test_dataloader:
predict = model.predict_batch(batch)
predicts += predict[0].argmax(axis=-1).tolist()
with open(save_file[data_name], 'w', encoding='utf8') as f:
f.write("index\tprediction\n")
for idx, sample in enumerate(data_dict[data_name]['test']):
qid = sample.split('\t')[0]
f.write(qid + '\t' + str(predicts[idx]) + '\n')
f.close()
import paddlenlp
from paddlenlp.transformers import SkepForTokenClassification, SkepTokenizer
# 解压数据
!unzip -o datasets/COTE-BD
!unzip -o datasets/COTE-DP
!unzip -o datasets/COTE-MFW
Archive: ?datasets/COTE-BD.zip
? ?creating: COTE-BD/
? inflating: COTE-BD/train.tsv ? ? ??
? ?creating: __MACOSX/COTE-BD/
? inflating: __MACOSX/COTE-BD/._train.tsv ?
? inflating: COTE-BD/License.pdf ? ??
? inflating: __MACOSX/COTE-BD/._License.pdf ?
? inflating: COTE-BD/test.tsv ? ? ? ?
? inflating: __MACOSX/COTE-BD/._test.tsv ?
? inflating: __MACOSX/._COTE-BD ? ? ?
Archive: ?datasets/COTE-DP.zip
? ?creating: COTE-DP/
? inflating: COTE-DP/train.tsv ? ? ??
? ?creating: __MACOSX/COTE-DP/
? inflating: __MACOSX/COTE-DP/._train.tsv ?
? inflating: COTE-DP/License.pdf ? ??
? inflating: __MACOSX/COTE-DP/._License.pdf ?
? inflating: COTE-DP/test.tsv ? ? ? ?
? inflating: __MACOSX/COTE-DP/._test.tsv ?
? inflating: __MACOSX/._COTE-DP ? ? ?
Archive: ?datasets/COTE-MFW.zip
? ?creating: COTE-MFW/
? inflating: COTE-MFW/train.tsv ? ? ?
? ?creating: __MACOSX/COTE-MFW/
? inflating: __MACOSX/COTE-MFW/._train.tsv ?
? inflating: COTE-MFW/License.pdf ? ?
? inflating: __MACOSX/COTE-MFW/._License.pdf ?
? inflating: COTE-MFW/test.tsv ? ? ??
? inflating: __MACOSX/COTE-MFW/._test.tsv ?
? inflating: __MACOSX/._COTE-MFW
数据内部结构解析(三个数据集的结构相同):
train: | ||
label | text_a | text_b |
1 | phone#design_features | 今天有幸拿到了港版白色iPhone 5真机,试玩了一下,说说感受吧:1. 真机尺寸宽度与4/4s保持一致没有变化... |
0 | software#operation_performance | 苹果iPhone5新机到手 对比4S使用感受1,外观。一开始看发布会和网上照片,我和大多数人观点一样:变化不大,有点小失望。... |
test: | ||
qid | text_a | text_b |
0 | software#usability | 刚刚入手8600,体会。刚刚从淘宝购买,1635元(包邮)。1、全新,... ... ... ... |
# 得到数据集字典
def open_func(file_path):
return [line.strip() for line in open(file_path, 'r', encoding='utf8').readlines()[1:] if len(line.strip().split('\t')) >= 2]
data_dict = {'cotebd': {'test': open_func('COTE-BD/test.tsv'),
'train': open_func('COTE-BD/train.tsv')},
'cotedp': {'test': open_func('COTE-DP/test.tsv'),
'train': open_func('COTE-DP/train.tsv')},
'cotemfw': {'test': open_func('COTE-MFW/test.tsv'),
'train': open_func('COTE-MFW/train.tsv')}}
思路类似,需要注意的是这一次是Tokens级的分类。在数据读取器中,将label写成BIO的形式,每一个token都对应一个label。
# 定义数据集
from paddle.io import Dataset, DataLoader
from paddlenlp.data import Pad, Stack, Tuple
import numpy as np
label_list = {'B': 0, 'I': 1, 'O': 2}
index2label = {0: 'B', 1: 'I', 2: 'O'}
# 考虑token_type_id
class MyDataset(Dataset):
def __init__(self, data, tokenizer, max_len=512, for_test=False):
super().__init__()
self._data = data
self._tokenizer = tokenizer
self._max_len = max_len
self._for_test = for_test
def __len__(self):
return len(self._data)
def __getitem__(self, idx):
samples = self._data[idx].split('\t')
label = samples[-2]
text = samples[-1]
if self._for_test:
origin_enc = self._tokenizer.encode(text, max_seq_len=self._max_len)['input_ids']
return np.array(origin_enc, dtype='int64')
else:
# 由于并不是每个字都是一个token,这里采用一种简单的处理方法,先编码label,再编码text中除了label以外的词,最后合到一起
texts = text.split(label)
label_enc = self._tokenizer.encode(label)['input_ids']
cls_enc = label_enc[0]
sep_enc = label_enc[-1]
label_enc = label_enc[1:-1]
# 合并
origin_enc = []
label_ids = []
for index, text in enumerate(texts):
text_enc = self._tokenizer.encode(text)['input_ids']
text_enc = text_enc[1:-1]
origin_enc += text_enc
label_ids += [label_list['O']] * len(text_enc)
if index != len(texts) - 1:
origin_enc += label_enc
label_ids += [label_list['B']] + [label_list['I']] * (len(label_enc) - 1)
origin_enc = [cls_enc] + origin_enc + [sep_enc]
label_ids = [label_list['O']] + label_ids + [label_list['O']]
# 截断
if len(origin_enc) > self._max_len:
origin_enc = origin_enc[:self._max_len-1] + origin_enc[-1:]
label_ids = label_ids[:self._max_len-1] + label_ids[-1:]
return np.array(origin_enc, dtype='int64'), np.array(label_ids, dtype='int64')
def batchify_fn(for_test=False):
if for_test:
return lambda samples, fn=Pad(axis=0, pad_val=tokenizer.pad_token_id): np.row_stack([data for data in fn(samples)])
else:
return lambda samples, fn=Tuple(Pad(axis=0, pad_val=tokenizer.pad_token_id),
Pad(axis=0, pad_val=label_list['O'])): [data for data in fn(samples)]
def get_data_loader(data, tokenizer, batch_size=32, max_len=512, for_test=False):
dataset = MyDataset(data, tokenizer, max_len, for_test)
shuffle = True if not for_test else False
data_loader = DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=batchify_fn(for_test), shuffle=shuffle)
return data_loader
与之前不同的是模型换成了Token分类。由于Accuracy不再适用于Token分类,我们用Perplexity来大致衡量预测的准确度(接近1为最佳)。
import paddle
from paddle.static import InputSpec
from paddlenlp.metrics import Perplexity
# 模型和分词
model = SkepForTokenClassification.from_pretrained('skep_ernie_1.0_large_ch', num_classes=3)
tokenizer = SkepTokenizer.from_pretrained('skep_ernie_1.0_large_ch')
# 参数设置 cotebd cotedp cotemfw
data_name = 'cotedp' # 更改此选项改变数据集
## 训练相关
epochs = 1
learning_rate = 2e-5
batch_size = 8
max_len = 512
## 数据相关
train_dataloader = get_data_loader(data_dict[data_name]['train'], tokenizer, batch_size, max_len, for_test=False)
input = InputSpec((-1, -1), dtype='int64', name='input')
label = InputSpec((-1, -1, 3), dtype='int64', name='label')
model = paddle.Model(model, [input], [label])
# 模型准备
optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, parameters=model.parameters())
model.prepare(optimizer, loss=paddle.nn.CrossEntropyLoss(), metrics=[Perplexity()])
# 开始训练
model.fit(train_dataloader, batch_size=batch_size, epochs=epochs, save_freq=5, save_dir='./checkpoints', log_freq=200)
step ?200/9381 - loss: 0.0223 - Perplexity: 1.0659 - 248ms/step
step ?400/9381 - loss: 0.0967 - Perplexity: 1.0434 - 247ms/step
step ?600/9381 - loss: 0.0789 - Perplexity: 1.0363 - 244ms/step
step ?800/9381 - loss: 0.0090 - Perplexity: 1.0323 - 244ms/step
step 1000/9381 - loss: 0.0143 - Perplexity: 1.0299 - 243ms/step
step 1200/9381 - loss: 0.0210 - Perplexity: 1.0279 - 244ms/step
step 1400/9381 - loss: 0.0419 - Perplexity: 1.0264 - 243ms/step
step 1600/9381 - loss: 0.0113 - Perplexity: 1.0252 - 244ms/step
step 1800/9381 - loss: 0.0369 - Perplexity: 1.0243 - 243ms/step
step 2000/9381 - loss: 0.0090 - Perplexity: 1.0236 - 242ms/step
step 2200/9381 - loss: 0.0095 - Perplexity: 1.0232 - 241ms/step
step 2400/9381 - loss: 0.0087 - Perplexity: 1.0226 - 241ms/step
step 2600/9381 - loss: 0.0055 - Perplexity: 1.0221 - 241ms/step
step 2800/9381 - loss: 0.0229 - Perplexity: 1.0216 - 241ms/step
step 3000/9381 - loss: 0.0281 - Perplexity: 1.0213 - 241ms/step
step 3200/9381 - loss: 0.0069 - Perplexity: 1.0209 - 241ms/step
step 3400/9381 - loss: 0.0036 - Perplexity: 1.0205 - 242ms/step
step 3600/9381 - loss: 0.0242 - Perplexity: 1.0203 - 242ms/step
step 3800/9381 - loss: 0.0070 - Perplexity: 1.0200 - 242ms/step
step 4000/9381 - loss: 0.0610 - Perplexity: 1.0197 - 242ms/step
step 4200/9381 - loss: 0.0042 - Perplexity: 1.0195 - 242ms/step
step 4400/9381 - loss: 0.0139 - Perplexity: 1.0193 - 242ms/step
step 4600/9381 - loss: 0.0333 - Perplexity: 1.0192 - 242ms/step
step 4800/9381 - loss: 0.0978 - Perplexity: 1.0189 - 242ms/step
step 5000/9381 - loss: 0.0255 - Perplexity: 1.0188 - 242ms/step
step 5200/9381 - loss: 0.0517 - Perplexity: 1.0187 - 242ms/step
step 5400/9381 - loss: 0.0153 - Perplexity: 1.0186 - 242ms/step
step 5600/9381 - loss: 0.0032 - Perplexity: 1.0185 - 242ms/step
step 5800/9381 - loss: 0.0134 - Perplexity: 1.0184 - 242ms/step
step 6000/9381 - loss: 0.0047 - Perplexity: 1.0183 - 242ms/step
step 6200/9381 - loss: 0.0198 - Perplexity: 1.0181 - 243ms/step
step 6400/9381 - loss: 0.0026 - Perplexity: 1.0180 - 243ms/step
step 6600/9381 - loss: 0.0200 - Perplexity: 1.0179 - 243ms/step
step 6800/9381 - loss: 0.0050 - Perplexity: 1.0179 - 243ms/step
step 7000/9381 - loss: 0.0191 - Perplexity: 1.0177 - 243ms/step
step 7200/9381 - loss: 0.0034 - Perplexity: 1.0176 - 244ms/step
step 7400/9381 - loss: 0.0127 - Perplexity: 1.0175 - 244ms/step
step 7600/9381 - loss: 0.0123 - Perplexity: 1.0174 - 244ms/step
step 7800/9381 - loss: 0.0934 - Perplexity: 1.0173 - 243ms/step
step 8000/9381 - loss: 0.0090 - Perplexity: 1.0173 - 243ms/step
step 8200/9381 - loss: 0.0314 - Perplexity: 1.0172 - 244ms/step
step 8400/9381 - loss: 0.0063 - Perplexity: 1.0172 - 244ms/step
step 8600/9381 - loss: 0.0073 - Perplexity: 1.0171 - 243ms/step
step 8800/9381 - loss: 0.0025 - Perplexity: 1.0170 - 244ms/step
step 9000/9381 - loss: 0.0121 - Perplexity: 1.0169 - 243ms/step
step 9200/9381 - loss: 0.0157 - Perplexity: 1.0169 - 244ms/step
step 9381/9381 - loss: 0.0330 - Perplexity: 1.0168 - 244ms/step
save checkpoint at /home/aistudio/checkpoints/0
save checkpoint at /home/aistudio/checkpoints/final
import re
# 导入预训练模型
checkpoint_path = './checkpoints/final' # 填写预训练模型的保存路径
model = SkepForTokenClassification.from_pretrained('skep_ernie_1.0_large_ch', num_classes=3)
input = InputSpec((-1, -1), dtype='int64', name='input')
model = paddle.Model(model, [input])
model.load(checkpoint_path)
# 导入测试集
test_dataloader = get_data_loader(data_dict[data_name]['test'], tokenizer, batch_size, max_len, for_test=True)
# 预测保存
save_file = {'cotebd': './submission/COTE_BD.tsv', 'cotedp': './submission/COTE_DP.tsv', 'cotemfw': './submission/COTE_MFW.tsv'}
predicts = []
input_ids = []
for batch in test_dataloader:
predict = model.predict_batch(batch)
predicts += predict[0].argmax(axis=-1).tolist()
input_ids += batch.numpy().tolist()
# 先找到B所在的位置,即标号为0的位置,然后顺着该位置一直找到所有的I,即标号为1,即为所得。
def find_entity(prediction, input_ids):
entity = []
entity_ids = []
for index, idx in enumerate(prediction):
if idx == label_list['B']:
entity_ids = [input_ids[index]]
elif idx == label_list['I']:
if entity_ids:
entity_ids.append(input_ids[index])
elif idx == label_list['O']:
if entity_ids:
entity.append(''.join(tokenizer.convert_ids_to_tokens(entity_ids)))
entity_ids = []
return entity
with open(save_file[data_name], 'w', encoding='utf8') as f:
f.write("index\tprediction\n")
for idx, sample in enumerate(data_dict[data_name]['test']):
qid = sample.split('\t')[0]
entity = find_entity(predicts[idx], input_ids[idx])
entity = list(set(entity)) # 去重
entity = [re.sub('##', '', e) for e in entity] # 去除英文编码时的特殊符号
entity = [re.sub('[UNK]', '', e) for e in entity] # 去除未知符号
f.write(qid + '\t' + '\x01'.join(entity) + '\n')
f.close()
!zip -r submission.zip submission
updating: submission/ (stored 0%)
updating: submission/COTE_BD.tsv (deflated 44%)
updating: submission/SE-ABSA16_CAME.tsv (deflated 65%)
updating: submission/ChnSentiCorp.tsv (deflated 63%)
updating: submission/COTE_DP.tsv (deflated 54%)
updating: submission/SE-ABSA16_PHNS.tsv (deflated 65%)
updating: submission/NLPCC14-SC.tsv (deflated 64%)
updating: submission/COTE_MFW.tsv (deflated 54%)?