脑子不太好使,经常写过的代码,后面需要找的时候,又找不到了,just记录下
将原始的coqe代码,转换成大模型需要的json格式,其中的instruction可以根据自己的实际需求,进行更改。
from tqdm import tqdm, trange
import os
import re
from typing import List
import json
from pdb import set_trace as stop
pipeline_data_path = "/public/home/hongy/qtxu/Qwen-main/data/version2/Ele-COQE/test.txt"
jsonl_data_path = "/public/home/hongy/qtxu/Qwen-main/data/version2/Ele-COQE/dev_w.json"
dic_en = { -1: 'worse', 0: 'equal', 1: 'better', 2: 'different'}
dic_zh = { -1: '更差', 0: '相等', 1: '更好', 2: '不同'}
def str_to_span(input_str):
#[3&高 4&端 5&机]--> 高端机
if len(input_str) == 0:
span_str = ''
indexs_str = ''
else:
if ' , ' in input_str: # '21&没 22&有 , 25&细 26&致' --> '21&没 22&有 25&细 26&致'
input_str = input_str.replace(' , ', ' ')
indexs, span = zip(*[i.split('&') for i in input_str.split()])
indexs_str = ':'.join(indexs)
span_str = ''.join(span)
return indexs_str, span_str
def process_line(text_line, label_line, kind, i):
# kind: en or zh; line_id: the number of sentences
text = text_line.split('\t')[0].strip() # text_line:当前行, text:sentence
have_triples = int(text_line.split('\t')[1]) # obtain the label is comparative (1) or no-comparative (0)
re_result = re.findall(r'\[\[(.*?)\];\[(.*?)\];\[(.*?)\];\[(.*?)\];\[(.*?)\]\]', label_line)
# label_line--> re_result:去除原始数据中的[],以及;
raw_labels: List = [[x for x in y] for y in re_result] #一个样本label 存放在一个list中
sample = {'id': "identity"+"_"+str(i), 'conversations': []}
dict_conver1 = {"from": "user", "is_compare":'', "value": ''}
dict_conver2 = {"from": "assistant", "value":''}
instruction = f"对比观点抽取任务:从输入语句中抽取所有的对比观点五元组(比较对象,被比较对象,对比属性,观点句,观点极性),抽取的元素允许为空。\n\n对比观点五元组解释:比较对象、被比较对象、对比属性和观点句必须是输入语句中出现的短语。比较对象和被比较对象可以是商品名、商品型号或代词。对比属性是对比的角度或方面,一版是比较对象和被比较对象的属性。观点句包含主观情感的短语。观点极性包括:更差、等同、更好、不同。\n\n请从输入语句({text})中抽取所有的对比观点五元组,并给出比较对象、被比较对象、对比属性、观点句在输入语句中的位置(位置从0开始编号):"
if have_triples == 0:
dict_conver1["is_compare"] = 0
dict_conver1["value"]=instruction
dict_conver2["value"]= "第1个五元组:(,,,,)\n元组位置:(,,,)\n"
sample["conversations"].append(dict_conver1)
sample["conversations"].append(dict_conver2)
return sample
if have_triples == 1:
dict_conver1["is_compare"] = 1
dict_conver1["value"]=instruction
number = 0
value = ''
for label in raw_labels: # 比较句
number += 1
sub, obj, asp, op, polarity = label[0], label[1], label[2], label[3], label[4]
sub_index, sub_span = str_to_span(sub)
obj_index, obj_span = str_to_span(obj)
asp_index, asp_span = str_to_span(asp)
op_index, op_span = str_to_span(op)
polarity = dic_zh[int(polarity)]
quintuple_span= "("+sub_span+","+obj_span +","+asp_span+","+op_span+","+polarity+")"
quintuple_indexs = "("+sub_index+"," +obj_index+","+asp_index+"," +op_index+")"
value = value + f"第{number}个五元组:{quintuple_span}\n元组位置:{quintuple_indexs}\n"
dict_conver2["value"] = value
sample["conversations"].append(dict_conver1)
sample["conversations"].append(dict_conver2)
return sample
def load_data(path, kind):
raw_data = []
# with open(os.path.join(args.data_path, f'{mode}_char.txt'), 'r') as f:
with open(path, 'r') as f:
for line in f:
raw_data.append(line)
all_samples = []
line_id, i = 0, 0
text_line, label_line = '', ''
for line_id in trange(len(raw_data), desc='processing data for mode'):
cur_line = raw_data[line_id]
if len(cur_line.split('\t')) != 2:
label_line += '\n' + cur_line
else:
# a new text line, so push the last text and update text_line
if text_line != '':
all_samples.append(process_line(text_line, label_line, kind, i))
i += 1
text_line = cur_line
label_line = ''
all_samples.append(process_line(text_line, label_line, kind, i))
return all_samples
kind = 'en'
json_data = load_data(pipeline_data_path, kind)
with open(jsonl_data_path, 'w') as fw:
fw.write(json.dumps(json_data, ensure_ascii=False))
2024年,第一天,加油,打工人~