目录
问题描述:
问题解决:
问题描述:
原始数据集中,数据的标注是以word为单位,且计数从1开始的。如下图:
如何修改为以char为单位的,从0开始计数的数据格式:
问题解决:
# 将英文数据集中,原本按照word存储的数据集,按照char格式存储,处理成与中文一样的格式
path = '/home/qtxu/Sentiment-SPN/data/Camera-COQE/dev.txt'
path_w = '/home/qtxu/Sentiment-SPN/data/Camera-COQE/dev_char.txt'
from pdb import set_trace as stop
def obtain_index(cur_ele): # '[10&&would 11&¬ , 17&&difference]' ## '[10&¬ , 16&&clearer]' ##[13&&did 14&&n't , 20&&as 21&&well 22&&as]
len_ele = len(cur_ele)
if ' ,' in cur_ele: # 针对几个带有逗号的特殊处理
start_index = cur_ele.find(',')
cur_ele = '['+cur_ele[start_index+2:]
if len_ele == 2:
index_list = []
span_str = ''
return index_list,span_str
else:
cur_ele = cur_ele[1:-1]
# try:
index_list = [int(ele.split('&&')[0])-1 for ele in cur_ele.split(' ')]
span_str = ' '.join(ele.split('&&')[1] for ele in cur_ele.split(' '))
# except:
# stop()
return index_list, span_str
def word_to_char(sentence, span, span_index):
if len(span)==0:
return '[]'
else:
span_start_index = span_index[0]
front_str = ' '.join(sentence.split(' ')[:span_start_index])
span_len = len(front_str)
result_str = ""
if span_start_index == 0:
i = 0
else:
i = 1
for char in span:
start_index = span_len + i
cur_char = f"{start_index}&&{char} "
result_str += cur_char
i += 1
# 移除末尾的空格
result_str = '['+result_str.rstrip()+']'
return result_str
with open(path, 'r') as fr, open(path_w, 'w') as fw:
lines = fr.readlines()
for line in lines:
try:
sent, label = line.strip().split('\t')
fw.write(line)
except:
if '[[];[];[];[];[]]' in line:
fw.write(line)
else:
# stop()
cur_line = line.strip()[1:-1]
sub,obj,asp,op,polarity = cur_line.split(';')
sub_index, sub_span = obtain_index(sub)
obj_index, obj_span = obtain_index(obj)
asp_index, asp_span = obtain_index(asp)
op_index, op_span = obtain_index(op)
sub_char= word_to_char(sent, sub_span,sub_index)
obj_char= word_to_char(sent, obj_span,obj_index)
asp_char= word_to_char(sent, asp_span,asp_index)
op_char= word_to_char(sent, op_span,op_index)
char_quintuple = '['+ str(sub_char) + ';' + str(obj_char) +';'+ str(asp_char) +';'+str(op_char) +';' + polarity + ']'
# polarity
fw.write(char_quintuple+'\n')
# print(sub_char)