import pkuseg
c = pkuseg.pkuseg(user_dict="./data/dict.txt")
sentence = '数字传播实验班'
print(c.cut(sentence))
字典中包含“”数字传媒与人文学院",添加自定义词典后,文本被错误分成““数字传 播 实验班” ,debug发现solve方法有bug
GPT4分析:
修改后的方法:
def solve(self, txt):
outlst = []
iswlst = []
taglst = []
l = len(txt)
last = 0
i = 0
while i < l:
now = self.trie
j = i
found = False
usertag = ''
last_word_idx = -1
while j < l:
c = txt[j]
if c not in now.children:
break
now = now.children[c]
if now.isword:
found = True
last_word_idx = j
usertag = now.usertag
j += 1
if found:
if last != i:
outlst.append(txt[last:i])
iswlst.append(False)
taglst.append('')
outlst.append(txt[i:last_word_idx + 1])
iswlst.append(True)
taglst.append(usertag)
last = last_word_idx + 1
i = last_word_idx + 1
else:
i += 1
if last < l:
outlst.append(txt[last:l])
iswlst.append(False)
taglst.append('')
return outlst, iswlst, taglst
重新运行结果: