import string
import re
from nltk.util import ngrams
classCleanerDedupLineByNgram():def__init__(self):# 定义行分隔符和元组分隔符
self.line_delimiter =list("\n")
chinese_punctuation =",。!?:;“”‘’()《》【】、|—"
self.gram_delimiter =list(string.punctuation)+list(chinese_punctuation)+[' ']defclean_single_text(self, text:str, n:int=5, thre_sim:float=0.95)->str:# 依靠行分隔符分割所有行
lines =[each for each in re.split('|'.join(map(re.escape, self.line_delimiter)), text)if each !='']
lineinfo, last =list(),{}for idx, line inenumerate(lines):# 计算每行的n 元组# 依靠元组分隔符分割所有N 元组,并将其暂时存储到lineinfo 里
grams =[each for each in re.split('|'.join(map(re.escape, self.gram_delimiter)), line)if each !='']
computed_ngrams =list(ngrams(grams,min(len(grams), n)))
lineinfo.append({"lineno": idx,"text": line,"n":min(len(grams), n),"ngrams": computed_ngrams,"keep":0})for idx, each inenumerate(lineinfo):# 过滤掉和相邻行之间n 元组的 Jaccard 相似度超过 thre_sim 的行if last =={}:
each["keep"], last =1, each
else:# 计算相邻行间的Jaccard 相似度
ngrams_last, ngrams_cur =set(last["ngrams"]),set(each["ngrams"])
ngrams_intersection, ngrams_union =len(ngrams_last.intersection(ngrams_cur)),len(ngrams_last.union(ngrams_cur))
jaccard_sim = ngrams_intersection / ngrams_union if ngrams_union !=0else0if jaccard_sim < thre_sim:
each["keep"], last =1, each
# 将所有未被过滤掉的N 元组重新拼接起来
text = self.line_delimiter[0].join([each["text"]for each in lineinfo if each["keep"]==1])return text
隐私过滤
去除身份证号:对每个输入的文本,下面使用正则替换的方式将匹配到的身份证号替换为特定字符串
from utils.rules.regex import REGEX_IDCARD
from utils.cleaner.cleaner_base import CleanerBase
classCleanerSubstitutePassageIDCard(CleanerBase):def__init__(self):super().__init__()defclean_single_text(self, text:str, repl_text:str="**MASKED**IDCARD**")->str:# 使用正则表达式REGEX_IDCARD 匹配身份证号,用repl_text 代替return self._sub_re(text=text, re_text=REGEX_IDCARD, repl_text=repl_text)
题目要求: This time let us consider the situation in the movie "Live and Let Die" in which James Bond, the worlds most famous spy, was captured by a group of drug dealers. He was sent to a small piece of land at the center of a lake fi…