背景
老人家不识字,在城市生活不便,喜欢去基督教堂,但是听不懂,也难以和姊妹们(老头老太太们)交流。于是想教他识字,从哪里教起呢,不如从 《圣经》的常用字词开始吧,于是花了几分钟把《圣经》热词统计了一下,代码如下。
代码
# 从 网上下载圣经文字版本: https://drive.my-elibrary.com/%E8%97%8F%E4%B9%A6%E5%85%A5%E5%8F%A3/%E7%B3%BB%E7%BB%9F/%E5%9C%A3%E7%BB%8F%E7%89%88%E6%9C%AC%20%20%20%E4%B8%8E%20%20%20%E5%9C%A3%E7%BB%8F%E7%A0%94%E8%AF%BB/sj/
# 安装jieba分词: pip install jieba
import jieba
sj = open("/Users/xxxx/Downloads/sj.txt")
hotwds = open("/Users/xxxx/hotwds.txt", "w+")
counts = {}
stop_words = [' ', '\u3000', '“', '”', ",","。",", ", '. ',':',': ',':',';','、',';','; ','的','1','2','3','4','5','6','7','8','9','0','\n','‘','!', '’', '(',')','?']
count = 0
line = sj.readline()
while line:
count = count + 1
# if count > 10000:
# break
words = jieba.lcut(line)
# print(words)
for word in words:
if word in stop_words:
continue
# print(word)
counts[word] = counts.get(word,0) + 1
line = sj.readline()
print(count)
items = list(counts.items())
items.sort(key = lambda x:x[1],reverse = True)
print(len(counts.keys()))
res = ""
# 取出排名前200并打印
for i in range(200):
word,count = items[i]
res = res + word + ", "
print("{0:<10}{1:>5}".format(word,count))
hotwds.write(res)
hotwds.close()