Go语言开发后台框架不能只有CRUD还需有算法集成基础功能-GoFly框架集成了自然语言处理(NLP)分词、关键词提取和情感分析

前言

Go语言开发框架，我们要把Go的优势体现在框架中，不仅CRUD常规操作，还要把常用即有算力自己集成到框架中，而不是去购买第三方提供服务接口。作为开发者可以拓宽自己代码面，获取更多成就感，同时也提供自己竞争性和自身价值。对应项目来说我们可以减少长期额外成本、维护接口成本。有很多成熟算法模型都可直接拿来用，效果和购买第三方接口服务是一样的，很多三方接口也是基于这些算法模型集成，再把集成接口对外售卖。

好了，接下来我们开始介绍本文自然语言处理插件了。自然语言处理（NLP）是人工智能领域的一个重要分支，它致力于让计算机能够理解、处理和生成人类语言。近年来，自然语言处理技术的发展迅速，并且在多个领域的应用已经进入到实际落地，同时它也拓宽了软件开发的新境界，为信息技术和各个行业带来了革命性的变革。本插件我们给大家集成了文本分词、预处理、提取关键词、情感分析。我们集成这个情感分析不依赖三方插件，方便安装使用，不像网上很多是依赖python，使用时还要本地编译后再调用，很麻烦。所以我们才花时间去搞纯Go版本。

应用方向

1.分词

分词功能在文本预处理应该较多，下面功能都是先对文本进行分词处理，再根据需要进行业务代码编写，如：词干提取、关键词提取、词云图片。

搜索引擎：用于建立搜索引擎索引时对文档进行分词处理，以便于检索和匹配用户查询。
文本分类：对文本进行分类之前，需要对文本进行分词处理，提取特征。
信息检索：在信息检索系统中，分词是将用户查询和文档内容进行匹配的重要步骤。
机器翻译：在进行机器翻译时，需要先将待翻译文本进行分词，然后进行翻译处理。
情感分析：对文本进行情感分析时，首先需要对文本进行分词，提取出情感相关的词语。

2.情感分析

商品评论情感分析获取评论分数、客服聊天分析客户反馈信息满意的和工作人员服务态度统计。它可以帮助企业更好地了解用户及其客户服务满意度，因此在市场营销，用户体验优化等方面非常有用，更多应用范围如下：

社交媒体：识别用户在Twitter、Facebook等平台上表达的情感倾向，以了解趋势和市场。
电子商务：分析客户对产品和服务的评价，以提高产品质量和客户满意度。
新闻媒体：分析读者对新闻文章的反馈，以了解热门话题和观点。
人力资源：分析员工对公司文化和工作环境的评价，以提高员工满意度和竞争力。

情感分析算法说明：

对字符串集执行朴素Bayesian分类到任意数量的类，对词条频率-逆文档频率计算（TF-IDF）

算法库地址：https://github.com/jbrukh/bayesian

使用前还需要对模型训练，安装好插件后我们在后台看到如下截图界面，安装界面提示进行训练。

使用介绍

引入插件

import (
    "gofly/utils/plugin"
)

1.情感接口函数

感情分析的训练

//isNew: 是否新生成结果文件，true-重新生成 false-追加
err := plugin.GonlpTraining(true)

计算文本情感值和预警级别

distance, err := plugin.GonlpCalcSemti("我们公司的股票今天涨了，太开心了。")

计算文本情感值和预警级别-接口示例

// 测试计算文本情感值和预警级别
func (api *Test) NlpSentiment(c *gf.GinCtx) {
    param, _ := gf.RequestParam(c)
    if param["text"] == "" {
        gf.Failed().SetMsg("参数text不能为空").Regin(c)
        return
    }
    distance, err := plugin.GonlpCalcSemti(gf.String(param["text"]))
    if err != nil {
        gf.Failed().SetMsg(err.Error()).Regin(c)
        return
    }
    gf.Success().SetMsg("计算文本情感值和预警级别").SetData(distance).Regin(c)
}

接口测试截图

计算结果：

{
  "sentiType": 1,
  "sentiText": "正面",
  "scores": [
    -204.7069297251559,
    -228.64907138697052
  ]
}

2.Seg分词接口函数

加载默认词典 -简体中文

 plugin.Seg.LoadDict()

更多词典配置：

// 加载默认 embed 词典
 plugin.Seg.LoadDictEmbed()
// 加载简体中文词典
  plugin.Seg.LoadDict("zh_s")
 plugin.Seg.LoadDictEmbed("zh_s")
// 加载繁体中文词典
plugin.Seg.LoadDict("zh_t")
// 加载日文词典
 plugin.Seg.LoadDict("jp")

向令牌添加新文本

eg.AddToken("太空针", 100)
plugin.Seg.AddToken("太空针", 100, "n")

检查文本是否添加到令牌

freq, pos, ok := plugin.Seg.Find("太空针")
//freq=100,pos=n,ok=是否存在

移除令牌

err = plugin.Seg.RemoveToken("太空针")

分词的使用

// 1.普通分词 true=使用 DAG 和 HMM
hmm :=  plugin.Seg.Cut(text, true)
cut :=  plugin.Seg.Cut(text)
// 2.使用搜索引擎模式将字符串切割为单词true=search use hmm
hmm =  plugin.Seg.CutSearch(text, true)
hmm =  plugin.Seg.CutSearch(text)
// 3.使用完整模式将字符串切割为单词
cut =  plugin.Seg.CutAll(text)
// 4.
s :=  plugin.Seg.CutStr(cut, ", ")

把分割的string数组转string

cstr :=  plugin.Seg.CutStr(hmm, ", ")

Trim 去除字符串数组中字符的符号、空格和点

cut = plugin.Seg.Trim(cut)

使用DAQ使用hmm和regBEP剪切字符串

reg := regexp.MustCompile(`(\d+年|\d+月|\d+日|[\p{Latin}]+|[\p{Hangul}]+|\d+\.\d+|[a-zA-Z0-9]+)`)
text1 := `搭建的测试, 2024年09月18日, 3.18`
hmm = plugin.Seg.CutDAG(text1, reg)

关键词提取

通过标签提取关键字：

var te idf.TagExtracter
  te.WithGse(plugin.Seg)
  err := te.LoadIdf()
  if err!=nil{
  fmt.Println("load idf: ", err)
  }
  segments := te.ExtractTags(text, 5)

通过文本排名提取关键字：

var tr idf.TextRanker
tr.WithGse(plugin.Seg)//plugin.Seg是seg对象
results := tr.TextRank(text, 5)

返回数据格式：[{"Text":"科幻片","Weight":1.6002581704125},{"Text":"全片","Weight":1.449761569875},{"Text":"摄影机","Weight":1.2764747747375}]，其中Text是关键词，Weight是权重。

3.seg包完整代码示例：

var (

	text  = "《复仇者联盟3：无限战争》是全片使用IMAX摄影机拍摄制作的的科幻片."
	text1 = flag.String("text", text, "要分词的文本")

	text2 = "西雅图地标建筑, Seattle Space Needle, 西雅图太空针. Sky tree."
)

func main() {
	flag.Parse()

	// Loading the default dictionary
	  plugin.Seg.LoadDict()
	// Loading the default dictionary with embed
	//   plugin.Seg.LoadDictEmbed()
	//
	// Loading the simple chinese dictionary
	//   plugin.Seg.LoadDict("zh_s")
	//   plugin.Seg.LoadDictEmbed("zh_s")
	//
	// Loading the traditional chinese dictionary
	//   plugin.Seg.LoadDict("zh_t")
	//
	// Loading the japanese dictionary
	//   plugin.Seg.LoadDict("jp")
	//
	//   plugin.Seg.LoadDict("../data/dict/dictionary.txt")
	//
	// Loading the custom dictionary
	//   plugin.Seg.LoadDict("zh,../../testdata/zh/test_dict.txt,../../testdata/zh/test_dict1.txt")

	addToken()

	cut()
	//
	cutPos()
	segCut()

	extAndRank(  plugin.Seg)
}

func addToken() {
	err :=   plugin.Seg.AddToken("《复仇者联盟3：无限战争》", 100, "n")
	fmt.Println("add token: ", err)
  plugin.Seg.AddToken("西雅图中心", 100)
  plugin.Seg.AddToken("西雅图太空针", 100, "n")
  plugin.Seg.AddToken("Space Needle", 100, "n")
	//   plugin.Seg.AddTokenForce("上海东方明珠广播电视塔", 100, "n")
	//
	  plugin.Seg.AddToken("太空针", 100)
	  plugin.Seg.ReAddToken("太空针", 100, "n")
	freq, pos, ok := plugin.Seg.Find("太空针")
	fmt.Println("seg.Find: ", freq, pos, ok)

	// plugin.Seg.CalcToken()
	err = plugin.Seg.RemoveToken("太空针")
	fmt.Println("remove token: ", err)
}

// 使用 DAG 或 HMM 模式分词
func cut() {
	// "《复仇者联盟3：无限战争》是全片使用IMAX摄影机拍摄制作的的科幻片."

	// use DAG and HMM
	hmm := plugin.Seg.Cut(text, true)
	fmt.Println("cut use hmm: ", hmm)
	// cut use hmm:  [《复仇者联盟3：无限战争》 是 全片 使用 imax 摄影机 拍摄 制作 的 的 科幻片 .]

	cut := plugin.Seg.Cut(text)
	fmt.Println("cut: ", cut)
	// cut:  [《 复仇者 联盟 3 ： 无限 战争 》 是 全片 使用 imax 摄影机 拍摄 制作 的 的 科幻片 .]

	hmm = plugin.Seg.CutSearch(text, true)
	fmt.Println("cut search use hmm: ", hmm)
	//cut search use hmm:  [复仇 仇者 联盟 无限 战争 复仇者 《复仇者联盟3：无限战争》 是 全片 使用 imax 摄影 摄影机 拍摄 制作 的 的 科幻 科幻片 .]
	fmt.Println("analyze: ", plugin.Seg.Analyze(hmm, text))

	cut = plugin.Seg.CutSearch(text)
	fmt.Println("cut search: ", cut)
	// cut search:  [《 复仇 者 复仇者 联盟 3 ： 无限 战争 》 是 全片 使用 imax 摄影 机 摄影机 拍摄 制作 的 的 科幻 片 科幻片 .]

	cut = plugin.Seg.CutAll(text)
	fmt.Println("cut all: ", cut)
	// cut all:  [《复仇者联盟3：无限战争》 复仇 复仇者 仇者 联盟 3 ： 无限 战争 》 是 全片 使用 i m a x 摄影 摄影机 拍摄 摄制 制作 的 的 科幻 科幻片 .]

	s := plugin.Seg.CutStr(cut, ", ")
	fmt.Println("cut all to string: ", s)
	// cut all to string:  《复仇者联盟3：无限战争》, 复仇, 复仇者, 仇者, 联盟, 3, ：, 无限, 战争, 》, 是, 全片, 使用, i, m, a, x, 摄影, 摄影机, 拍摄, 摄制, 制作, 的, 的, 科幻, 科幻片, .

	analyzeAndTrim(cut)

  reg := regexp.MustCompile(`(\d+年|\d+月|\d+日|[\p{Latin}]+|[\p{Hangul}]+|\d+\.\d+|[a-zA-Z0-9]+)`)
  text1 := `搭建的测试, 2024年09月18日, 3.18`
  hmm = plugin.Seg.CutDAG(text1, reg)
	fmt.Println("Cut with hmm and regexp: ", hmm, hmm[0], hmm[6])
}

func analyzeAndTrim(cut []string) {
	a := plugin.Seg.Analyze(cut, "")
	fmt.Println("analyze the segment: ", a)
	// analyze the segment:

	cut = plugin.Seg.Trim(cut)
	fmt.Println("cut all: ", cut)
	// cut all:  [复仇者联盟3无限战争 复仇 复仇者 仇者 联盟 3 无限 战争 是 全片 使用 i m a x 摄影 摄影机 拍摄 摄制 制作 的 的 科幻 科幻片]

	fmt.Println(plugin.Seg.String(text2, true))
	// 西雅图/nr 地标/n 建筑/n ,/x  /x seattle/x  /x space needle/n ,/x  /x 西雅图太空针/n ./x  /x sky/x  /x tree/x ./x
	fmt.Println(plugin.Seg.Slice(text2, true))
	// [西雅图 地标 建筑 ,   seattle   space needle ,   西雅图太空针 .   sky   tree .]
}

func cutPos() {
	// "西雅图地标建筑, Seattle Space Needle, 西雅图太空针. Sky tree."

	po := plugin.Seg.Pos(text2, true)
	fmt.Println("pos: ", po)
	// pos:  [{西雅图 nr} {地标 n} {建筑 n} {, x} {  x} {seattle x} {  x} {space needle n} {, x} {  x} {西雅图太空针 n} {. x} {  x} {sky x} {  x} {tree x} {. x}]

	po = plugin.Seg.TrimWithPos(po, "zg")
	fmt.Println("trim pos: ", po)
	// trim pos:  [{西雅图 nr} {地标 n} {建筑 n} {, x} {  x} {seattle x} {  x} {space needle n} {, x} {  x} {西雅图太空针 n} {. x} {  x} {sky x} {  x} {tree x} {. x}]

	posSeg.WithGse(plugin.Seg)
	po = posSeg.Cut(text, true)
	fmt.Println("pos: ", po)
	// pos:  [{《 x} {复仇 v} {者 k} {联盟 j} {3 x} {： x} {无限 v} {战争 n} {》 x} {是 v} {全片 n} {使用 v} {imax eng} {摄影 n} {机 n} {拍摄 v} {制作 vn} {的的 u} {科幻 n} {片 q} {. m}]

	po = posSeg.TrimWithPos(po, "zg")
	fmt.Println("trim pos: ", po)
	// trim pos:  [{《 x} {复仇 v} {者 k} {联盟 j} {3 x} {： x} {无限 v} {战争 n} {》 x} {是 v} {全片 n} {使用 v} {imax eng} {摄影 n} {机 n} {拍摄 v} {制作 vn} {的的 u} {科幻 n} {片 q} {. m}]
}

// 使用最短路径和动态规划分词
func segCut() {
	segments := seg.Segment([]byte(*text1))
	fmt.Println(gse.ToString(segments, true))
	// 《/x 复仇/v 者/k 复仇者/n 联盟/j 3/x ：/x 无限/v 战争/n 》/x 是/v 全片/n 使用/v imax/x 摄影/n 机/n 摄影机/n 拍摄/v 制作/vn 的/uj 的/uj 科幻/n 片/q 科幻片/n ./x

	segs := seg.Segment([]byte(text2))
	// log.Println(gse.ToString(segs, false))
	log.Println(gse.ToString(segs))
	// 西雅图/nr 地标/n 建筑/n ,/x  /x seattle/x  /x space needle/n ,/x  /x 西雅图太空针/n ./x  /x sky/x  /x tree/x ./x

	// 搜索模式主要用于给搜索引擎提供尽可能多的关键字
	// segs := seg.ModeSegment(text2, true)
	log.Println("搜索模式: ", gse.ToString(segs, true))
	// 搜索模式:  西雅图/nr 地标/n 建筑/n ,/x  /x seattle/x  /x space needle/n ,/x  /x 西雅图太空针/n ./x  /x sky/x  /x tree/x ./x

	log.Println("to slice", gse.ToSlice(segs, true))
	// to slice [西雅图 地标 建筑 ,   seattle   space needle ,   西雅图太空针 .   sky   tree .]
}

func extAndRank(segs gse.Segmenter) {
	var te idf.TagExtracter
	te.WithGse(segs)
	err := te.LoadIdf()
	fmt.Println("load idf: ", err)

	segments := te.ExtractTags(text, 5)
	fmt.Println("segments: ", len(segments), segments)
	// segments:  5 [{科幻片 1.6002581704125} {全片 1.449761569875} {摄影机 1.2764747747375} {拍摄 0.9690261695075} {制作 0.8246043033375}]

	var tr idf.TextRanker
	tr.WithGse(segs)

	results := tr.TextRank(text, 5)
	fmt.Println("results: ", results)
	// results:  [{机 1} {全片 0.9931964427972227} {摄影 0.984870660504368} {使用 0.9769826633059524} {是 0.8489363954683677}]
}