本笔记内容为黑马头条项目的图片识别文字审核敏感词部分
目录
一、需求分析
二、图片文字识别
三、Tess4j案例
四、管理敏感词和图片文字识别集成到文章审核
一、需求分析
产品经理召集开会,文章审核功能已经交付了,文章也能正常发布审核。对于上次提出的自管理敏感词也很满意,这次会议核心的内容如下:
-
文章中包含的图片要识别文字,过滤掉图片文字的敏感词
二、图片文字识别
什么是OCR?
OCR (Optical Character Recognition,光学字符识别)是指电子设备(例如扫描仪或数码相机)检查纸上打印的字符,通过检测暗、亮的模式确定其形状,然后用字符识别方法将形状翻译成计算机文字的过程
方案 | 说明 |
---|---|
百度OCR | 收费 |
Tesseract-OCR | Google维护的开源OCR引擎,支持Java,Python等语言调用 |
Tess4J | 封装了Tesseract-OCR ,支持Java调用 |
三、Tess4j案例
①:创建项目导入tess4j对应的依赖
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>4.1.1</version>
</dependency>
②:导入中文字体库, 把资料中的tessdata文件夹拷贝到自己的工作空间下
③:编写测试类进行测试
package com.heima.tess4j;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import java.io.File;
public class Application {
public static void main(String[] args) {
try {
//获取本地图片
File file = new File("D:\\26.png");
//创建Tesseract对象
ITesseract tesseract = new Tesseract();
//设置字体库路径
tesseract.setDatapath("D:\\workspace\\tessdata");
//中文识别
tesseract.setLanguage("chi_sim");
//执行ocr识别
String result = tesseract.doOCR(file);
//替换回车和tal键 使结果为一行
result = result.replaceAll("\\r|\\n","-").replaceAll(" ","");
System.out.println("识别的结果为:"+result);
} catch (Exception e) {
e.printStackTrace();
}
}
}
四、管理敏感词和图片文字识别集成到文章审核
①:在heima-leadnews-common中创建工具类,简单封装一下tess4j
需要先导入pom
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>4.1.1</version>
</dependency>
工具类
package com.heima.common.tess4j;
import lombok.Getter;
import lombok.Setter;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;
import java.awt.image.BufferedImage;
@Getter
@Setter
@Component
@ConfigurationProperties(prefix = "tess4j")
public class Tess4jClient {
private String dataPath;
private String language;
public String doOCR(BufferedImage image) throws TesseractException {
//创建Tesseract对象
ITesseract tesseract = new Tesseract();
//设置字体库路径
tesseract.setDatapath(dataPath);
//中文识别
tesseract.setLanguage(language);
//执行ocr识别
String result = tesseract.doOCR(image);
//替换回车和tal键 使结果为一行
result = result.replaceAll("\\r|\\n", "-").replaceAll(" ", "");
return result;
}
}
在spring.factories配置中添加该类,完整如下:
org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
com.heima.common.exception.ExceptionCatch,\
com.heima.common.swagger.SwaggerConfiguration,\
com.heima.common.swagger.Swagger2Configuration,\
com.heima.common.aliyun.GreenTextScan,\
com.heima.common.aliyun.GreenImageScan,\
com.heima.common.tess4j.Tess4jClient
②:在heima-leadnews-wemedia中的配置中添加两个属性(字图库的位置和前缀名)
tess4j:
data-path: D:\workspace\tessdata
language: chi_sim
③:在WmNewsAutoScanServiceImpl中的handleImageScan方法上添加如下代码
try {
for (String image : images) {
byte[] bytes = fileStorageService.downLoadFile(image);
//图片识别文字审核---begin-----
//从byte[]转换为butteredImage
ByteArrayInputStream in = new ByteArrayInputStream(bytes);
BufferedImage imageFile = ImageIO.read(in);
//识别图片的文字
String result = tess4jClient.doOCR(imageFile);
//审核是否包含自管理的敏感词
boolean isSensitive = handleSensitiveScan(result, wmNews);
if(!isSensitive){
return isSensitive;
}
//图片识别文字审核---end-----
imageList.add(bytes);
}
}catch (Exception e){
e.printStackTrace();
}
最后附上文章审核的完整代码如下:
package com.heima.wemedia.service.impl;
import com.alibaba.fastjson.JSONArray;
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
import com.heima.apis.article.IArticleClient;
import com.heima.common.aliyun.GreenImageScan;
import com.heima.common.aliyun.GreenTextScan;
import com.heima.common.tess4j.Tess4jClient;
import com.heima.file.service.FileStorageService;
import com.heima.model.article.dtos.ArticleDto;
import com.heima.model.common.dtos.ResponseResult;
import com.heima.model.wemedia.pojos.WmChannel;
import com.heima.model.wemedia.pojos.WmNews;
import com.heima.model.wemedia.pojos.WmSensitive;
import com.heima.model.wemedia.pojos.WmUser;
import com.heima.utils.common.SensitiveWordUtil;
import com.heima.wemedia.mapper.WmChannelMapper;
import com.heima.wemedia.mapper.WmNewsMapper;
import com.heima.wemedia.mapper.WmSensitiveMapper;
import com.heima.wemedia.mapper.WmUserMapper;
import com.heima.wemedia.service.WmNewsAutoScanService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.BeanUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.util.*;
import java.util.stream.Collectors;
@Service
@Slf4j
@Transactional
public class WmNewsAutoScanServiceImpl implements WmNewsAutoScanService {
@Autowired
private WmNewsMapper wmNewsMapper;
/**
* 自媒体文章审核
*
* @param id 自媒体文章id
*/
@Override
@Async //标明当前方法是一个异步方法
public void autoScanWmNews(Integer id) {
// int a = 1/0;
//1.查询自媒体文章
WmNews wmNews = wmNewsMapper.selectById(id);
if (wmNews == null) {
throw new RuntimeException("WmNewsAutoScanServiceImpl-文章不存在");
}
if (wmNews.getStatus().equals(WmNews.Status.SUBMIT.getCode())) {
//从内容中提取纯文本内容和图片
Map<String, Object> textAndImages = handleTextAndImages(wmNews);
//自管理的敏感词过滤
boolean isSensitive = handleSensitiveScan((String) textAndImages.get("content"), wmNews);
if(!isSensitive) return;
//2.审核文本内容 阿里云接口
boolean isTextScan = handleTextScan((String) textAndImages.get("content"), wmNews);
if (!isTextScan) return;
//3.审核图片 阿里云接口
boolean isImageScan = handleImageScan((List<String>) textAndImages.get("images"), wmNews);
if (!isImageScan) return;
//4.审核成功,保存app端的相关的文章数据
ResponseResult responseResult = saveAppArticle(wmNews);
if (!responseResult.getCode().equals(200)) {
throw new RuntimeException("WmNewsAutoScanServiceImpl-文章审核,保存app端相关文章数据失败");
}
//回填article_id
wmNews.setArticleId((Long) responseResult.getData());
updateWmNews(wmNews, (short) 9, "审核成功");
}
}
@Autowired
private WmSensitiveMapper wmSensitiveMapper;
/**
* 自管理的敏感词审核
* @param content
* @param wmNews
* @return
*/
private boolean handleSensitiveScan(String content, WmNews wmNews) {
boolean flag = true;
//获取所有的敏感词
List<WmSensitive> wmSensitives = wmSensitiveMapper.selectList(Wrappers.<WmSensitive>lambdaQuery().select(WmSensitive::getSensitives));
List<String> sensitiveList = wmSensitives.stream().map(WmSensitive::getSensitives).collect(Collectors.toList());
//初始化敏感词库
SensitiveWordUtil.initMap(sensitiveList);
//查看文章中是否包含敏感词
Map<String, Integer> map = SensitiveWordUtil.matchWords(content);
if(map.size() >0){
updateWmNews(wmNews,(short) 2,"当前文章中存在违规内容"+map);
flag = false;
}
return flag;
}
@Autowired
private IArticleClient articleClient;
@Autowired
private WmChannelMapper wmChannelMapper;
@Autowired
private WmUserMapper wmUserMapper;
/**
* 保存app端相关的文章数据
*
* @param wmNews
*/
private ResponseResult saveAppArticle(WmNews wmNews) {
ArticleDto dto = new ArticleDto();
//属性的拷贝
BeanUtils.copyProperties(wmNews, dto);
//文章的布局
dto.setLayout(wmNews.getType());
//频道
WmChannel wmChannel = wmChannelMapper.selectById(wmNews.getChannelId());
if (wmChannel != null) {
dto.setChannelName(wmChannel.getName());
}
//作者
dto.setAuthorId(wmNews.getUserId().longValue());
WmUser wmUser = wmUserMapper.selectById(wmNews.getUserId());
if (wmUser != null) {
dto.setAuthorName(wmUser.getName());
}
//设置文章id
if (wmNews.getArticleId() != null) {
dto.setId(wmNews.getArticleId());
}
dto.setCreatedTime(new Date());
ResponseResult responseResult = articleClient.saveArticle(dto);
return responseResult;
}
@Autowired
private FileStorageService fileStorageService;
@Autowired
private GreenImageScan greenImageScan;
@Autowired
private Tess4jClient tess4jClient;
/**
* 审核图片
*
* @param images
* @param wmNews
* @return
*/
private boolean handleImageScan(List<String> images, WmNews wmNews) {
boolean flag = true;
if (images == null || images.size() == 0) {
return flag;
}
//下载图片 minIO
//图片去重
images = images.stream().distinct().collect(Collectors.toList());
List<byte[]> imageList = new ArrayList<>();
try {
for (String image : images) {
byte[] bytes = fileStorageService.downLoadFile(image);
//图片识别文字审核---begin-----
//从byte[]转换为butteredImage
ByteArrayInputStream in = new ByteArrayInputStream(bytes);
BufferedImage imageFile = ImageIO.read(in);
//识别图片的文字
String result = tess4jClient.doOCR(imageFile);
//审核是否包含自管理的敏感词
boolean isSensitive = handleSensitiveScan(result, wmNews);
if(!isSensitive){
return isSensitive;
}
//图片识别文字审核---end-----
imageList.add(bytes);
}
}catch (Exception e){
e.printStackTrace();
}
//审核图片
try {
Map map = greenImageScan.imageScan(imageList);
if (map != null) {
//审核失败
if (map.get("suggestion").equals("block")) {
flag = false;
updateWmNews(wmNews, (short) 2, "当前文章中存在违规内容");
}
//不确定信息 需要人工审核
if (map.get("suggestion").equals("review")) {
flag = false;
updateWmNews(wmNews, (short) 3, "当前文章中存在不确定内容");
}
}
} catch (Exception e) {
flag = false;
e.printStackTrace();
}
return flag;
}
@Autowired
private GreenTextScan greenTextScan;
/**
* 审核纯文本内容
*
* @param content
* @param wmNews
* @return
*/
private boolean handleTextScan(String content, WmNews wmNews) {
boolean flag = true;
if ((wmNews.getTitle() + "-" + content).length() == 0) {
return flag;
}
try {
Map map = greenTextScan.greeTextScan((wmNews.getTitle() + "-" + content));
if (map != null) {
//审核失败
if (map.get("suggestion").equals("block")) {
flag = false;
updateWmNews(wmNews, (short) 2, "当前文章中存在违规内容");
}
//不确定信息 需要人工审核
if (map.get("suggestion").equals("review")) {
flag = false;
updateWmNews(wmNews, (short) 3, "当前文章中存在不确定内容");
}
}
} catch (Exception e) {
flag = false;
e.printStackTrace();
}
return flag;
}
/**
* 修改文章内容
*
* @param wmNews
* @param status
* @param reason
*/
private void updateWmNews(WmNews wmNews, short status, String reason) {
wmNews.setStatus(status);
wmNews.setReason(reason);
wmNewsMapper.updateById(wmNews);
}
/**
* 1。从自媒体文章的内容中提取文本和图片
* 2.提取文章的封面图片
*
* @param wmNews
* @return
*/
private Map<String, Object> handleTextAndImages(WmNews wmNews) {
//存储纯文本内容
StringBuilder stringBuilder = new StringBuilder();
List<String> images = new ArrayList<>();
//1。从自媒体文章的内容中提取文本和图片
if (StringUtils.isNotBlank(wmNews.getContent())) {
List<Map> maps = JSONArray.parseArray(wmNews.getContent(), Map.class);
for (Map map : maps) {
if (map.get("type").equals("text")) {
stringBuilder.append(map.get("value"));
}
if (map.get("type").equals("image")) {
images.add((String) map.get("value"));
}
}
}
//2.提取文章的封面图片
if (StringUtils.isNotBlank(wmNews.getImages())) {
String[] split = wmNews.getImages().split(",");
images.addAll(Arrays.asList(split));
}
Map<String, Object> resultMap = new HashMap<>();
resultMap.put("content", stringBuilder.toString());
resultMap.put("images", images);
return resultMap;
}
}