一、相关的地址
https://github.com/tesseract-ocr/tessdata
Releases - OpenCV
opencv要装好,我装的是4.5.3的,最新版的没试过。
tessdata就下载了需要用的。好像还有best和fast的版本,我试了一下报错,不知道是不是版本不支持的问题。
二、主要的思路
识别的代码到没什么特别的,就是在每一行的识别上,为了提高准确度,稍微花了点心思,但也不时很完善。
发现识别的时候会出现很多干扰,所以从“姓”这一行开始算第一行,然后一行一行分析。
如果图片不清晰,不是太准,有待改进。
三、代码
pom.xml的依赖:
<!-- OpenCV -->
<dependency>
<groupId>org.openpnp</groupId>
<artifactId>opencv</artifactId>
<version>4.5.3-4</version>
</dependency>
<!-- Tesseract -->
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>5.11.0</version> <!-- 使用最新版本 -->
</dependency>
JAVA代码:
package com.yutiandada.idcardread.demos.test;
import org.opencv.core.Core;
import org.opencv.core.Mat;
import org.opencv.core.Size;
import org.opencv.imgcodecs.Imgcodecs;
import org.opencv.imgproc.Imgproc;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.List;
import net.sourceforge.tess4j.*;
public class IDCardRecognition2 {
static {
System.setProperty("java.library.path", "D:\\opencv\\opencv\\build\\java\\x64");
try {
Field fieldSysPath = ClassLoader.class.getDeclaredField("sys_paths");
fieldSysPath.setAccessible(true);
fieldSysPath.set(null, null);
} catch (Exception e) {
e.printStackTrace();
}
System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
}
public static Mat preprocessImage(String imagePath) {
// 读取图像
Mat image = Imgcodecs.imread(imagePath);
// 可选:进行一些图像增强操作,例如锐化
Mat enhancedImage = new Mat();
Imgproc.GaussianBlur(image, enhancedImage, new Size(0, 0), 3);
Core.addWeighted(image, 1.5, enhancedImage, -0.5, 0, enhancedImage);
return enhancedImage;
}
public static List<String> recognizeText(Mat mat) throws TesseractException {
try {
// 将Mat转换为File
File tempFile = File.createTempFile("temp", ".png");
Imgcodecs.imwrite(tempFile.getAbsolutePath(), mat);
// 初始化Tesseract
Tesseract tesseract = new Tesseract();
tesseract.setLanguage("chi_sim"); // 设置语言为简体中文
tesseract.setDatapath("d:/tessdata"); // 设置Tesseract数据路径
// tesseract.setPageSegMode(PageSegMode.PSM_AUTO); // 设置页面分割模式
// 进行OCR识别
String result = tesseract.doOCR(tempFile);
tempFile.delete(); // 删除临时文件
// 将结果按行分割
List<String> lines = new ArrayList<>();
String[] splitResult = result.split("\\r?\\n");
for (String line : splitResult) {
if (!line.trim().isEmpty()) {
lines.add(line.trim());
}
}
return lines;
} catch (IOException e) {
// 处理异常,例如记录日志或显示错误信息
e.printStackTrace();
throw new TesseractException("Error during OCR processing", e);
}
}
public static void main(String[] args) {
try {
String imagePath = "D:/images/test01.png";
Mat processedImage = preprocessImage(imagePath);
List<String> recognizedLines = recognizeText(processedImage);
System.out.println("Recognized Text:");
// 找到包含“姓”字的行索引
int startIndex = -1;
for (int i = 0; i < recognizedLines.size(); i++) {
if (recognizedLines.get(i).contains("姓") || recognizedLines.get(i).contains("名") ) {
startIndex = i;
break;
}
}
if (startIndex != -1) {
// 从包含“姓”字的行开始重新编号
List<String> filteredLines = recognizedLines.subList(startIndex, recognizedLines.size());
// 第一行:“姓名”之后的字符串
String name = filteredLines.get(0).replaceAll(".*姓名", "").trim();
System.out.println("Name: " + name);
// 第二行:性别
String gender = "";
String ethnicity = "";
String secondLine = filteredLines.get(1);
if (secondLine.contains("男")) {
gender = "男";
} else if (secondLine.contains("女")) {
gender = "女";
}
int minIndex = secondLine.indexOf("民") + 2;
if (minIndex > 0 && minIndex < secondLine.length()) {
ethnicity = secondLine.substring(minIndex).trim();
}
System.out.println("Gender: " + gender);
System.out.println("Ethnicity: " + ethnicity);
// 第三行:出生日期,只显示数字
String thirdLine = filteredLines.get(2).replaceAll("[^0-9]", "").trim();
if (thirdLine.length() == 8) {
String birthYear = thirdLine.substring(0, 4);
String birthMonth = thirdLine.substring(4, 6);
String birthDay = thirdLine.substring(6, 8);
System.out.println("Birth Year: " + birthYear);
System.out.println("Birth Month: " + birthMonth);
System.out.println("Birth Day: " + birthDay);
} else {
System.out.println("Invalid date format in the third line.");
}
// 第四行:地址
String addressPart1 = filteredLines.get(3).replaceAll("\\s+", "").substring(2).trim();
String addressPart2 = filteredLines.get(4).replaceAll("\\s+", "").trim();
String address = addressPart1 + addressPart2;
System.out.println("Address: " + address);
// 第六行:身份证号,只显示数字和字母
String idNumber = filteredLines.get(5).replaceAll("[^a-zA-Z0-9]", "").trim();
System.out.println("ID Number: " + idNumber);
} else {
System.out.println("No line containing '姓' found.");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
IDEA里面要配置一下
把安装好的OPENCV装一下
四、效果
网上随便找的图
识别的效果