基于Tesseract

基于Tesseract_OCR识别

news2024/11/13 10:08:30

1、安装Tesseract
Mac版本，通过Homebrew进行安装即可

brew install tesseract

windows版本安装

下载地址：https://digi.bib.uni-mannheim.de/tesseract/

2、更换语言包

下载语言包

https://github.com/tesseract-ocr/tesseract

亦可参照这个 Tesseract最新版语言包chi_sim.traineddata(4.0.0)GitHub官方获取免csdn积分，各个版本语言包全有_tesseract github releases-CSDN博客

3、程序

import os
from PIL import Image
import pytesseract
import re
import pandas as pd

# 如果你使用 Windows，指定 Tesseract 的安装路径
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# 1. 遍历文件夹中的所有图片并进行OCR识别
def process_images(folder_path, output_txt):
    with open(output_txt, 'w', encoding='utf-8') as f:
        for filename in os.listdir(folder_path):
            if filename.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
                image_path = os.path.join(folder_path, filename)
                print(f"Processing image: {filename}")
                img = Image.open(image_path)

                # 进行OCR识别
                text = pytesseract.image_to_string(img, lang='chi_sim')

                # 替换错误识别的符号为 ¥
                text = text.replace('#', '¥')
                text = text.replace('Y', '¥')
                text = text.replace('*', '¥')

                # 处理每一行中的多余空格
                lines = text.splitlines()
                processed_lines = []
                for line in lines:
                    # 移除行内的字符间空格
                    processed_line = re.sub(r'\s', '', line)
                    processed_lines.append(processed_line)
                processed_text = '\n'.join(processed_lines)
                f.write(f"Image: {filename}\n")
                f.write(processed_text)
                f.write("\n" + "=" * 40 + "\n")

    # 处理完图片后，清理txt文件并保存为新的文件
    cleaned_txt_file = 'cleaned_' + output_txt  # 创建清理后的文件名
    clean_txt(output_txt, cleaned_txt_file)     # 调用清理函数
    print(f"Cleaned text saved to {cleaned_txt_file}")
    return cleaned_txt_file


# 2. 清理txt文件中的无关内容，只保留指定信息
def clean_txt(txt_file, cleaned_txt_file):
    headers = ["券码", "券类型", "套餐内容", "验证时间", "消费金额", "消费明细", "订单号", "验券账号"]
    with open(txt_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    with open(cleaned_txt_file, 'w', encoding='utf-8') as f:
        buffer = ''
        is_in_taocan = False  # 标识是否在"套餐内容"部分
        for line in lines:
            line = line.strip()
            if any(header in line for header in headers):
                if "套餐内容" in line:
                    buffer += line  # 如果遇到"套餐内容"，开始处理
                    is_in_taocan = True
                else:
                    if is_in_taocan:  # 如果之前是在"套餐内容"内
                        f.write(buffer + '\n')  # 把完整的"套餐内容"写入文件
                        buffer = ''  # 清空buffer
                        is_in_taocan = False  # 退出"套餐内容"模式

                    f.write(line + '\n')  # 写入其他字段
            elif is_in_taocan:
                # 如果是在处理"套餐内容"部分，合并多行内容
                buffer += line  # 合并到buffer
            else:
                continue  # 跳过非关键信息行

        if buffer:  # 写入最后一部分的"套餐内容"（如果有）
            f.write(buffer + '\n')

# 运行流程
image_folder = '/pytorchPeoject/testDemo1/img/'  # 图片文件夹路径
output_txt_file = 'output_02.txt'  # 临时存储OCR结果的txt文件

# 执行图像处理和OCR识别，并清理txt文件
cleaned_txt_file = process_images(image_folder, output_txt_file)  # 获取清理后的文件路径

# 打开并读取txt文件
with open(cleaned_txt_file, 'r', encoding='utf-8') as file:
    data = file.read()

# 使用正则表达式提取需要的数据
pattern = re.compile(r'券码:(\d+)\s+券类型:(.+?)\s+套餐内容:(.+?)\s+验证时间:(\d{4}-\d{2}-\d{2})(\d{2}:\d{2}:\d{2})\s+消费金额:(.+?)\s+消费明细:(.+?)\s+订单号:(\d+)\s+验券账号:(\w+)')
matches = pattern.findall(data)

# 创建DataFrame
columns = ['券码', '券类型', '套餐内容', '验证时间', '时间', '消费金额', '消费明细', '订单号', '验券账号']
df = pd.DataFrame(matches, columns=columns)

# 合并验证时间和时间列
df['验证时间'] = df['验证时间'] + ' ' + df['时间']
df.drop('时间', axis=1, inplace=True)

# 保存到Excel文件
df.to_excel('output.xlsx', index=False)

print("数据提取并保存到Excel文件成功！")

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.coloradmin.cn/o/2148883.html

如若内容造成侵权/违法违规/事实不符，请联系多彩编程网进行投诉反馈，一经查实，立即删除！