python OCR识别验证码

news2025/1/4 19:26:05

1. 抓取网页验证码图像并保存

import lxml.html, urllib3

# 使用urllib3抓取网页数据
http = urllib3.PoolManager()
html = http.request('GET',site).data

# 使用lxml解析网页数据
tree = lxml.html.fromstring(html)   # 解析HTML，补全不完整的格式
fixedhtml = lxml.html.tostring(tree,pretty_print=True)

# 通过cssselect抓取验证码图像文件，返回图像列表
# [0]：取图像列表的第1条数据
# .get('src')：抓取验证码图像资源存取地址
img = tree.cssselect("#login-form > div:nth-child(4) > img")[0].get('src') 

# 通过资源地址获取验证图像数据
# 实际的验证码图像资源地址根据实际进行解转换
pic = http.request('GET',img).data

# 定义验证码图像保存地址，保存
fn = 'C:/a.jpg'  # 定义保存地址
with open(fn,'wb') as fp:
    fp.write(pic)    # 2时制方式写入图像数据保存

2. 验证码图像识别

使用 pytesseract 进行图像识别

from PIL import Image
from pytesseract import pytesseract as pt

# 打开验证码图像文件
img = Image.open( img_file_path )

# 使用pytesseract识别图像中的文字信息
strocr = pt.image_to_string(img)
print('Img Word: ', strocr)

转换图像灰度对比增强图像识别

from PIL import Image
from pytesseract import pytesseract as pt

img = Image.open( img_file_path )

# 图像模式转换 
img = img.convert('L')  # help(img.convert)

# 通过读取图像中的像素点对图像进行灰度转换
# img.point(lambda x: 0 if x<1 else 255, 'L')

# 保存转换后的图像
img.save( save_img_path )
img = Image.open( img_file_path)
strocr = pt.image_to_string(img)
print('Img Word: ', strocr)

使用 easyocr 进行图像识别

import easyocr

# 定义读取模式
reader = easyocr.Reader(['ch_sim','en'],gpu=True, model_storage_directory='./model',verbose=True,download_enabled=False)

# 使用 easyocr 识别图像中的文字信息
strocr = reader.readtext( img_file_path )
print(strocr)

3. 验证码识别效果

有干扰线/点的验证码图像识别率低，多数识别为空
>>> strocr
''
easyocr 识别效果似乎比 pytesseract 要好一些

>> pytesseract:

>> pytesseract L:

>> CUDA not available - defaulting to CPU. Note: This module is much faster with a GPU.
[([[0, 1], [133, 1], [133, 38], [0, 38]], 'KODAU', 0.9993924878037527)]