开源通用验证码识别OCR —— DdddOcr 源码赏析(二)

文章目录

前言
DdddOcr
分类识别
- 调用识别功能
- classification 函数源码
- classification 函数源码解读
- - 1. 分类功能不支持目标检测
  - 2. 转换为Image对象
  - 3. 根据模型配置调整图片尺寸和色彩模式
  - 4. 图像数据转换为浮点数据并归一化
  - 5. 图像数据预处理
  - 6. 运行模型，返回预测结果
总结

前言

DdddOcr 源码赏析
上文我们读到了分类识别部分的源码，这里我们继续往下进行
在这里插入图片描述

DdddOcr

DdddOcr是开源的通用验证码识别OCR
官方传送门

分类识别

调用识别功能

image = open("example.jpg", "rb").read()
result = ocr.classification(image)
print(result)

classification 函数源码

def classification(self, img, png_fix: bool = False, probability=False):
        if self.det:
            raise TypeError("当前识别类型为目标检测")
        if not isinstance(img, (bytes, str, pathlib.PurePath, Image.Image)):
            raise TypeError("未知图片类型")
        if isinstance(img, bytes):
            image = Image.open(io.BytesIO(img))
        elif isinstance(img, Image.Image):
            image = img.copy()
        elif isinstance(img, str):
            image = base64_to_image(img)
        else:
            assert isinstance(img, pathlib.PurePath)
            image = Image.open(img)
        if not self.use_import_onnx:
            image = image.resize((int(image.size[0] * (64 / image.size[1])), 64), Image.ANTIALIAS).convert('L')
        else:
            if self.__resize[0] == -1:
                if self.__word:
                    image = image.resize((self.__resize[1], self.__resize[1]), Image.ANTIALIAS)
                else:
                    image = image.resize((int(image.size[0] * (self.__resize[1] / image.size[1])), self.__resize[1]),
                                         Image.ANTIALIAS)
            else:
                image = image.resize((self.__resize[0], self.__resize[1]), Image.ANTIALIAS)
            if self.__channel == 1:
                image = image.convert('L')
            else:
                if png_fix:
                    image = png_rgba_black_preprocess(image)
                else:
                    image = image.convert('RGB')
        image = np.array(image).astype(np.float32)
        image = np.expand_dims(image, axis=0) / 255.
        if not self.use_import_onnx:
            image = (image - 0.5) / 0.5
        else:
            if self.__channel == 1:
                image = (image - 0.456) / 0.224
            else:
                image = (image - np.array([0.485, 0.456, 0.406])) / np.array([0.229, 0.224, 0.225])
                image = image[0]
                image = image.transpose((2, 0, 1))

        ort_inputs = {'input1': np.array([image]).astype(np.float32)}
        ort_outs = self.__ort_session.run(None, ort_inputs)
        result = []

        last_item = 0

        if self.__word:
            for item in ort_outs[1]:
                result.append(self.__charset[item])
        else:
            if not self.use_import_onnx:
                # 概率输出仅限于使用官方模型
                if probability:
                    ort_outs = ort_outs[0]
                    ort_outs = np.exp(ort_outs) / np.sum(np.exp(ort_outs))
                    ort_outs_sum = np.sum(ort_outs, axis=2)
                    ort_outs_probability = np.empty_like(ort_outs)
                    for i in range(ort_outs.shape[0]):
                        ort_outs_probability[i] = ort_outs[i] / ort_outs_sum[i]
                    ort_outs_probability = np.squeeze(ort_outs_probability).tolist()
                    result = {}
                    if len(self.__charset_range) == 0:
                        # 返回全部
                        result['charsets'] = self.__charset
                        result['probability'] = ort_outs_probability
                    else:
                        result['charsets'] = self.__charset_range
                        probability_result_index = []
                        for item in self.__charset_range:
                            if item in self.__charset:
                                probability_result_index.append(self.__charset.index(item))
                            else:
                                # 未知字符
                                probability_result_index.append(-1)
                        probability_result = []
                        for item in ort_outs_probability:
                            probability_result.append([item[i] if i != -1 else -1 for i in probability_result_index ])
                        result['probability'] = probability_result
                    return result
                else:
                    last_item = 0
                    argmax_result = np.squeeze(np.argmax(ort_outs[0], axis=2))
                    for item in argmax_result:
                        if item == last_item:
                            continue
                        else:
                            last_item = item
                        if item != 0:
                            result.append(self.__charset[item])
                    return ''.join(result)

            else:
                last_item = 0
                for item in ort_outs[0][0]:
                    if item == last_item:
                        continue
                    else:
                        last_item = item
                    if item != 0:
                        result.append(self.__charset[item])
                return ''.join(result)

classification 函数源码解读

1. 分类功能不支持目标检测

if self.det:
	raise TypeError("当前识别类型为目标检测")

2. 转换为Image对象

 if not isinstance(img, (bytes, str, pathlib.PurePath, Image.Image)):
            raise TypeError("未知图片类型")
        if isinstance(img, bytes):
            image = Image.open(io.BytesIO(img))
        elif isinstance(img, Image.Image):
            image = img.copy()
        elif isinstance(img, str):
            image = base64_to_image(img)
        else:
            assert isinstance(img, pathlib.PurePath)
            image = Image.open(img)

3. 根据模型配置调整图片尺寸和色彩模式

 if not self.use_import_onnx:
            image = image.resize((int(image.size[0] * (64 / image.size[1])), 64), Image.ANTIALIAS).convert('L')
        else:
            if self.__resize[0] == -1:
                if self.__word:
                    image = image.resize((self.__resize[1], self.__resize[1]), Image.ANTIALIAS)
                else:
                    image = image.resize((int(image.size[0] * (self.__resize[1] / image.size[1])), self.__resize[1]),
                                         Image.ANTIALIAS)
            else:
                image = image.resize((self.__resize[0], self.__resize[1]), Image.ANTIALIAS)
            if self.__channel == 1:
                image = image.convert('L')
            else:
                if png_fix:
                    image = png_rgba_black_preprocess(image)
                else:
                    image = image.convert('RGB')

如果使用dddocr的模型，则将图像调整为高度为64，同时保持原来的宽高比，同时将图片转为灰度图
如果使用自己传入的模型，则根据从charsets_path读取的charset info调整图片尺寸，之后根据charset 需要调整为灰度图片或RGB模式的图片，这里png_rgba_black_preprocess也是将图片转为RGB模式

def png_rgba_black_preprocess(img: Image):
    width = img.width
    height = img.height
    image = Image.new('RGB', size=(width, height), color=(255, 255, 255))
    image.paste(img, (0, 0), mask=img)
    return image

4. 图像数据转换为浮点数据并归一化

image = np.array(image).astype(np.float32)
image = np.expand_dims(image, axis=0) / 255.

image = np.array(image).astype(np.float32)：首先，将图像从PIL图像或其他格式转换为NumPy数组，并确保数据类型为float32。这是为了后续的数学运算，特别是归一化和标准化。
image = np.expand_dims(image, axis=0) / 255.：然后，通过np.expand_dims在第一个维度（axis=0）上增加一个维度，这通常是为了符合某些模型输入的形状要求（例如，批处理大小）。之后，将图像数据除以255，将其归一化到[0, 1]区间内。

5. 图像数据预处理

if not self.use_import_onnx:
   image = (image - 0.5) / 0.5
else:
    if self.__channel == 1:
        image = (image - 0.456) / 0.224
    else:
        image = (image - np.array([0.485, 0.456, 0.406])) / np.array([0.229, 0.224, 0.225])
        image = image[0]
        image = image.transpose((2, 0, 1))

这段代码主要进行了图像数据的预处理，具体地，根据是否使用私人的onnx模型(self.use_import_onnx)以及图像的通道数(self.__channel)，对图像数据image进行了不同的归一化处理。这种处理在机器学习和深度学习模型中是常见的，特别是当使用预训练的模型进行推理时，需要确保输入数据与模型训练时使用的数据具有相同的分布。

如果不使用私人的ONNX模型 (self.use_import_onnx 为 False, 也就是使用官方的模型)

图像数据image会先减去0.5，然后除以0.5，实现了一个简单的归一化，将图像的像素值从[0, 255]范围缩放到[-1, 1]范围。这种归一化方式可能适用于某些特定训练的模型。

如果使用私人的ONNX模型 (self.use_import_onnx 为 True)

首先，根据图像的通道数self.__channel进行不同的处理。
如果图像是单通道(self.__channel == 1)，则图像数据image会先减去0.456，然后除以0.224，实现另一种归一化。这种归一化参数（0.456和0.224）是针对单通道图像（如灰度图）预训练的模型所使用的。
如果图像是多通道（通常是RGB三通道），则图像数据image会先减去一个包含三个值的数组[0.485, 0.456, 0.406]（这些值分别是RGB三通道的均值），然后除以另一个包含三个值的数组[0.229, 0.224, 0.225]（这些值分别是RGB三通道的标准差或缩放因子）。这种归一化方式是为了将图像数据标准化到常见的分布，与许多预训练的深度学习模型（如ResNet, VGG等）训练时使用的数据分布相匹配。
接着，对于多通道图像，还执行了两个额外的步骤：
image = image[0]：由于之前通过np.expand_dims增加了一个维度，这里通过索引[0]将其移除，恢复到原始的三维形状（高度、宽度、通道数）。
image = image.transpose((2, 0, 1))：最后，将图像的维度从（高度、宽度、通道数）转换为（通道数、高度、宽度）。这是因为某些模型（特别是使用PyTorch等框架训练的模型）期望输入数据的维度顺序为（通道数、高度、宽度）。

6. 运行模型，返回预测结果

ort_inputs = {'input1': np.array([image]).astype(np.float32)}
ort_outs = self.__ort_session.run(None, ort_inputs)
result = []
if self.__word:
    for item in ort_outs[1]:
        result.append(self.__charset[item])
else:
    if not self.use_import_onnx:
         # 概率输出仅限于使用官方模型
         if probability:
             ort_outs = ort_outs[0]
             ort_outs = np.exp(ort_outs) / np.sum(np.exp(ort_outs))
             ort_outs_sum = np.sum(ort_outs, axis=2)
             ort_outs_probability = np.empty_like(ort_outs)
             for i in range(ort_outs.shape[0]):
                 ort_outs_probability[i] = ort_outs[i] / ort_outs_sum[i]
             ort_outs_probability = np.squeeze(ort_outs_probability).tolist()
             result = {}
             if len(self.__charset_range) == 0:
                 # 返回全部
                 result['charsets'] = self.__charset
                 result['probability'] = ort_outs_probability
             else:
                 result['charsets'] = self.__charset_range
                 probability_result_index = []
                 for item in self.__charset_range:
                     if item in self.__charset:
                         probability_result_index.append(self.__charset.index(item))
                     else:
                         # 未知字符
                         probability_result_index.append(-1)
                 probability_result = []
                 for item in ort_outs_probability:
                     probability_result.append([item[i] if i != -1 else -1 for i in probability_result_index ])
                 result['probability'] = probability_result
             return result
         else:
             last_item = 0
             argmax_result = np.squeeze(np.argmax(ort_outs[0], axis=2))
             for item in argmax_result:
                 if item == last_item:
                     continue
                 else:
                     last_item = item
                 if item != 0:
                     result.append(self.__charset[item])
             return ''.join(result)

     else:
         last_item = 0
         for item in ort_outs[0][0]:
             if item == last_item:
                 continue
             else:
                 last_item = item
             if item != 0:
                 result.append(self.__charset[item])
         return ''.join(result)