paddle实现获取pdf的内容
- 1. 环境安装
- 2. 实现代码
- 源码链接
1. 环境安装
- 安装paddlepaddle
- gpu版本
python -m pip install paddlepaddle-gpu -i https://pypi.tuna.tsinghua.edu.cn/simple
- cpu版本:
python -m pip install paddlepaddle -i https://pypi.tuna.tsinghua.edu.cn/simple
- gpu版本
- 安装PaddleOCR
pip install "paddleocr>=2.0.1" # Recommend to use version 2.0.1+
- 其他库
pip install Pillow==9.5.0 pip install fitz==0.0.1.dev2 pip install numpy==1.24.4 pip install PyMuPDF==1.19.0 pip install opencv-python==4.6.0.66
2. 实现代码
- 代码
import cv2 import fitz import numpy as np from PIL import Image from paddleocr import PaddleOCR, draw_ocr ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=427) # page_num=pdf文件页数 img_path = 'data/深度学习进阶自然语言处理.pdf' result = ocr.ocr(img_path, cls=True) for idx in range(len(result)): res = result[idx] for line in res: print(line) # draw result imgs = [] with fitz.open(img_path) as pdf: for pg in range(0, pdf.pageCount): page = pdf[pg] mat = fitz.Matrix(2, 2) pm = page.getPixmap(matrix=mat, alpha=False) # if width or height > 2000 pixels, don't enlarge the image if pm.width > 2000 or pm.height > 2000: pm = page.getPixmap(matrix=fitz.Matrix(1, 1), alpha=False) img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) imgs.append(img) for idx in range(len(result)): # 保存获取的文本 with open(f'data/data_txt/text_{idx}.txt', 'w', encoding='utf-8') as f: res = result[idx] image = imgs[idx] boxes = [line[0] for line in res] txts = [line[1][0] for line in res] for line in txts: f.write(line) f.write('\n') scores = [line[1][1] for line in res] im_show = draw_ocr(image, boxes, txts, scores, font_path='doc/fonts/simfang.ttf') im_show = Image.fromarray(im_show) # 保存图片 im_show.save('data/images/page_{}.jpg'.format(idx))
- 结果展示