先pdf转图片
import os
from pdf2image import convert_from_path
pdf_path = '/Users/xxx/2022.pdf'
output_folder = './output_images2022'
output_name = 'page'
if not os. path. exists( output_folder) :
os. makedirs( output_folder)
images = convert_from_path( pdf_path, dpi= 300 )
for i, image in enumerate ( images) :
image. save( f' { output_folder} / { output_name} _ { i+ 1 } .png' , 'PNG' )
OCR
from PIL import ImageEnhance
import pytesseract
from PIL import Image
from openpyxl import Workbook
def enhance_image ( img) :
img = img. convert( 'L' )
img = ImageEnhance. Contrast( img) . enhance( 2.0 )
return img
def allimngs ( image_path) :
image = Image. open ( image_path)
image = enhance_image( image)
text = pytesseract. image_to_string( image, lang= "chi_sim" )
return text. replace( ' ' , '' )
class TrieNode :
def __init__ ( self) :
self. children = { }
self. keywords = [ ]
class Trie :
def __init__ ( self) :
self. root = TrieNode( )
def insert ( self, keyword) :
node = self. root
for char in keyword:
if char not in node. children:
node. children[ char] = TrieNode( )
node = node. children[ char]
node. keywords. append( keyword)
def count_keywords ( text, keywords) :
keywords = list ( set ( keywords) )
trie = Trie( )
for kw in keywords:
trie. insert( kw)
counters = { kw: 0 for kw in keywords}
i = 0
n = len ( text)
while i < n:
current_node = trie. root
max_len = 0
current_len = 0
end_pos = i
for j in range ( i, n) :
char = text[ j]
if char in current_node. children:
current_node = current_node. children[ char]
current_len += 1
if current_node. keywords:
max_len = current_len
end_pos = j + 1
else :
break
if max_len > 0 :
for kw in current_node. keywords:
counters[ kw] += 1
i = end_pos
else :
i += 1
return counters
if __name__ == "__main__" :
keywords = [ '矮小' ,
'安于现状' ,
'暗藏' ,
'暗淡' ,
'暗黑' ]
all_text = ''
workbook = Workbook( )
sheet = workbook. active
for i in range ( 108 ) :
i = i+ 1
image_path = f"/Users/xxx/output_images2022/page_ { i} .png"
all_text = all_text + allimngs( image_path)
all_text = all_text. replace( ' ' , '' ) . replace( '\n' , '' )
result = count_keywords( all_text, keywords)
num = 1
for k, v in result. items( ) :
sheet[ f'A { num} ' ] = k
sheet[ f'B { num} ' ] = v
print ( k, v, num)
num = num + 1
workbook. save( filename= '2022.xlsx' )