一款基于Python的从常规文档里提取图片的简单工具开发方案
1. 环境准备
安装必需库
pip install python-docx PyMuPDF openpyxl beautifulsoup4 pillow
pip install pdfplumber
pip install tk
工具选择
开发环境 :VSCode + Python插件调试工具 :Python IDLE(初学者友好)打包工具 :pyinstaller(可选,用于生成exe)
2. 项目架构设计
image-extractor/
├── main.py # 主程序入口
├── core/
│ ├── docx_extractor.py
│ ├── pdf_extractor.py
│ ├── excel_extractor.py
│ └── html_extractor.py
└── outputs/ # 默认输出目录
3. 核心功能实现
(1) Word文档提取 (docx_extractor.py
)
import zipfile
import os
from PIL import Image
def extract_docx_images ( file_path, output_dir) :
with zipfile. ZipFile( file_path, 'r' ) as zip_ref:
image_files = [ f for f in zip_ref. namelist( ) if f. startswith( 'word/media/' ) ]
for img_file in image_files:
zip_ref. extract( img_file, output_dir)
src = os. path. join( output_dir, img_file)
dst = os. path. join( output_dir, os. path. basename( img_file) )
os. rename( src, dst)
return len ( image_files)
(2) PDF文件提取 (pdf_extractor.py
)
import fitz
import os
def extract_pdf_images ( file_path, output_dir) :
doc = fitz. open ( file_path)
img_count = 0
for page_num in range ( len ( doc) ) :
page = doc. load_page( page_num)
images = page. get_images( full= True )
for img_index, img in enumerate ( images) :
xref = img[ 0 ]
base_image = doc. extract_image( xref)
img_data = base_image[ "image" ]
img_path = os. path. join( output_dir, f"pdf_page { page_num} _img { img_index} .png" )
with open ( img_path, "wb" ) as f:
f. write( img_data)
img_count += 1
return img_count
(3) Excel文件提取 (excel_extractor.py
)
from openpyxl import load_workbook
import os
def extract_excel_images ( file_path, output_dir) :
wb = load_workbook( file_path)
img_count = 0
for sheet in wb. worksheets:
for image in sheet. _images:
img = image. _data
img_path = os. path. join( output_dir, f"excel_ { sheet. title} _img { img_count} .png" )
with open ( img_path, "wb" ) as f:
f. write( img)
img_count += 1
return img_count
(4) HTML文件提取 (html_extractor.py
)
import requests
from bs4 import BeautifulSoup
import os
import base64
def extract_html_images ( html_path, output_dir) :
if html_path. startswith( 'http' ) :
response = requests. get( html_path)
soup = BeautifulSoup( response. text, 'html.parser' )
else :
with open ( html_path, 'r' ) as f:
soup = BeautifulSoup( f. read( ) , 'html.parser' )
img_tags = soup. find_all( 'img' )
img_count = 0
for img in img_tags:
src = img. get( 'src' )
if src. startswith( 'data:image' ) :
header, data = src. split( ',' , 1 )
img_format = header. split( '/' ) [ 1 ] . split( ';' ) [ 0 ]
img_data = base64. b64decode( data)
img_path = os. path. join( output_dir, f"html_img { img_count} . { img_format} " )
with open ( img_path, 'wb' ) as f:
f. write( img_data)
img_count += 1
return img_count
4. 交互界面开发 (main.py
)
import tkinter as tk
from tkinter import filedialog, messagebox
from core import docx_extractor, pdf_extractor, excel_extractor, html_extractor
import os
class ImageExtractorApp :
def __init__ ( self, root) :
self. root = root
self. root. title( "多格式图片提取工具" )
self. file_path = tk. StringVar( )
self. output_dir = tk. StringVar( value= "outputs" )
self. create_widgets( )
def create_widgets ( self) :
tk. Label( self. root, text= "选择文件:" ) . grid( row= 0 , column= 0 , padx= 5 , pady= 5 )
tk. Entry( self. root, textvariable= self. file_path, width= 40 ) . grid( row= 0 , column= 1 )
tk. Button( self. root, text= "浏览" , command= self. select_file) . grid( row= 0 , column= 2 )
tk. Label( self. root, text= "输出目录:" ) . grid( row= 1 , column= 0 )
tk. Entry( self. root, textvariable= self. output_dir, width= 40 ) . grid( row= 1 , column= 1 )
tk. Button( self. root, text= "选择目录" , command= self. select_output_dir) . grid( row= 1 , column= 2 )
tk. Button( self. root, text= "开始提取" , command= self. start_extraction) . grid( row= 2 , column= 1 , pady= 10 )
self. log_text = tk. Text( self. root, height= 10 , width= 50 )
self. log_text. grid( row= 3 , column= 0 , columnspan= 3 )
def select_file ( self) :
file_types = [
( '支持的文件类型' , '*.docx *.pdf *.xlsx *.html' ) ,
( 'Word文档' , '*.docx' ) ,
( 'PDF文件' , '*.pdf' ) ,
( 'Excel文件' , '*.xlsx' ) ,
( '网页文件' , '*.html' )
]
self. file_path. set ( filedialog. askopenfilename( filetypes= file_types) )
def select_output_dir ( self) :
self. output_dir. set ( filedialog. askdirectory( ) )
def start_extraction ( self) :
file_path = self. file_path. get( )
output_dir = self. output_dir. get( )
if not os. path. exists( output_dir) :
os. makedirs( output_dir)
ext = os. path. splitext( file_path) [ 1 ] . lower( )
try :
if ext == '.docx' :
count = docx_extractor. extract_docx_images( file_path, output_dir)
elif ext == '.pdf' :
count = pdf_extractor. extract_pdf_images( file_path, output_dir)
elif ext == '.xlsx' :
count = excel_extractor. extract_excel_images( file_path, output_dir)
elif ext == '.html' :
count = html_extractor. extract_html_images( file_path, output_dir)
else :
messagebox. showerror( "错误" , "不支持的文件类型" )
return
self. log_text. insert( tk. END, f"成功提取 { count} 张图片到 { output_dir} \n" )
except Exception as e:
messagebox. showerror( "错误" , f"提取失败: { str ( e) } " )
if __name__ == "__main__" :
root = tk. Tk( )
app = ImageExtractorApp( root)
root. mainloop( )
5. 使用说明
操作步骤
运行 main.py
点击 浏览 选择文件 (支持.docx/.pdf/.xlsx/.html) 选择输出目录(默认 outputs) 点击 开始提取 查看底部日志区域的提取结果
效果示例
成功提取 5 张图片到 outputs/
成功提取 3 张图片到 outputs/
6. 常见问题解决
Q1: Excel图片无法提取?
原因:openpyxl只能提取嵌入式图片,无法提取浮动图片 解决方案:改用xlrd
+图像坐标识别(需更复杂处理)
Q2: PDF提取的图片模糊?
原因:PDF内嵌低分辨率图片 解决方案:使用pdfplumber
的更高精度提取模式
Q3: 程序无响应?
原因:大文件处理耗时阻塞主线程 解决方案:改用多线程处理(参考threading
模块)
7. 项目扩展建议
增加批量处理 :支持文件夹批量导入添加图片预览 :在界面中显示缩略图支持压缩包 :直接解压ZIP/RAR文件并处理内容增加格式转换 :自动转换HEIC/WEBP等特殊格式