Python办公自动化（2）对wordpdf的操作

一、操作word文档

终端下载操作word文件的工具库：

pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple python-docx

1.遍历文档中内容

paragraphs：段落属性，返回列表类型的段落地址，遍历每一个段落地址，通过text获得文本

# 操作docx文档的工具
from docx import Document
# 加载文档
doc = Document('需求规约v1.0.docx')
# print(doc.paragraphs)
for p in doc.paragraphs:
    print(p.text)

2.遍历文档中所有表格

from docx import Document
doc = Document('需求规约v1.0.docx')
for t in doc.tables:# 遍历文档中所有表格
    for row in t.rows:# 遍历表格中的每一行
        #_row_str = ' ' 👈优化视图
        for cell in row.cells:# 遍历每一行中的每一个格子
            #_row_str = cell.text + '|' 👈视图优化
            print(_row_str)# 输出格子中所包含的内容

3.检索文档中的关键字

批量查找word文档，检索哪些文档中含有关键字。

1.引库

无需多言

import glob
from docx import Document

2.创建ReadDoc类

用于读取 Word 文档中的段落和表格内容，并将它们分别存储到类的属性中。

class ReadDoc(object): # 创建类，继承object父类
    def __init__(self,path):# 类的初始化方法，self可以调用类成员，path初始化时传递的参数
        self.doc = Document(path)# 获得word文件
        self.p_text = ''# 定义存放段落的引用
        self.table_text = ''# 定义存放表格的引用
        self.get_para()
        self.get_table()

    def get_para(self):
        for p in self.doc.paragraphs:
            self.p_text += p.text + '\n'

    def get_table(self):
        for table in self.doc.tables:
            for row in table.rows:
                _cell_str = ''
                for cell in row.cells:
                    _cell_str += cell.text + ','
                self.table_text += _cell_str +'\n'

3.创建search_word函数

用于在指定路径下的 Word 文档中查找是否包含所有指定的关键字。

def search_word(path,targets):
    result = glob.glob(path)
    final_result = []
    for i in result:
        isuse = True
        if glob.os.path.isfile(i):
            if i.endswith('.docx'):#判断是否是word文件，是返回true，不是返回false
                doc = ReadDoc(i)# 创建对象，参数值会自动给初始化方法赋值
                p_text = doc.p_text # 获得ReadDoc类中p_text值，该属性包含word文档中的段落内容
                t_text = doc.table_text # 获得ReadDoc类中table_text值，该属性包含word文档中的表格内容
                all_text = p_text + t_text
                for target in targets: #循环遍历每一个要查找的关键字
                    if target not in all_text:# 判断文档中不包含关键字
                        isuse = False
                        break
                if not isuse:
                    continue
                final_result.append(i)
    return final_result

4.主程序

用于在当前目录下的所有文件中查找包含指定关键字的 Word 文档。

if __name__ == '__main__':
    path = glob.os.path.join(glob.os.getcwd(),'*')
    res = search_word(path,['python','golang','最佳'])
    print(res)

4.生成word文件

1.创建一个docx文件

from docx import Document
doc = Document()
#👉....👈添加内容
doc.save('text.docx') # 保存word

2.添加/追加标题

添加标题
参数1：标题内容；参数2：标题字号0-9

title = doc.add_heading('My Title',0)

追加标题
即在大标题下写一个小标题

title.add_run('\n123456')

3.添加段落

p = doc.add_paragraph('今天下雨辣')
p.add_run('\n其实也可能不下雨')

4.添加图片

参数1：图片的名称；参数2：图片的宽度，Inches：英寸单位

from docx.shared import Inches

doc.add_picture('tupian.jpg',width=Inches(2))

5.添加表格

#添加表格样式
table_title = ['name','age','sex'] # 构建表头信息
table = doc.add_table(rows=1,cols=3) # 初始化表格，默认1行3列
title_cells = table.rows[0].cells # 获得第一行的格子列表
title_cells[0].text = table_title[0]
title_cells[1].text = table_title[1]
title_cells[2].text = table_title[2]
#构建表体数据
data = [
    ('Adela','18','woman'),
    ('Hecate','15','woman'),
    ('Hela','14','woman'),
]
# 将表体数据赋值给表格
for d in data:
    row_cells = table.add_row().cells # 添加行并且获得行中的格子
    row_cells[0].text = d[0] #name
    row_cells[1].text = d[1] #age
    row_cells[2].text = d[2] #sex

6.添加分页

doc.add_page_break()
# title1 = doc.add_heading('My Title2',0) 👈随便加点内容

5.设置word样式

1.创建一个docx文件

from docx import Document
# ......👈添加功能库
doc = Document()
# ......👈添加功能
doc.save('test.docx')

2. 定义全局样式

from docx.shared import RGBColor, Pt
style = doc.styles['Normal']
style.font.name='微软雅黑'
style.font.color.rgb = RGBColor(255,0,0)
style.font.size = Pt(16)
doc.add_paragraph('Java语言和Python已经成为开发者的必备语言')

3.定义全局样式

from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import Pt
title = doc.add_heading('My Title',0)
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
title.style.font.size = Pt(20)

4.添加表格样式

from docx.enum.style import WD_STYLE_TYPE
#工具中内置哪些表格样式👈在终端中输出，单纯为了查找可用样式
for i in doc.styles:
    if i.type == WD_STYLE_TYPE.TABLE:
        print(i.name)
#添加表格样式
table_title = ['name','age','sex'] # 构建表头信息
table = doc.add_table(rows=1,cols=3,👉style='Colorful Grid Accent 3'👈) # 初始化表格，默认1行3列
title_cells = table.rows[0].cells # 获得第一行的格子列表
title_cells[0].text = table_title[0]
title_cells[1].text = table_title[1]
title_cells[2].text = table_title[2]
#构建表体数据
data = [
    ('Adela','18','woman'),
    ('Hecate','15','woman'),
    ('Hela','14','woman'),
]
# 将表体数据赋值给表格
for d in data:
    row_cells = table.add_row().cells # 添加行并且获得行中的格子
    row_cells[0].text = d[0] #name
    row_cells[1].text = d[1] #age
    row_cells[2].text = d[2] #sex

二、操作PDF文档

1.安装工具

1.wkhtmltopdf

将 HTML 页面或网页转换为 PDF 文件工具：

下载网址：wkhtmltopdf

配置环境变量后可在pycharm终端中查询版本：

wkhtmltopdf -V

2.pdfkit

将 HTML、CSS 和 JavaScript 转换为 PDF 格式的工具：

在pycharm终端下载工具库即可：

pip3 install pdfkit

3.pywin32

在 Python 环境中直接使用 Windows 的各种功能:

包括操作系统的文件系统、注册表、图形用户界面等

pip3 install pywin32

4.pypdf2

用于处理 PDF 文件的 Python 库:

它提供了丰富的功能，包括读取、合并、拆分、加密、解密 PDF 文件等。

pip3 install pypdf2

5.pdfplumber

也是用于处理 PDF 文件的 Python 库:

主要功能是从 PDF 文件中提取文本、表格、图像等数据。

pip3 install pdfplumber

2.生成PDF文件

1.html转换成pdf

参数1：html文件；参数2：转换pdf文件的名字

import pdfkit
pdfkit.from_file('htmldemo.html','test0.pdf')

2.网址的html转换成pdf

#我没成功，估计是网页信息量太大了👇
pdfkit.from_url(['https://www.baidu.com','https://www/jd.com'],'test1.pdf')

3.字符串转pdf

import pdfkit
html = '''
<html>
    <head>
        <meta charset="utf-8"/>
    </head>
    <body>
        <p>你好</p>
    </body>
</html>
'''
pdfkit.from_string(html,'test2.pdf')

3.生成通知书

from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.shared import Pt, RGBColor, Inches

def create_doc(car_no,year,month,day,hour,minute,money,type_info):
    doc = Document()
    title = doc.add_paragraph()
    p1 = title.add_run('车辆违章处罚通知单')
    p1.font.size = Pt(30)
    p1.font.color.rgb = RGBColor(255,0,0)
    p1.font.name = ''
    p1._element.rPr.rFonts.set(qn('w:eastAsia'),'黑体')
    title.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
    info = f'''辽A{car_no}车于{year}年{month}月{day}日{hour}时{minute}分在营过程中出现{type_info}(违章)现象。公可按票安企法规和公司相关制度发定决定对该车驾驶员处以{money}元款，要求你在今后的营运过程中严格按照相关法律法规运行。(注，罚款金请在返程后立即到公司缴纳)
                                                                
                                                                        驾驶员签字:                  年       月       日     '''
    content = doc.add_paragraph()
    p2 = content.add_run(info)
    content.paragraph_format.first_line_indent = Inches(0.25)

    doc.save('通知书.docx')

if __name__ == '__main__':
    car_no = '123456'
    year = 2030
    month = 8
    day = 8
    hour = 16
    minute = 25
    money = 200
    type_info = '违停'
    create_doc(car_no,year,month,day,hour,minute,money,type_info)

4.通过模板生成文档

from docx import Document
import os
infos = [
    ['辽A00001',2030,12,12,12,12,'违停',200],
    ['辽A00002',2030,11,11,11,11,'闯红灯',500],
    ['辽A00003',2030,10,10,10,10,'压线',200],
]
for info in infos:
    doc = Document('word_模板.docx')
    for p in doc.paragraphs:
        for run in p.runs:
            run.text = run.text.replace('{0}',info[0])
            run.text = run.text.replace('{1}', str(info[1]))
            run.text = run.text.replace('{2}', str(info[2]))
            run.text = run.text.replace('{3}', str(info[3]))
            run.text = run.text.replace('{4}', str(info[4]))
            run.text = run.text.replace('{5}', str(info[5]))
            run.text = run.text.replace('{6}', info[6])
            run.text = run.text.replace('{7}', str(info[7]))

    if not os.path.exists('./通知'):
        os.makedirs('./通知')
    doc.save(f'./通知/{info[0]}.docx')

5.word转换pdf文件

from win32com.client import constants,gencache

def createPdf(wordPath,pdfPath):
    # 声明操作的是word文件
    word = gencache.EnsureDispatch('Word.Application')
    # 打开word文件
    doc = word.Documents.Open(wordPath,ReadOnly=1)
    # 转换pdf文件，并进行格式设置
    doc.ExportAsFixedFormat(pdfPath,constants.wdExportFormatPDF,Item=constants.wdExportDocumentWithMarkup,CreateBookmarks=constants.wdExportCreateHeadingBookmarks)
    word.Quit(constants.wdDoNotSaveChanges)

if __name__ == '__main__':
    path = 'D:/workspace/demo/PythonOfficeAutomation/example2/'
    createPdf(path+'简历1.docx',path+'简历1copy.pdf')

6.读取pdf文件

def read_pdf2(path):
    import pdfplumber
    with pdfplumber.open(path) as pdf:
        for i in range(len(pdf.pages)):
            page = pdf.pages[i]
            print(page.extract_text())# 输出当前页中的文本
if __name__ == '__main__':
    read_pdf2('简历1copy.pdf')

7.合并pdf文件

def merger_pdf(path1,path2):
    from PyPDF2 import PdfWriter,PdfReader
    write = PdfWriter()

    for path in [path1,path2]:
        tmp_pdf = PdfReader(open(path,'rb'))

        for page in tmp_pdf.pages:
            write.add_page(page)

        with open('./合并pdf.pdf','wb') as out:
            write.write(out)
if __name__ == '__main__':
    merger_pdf('简历1copy.pdf','test0.pdf')

8.拆分pdf文件

def chaifen_pdf(path):
    from PyPDF2 import PdfWriter,PdfReader
    pdf = PdfReader(open(path,'rb'))
    for i,page in enumerate(pdf.pages):
        writer = PdfWriter()
        writer.add_page(page)
        with open(f'./拆分_{i+1}.pdf','wb') as out:
            writer.write(out)
if __name__ == '__main__':
    chaifen_pdf('./合并pdf.pdf')

9.加密解密pdf文件

def jiami(path):
    from PyPDF2 import PdfWriter,PdfReader
    pdf = PdfReader(open(path,'rb'))
    writer = PdfWriter()
    # pdf.decrypt('123456') # 读取pdf的文件有密码时，填写对应的密码
    writer.encrypt('123456')#设置密码
    for page in pdf.pages:
        writer.add_page(page)
    with open('加密pdf.pdf','wb') as target:
        writer.write(target)

if __name__ == '__main__':
    jiami('拆分_1.pdf')