DIfy中集成magic-pdf实现文档解析agent与多模态大模型图文问答

大模型相关目录

大模型，包括部署微调prompt/Agent应用开发、知识库增强、数据库增强、知识图谱增强、自然语言处理、多模态等大模型应用开发内容
从0起步，扬帆起航。

swift与Internvl下的多模态大模型分布式微调指南（附代码和数据）
多模态大模型Internvl-1.5-26B微调后部署及测试实录（附代码）
多模态大模型Internvl-2-26B的OCR赋能方案（附代码）
miniconda+xinference的大模型推理部署指南
Mem0：大模型最强赋能“有记忆的LLM”
再谈Agent：Dify智能体实现Txet2SQL
Moe模式：或将是最好的大模型应用开发路径
一文带你了解大模型RAG
详细记录swfit微调interVL2-8B多模态大模型进行目标检测（附代码）
DIfy中集成magic-pdf实现文档解析agent与多模态大模型图文问答

文章目录

大模型相关目录
前言
技术方案
效果

前言

Dify 是一款开源的大语言模型（LLM）应用开发平台。它结合了后端即服务（Backend as Service）和 LLMOps 的理念，使开发者能够快速构建生产级的生成式 AI 应用。Dify 提供了内置的模型支持、Prompt 编排、RAG 引擎、Agent 框架、流程编排等技术栈，以及易用的界面和 API，支持自部署和数据控制。这个平台特别适合于创业公司快速将 AI 应用创意变为现实，也适用于将 LLM 集成到现有业务中，或作为企业级的 LLM 基础设施。

Magic-PDF 是一款将 PDF 转化为 markdown 格式的工具。支持多种前端模型输入；删除页眉、页脚、脚注、页码等元素；符合人类阅读顺序的排版格式；保留原文档的结构和格式，包括标题、段落、列表等；提取图像和表格并在markdown中展示；将公式转换成latex。

本文旨在在dify平台中集成Magic-PDF的pdf转markdown功能，实现文档解析智能体，此外在问答中实现图文对话功能。

技术方案

在这里插入图片描述

pdf预处理

def save_first_ten_pages(pdf_path):
    # 创建一个PdfReader对象
    input_pdf = PdfReader(pdf_path)
    
    # 创建一个PdfWriter对象
    output_pdf = PdfWriter()
    
    # 获取PDF文件的总页数
    num_pages = len(input_pdf.pages)
    
    # 将前10页（或更少，如果PDF不足10页）添加到输出PDF中
    for page_num in range(min(10, num_pages)):
        output_pdf.add_page(input_pdf.pages[page_num])
    
    # 获取原始PDF文件的名称
    pdf_file_name = os.path.basename(pdf_path)
    
    # 创建新文件的名称，只包含文件名，不包含路径
    new_file_ab_name = '/home/super/lyq/PDF-Extract-Kit/file_folder/'+f"first_ten_pages_{pdf_file_name}"
    new_file_name = f"first_ten_pages_{pdf_file_name}"
    # 写入新的PDF文件
    with open(new_file_ab_name, "wb") as output_file:
        output_pdf.write(output_file)
    
    return new_file_ab_name

生成sh脚本并运行

def write_to_script(file_path):
    script_content = f"""
source activate
conda deactivate
conda activate pdf_e_k
magic-pdf pdf-command --pdf "{file_path}" --inside_model true
echo 'deal success'
"""
    with open("/home/super/lyq/pdf-kit-dify/test.sh", "w",encoding='utf-8') as file:
        file.write(script_content)


def run_bash_script(script_path):
    try:
        # 运行bash脚本
        subprocess.run(['bash', script_path], check=True)
        print(f"Script '{script_path}' executed successfully.")
    except subprocess.CalledProcessError as e:
        print(f"An error occurred while executing '{script_path}': {e}")
    except FileNotFoundError:
        print(f"Script '{script_path}' not found.")

读取markdown数据并后处理为dify可处理格式

def read_markdown_file(file_path):
    """
    读取Markdown文件并返回其内容。

    参数:
    file_path (str): Markdown文件的路径。

    返回:
    str: Markdown文件的内容。
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        return "文件未找到，请检查路径是否正确。"
    except Exception as e:
        return f"读取文件时发生错误: {e}"

def url_deal(md_string: str,input_file_name) -> dict:
    new_base_url: str = f"http://172.20.32.127:6007/{input_file_name}/auto/images/"
    # 定义Markdown图片语法的正则表达式模式
    pattern = r'!\[(.*?)\]\((.*?)\)'
    
    # 使用正则表达式查找所有匹配项
    matches = re.findall(pattern, md_string)
    
    # 用于存储修改后的字符串
    modified_string = md_string
    
    for alt_text, image_url in matches:
        # 获取原始图片文件名
        original_filename = image_url.split('/')[-1]
        
        # 构造新的图片URL
        new_image_url = new_base_url + original_filename
        
        # 替换原始URL为新URL
        old_image_syntax = f'![{alt_text}]({image_url})'
        new_image_syntax = f'![{alt_text}]({new_image_url})'
        modified_string = modified_string.replace(old_image_syntax, new_image_syntax)
    
    return {
        "result": modified_string,
    }

值得注意的是，要使dify能够解析图文，需配合代码在指定路径下启动一个server：

(base) super@super-SYS-420GP-TNR:~/lyq/PDF-Extract-Kit/temp_out/magic-pdf$

python3 -m http.server --directory=. 6007

效果

汇总代码，上述工具汇总为执行函数并以fastapi形式启动。

import subprocess
from PyPDF2 import PdfReader, PdfWriter
import os
import re
from fastapi import FastAPI, HTTPException, Depends


app = FastAPI()

def url_deal(md_string: str,input_file_name) -> dict:
    new_base_url: str = f"http://172.20.32.127:6007/{input_file_name}/auto/images/"
    # 定义Markdown图片语法的正则表达式模式
    pattern = r'!\[(.*?)\]\((.*?)\)'
    
    # 使用正则表达式查找所有匹配项
    matches = re.findall(pattern, md_string)
    
    # 用于存储修改后的字符串
    modified_string = md_string
    
    for alt_text, image_url in matches:
        # 获取原始图片文件名
        original_filename = image_url.split('/')[-1]
        
        # 构造新的图片URL
        new_image_url = new_base_url + original_filename
        
        # 替换原始URL为新URL
        old_image_syntax = f'![{alt_text}]({image_url})'
        new_image_syntax = f'![{alt_text}]({new_image_url})'
        modified_string = modified_string.replace(old_image_syntax, new_image_syntax)
    
    return {
        "result": modified_string,
    }

def read_markdown_file(file_path):
    """
    读取Markdown文件并返回其内容。

    参数:
    file_path (str): Markdown文件的路径。

    返回:
    str: Markdown文件的内容。
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        return "文件未找到，请检查路径是否正确。"
    except Exception as e:
        return f"读取文件时发生错误: {e}"
def save_first_ten_pages(pdf_path):
    # 创建一个PdfReader对象
    input_pdf = PdfReader(pdf_path)
    
    # 创建一个PdfWriter对象
    output_pdf = PdfWriter()
    
    # 获取PDF文件的总页数
    num_pages = len(input_pdf.pages)
    
    # 将前10页（或更少，如果PDF不足10页）添加到输出PDF中
    for page_num in range(min(10, num_pages)):
        output_pdf.add_page(input_pdf.pages[page_num])
    
    # 获取原始PDF文件的名称
    pdf_file_name = os.path.basename(pdf_path)
    
    # 创建新文件的名称，只包含文件名，不包含路径
    new_file_ab_name = '/home/super/lyq/PDF-Extract-Kit/file_folder/'+f"first_ten_pages_{pdf_file_name}"
    new_file_name = f"first_ten_pages_{pdf_file_name}"
    # 写入新的PDF文件
    with open(new_file_ab_name, "wb") as output_file:
        output_pdf.write(output_file)
    
    return new_file_ab_name

def write_to_script(file_path):
    script_content = f"""
source activate
conda deactivate
conda activate pdf_e_k
magic-pdf pdf-command --pdf "{file_path}" --inside_model true
echo 'deal success'
"""
    with open("/home/super/lyq/pdf-kit-dify/test.sh", "w",encoding='utf-8') as file:
        file.write(script_content)


def run_bash_script(script_path):
    try:
        # 运行bash脚本
        subprocess.run(['bash', script_path], check=True)
        print(f"Script '{script_path}' executed successfully.")
    except subprocess.CalledProcessError as e:
        print(f"An error occurred while executing '{script_path}': {e}")
    except FileNotFoundError:
        print(f"Script '{script_path}' not found.")

@app.get("/deal_pdf")
def exe(file_path):
    new_file_path = save_first_ten_pages(file_path)

    
    write_to_script(new_file_path)

    # 脚本路径
    bash_script_path = '/home/super/lyq/pdf-kit-dify/test.sh'

    # 调用函数来运行bash脚本
    run_bash_script(bash_script_path)

    name_str = file_path.split('/')[-1].split('.')[0]
    return f'/home/super/lyq/PDF-Extract-Kit/temp_out/magic-pdf/first_ten_pages_{name_str}/auto/'
@app.get("/tw_content")
def exe2(file_path):
    name_str = file_path.split('/')[-1].split('.')[0]
    # 示例用法（假设有一个名为 "example.md" 的Markdown文件）
    mid_path = f'/home/super/lyq/PDF-Extract-Kit/temp_out/magic-pdf/first_ten_pages_{name_str}/auto/first_ten_pages_{name_str}.md'
    file_content = read_markdown_file(mid_path)
    #print(url_deal(file_content))
    return url_deal(file_content,f'first_ten_pages_{name_str}')

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=6008)
# # Example usage
# file_path = "/home/super/lyq/PDF-Extract-Kit/半年项目创新点总结.pdf"
# print(exe(file_path))
# print(exe2(file_path))