大模型RAG企业级项目实战:手把手带你搭建一套属于你的RAG系统
完整的 RAG 应用流程主要包含两个阶段:
数据准备阶段:(1)数据提取–> (2)分块(Chunking)–> (3)向量化(embedding)–> (4)数据入库
检索生成阶段:(1)问题向量化–> (2)根据问题查询匹配数据–> (3)获取索引数据 --> (4)将数据注入Prompt–> (5)LLM生成答案
#导入包
from langchain.document_loaders import UnstructuredExcelLoader,PyPDFLoader,Docx2txtLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
#引入多重查询检索和LLM
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chat_models import ChatLiteLLM
from langchain.prompts import ChatPromptTemplate
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
#定义chatdoc
class ChatDoc():
def __init__(self):
self.doc = None
self.splitText = []
self.template = [
("system", "你是一个处理文档的秘书,你从不说自己是一个大模型或AI助手,你会根据下面提供的上下文内容来继续回答问题.\n 上下文内容 \n {context} \n"),
('human','你好!'),
('ai','你好!'),
('human','{question}')
]
self.prompt = ChatPromptTemplate.from_messages(self.template)
#加载文本
def getFile(self):
doc = self.doc
loader = {
"docx": Docx2txtLoader,
"pdf": PyPDFLoader,
"xlsx": UnstructuredExcelLoader,
}
file_extension = doc.split('.')[-1] #加载文件,选择文件类型
loader_class = loader.get(file_extension)
if loader_class:
try:
loader = loader_class(doc)
text = loader.load()
return text
except Exception as e:
print(f'Error loading{file_extension} files:{e}')
return None
else:
print(f'Unsupporyed file extension: {file_extension}')
return None
#处理文本的函数
def splitsentences(self):
full_text = self.getFile() #获取文档内容
if full_text != None:
#对文档进行切分,chunk_size 指定了每个文本块的最大字符数,而 chunk_overlap 指定了每个块之间的重叠字符数
text_splitter = CharacterTextSplitter(chunk_size = 200,chunk_overlap = 20)
self.splitText = text_splitter.split_documents(full_text)
#向量化与向量存储
def embeddingAndVectorDB(self):
embeddings = HuggingFaceEmbeddings()
db = FAISS.from_documents(documents = self.splitText, embedding = embeddings)
return db
#提问并找到相关的文本块
def askAndFindFiles(self, question):
db = self.embeddingAndVectorDB()
#把问题交给LLM进行多角度扩展
llm = ChatLiteLLM()
retriever_from_llm = MultiQueryRetriever.from_llm(
retriever = db.as_retriever(),
llm = llm
)
return retriever_from_llm.aget_relevant_documents(question)
#用自然语言和文档聊天
def chatWithDoc(self, question):
contents = ""
context = self.askAndFindFiles(question)
for i in context:
contents += i.page_content
messages = self.prompt.format_messages(context = contents, question = question)
chat = ChatLiteLLM(
streaming=True,
verbose=True,
callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
)
return chat.invoke(message)
def main():
chat_doc = ChatDoc()
chat_doc.doc = '/kaggle/input/data-docxdata-docx/.docx' #输入自己的文档
chat_doc.splitsentences()
result = chat_doc.chatWithDoc('你是谁?')
print(result)
main()