#%%from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
pdf_path ='2023-LiuGuokai-Meas.pdf'
pdf_reader = PdfReader(pdf_path)
text =""for page in pdf_reader.pages:
text += page.extract_text()# split into chunks
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len)
chunks = text_splitter.split_text(text)# %%
C. 对话问答代码
# [嵌入模型 · Ollama 博客 - Ollama 中文](https://ollama.org.cn/blog/embedding-models)# 步骤1:生成嵌入import ollama
import chromadb
documents =["Llamas are members of the camelid family meaning they're pretty closely related to vicuñas and camels","Llamas were first domesticated and used as pack animals 4,000 to 5,000 years ago in the Peruvian highlands","Llamas can grow as much as 6 feet tall though the average llama between 5 feet 6 inches and 5 feet 9 inches tall","Llamas weigh between 280 and 450 pounds and can carry 25 to 30 percent of their body weight","Llamas are vegetarians and have very efficient digestive systems","Llamas live to be about 20 years old, though some only live for 15 years and others live to be 30 years old",]
client = chromadb.Client()
collection = client.create_collection(name="docs")# store each document in a vector embedding databasefor i, d inenumerate(documents):
response = ollama.embeddings(model="mxbai-embed-large", prompt=d)
embedding = response["embedding"]
collection.add(
ids=[str(i)],
embeddings=[embedding],
documents=[d])# 步骤2:检索# an example prompt
prompt ="What animals are llamas related to?"# generate an embedding for the prompt and retrieve the most relevant doc
response = ollama.embeddings(
prompt=prompt,
model="mxbai-embed-large")
results = collection.query(
query_embeddings=[response["embedding"]],
n_results=1)
data = results['documents'][0][0]# 步骤3:生成# generate a response combining the prompt and data we retrieved in step 2
output = ollama.generate(
model="qwen2:7b",
prompt=f"Using this data: {data}. Respond to this prompt: {prompt}")print(output['response'])
原论文:On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes
摘要
知识蒸馏(KD)被广泛用于通过训练较小的学生模型来压缩教师模型,以降低推理成本和内存占用。然而,当前用于自回归序…
01.Selenium4.0实现搜索功能
1.安装Selenium及查看Selenium版本
pip install selenium
pip show seleniumfrom selenium import webdriver
from chromedriver_py import binary_path
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.commo…
利用导数描绘函数图形的一般步骤如下: (1)确定函数 y f ( x ) y f(x) yf(x) 的定义域及函数所具有的某些特性(如奇偶性、周期性等),并求出函数的一阶导数 f ′ ( x ) f^{}(x) f′(x) 和二阶导数 f ′ …
目录 说明图片示例 说明
数据集格式:YOLO格式
图片数量:55952
标注数量(txt文件个数):55952
标注类别数:7
标注类别名称: one two three four five good ok
数据集下载:手势识别数据集
图片示例
数…