随机生成向量数据集
import numpy as np
import pandas as pd
data_vectors = [(i, np.random.random(300)) for i in range(1000000)]
df = pd.DataFrame(data_vectors)
df.columns = ['idx', 'vector']
df
转化成parquet格式,分批转化,一次会爆炸
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm.notebook import tqdm
def df2parquet(df, batch_size=100000):
num_batch = (len(df)+batch_size-1)//batch_size
for i in tqdm(range(num_batch)):
table = pa.Table.from_pandas(df.iloc[i*batch_size:(i+1)*batch_size])
pq.write_table(table, f'part_{i}.parquet')
df2parquet(df, batch_size=100000)
构造查询语句,检索相似向量
import chdb
def query(vec):
sql = """
SELECT idx,L2Distance(CAST(vector,'Array(Float64)'),{embeddings}) AS score FROM
file("part_*.parquet", Parquet)
ORDER BY score ASC LIMIT 5
""".format(embeddings=vec)
res = chdb.query(sql, 'Dataframe')
return res
q = list(np.random.random(300))
query(q)