使用向量数据库pinecone构建应用05：人脸相似度查询Facial Similarity Search

news2025/7/15 15:41:41

Building Applications with Vector Databases

下面是这门课的学习笔记：https://www.deeplearning.ai/short-courses/building-applications-vector-databases/

Learn to create six exciting applications of vector databases and implement them using Pinecone.

Build a hybrid search app that combines both text and images for improved multimodal search results.

Learn how to build an app that measures and ranks facial similarity.

文章目录

Building Applications with Vector Databases
Lesson 5 - Facial Similarity Search
- - Import the Needed Packages
  - Load the Dataset
  - Setup Pinecone
  - Create Embeddings Using DeepFace
  - Plot the Data of Images
  - Store the Embeddings in Pinecone
  - Calculate the Similarity Scores
  - Check the Matching Images

Lesson 5 - Facial Similarity Search

人脸相似度查询：看看孩子更像母亲还是父亲

在这里插入图片描述

Import the Needed Packages

import warnings
warnings.filterwarnings('ignore')

from deepface import DeepFace
from pinecone import Pinecone, ServerlessSpec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from tqdm import tqdm
from DLAIUtils import Utils


import contextlib
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import time

# get api key
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

Load the Dataset

def show_img(f):
  img = plt.imread(f)
  plt.figure(figsize=(4,3))
  plt.imshow(img)

show_img('family/dad/P06260_face5.jpg')

Output

在这里插入图片描述

Setup Pinecone

MODEL = "Facenet"
INDEX_NAME = utils.create_dlai_index_name('dl-ai')

pinecone = Pinecone(api_key=PINECONE_API_KEY)

Create Embeddings Using DeepFace

def generate_vectors():
  VECTOR_FILE = "./vectors.vec"

  with contextlib.suppress(FileNotFoundError):
    os.remove(VECTOR_FILE)
  with open(VECTOR_FILE, "w") as f:
    for person in ["mom", "dad", "child"]:
      files = glob.glob(f'family/{person}/*')
      for file in tqdm(files):
        try:
          embedding = DeepFace.represent(img_path=file, model_name=MODEL, enforce_detection=False)[0]['embedding']
          f.write(f'{person}:{os.path.basename(file)}:{embedding}\n')
        except (ValueError, UnboundLocalError, AttributeError) as e:
          print(e)

generate_vectors()

!head -10 vectors.vec

Plot the Data of Images

如下两步：
在这里插入图片描述
PCA介绍

在这里插入图片描述

t-SNE plot介绍
在这里插入图片描述

def gen_tsne_df(person, perplexity):
    vectors =[]
    with open('./vectors.vec', 'r') as f:
      for line in tqdm(f):
        p, orig_img, v = line.split(':')
        if person == p:
            vectors.append(eval(v))
    pca = PCA(n_components=8)
    tsne = TSNE(2, perplexity=perplexity, random_state = 0, n_iter=1000,
        verbose=0, metric='euclidean', learning_rate=75)
    print(f'transform {len(vectors)} vectors')
    pca_transform = pca.fit_transform(vectors)
    embeddings2d = tsne.fit_transform(pca_transform)
    return pd.DataFrame({'x':embeddings2d[:,0], 'y':embeddings2d[:,1]})

def plot_tsne(perplexity, model):
    (_, ax) = plt.subplots(figsize=(8,5))
    #plt.style.use('seaborn-whitegrid')
    plt.grid(color='#EAEAEB', linewidth=0.5)
    ax.spines['top'].set_color(None)
    ax.spines['right'].set_color(None)
    ax.spines['left'].set_color('#2B2F30')
    ax.spines['bottom'].set_color('#2B2F30')
    colormap = {'dad':'#ee8933', 'child':'#4fad5b', 'mom':'#4c93db'}

    for person in colormap:
        embeddingsdf = gen_tsne_df(person, perplexity)
        ax.scatter(embeddingsdf.x, embeddingsdf.y, alpha=.5, 
                   label=person, color=colormap[person])
    plt.title(f'Scatter plot of faces using {model}', fontsize=16, fontweight='bold', pad=20)
    plt.suptitle(f't-SNE [perplexity={perplexity}]', y=0.92, fontsize=13)
    plt.legend(loc='best', frameon=True)
    plt.show()

plot_tsne(44, 'facenet')

Output

在这里插入图片描述

Store the Embeddings in Pinecone

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)
pinecone.create_index(name=INDEX_NAME, dimension=128, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-west-2'))

index = pinecone.Index(INDEX_NAME)

def store_vectors():
  with open("vectors.vec", "r") as f:
    for line in tqdm(f):
        person, file, vec = line.split(':')
        index.upsert([(f'{person}-{file}', eval(vec), {"person":person, "file":file})])
store_vectors()

index.describe_index_stats()

Output

{'dimension': 128,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 319}},
 'total_vector_count': 319}

Calculate the Similarity Scores

def test(vec_groups, parent, child):
  index = pinecone.Index(INDEX_NAME)
  parent_vecs = vec_groups[parent]
  K = 10
  SAMPLE_SIZE = 10
  sum = 0
  for i in tqdm(range(0,SAMPLE_SIZE)):
    query_response = index.query(
      top_k=K,
      vector = parent_vecs[i],
      filter={
        "person": {"$eq": child}
      }
    )
    for row in query_response["matches"]:
      sum  = sum + row["score"]
  print(f'\n\n{parent} AVG: {sum / (SAMPLE_SIZE*K)}')

def compute_scores():
  index = pinecone.Index(INDEX_NAME)
  vec_groups = {"dad":[], "mom":[], "child":[]}
  with open("vectors.vec", "r") as f:
    for line in tqdm(f):
      person, file, vec = line.split(':')
      vec_groups[person].append(eval(vec))
  print(f"DAD {'-' * 20}")
  test(vec_groups, "dad", "child")
  print(f"MOM {'-' * 20}")
  test(vec_groups, "mom", "child")

compute_scores()

Output

dad AVG: 0.41020248437000006
mom AVG: 0.3494142116000002

Check the Matching Images

孩子的照片

child_base = 'family/child/P06310_face1.jpg'
show_img(child_base)

Output

在这里插入图片描述

#Now find closest given we know dad is "most similar"
embedding = DeepFace.represent(img_path=child_base, model_name=MODEL)[0]['embedding']
print(embedding)

在dad文件夹中查找结果

query_response = index.query(
      top_k=3,
      vector = embedding,
      filter={
        "person": {"$eq": "dad"}
      },
      include_metadata=True
)

在这里，filter 参数中的 “$eq” 表示等于运算符（equality operator）。它用于对索引中的特定字段进行过滤，只返回那些字段值等于给定值的条目。

在这个例子中，filter 参数的目的是对索引中的 “person” 字段进行过滤，只返回 “person” 字段值等于 “dad” 的条目。这样可以限制搜索结果，只返回符合指定条件的结果。

print(query_response)

在这里，vector 参数用于指定查询时所使用的向量。具体来说，它表示查询向量，即用于与索引中存储的向量进行相似度比较的向量。

当执行查询时，系统会使用 vector 参数中指定的向量与索引中存储的向量进行相似度计算，从而找到与查询向量最相似的向量，并返回相应的搜索结果。

在这个例子中，vector 参数应该包含了用于查询的嵌入向量，通常是由某个模型生成的表示查询内容的向量。

Output

{'matches': [{'id': 'dad-P06396_face3.jpg',
              'metadata': {'file': 'P06396_face3.jpg', 'person': 'dad'},
              'score': 0.438557684,
              'values': []},
             {'id': 'dad-P11886_face3.jpg',
              'metadata': {'file': 'P11886_face3.jpg', 'person': 'dad'},
              'score': 0.419384569,
              'values': []},
             {'id': 'dad-P04408_face0.jpg',
              'metadata': {'file': 'P04408_face0.jpg', 'person': 'dad'},
              'score': 0.407050818,
              'values': []},
             {'id': 'dad-P11991_face7.jpg',
              'metadata': {'file': 'P11991_face7.jpg', 'person': 'dad'},
              'score': 0.368852,
              'values': []},
             {'id': 'dad-P06265_face2.jpg',
              'metadata': {'file': 'P06265_face2.jpg', 'person': 'dad'},
              'score': 0.36260435,
              'values': []},
             {'id': 'dad-P04407_face3.jpg',
              'metadata': {'file': 'P04407_face3.jpg', 'person': 'dad'},
              'score': 0.357501268,
              'values': []},
             {'id': 'dad-P11995_face2.jpg',
              'metadata': {'file': 'P11995_face2.jpg', 'person': 'dad'},
              'score': 0.336032152,
              'values': []},
             {'id': 'dad-P11975_face0.jpg',
              'metadata': {'file': 'P11975_face0.jpg', 'person': 'dad'},
              'score': 0.317324311,
              'values': []},
             {'id': 'dad-P11984_face2.jpg',
              'metadata': {'file': 'P11984_face2.jpg', 'person': 'dad'},
              'score': 0.310575306,
              'values': []},
             {'id': 'dad-P06260_face5.jpg',
              'metadata': {'file': 'P06260_face5.jpg', 'person': 'dad'},
              'score': 0.30459854,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}

查询最像孩子的父亲：

photo = query_response['matches'][0]['metadata']['file']
show_img(f'family/dad/{photo}')

Output

在这里插入图片描述

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.coloradmin.cn/o/1465556.html

如若内容造成侵权/违法违规/事实不符，请联系多彩编程网进行投诉反馈，一经查实，立即删除！