RAG Systems · Lesson 7 of 24
Vector Stores: FAISS, Chroma, Pinecone, pgvector
What a Vector Store Does
A vector store (vector database) indexes embeddings for fast approximate nearest-neighbor (ANN) search. At query time, you provide a query embedding and retrieve the K most similar document embeddings from potentially millions of stored vectors.
Without a vector store: Brute-force cosine similarity against all documents is O(N×D) — feasible for thousands of documents, unusably slow for millions.
With a vector store: ANN indexes (HNSW, IVF) reduce search to O(log N) or O(N/n_clusters), trading a small accuracy loss for massive speed gains.
Chroma: Simple Local Development
import chromadb
from chromadb.utils import embedding_functions
# Local persistent client
client = chromadb.PersistentClient(path="./chroma_db")
# Use OpenAI embeddings
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
api_key="your-api-key",
model_name="text-embedding-3-small",
)
# Create or get a collection
collection = client.get_or_create_collection(
name="clinical_documents",
embedding_function=openai_ef,
metadata={"hnsw:space": "cosine"}, # Distance metric
)
# Ingest documents
def ingest_to_chroma(
documents: list[dict],
collection,
) -> None:
"""Ingest documents with metadata into Chroma."""
collection.add(
ids=[doc["id"] for doc in documents],
documents=[doc["content"] for doc in documents],
metadatas=[
{
"title": doc["title"],
"category": doc.get("category", ""),
"source": doc.get("source", ""),
}
for doc in documents
],
)
print(f"Ingested {len(documents)} documents")
# Query
def query_chroma(
query_text: str,
collection,
n_results: int = 5,
where: dict = None, # Metadata filter
) -> list[dict]:
"""Query Chroma for similar documents."""
results = collection.query(
query_texts=[query_text],
n_results=n_results,
where=where,
include=["documents", "metadatas", "distances"],
)
return [
{
"content": doc,
"metadata": meta,
"score": 1 - dist, # Convert distance to similarity
}
for doc, meta, dist in zip(
results["documents"][0],
results["metadatas"][0],
results["distances"][0],
)
]
# Example with metadata filtering
results = query_chroma(
"warfarin drug interactions",
collection,
where={"category": "drug_interaction"}, # Only search interaction docs
)Pinecone: Managed Cloud Vector Database
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key="your-pinecone-api-key")
# Create index (do once)
def create_pinecone_index(
index_name: str = "clinical-rag",
dimension: int = 1536, # text-embedding-3-small
) -> None:
"""Create a serverless Pinecone index."""
if index_name not in [idx.name for idx in pc.list_indexes()]:
pc.create_index(
name=index_name,
dimension=dimension,
metric="cosine",
spec=ServerlessSpec(
cloud="aws",
region="us-east-1",
),
)
print(f"Created index: {index_name}")
def get_pinecone_index(index_name: str = "clinical-rag"):
return pc.Index(index_name)
# Upsert documents
def upsert_to_pinecone(
documents: list[dict],
index,
embeddings: list[list[float]],
batch_size: int = 100,
) -> None:
"""Upsert document embeddings to Pinecone."""
vectors = [
{
"id": doc["id"],
"values": emb,
"metadata": {
"content": doc["content"][:1000], # Pinecone metadata limit
"title": doc["title"],
"category": doc.get("category", ""),
},
}
for doc, emb in zip(documents, embeddings)
]
for i in range(0, len(vectors), batch_size):
batch = vectors[i:i + batch_size]
index.upsert(vectors=batch)
print(f"Upserted {len(vectors)} vectors")
# Query Pinecone
def query_pinecone(
query_embedding: list[float],
index,
top_k: int = 5,
filter: dict = None,
) -> list[dict]:
"""Query Pinecone for similar vectors."""
results = index.query(
vector=query_embedding,
top_k=top_k,
filter=filter, # Metadata filtering
include_metadata=True,
)
return [
{
"id": match.id,
"score": match.score,
"content": match.metadata.get("content", ""),
"metadata": match.metadata,
}
for match in results.matches
]Qdrant: Self-Hosted with Rich Filtering
from qdrant_client import QdrantClient
from qdrant_client.models import (
Distance,
VectorParams,
PointStruct,
Filter,
FieldCondition,
MatchValue,
)
# Initialize client (local or remote)
qdrant_client = QdrantClient(url="http://localhost:6333")
# For cloud: QdrantClient(url="https://your-cluster.qdrant.io", api_key="your-key")
def create_qdrant_collection(
collection_name: str,
vector_size: int = 1536,
) -> None:
"""Create a Qdrant collection with HNSW indexing."""
qdrant_client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(
size=vector_size,
distance=Distance.COSINE,
),
)
def upsert_to_qdrant(
collection_name: str,
documents: list[dict],
embeddings: list[list[float]],
) -> None:
"""Upsert points to Qdrant."""
points = [
PointStruct(
id=i,
vector=emb,
payload={
"content": doc["content"],
"title": doc["title"],
"category": doc.get("category", ""),
"source": doc.get("source", ""),
},
)
for i, (doc, emb) in enumerate(zip(documents, embeddings))
]
qdrant_client.upsert(collection_name=collection_name, points=points)
print(f"Upserted {len(points)} points to Qdrant")
def query_qdrant(
collection_name: str,
query_embedding: list[float],
top_k: int = 5,
category_filter: str = None,
) -> list[dict]:
"""Query Qdrant with optional metadata filtering."""
filter_condition = None
if category_filter:
filter_condition = Filter(
must=[
FieldCondition(
key="category",
match=MatchValue(value=category_filter),
)
]
)
results = qdrant_client.search(
collection_name=collection_name,
query_vector=query_embedding,
limit=top_k,
query_filter=filter_condition,
with_payload=True,
)
return [
{
"score": hit.score,
"content": hit.payload.get("content", ""),
"title": hit.payload.get("title", ""),
"metadata": hit.payload,
}
for hit in results
]pgvector: Vector Search in PostgreSQL
For applications already using PostgreSQL, pgvector adds vector similarity search without a separate service:
import psycopg2
import numpy as np
def setup_pgvector(connection_string: str) -> None:
"""Set up pgvector extension and documents table."""
conn = psycopg2.connect(connection_string)
cur = conn.cursor()
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
cur.execute("""
CREATE TABLE IF NOT EXISTS documents (
id SERIAL PRIMARY KEY,
content TEXT NOT NULL,
title TEXT,
category TEXT,
source TEXT,
embedding vector(1536), -- Match your embedding model dimensions
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
# Create HNSW index for fast similarity search
cur.execute("""
CREATE INDEX IF NOT EXISTS documents_embedding_idx
ON documents USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
""")
conn.commit()
cur.close()
conn.close()
def upsert_document_pg(
conn,
content: str,
title: str,
embedding: list[float],
category: str = None,
source: str = None,
) -> int:
"""Insert a document with embedding into PostgreSQL."""
cur = conn.cursor()
cur.execute(
"""
INSERT INTO documents (content, title, embedding, category, source)
VALUES (%s, %s, %s, %s, %s)
RETURNING id;
""",
(content, title, embedding, category, source),
)
doc_id = cur.fetchone()[0]
conn.commit()
cur.close()
return doc_id
def search_similar_pg(
conn,
query_embedding: list[float],
top_k: int = 5,
category_filter: str = None,
) -> list[dict]:
"""Find similar documents using cosine similarity."""
cur = conn.cursor()
# Use pgvector's <=> operator for cosine distance
if category_filter:
cur.execute(
"""
SELECT id, content, title, category, 1 - (embedding <=> %s::vector) AS similarity
FROM documents
WHERE category = %s
ORDER BY similarity DESC
LIMIT %s;
""",
(query_embedding, category_filter, top_k),
)
else:
cur.execute(
"""
SELECT id, content, title, category, 1 - (embedding <=> %s::vector) AS similarity
FROM documents
ORDER BY similarity DESC
LIMIT %s;
""",
(query_embedding, top_k),
)
rows = cur.fetchall()
cur.close()
return [
{
"id": row[0],
"content": row[1],
"title": row[2],
"category": row[3],
"similarity": float(row[4]),
}
for row in rows
]Choosing a Vector Store
| Factor | Chroma | Pinecone | Qdrant | pgvector | |---|---|---|---|---| | Deployment | Local/self-hosted | Managed cloud | Self-hosted/cloud | PostgreSQL extension | | Setup complexity | Low | Very low | Medium | Low (if using PG) | | Scale | Under 1M vectors | 100M+ | 100M+ | Millions (hardware-dependent) | | Filtering | Basic | Good | Excellent | Full SQL | | Cost | Free | Per-vector | Infrastructure | PostgreSQL cost | | Maintenance | None (local) | None (managed) | You manage | PostgreSQL ops | | Privacy | Full (local) | Data leaves infra | Full (self-hosted) | Full (if self-hosted) |
Recommendation by scenario:
- Prototype/dev: Chroma (zero setup, runs in-process)
- Production, no infra team: Pinecone (fully managed, scales automatically)
- Production, need full control: Qdrant (feature-rich, excellent filtering)
- Already using PostgreSQL: pgvector (no extra service, ACID transactions, full SQL filters)
- Healthcare with PHI: Qdrant or pgvector self-hosted (data stays on-premises)