Naive RAG: The Basic Pipeline
Build the foundational RAG pipeline: chunk documents, embed, store, retrieve top-k, and generate. Understand its real limitations before optimizing.
Naive RAG: The Basic Pipeline
Naive RAG is the straight-line implementation of the retrieval-augmented generation idea: split documents into chunks, embed them, store the vectors, retrieve the top-k most similar chunks at query time, and hand them to an LLM. No query rewriting. No reranking. No hybrid search. Just the core loop.
Understanding naive RAG deeply — including where it fails — gives you the foundation you need to apply every advanced technique with clarity.
The Five-Step Pipeline
Documents ──► Chunk ──► Embed ──► Store ──► (at query time) Retrieve ──► GenerateEach step has design decisions that dramatically affect quality.
Step 1: Load Documents
# loaders.py
from pathlib import Path
def load_text_files(directory: str) -> list[dict]:
"""Load all .txt and .md files from a directory."""
docs = []
for path in Path(directory).rglob("*"):
if path.suffix in {".txt", ".md"}:
text = path.read_text(encoding="utf-8")
docs.append({
"source": str(path),
"text": text,
})
return docs
# Example with LangChain loaders
from langchain_community.document_loaders import (
DirectoryLoader,
PyPDFLoader,
TextLoader,
)
def load_with_langchain(directory: str):
loader = DirectoryLoader(
directory,
glob="**/*.pdf",
loader_cls=PyPDFLoader,
)
documents = loader.load()
# Each Document has .page_content and .metadata
return documentsStep 2: Chunk Documents
# chunker.py
from langchain_text_splitters import RecursiveCharacterTextSplitter
def chunk_documents(documents, chunk_size=512, chunk_overlap=64):
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
separators=["\n\n", "\n", ". ", " ", ""],
)
chunks = splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks")
return chunksChunk size is one of the highest-leverage decisions in RAG. Too large: you retrieve irrelevant text that drowns the signal. Too small: you lose context the model needs to answer correctly. The sweet spot for most use cases is 256–512 tokens with 10–15% overlap.
Step 3: Embed Chunks
# embedder.py
import os
import time
from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def embed_batch(texts: list[str], model="text-embedding-3-small") -> list[list[float]]:
"""Embed a batch of texts. Handles rate limits with exponential backoff."""
for attempt in range(5):
try:
response = client.embeddings.create(model=model, input=texts)
return [e.embedding for e in response.data]
except Exception as e:
if attempt == 4:
raise
wait = 2 ** attempt
print(f"Rate limit hit, waiting {wait}s...")
time.sleep(wait)
def embed_all_chunks(chunks, batch_size=100):
"""Embed all chunks in batches to avoid API limits."""
all_embeddings = []
texts = [c.page_content for c in chunks]
for i in range(0, len(texts), batch_size):
batch = texts[i : i + batch_size]
embeddings = embed_batch(batch)
all_embeddings.extend(embeddings)
print(f"Embedded {min(i + batch_size, len(texts))}/{len(texts)} chunks")
return all_embeddingsStep 4: Store in a Vector Database
# store.py — using Qdrant (local in-memory mode for development)
from qdrant_client import QdrantClient
from qdrant_client.models import (
Distance,
VectorParams,
PointStruct,
)
import uuid
COLLECTION = "documents"
VECTOR_SIZE = 1536 # text-embedding-3-small dimension
def create_store() -> QdrantClient:
client = QdrantClient(":memory:") # use url="http://localhost:6333" for persistent
client.create_collection(
collection_name=COLLECTION,
vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
)
return client
def upsert_chunks(store: QdrantClient, chunks, embeddings):
points = [
PointStruct(
id=str(uuid.uuid4()),
vector=embedding,
payload={
"text": chunk.page_content,
"source": chunk.metadata.get("source", "unknown"),
"page": chunk.metadata.get("page", 0),
},
)
for chunk, embedding in zip(chunks, embeddings)
]
store.upsert(collection_name=COLLECTION, points=points)
print(f"Upserted {len(points)} vectors")Step 5: Retrieve
# retriever.py
def retrieve(
store: QdrantClient,
query: str,
top_k: int = 4,
embed_fn=None,
) -> list[dict]:
query_vector = embed_fn([query])[0]
results = store.search(
collection_name=COLLECTION,
query_vector=query_vector,
limit=top_k,
with_payload=True,
)
return [
{
"text": r.payload["text"],
"source": r.payload["source"],
"score": r.score,
}
for r in results
]Step 6: Generate
# generator.py
def generate_answer(question: str, chunks: list[dict]) -> str:
context_parts = []
for i, chunk in enumerate(chunks):
context_parts.append(f"[Source {i+1}: {chunk['source']}]\n{chunk['text']}")
context = "\n\n---\n\n".join(context_parts)
system_prompt = """You are a helpful assistant that answers questions based on provided context.
Rules:
- Answer ONLY from the provided context
- If the context doesn't contain the answer, say: "I don't have enough information to answer this."
- Be concise but complete
- Cite sources using [Source N] notation"""
user_message = f"""Context:
{context}
Question: {question}"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message},
],
temperature=0,
max_tokens=512,
)
return response.choices[0].message.contentPutting It All Together
# pipeline.py — full naive RAG pipeline
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
def build_pipeline(data_dir: str):
# 1. Load
loader = DirectoryLoader(data_dir, glob="**/*.txt", loader_cls=TextLoader)
raw_docs = loader.load()
print(f"Loaded {len(raw_docs)} documents")
# 2. Chunk
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
chunks = splitter.split_documents(raw_docs)
print(f"Created {len(chunks)} chunks")
# 3. Embed
embeddings = embed_all_chunks(chunks)
# 4. Store
store = create_store()
upsert_chunks(store, chunks, embeddings)
return store
def query_pipeline(store, question: str) -> str:
# 5. Retrieve
chunks = retrieve(store, question, top_k=4, embed_fn=embed_batch)
for i, c in enumerate(chunks):
print(f" [{i+1}] score={c['score']:.3f} | {c['source']}")
# 6. Generate
answer = generate_answer(question, chunks)
return answer
# Usage
if __name__ == "__main__":
store = build_pipeline("./data")
answer = query_pipeline(store, "What is the refund policy?")
print("\nAnswer:", answer)LangChain One-Liner (for comparison)
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Qdrant
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import DirectoryLoader
# Build
loader = DirectoryLoader("./data", glob="**/*.txt")
docs = loader.load()
from langchain_text_splitters import RecursiveCharacterTextSplitter
chunks = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64).split_documents(docs)
vectorstore = Qdrant.from_documents(
chunks,
OpenAIEmbeddings(model="text-embedding-3-small"),
location=":memory:",
collection_name="documents",
)
# Query
qa = RetrievalQA.from_chain_type(
llm=ChatOpenAI(model="gpt-4o-mini", temperature=0),
retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
return_source_documents=True,
)
result = qa.invoke("What is the refund policy?")
print(result["result"])Limitations of Naive RAG
1. Semantic Gap
The user asks "How do I cancel my account?" but the relevant document says "Account termination procedure." The embedding similarity may not be high enough to retrieve it. Naive RAG has no mechanism to bridge this gap.
Query vector: [cancel, account, how]
Document chunk: [terminate, account, procedure, steps]
Cosine sim: 0.71 ← may not rank in top-42. No Query Rewriting
Short, ambiguous queries return noisy results. "Python speed" could mean Python performance optimization, Python's speed compared to other languages, or something entirely different. Naive RAG retrieves whatever the embedding happens to rank.
3. No Reranking
The embedding model encodes semantics coarsely. The top result by cosine similarity is often not the most relevant result. A cross-encoder reranker that sees both the query and the candidate chunk would rank more accurately but is too slow to run over the full index.
4. Chunk Boundary Problems
If the answer spans two chunks, neither chunk alone may have enough information. Fixed-size chunking ignores document structure, sometimes splitting a sentence mid-thought.
5. No Quality Filter
Naive RAG retrieves the top-k regardless of their actual relevance scores. If the best matching chunk has a similarity of only 0.45, it is probably irrelevant — but the system includes it anyway.
# Simple quality filter — drop chunks below a threshold
def retrieve_with_threshold(store, query, top_k=8, threshold=0.60, embed_fn=None):
candidates = retrieve(store, query, top_k=top_k, embed_fn=embed_fn)
filtered = [c for c in candidates if c["score"] >= threshold]
if not filtered:
return [] # caller should handle "no relevant context" case
return filtered[:4]Measuring Naive RAG Quality
Before optimizing, establish a baseline:
# eval_baseline.py
import json
def evaluate_pipeline(store, qa_pairs: list[dict]) -> dict:
"""
qa_pairs: [{"question": "...", "expected": "..."}, ...]
Returns precision@k and answer quality metrics.
"""
results = []
for qa in qa_pairs:
chunks = retrieve(store, qa["question"], top_k=4, embed_fn=embed_batch)
answer = generate_answer(qa["question"], chunks)
# Simple keyword overlap as a proxy metric
expected_words = set(qa["expected"].lower().split())
answer_words = set(answer.lower().split())
overlap = len(expected_words & answer_words) / len(expected_words)
results.append({
"question": qa["question"],
"overlap": overlap,
"num_chunks": len(chunks),
"top_score": chunks[0]["score"] if chunks else 0,
})
avg_overlap = sum(r["overlap"] for r in results) / len(results)
avg_top_score = sum(r["top_score"] for r in results) / len(results)
return {
"avg_answer_overlap": round(avg_overlap, 3),
"avg_top_retrieval_score": round(avg_top_score, 3),
"num_evaluated": len(results),
}
# Load your golden test set
with open("golden_qa.json") as f:
qa_pairs = json.load(f)
metrics = evaluate_pipeline(store, qa_pairs)
print(json.dumps(metrics, indent=2))When Naive RAG Is Good Enough
Naive RAG is not always inadequate. It works well when:
- Documents are well-written, informative prose (not fragmented tables or scanned text)
- Queries are natural language questions that map cleanly to document language
- The domain vocabulary is standard (no specialized jargon that differs from common usage)
- You need a fast prototype to validate the use case before investing in advanced techniques
If your naive RAG gets above 70% of answers right, you have a solid foundation. If it's below 50%, investigate chunking and embedding model choice before anything else — those have the highest leverage.
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.