Naive RAG: The Basic Pipeline

Naive RAG is the straight-line implementation of the retrieval-augmented generation idea: split documents into chunks, embed them, store the vectors, retrieve the top-k most similar chunks at query time, and hand them to an LLM. No query rewriting. No reranking. No hybrid search. Just the core loop.

Understanding naive RAG deeply — including where it fails — gives you the foundation you need to apply every advanced technique with clarity.

The Five-Step Pipeline

Documents ──► Chunk ──► Embed ──► Store ──► (at query time) Retrieve ──► Generate

Each step has design decisions that dramatically affect quality.

Step 1: Load Documents

Python

# loaders.py
from pathlib import Path

def load_text_files(directory: str) -> list[dict]:
    """Load all .txt and .md files from a directory."""
    docs = []
    for path in Path(directory).rglob("*"):
        if path.suffix in {".txt", ".md"}:
            text = path.read_text(encoding="utf-8")
            docs.append({
                "source": str(path),
                "text": text,
            })
    return docs

# Example with LangChain loaders
from langchain_community.document_loaders import (
    DirectoryLoader,
    PyPDFLoader,
    TextLoader,
)

def load_with_langchain(directory: str):
    loader = DirectoryLoader(
        directory,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader,
    )
    documents = loader.load()
    # Each Document has .page_content and .metadata
    return documents

Step 2: Chunk Documents

Python

# chunker.py
from langchain_text_splitters import RecursiveCharacterTextSplitter

def chunk_documents(documents, chunk_size=512, chunk_overlap=64):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""],
    )
    chunks = splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks")
    return chunks

Chunk size is one of the highest-leverage decisions in RAG. Too large: you retrieve irrelevant text that drowns the signal. Too small: you lose context the model needs to answer correctly. The sweet spot for most use cases is 256–512 tokens with 10–15% overlap.

Step 3: Embed Chunks

Python

# embedder.py
import os
import time
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def embed_batch(texts: list[str], model="text-embedding-3-small") -> list[list[float]]:
    """Embed a batch of texts. Handles rate limits with exponential backoff."""
    for attempt in range(5):
        try:
            response = client.embeddings.create(model=model, input=texts)
            return [e.embedding for e in response.data]
        except Exception as e:
            if attempt == 4:
                raise
            wait = 2 ** attempt
            print(f"Rate limit hit, waiting {wait}s...")
            time.sleep(wait)

def embed_all_chunks(chunks, batch_size=100):
    """Embed all chunks in batches to avoid API limits."""
    all_embeddings = []
    texts = [c.page_content for c in chunks]

    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        embeddings = embed_batch(batch)
        all_embeddings.extend(embeddings)
        print(f"Embedded {min(i + batch_size, len(texts))}/{len(texts)} chunks")

    return all_embeddings

Step 4: Store in a Vector Database

Python

# store.py — using Qdrant (local in-memory mode for development)
from qdrant_client import QdrantClient
from qdrant_client.models import (
    Distance,
    VectorParams,
    PointStruct,
)
import uuid

COLLECTION = "documents"
VECTOR_SIZE = 1536  # text-embedding-3-small dimension

def create_store() -> QdrantClient:
    client = QdrantClient(":memory:")  # use url="http://localhost:6333" for persistent
    client.create_collection(
        collection_name=COLLECTION,
        vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
    )
    return client

def upsert_chunks(store: QdrantClient, chunks, embeddings):
    points = [
        PointStruct(
            id=str(uuid.uuid4()),
            vector=embedding,
            payload={
                "text": chunk.page_content,
                "source": chunk.metadata.get("source", "unknown"),
                "page": chunk.metadata.get("page", 0),
            },
        )
        for chunk, embedding in zip(chunks, embeddings)
    ]
    store.upsert(collection_name=COLLECTION, points=points)
    print(f"Upserted {len(points)} vectors")

Step 5: Retrieve

Python

# retriever.py
def retrieve(
    store: QdrantClient,
    query: str,
    top_k: int = 4,
    embed_fn=None,
) -> list[dict]:
    query_vector = embed_fn([query])[0]
    results = store.search(
        collection_name=COLLECTION,
        query_vector=query_vector,
        limit=top_k,
        with_payload=True,
    )
    return [
        {
            "text": r.payload["text"],
            "source": r.payload["source"],
            "score": r.score,
        }
        for r in results
    ]

Step 6: Generate

Python

# generator.py
def generate_answer(question: str, chunks: list[dict]) -> str:
    context_parts = []
    for i, chunk in enumerate(chunks):
        context_parts.append(f"[Source {i+1}: {chunk['source']}]\n{chunk['text']}")
    context = "\n\n---\n\n".join(context_parts)

    system_prompt = """You are a helpful assistant that answers questions based on provided context.

Rules:
- Answer ONLY from the provided context
- If the context doesn't contain the answer, say: "I don't have enough information to answer this."
- Be concise but complete
- Cite sources using [Source N] notation"""

    user_message = f"""Context:
{context}

Question: {question}"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message},
        ],
        temperature=0,
        max_tokens=512,
    )
    return response.choices[0].message.content

Putting It All Together

Python

# pipeline.py — full naive RAG pipeline
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

def build_pipeline(data_dir: str):
    # 1. Load
    loader = DirectoryLoader(data_dir, glob="**/*.txt", loader_cls=TextLoader)
    raw_docs = loader.load()
    print(f"Loaded {len(raw_docs)} documents")

    # 2. Chunk
    splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
    chunks = splitter.split_documents(raw_docs)
    print(f"Created {len(chunks)} chunks")

    # 3. Embed
    embeddings = embed_all_chunks(chunks)

    # 4. Store
    store = create_store()
    upsert_chunks(store, chunks, embeddings)

    return store

def query_pipeline(store, question: str) -> str:
    # 5. Retrieve
    chunks = retrieve(store, question, top_k=4, embed_fn=embed_batch)
    for i, c in enumerate(chunks):
        print(f"  [{i+1}] score={c['score']:.3f} | {c['source']}")

    # 6. Generate
    answer = generate_answer(question, chunks)
    return answer

# Usage
if __name__ == "__main__":
    store = build_pipeline("./data")
    answer = query_pipeline(store, "What is the refund policy?")
    print("\nAnswer:", answer)

LangChain One-Liner (for comparison)

Python

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Qdrant
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import DirectoryLoader

# Build
loader = DirectoryLoader("./data", glob="**/*.txt")
docs = loader.load()

from langchain_text_splitters import RecursiveCharacterTextSplitter
chunks = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64).split_documents(docs)

vectorstore = Qdrant.from_documents(
    chunks,
    OpenAIEmbeddings(model="text-embedding-3-small"),
    location=":memory:",
    collection_name="documents",
)

# Query
qa = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model="gpt-4o-mini", temperature=0),
    retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
    return_source_documents=True,
)
result = qa.invoke("What is the refund policy?")
print(result["result"])

Limitations of Naive RAG

1. Semantic Gap

The user asks "How do I cancel my account?" but the relevant document says "Account termination procedure." The embedding similarity may not be high enough to retrieve it. Naive RAG has no mechanism to bridge this gap.

Query vector:   [cancel, account, how]
Document chunk: [terminate, account, procedure, steps]
Cosine sim: 0.71  ← may not rank in top-4

2. No Query Rewriting

Short, ambiguous queries return noisy results. "Python speed" could mean Python performance optimization, Python's speed compared to other languages, or something entirely different. Naive RAG retrieves whatever the embedding happens to rank.

3. No Reranking

The embedding model encodes semantics coarsely. The top result by cosine similarity is often not the most relevant result. A cross-encoder reranker that sees both the query and the candidate chunk would rank more accurately but is too slow to run over the full index.

4. Chunk Boundary Problems

If the answer spans two chunks, neither chunk alone may have enough information. Fixed-size chunking ignores document structure, sometimes splitting a sentence mid-thought.

5. No Quality Filter

Naive RAG retrieves the top-k regardless of their actual relevance scores. If the best matching chunk has a similarity of only 0.45, it is probably irrelevant — but the system includes it anyway.

Python

# Simple quality filter — drop chunks below a threshold
def retrieve_with_threshold(store, query, top_k=8, threshold=0.60, embed_fn=None):
    candidates = retrieve(store, query, top_k=top_k, embed_fn=embed_fn)
    filtered = [c for c in candidates if c["score"] >= threshold]
    if not filtered:
        return []  # caller should handle "no relevant context" case
    return filtered[:4]

Measuring Naive RAG Quality

Before optimizing, establish a baseline:

Python

# eval_baseline.py
import json

def evaluate_pipeline(store, qa_pairs: list[dict]) -> dict:
    """
    qa_pairs: [{"question": "...", "expected": "..."}, ...]
    Returns precision@k and answer quality metrics.
    """
    results = []
    for qa in qa_pairs:
        chunks = retrieve(store, qa["question"], top_k=4, embed_fn=embed_batch)
        answer = generate_answer(qa["question"], chunks)

        # Simple keyword overlap as a proxy metric
        expected_words = set(qa["expected"].lower().split())
        answer_words = set(answer.lower().split())
        overlap = len(expected_words & answer_words) / len(expected_words)

        results.append({
            "question": qa["question"],
            "overlap": overlap,
            "num_chunks": len(chunks),
            "top_score": chunks[0]["score"] if chunks else 0,
        })

    avg_overlap = sum(r["overlap"] for r in results) / len(results)
    avg_top_score = sum(r["top_score"] for r in results) / len(results)

    return {
        "avg_answer_overlap": round(avg_overlap, 3),
        "avg_top_retrieval_score": round(avg_top_score, 3),
        "num_evaluated": len(results),
    }

# Load your golden test set
with open("golden_qa.json") as f:
    qa_pairs = json.load(f)

metrics = evaluate_pipeline(store, qa_pairs)
print(json.dumps(metrics, indent=2))

When Naive RAG Is Good Enough

Naive RAG is not always inadequate. It works well when:

Documents are well-written, informative prose (not fragmented tables or scanned text)
Queries are natural language questions that map cleanly to document language
The domain vocabulary is standard (no specialized jargon that differs from common usage)
You need a fast prototype to validate the use case before investing in advanced techniques

If your naive RAG gets above 70% of answers right, you have a solid foundation. If it's below 50%, investigate chunking and embedding model choice before anything else — those have the highest leverage.

Naive RAG: The Basic Pipeline

Naive RAG: The Basic Pipeline

The Five-Step Pipeline

Step 1: Load Documents

Step 2: Chunk Documents

Step 3: Embed Chunks

Step 4: Store in a Vector Database

Step 5: Retrieve

Step 6: Generate

Putting It All Together

LangChain One-Liner (for comparison)

Limitations of Naive RAG

1. Semantic Gap

2. No Query Rewriting

3. No Reranking

4. Chunk Boundary Problems

5. No Quality Filter

Measuring Naive RAG Quality

When Naive RAG Is Good Enough

Enjoyed this article?

Leave a comment