Context Stuffing: Maximizing What the Model Knows
Techniques for packing the right information into a context window. Covers document selection, truncation strategies, context ordering, and the lost-in-the-middle problem.
What is Context Stuffing?
Context stuffing is the practice of including external information in the prompt so the model can use it when answering — rather than relying solely on its pretraining knowledge. This is the retrieval step in RAG (Retrieval-Augmented Generation).
The challenge: your context window is limited (8k–200k tokens). Not all retrieved documents fit. Even when they do fit, their placement within the context affects whether the model attends to them.
Document Selection: Quality Over Quantity
Retrieving more documents doesn't always help. A context with 20 weakly relevant documents often performs worse than 5 highly relevant ones:
from openai import OpenAI
from sentence_transformers import SentenceTransformer
import numpy as np
client = OpenAI()
embedder = SentenceTransformer("all-MiniLM-L6-v2")
def select_relevant_documents(
query: str,
documents: list[dict], # Each has "content" and "title"
max_docs: int = 5,
min_similarity: float = 0.5,
) -> list[dict]:
"""Select only genuinely relevant documents, not just top-k."""
query_embedding = embedder.encode([query])[0]
doc_embeddings = embedder.encode([d["content"][:500] for d in documents])
scored = []
for i, (doc, emb) in enumerate(zip(documents, doc_embeddings)):
similarity = float(np.dot(query_embedding, emb) / (
np.linalg.norm(query_embedding) * np.linalg.norm(emb)
))
if similarity >= min_similarity:
scored.append({"doc": doc, "similarity": similarity})
# Sort by similarity, keep top max_docs
scored.sort(key=lambda x: x["similarity"], reverse=True)
return [s["doc"] for s in scored[:max_docs]]
# Example
query = "What is the mechanism of warfarin's drug interaction with clarithromycin?"
documents = [
{"title": "Warfarin and CYP2C9", "content": "Warfarin is primarily metabolized by CYP2C9 (S-warfarin) and CYP3A4 (R-warfarin). S-warfarin is 3-5× more potent..."},
{"title": "Clarithromycin pharmacokinetics", "content": "Clarithromycin is a macrolide antibiotic that inhibits CYP3A4 and to a lesser extent CYP2C9..."},
{"title": "Aspirin dosing guidelines", "content": "Aspirin 81mg daily is recommended for secondary prevention..."}, # Less relevant
{"title": "Drug-drug interactions in anticoagulation", "content": "CYP inhibitors that interact with warfarin include..."},
]
relevant = select_relevant_documents(query, documents, max_docs=3, min_similarity=0.4)
print(f"Selected {len(relevant)} relevant documents")
for doc in relevant:
print(f" - {doc['title']}")Context Assembly
Once documents are selected, assemble the context efficiently:
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4o")
def assemble_context(
query: str,
documents: list[dict],
system_prompt: str,
max_context_tokens: int = 50_000,
) -> str:
"""Assemble context, respecting token budget."""
system_tokens = len(enc.encode(system_prompt))
query_tokens = len(enc.encode(query))
overhead = 200 # Tokens for formatting, role headers, etc.
available_for_docs = max_context_tokens - system_tokens - query_tokens - overhead
context_parts = []
tokens_used = 0
for doc in documents:
doc_text = f"## {doc['title']}\n{doc['content']}\n"
doc_tokens = len(enc.encode(doc_text))
if tokens_used + doc_tokens > available_for_docs:
# Try to include a truncated version
budget_remaining = available_for_docs - tokens_used
if budget_remaining > 200: # Only include if we have meaningful space
truncated_content = enc.decode(
enc.encode(doc["content"])[:budget_remaining - 50]
)
context_parts.append(f"## {doc['title']}\n{truncated_content} [truncated]\n")
break
context_parts.append(doc_text)
tokens_used += doc_tokens
return "\n".join(context_parts)
# Assemble
context = assemble_context(
query="How does clarithromycin interact with warfarin?",
documents=relevant,
system_prompt="You are a clinical pharmacology assistant.",
max_context_tokens=4_000,
)
print(f"Context length: {len(enc.encode(context))} tokens")The Lost-in-the-Middle Problem
Research (Liu et al., 2023) found that language models perform better when relevant information is at the beginning or end of the context, and worse when it's in the middle:
Context: [Doc1] [Doc2] [Doc3 - RELEVANT] [Doc4] [Doc5]
↑ model attends less well hereStrategic ordering of context:
def order_context_strategically(
documents: list[dict],
strategy: str = "primacy_recency",
) -> list[dict]:
"""Order documents to maximize model attention."""
if strategy == "primacy_recency":
# Most relevant first and last — least relevant in the middle
# Assumes documents are sorted by relevance (highest first)
if len(documents) <= 2:
return documents
most_relevant = documents[:1] # First: highest relevance (primacy effect)
least_relevant = documents[1:-1] # Middle: lowest relevance
second_most_relevant = documents[-1:] # Last: second highest (recency effect)
return most_relevant + least_relevant + second_most_relevant
elif strategy == "relevance_first":
# Simply most relevant at the start
return documents # Already sorted by relevance
elif strategy == "relevance_last":
# Most relevant immediately before the query (strongest recency)
return list(reversed(documents))
def build_prompt_with_strategic_context(
query: str,
documents: list[dict],
strategy: str = "primacy_recency",
) -> str:
ordered = order_context_strategically(documents, strategy)
context = "Use the following information to answer the question:\n\n"
for i, doc in enumerate(ordered, 1):
context += f"[Document {i}]: {doc['title']}\n{doc['content']}\n\n"
context += f"Question: {query}"
return contextChunking Long Documents
When individual documents are too long, chunk them before retrieval:
def chunk_document(
text: str,
chunk_size_tokens: int = 400,
overlap_tokens: int = 50,
title: str = "",
) -> list[dict]:
"""Split a document into overlapping chunks."""
tokens = enc.encode(text)
chunks = []
start = 0
chunk_num = 0
while start < len(tokens):
end = min(start + chunk_size_tokens, len(tokens))
chunk_tokens = tokens[start:end]
chunk_text = enc.decode(chunk_tokens)
chunks.append({
"title": f"{title} (part {chunk_num + 1})" if title else f"Chunk {chunk_num + 1}",
"content": chunk_text,
"chunk_index": chunk_num,
"source": title,
})
if end >= len(tokens):
break
start = end - overlap_tokens # Overlap for context continuity
chunk_num += 1
return chunks
# For a 50-page drug monograph:
with open("warfarin_monograph.txt") as f:
full_text = f.read()
chunks = chunk_document(full_text, chunk_size_tokens=400, title="Warfarin Monograph")
print(f"Split into {len(chunks)} chunks")
# Then embed each chunk and retrieve only the most relevant onesCiting Sources in the Context
When asking the model to use documents, tell it to cite them:
def ask_with_citations(query: str, documents: list[dict]) -> str:
context = "Reference documents:\n\n"
for i, doc in enumerate(documents, 1):
context += f"[{i}] {doc['title']}:\n{doc['content']}\n\n"
prompt = f"""{context}
Using ONLY information from the reference documents above, answer:
{query}
For every claim, cite the document number in brackets like [1] or [2].
If the documents don't contain information needed to answer, say "The provided documents don't contain information about this aspect."
Do not use knowledge not found in the documents."""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0,
)
return response.choices[0].message.content
answer = ask_with_citations(
"How does clarithromycin affect warfarin's metabolism?",
relevant,
)
print(answer)Handling Conflicting Documents
When retrieved documents contradict each other, prompt the model to acknowledge it:
CONFLICT_AWARE_PROMPT = """You have access to the following reference documents.
If documents contain contradictory information:
1. Note the contradiction explicitly
2. Indicate which source you believe is more reliable and why
3. Present both perspectives if the answer depends on which source is correct
Reference documents:
{context}
Question: {query}"""Practical Context Budget Allocation
def estimate_budget(
model: str = "gpt-4o",
reserved_for_output: int = 1_000,
) -> dict:
"""Estimate how many tokens are available for context."""
context_limits = {
"gpt-4o": 128_000,
"gpt-4o-mini": 128_000,
"gpt-3.5-turbo": 16_385,
"claude-3-5-sonnet": 200_000,
}
max_context = context_limits.get(model, 8_000)
available = max_context - reserved_for_output
return {
"total_context": max_context,
"reserved_output": reserved_for_output,
"available_for_input": available,
"documents_at_400_tokens_each": available // 400,
"documents_at_800_tokens_each": available // 800,
}
budget = estimate_budget("gpt-4o")
print(f"gpt-4o can hold up to {budget['documents_at_400_tokens_each']} 400-token chunks")
# gpt-4o can hold up to 317 400-token chunks
# In practice, use far fewer — diminishing returns beyond 10-20 relevant chunksContext Compression
For large documents, compress before including:
def compress_document(document: str, query: str, target_tokens: int = 300) -> str:
"""Extract only the query-relevant parts of a long document."""
if len(enc.encode(document)) <= target_tokens:
return document
prompt = f"""The following document may contain information relevant to: "{query}"
Document:
{document[:3000]}
Extract and return ONLY the sentences or paragraphs that are directly relevant to the query.
Keep your extraction under {target_tokens // 4} words.
If nothing is relevant, respond with: NOT RELEVANT"""
response = client.chat.completions.create(
model="gpt-4o-mini", # Use cheaper model for compression
messages=[{"role": "user", "content": prompt}],
temperature=0,
)
return response.choices[0].message.contentContext compression trades latency (one extra LLM call per document) for token efficiency. At scale, using gpt-4o-mini for compression and gpt-4o for the final answer can reduce costs significantly.
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.