Query Rewriting and Expansion in RAG
Improve RAG retrieval quality by transforming user queries before search. HyDE, multi-query generation, query decomposition, and step-back prompting.
The Query-Document Gap
User queries and indexed documents are often semantically misaligned:
- User: "My patient can't take this with ibuprofen ā why?" (conversational)
- Document: "Pharmacodynamic interaction: NSAIDs reduce renal prostaglandin synthesis, attenuating natriuretic effects of ACE inhibitors." (technical)
The embedding distance between these is larger than it should be. Query rewriting bridges this gap by transforming queries into forms more likely to match the document vocabulary and style.
HyDE: Hypothetical Document Embeddings
HyDE (Gao et al., 2022) generates a hypothetical answer to the query, then uses that answer as the retrieval query:
from openai import OpenAI
import numpy as np
client = OpenAI()
def hyde_retrieval(
query: str,
embedding_fn,
retriever,
top_k: int = 5,
) -> list[dict]:
"""
Retrieve documents using HyDE.
Instead of embedding the query directly, generate a hypothetical document
that would answer the query, then embed that document for retrieval.
The hypothetical document's embedding sits in the same space as real
documents (not user query space), improving retrieval quality.
"""
# Step 1: Generate a hypothetical answer
hyde_prompt = f"""Write a short, factual paragraph that would directly answer this question.
Use the same technical vocabulary as a clinical reference document.
Question: {query}
Write only the factual paragraph, as if you were a drug information database entry."""
hypothetical_document = client.chat.completions.create(
model="gpt-4o-mini", # Cheap model ā this is just for retrieval, not the final answer
messages=[{"role": "user", "content": hyde_prompt}],
max_tokens=200,
temperature=0,
).choices[0].message.content
# Step 2: Embed the hypothetical document (not the original query)
hyde_embedding = embedding_fn(hypothetical_document)
# Step 3: Retrieve using hypothetical document embedding
results = retriever.retrieve(hyde_embedding, top_k=top_k)
return {
"query": query,
"hypothetical_document": hypothetical_document,
"retrieved_documents": results,
}
# Example
result = hyde_retrieval(
query="Can I give this patient aspirin while they're on warfarin?",
embedding_fn=lambda text: embed_text(text),
retriever=retriever,
)
# Hypothetical doc: "Concomitant use of aspirin and warfarin presents a major pharmacodynamic
# interaction. Aspirin inhibits platelet aggregation via COX-1, while warfarin inhibits
# Vitamin K-dependent clotting factors. Together, these mechanisms synergistically increase
# bleeding risk by 2-3Ć..."
# This embeds much closer to actual drug interaction monographs than the original questionMulti-Query Generation
Generate multiple reformulations of the query, retrieve for each, then deduplicate:
def generate_query_variants(
query: str,
n_variants: int = 3,
model: str = "gpt-4o-mini",
) -> list[str]:
"""Generate N semantically equivalent query reformulations."""
prompt = f"""Generate {n_variants} different ways to phrase this question.
Each variant should capture the same information need but use different vocabulary.
Include technical, clinical, and layperson phrasings.
Original question: {query}
Return a JSON array of {n_variants} strings:
["variant 1", "variant 2", ...]"""
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
temperature=0.7, # Some variation here is good
)
import json
result = json.loads(response.choices[0].message.content)
# Handle both {"queries": [...]} and ["..."] formats
if isinstance(result, list):
return result[:n_variants]
return result.get("queries", result.get("variants", [query]))[:n_variants]
def multi_query_retrieval(
query: str,
embedding_fn,
retriever,
n_queries: int = 3,
top_k_per_query: int = 5,
) -> list[dict]:
"""
Generate multiple query variants, retrieve for each, then deduplicate.
Takes the union of retrieved documents across all query variants.
"""
# Generate variants
variants = generate_query_variants(query, n_variants=n_queries)
all_queries = [query] + variants
# Retrieve for each variant
all_results = {}
for q in all_queries:
q_embedding = embedding_fn(q)
results = retriever.retrieve(q_embedding, top_k=top_k_per_query)
for result in results:
doc_id = result["document"]["id"]
if doc_id not in all_results or result["score"] > all_results[doc_id]["score"]:
all_results[doc_id] = result
# Return deduplicated, sorted by best score
return sorted(all_results.values(), key=lambda x: x["score"], reverse=True)[:top_k_per_query]
# Example variants for "warfarin and ibuprofen interaction":
# 1. "What is the pharmacokinetic interaction between warfarin and ibuprofen?"
# 2. "Can NSAIDs affect warfarin anticoagulation?"
# 3. "Ibuprofen effect on INR and bleeding risk with warfarin therapy"Query Decomposition
Break complex multi-hop questions into simpler sub-questions:
def decompose_query(query: str) -> list[str]:
"""Break a complex query into simpler sub-questions for sequential retrieval."""
prompt = f"""Decompose this complex question into 2-4 simpler sub-questions.
Each sub-question should be answerable independently.
The answers to sub-questions should collectively answer the main question.
Main question: {query}
Return JSON: {{"sub_questions": ["question 1", "question 2", ...]}}"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
temperature=0,
)
import json
result = json.loads(response.choices[0].message.content)
return result.get("sub_questions", [query])
def decomposition_retrieval(
query: str,
embedding_fn,
retriever,
generator,
top_k: int = 3,
) -> dict:
"""
Sequential retrieval using query decomposition.
Each sub-question is answered before moving to the next.
"""
sub_questions = decompose_query(query)
sub_answers = []
all_retrieved_docs = []
for sub_q in sub_questions:
# Retrieve for this sub-question
q_emb = embedding_fn(sub_q)
docs = retriever.retrieve(q_emb, top_k=top_k)
all_retrieved_docs.extend(docs)
# Generate intermediate answer
context = "\n\n".join([d["document"]["content"] for d in docs])
answer = generator.answer(sub_q, context=context)
sub_answers.append({
"question": sub_q,
"context_docs": [d["document"]["id"] for d in docs],
"answer": answer,
})
# Final synthesis: combine sub-answers to answer the main question
synthesis_context = "\n\n".join([
f"Q: {sa['question']}\nA: {sa['answer']}"
for sa in sub_answers
])
final_answer = generator.answer(
query,
context=synthesis_context,
instruction="Use the intermediate answers above to construct a comprehensive final answer.",
)
return {
"original_query": query,
"sub_questions": sub_answers,
"final_answer": final_answer,
}
# Example decomposition:
# Query: "What anticoagulation adjustments are needed if my warfarin patient starts dialysis and needs antibiotics?"
# Sub-questions:
# 1. "How does dialysis affect warfarin metabolism and INR control?"
# 2. "Which antibiotics commonly interact with warfarin and by what mechanism?"
# 3. "How should warfarin be dosed in dialysis patients?"Step-Back Prompting
Generate a more general version of the query to retrieve background knowledge:
def step_back_retrieval(
query: str,
embedding_fn,
retriever,
top_k: int = 5,
) -> dict:
"""
Step-back prompting: retrieve for a higher-level question first,
then retrieve for the original specific question.
Provides background context that helps answer specific questions.
"""
STEP_BACK_PROMPT = f"""What is the more general principle or background knowledge needed to answer this specific question?
Specific question: {query}
Write the more general question (1-2 sentences)."""
general_question = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": STEP_BACK_PROMPT}],
max_tokens=100,
temperature=0,
).choices[0].message.content.strip()
# Retrieve for general question (background)
general_emb = embedding_fn(general_question)
general_docs = retriever.retrieve(general_emb, top_k=top_k // 2)
# Retrieve for specific question
specific_emb = embedding_fn(query)
specific_docs = retriever.retrieve(specific_emb, top_k=top_k)
# Combine (general context + specific details)
all_doc_ids = set()
combined_docs = []
for doc_list in [general_docs, specific_docs]:
for doc in doc_list:
doc_id = doc["document"]["id"]
if doc_id not in all_doc_ids:
combined_docs.append(doc)
all_doc_ids.add(doc_id)
return {
"original_query": query,
"step_back_query": general_question,
"background_docs": general_docs,
"specific_docs": specific_docs,
"combined_docs": combined_docs[:top_k],
}
# Example:
# Original: "What dose of metformin is safe if eGFR is 35?"
# Step-back: "How does renal function affect drug elimination and dosing in general?"
# ā Retrieves general pharmacokinetics context + specific metformin renal dosing infoQuery Rewriting with Conversation Context
In multi-turn conversations, standalone queries need reformulation:
def rewrite_with_context(
current_query: str,
conversation_history: list[dict],
model: str = "gpt-4o-mini",
) -> str:
"""
Rewrite a follow-up query to be standalone.
"What's its half-life?" ā "What is the half-life of warfarin?"
"""
if not conversation_history:
return current_query
# Format history for the prompt
history_text = "\n".join([
f"{msg['role'].capitalize()}: {msg['content']}"
for msg in conversation_history[-6:] # Last 3 turns
])
prompt = f"""Given this conversation history:
{history_text}
Rewrite the following follow-up question to be completely standalone (self-contained):
"{current_query}"
Return only the rewritten question, nothing else."""
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=100,
temperature=0,
)
return response.choices[0].message.content.strip()
# Example:
# History: "Tell me about warfarin" ā "Warfarin is a vitamin K antagonist..."
# Follow-up: "What's its typical INR target?"
# Rewritten: "What is the typical INR target for warfarin therapy?"Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.