AI Systemsadvanced
Multimodal RAG: Images and Documents
Extend RAG to handle images, charts, and mixed-media documents. Caption-based indexing, CLIP embeddings for image search, and multi-modal context assembly.
Asma Hafeez KhanMay 16, 20266 min read
RAGMultimodalVisionImage SearchCLIP
Why Multimodal RAG
Many domain knowledge bases contain valuable information in non-text formats:
- Clinical: ECG tracings, X-rays, lab result tables as images
- Scientific: charts, diagrams, figures in research papers
- Technical: schematics, flowcharts, screenshots
A text-only RAG system ignores all this visual information. Multimodal RAG indexes and retrieves from images as well as text.
Approach 1: Caption-Based Indexing
Extract text descriptions from images, then index the captions:
Python
from openai import OpenAI
import base64
from pathlib import Path
from dataclasses import dataclass
client = OpenAI()
@dataclass
class ImageDocument:
"""An image with generated caption for RAG indexing."""
id: str
image_path: str
caption: str
detailed_description: str
image_type: str # "chart", "diagram", "photo", "screenshot", "table"
extracted_text: str # OCR text from the image
metadata: dict
def generate_image_caption(
image_path: str,
context: str = "",
) -> dict:
"""Generate a searchable caption and detailed description for an image."""
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode("utf-8")
ext = Path(image_path).suffix.lower().lstrip(".")
if ext == "jpg":
ext = "jpeg"
media_type = f"image/{ext}"
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": """You analyze images for a medical knowledge base.
Provide three things:
1. A concise caption (1-2 sentences) suitable for search indexing
2. A detailed description (3-5 sentences) of all content visible
3. Any text visible in the image (transcribe exactly)
4. Image type classification
Return JSON:
{
"caption": "...",
"detailed_description": "...",
"extracted_text": "...",
"image_type": "chart|diagram|photo|screenshot|table|other"
}""",
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:{media_type};base64,{image_data}",
"detail": "high",
},
},
{
"type": "text",
"text": f"Context: {context}\n\nAnalyze this image for medical knowledge base indexing.",
},
],
},
],
response_format={"type": "json_object"},
temperature=0,
)
import json
result = json.loads(response.choices[0].message.content)
return result
def ingest_image_document(
image_path: str,
document_id: str,
context: str = "",
metadata: dict = None,
) -> ImageDocument:
"""Process an image into a searchable document."""
caption_data = generate_image_caption(image_path, context=context)
# Build searchable content: combine all text for embedding
searchable_content = " ".join(filter(None, [
caption_data.get("caption", ""),
caption_data.get("detailed_description", ""),
caption_data.get("extracted_text", ""),
]))
return ImageDocument(
id=document_id,
image_path=image_path,
caption=caption_data.get("caption", ""),
detailed_description=caption_data.get("detailed_description", ""),
image_type=caption_data.get("image_type", "other"),
extracted_text=caption_data.get("extracted_text", ""),
metadata=metadata or {},
)Approach 2: CLIP Embeddings for Native Image Search
Embed images and queries in the same semantic space using CLIP:
Python
from transformers import CLIPModel, CLIPProcessor
import torch
import torch.nn.functional as F
from PIL import Image
import numpy as np
class CLIPRetriever:
"""Image retrieval using CLIP's shared vision-language embedding space."""
def __init__(self, model_name: str = "openai/clip-vit-base-patch32"):
self.model = CLIPModel.from_pretrained(model_name)
self.processor = CLIPProcessor.from_pretrained(model_name)
self.model.eval()
self.image_embeddings: list[np.ndarray] = []
self.image_documents: list[dict] = []
def embed_image(self, image: Image.Image) -> np.ndarray:
"""Embed a single image using CLIP's vision encoder."""
inputs = self.processor(images=image, return_tensors="pt")
with torch.no_grad():
features = self.model.get_image_features(**inputs)
return F.normalize(features, dim=-1).numpy()[0]
def embed_text(self, text: str) -> np.ndarray:
"""Embed a text query using CLIP's text encoder."""
inputs = self.processor(text=[text], return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
features = self.model.get_text_features(**inputs)
return F.normalize(features, dim=-1).numpy()[0]
def add_image(self, image_path: str, metadata: dict = None) -> None:
"""Add an image to the retrieval index."""
image = Image.open(image_path).convert("RGB")
embedding = self.embed_image(image)
self.image_embeddings.append(embedding)
self.image_documents.append({
"image_path": image_path,
"metadata": metadata or {},
})
def search_by_text(self, query: str, top_k: int = 5) -> list[dict]:
"""Find images most similar to a text query."""
if not self.image_embeddings:
return []
query_embedding = self.embed_text(query)
image_matrix = np.stack(self.image_embeddings)
# Cosine similarity (embeddings are normalized)
similarities = image_matrix @ query_embedding
top_indices = np.argsort(-similarities)[:top_k]
return [
{
"image_path": self.image_documents[i]["image_path"],
"metadata": self.image_documents[i]["metadata"],
"similarity": float(similarities[i]),
}
for i in top_indices
]
def search_by_image(self, query_image_path: str, top_k: int = 5) -> list[dict]:
"""Find similar images to a query image."""
query_image = Image.open(query_image_path).convert("RGB")
query_embedding = self.embed_image(query_image)
image_matrix = np.stack(self.image_embeddings)
similarities = image_matrix @ query_embedding
top_indices = np.argsort(-similarities)[:top_k]
return [
{
"image_path": self.image_documents[i]["image_path"],
"similarity": float(similarities[i]),
}
for i in top_indices
]Approach 3: Late Interaction (ColPali)
ColPali embeds document page images directly and uses late-interaction for retrieval, without OCR:
Python
# ColPali: Efficient Document Retrieval with Vision Language Models
# pip install colpali-engine
from colpali_engine.models import ColPali
from colpali_engine.utils.torch_utils import get_torch_device
from PIL import Image
import torch
def load_colpali_model() -> ColPali:
"""Load ColPali model for visual document retrieval."""
device = get_torch_device("auto")
model = ColPali.from_pretrained(
"vidore/colpali-v1.2",
torch_dtype=torch.bfloat16,
device_map=device,
)
return model
def embed_page_image(model: ColPali, image: Image.Image) -> torch.Tensor:
"""Embed a document page image using ColPali."""
from colpali_engine.utils.torch_utils import ListDataset
from torch.utils.data import DataLoader
dataset = ListDataset([image], transform=model.processor.process_images)
loader = DataLoader(dataset, batch_size=1)
with torch.no_grad():
for batch in loader:
batch = {k: v.to(model.device) for k, v in batch.items()}
page_embeddings = model(**batch)
return page_embeddings # (1, n_patches, d_model)
# ColPali advantages:
# - No OCR required — processes page images directly
# - Excellent at text-heavy document pages
# - Handles complex layouts naturally (it "sees" the page)
# - State-of-art on DocVQA and ViDoRe benchmarksCombining Text and Image Retrieval
Python
class MultimodalRetriever:
"""Combines text and image retrieval for mixed-media knowledge bases."""
def __init__(self, text_retriever, clip_retriever: CLIPRetriever):
self.text_retriever = text_retriever
self.clip_retriever = clip_retriever
def retrieve(
self,
query: str,
query_embedding: list[float],
top_k: int = 5,
image_weight: float = 0.3, # Weight for image results
text_weight: float = 0.7,
) -> dict:
"""Retrieve from both text and image indices, then fuse."""
# Text retrieval
text_results = self.text_retriever.retrieve(query_embedding, top_k=top_k)
# Image retrieval using CLIP
image_results = self.clip_retriever.search_by_text(query, top_k=top_k)
return {
"text_results": text_results,
"image_results": image_results,
}
def answer_multimodal_question(
question: str,
text_results: list[dict],
image_results: list[dict],
) -> str:
"""Generate an answer using both text and image context."""
# Build text context
text_context = "\n\n".join([
f"[Text Document {i+1}]: {result['content']}"
for i, result in enumerate(text_results[:3])
])
# Build image content blocks
image_content = []
for i, img_result in enumerate(image_results[:2]):
with open(img_result["image_path"], "rb") as f:
img_data = base64.b64encode(f.read()).decode("utf-8")
image_content.append({"type": "text", "text": f"[Image {i+1}]:"})
image_content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_data}",
"detail": "high",
},
})
# Generate answer using all context
messages = [
{
"role": "system",
"content": "Answer using the provided text documents and images. Cite your sources.",
},
{
"role": "user",
"content": [
{"type": "text", "text": f"Context:\n{text_context}\n\nAdditional visual context:"},
*image_content,
{"type": "text", "text": f"\nQuestion: {question}"},
],
},
]
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
temperature=0,
)
return response.choices[0].message.contentFigure Extraction from PDFs
Extract figures from PDFs for separate indexing:
Python
import fitz # PyMuPDF
def extract_figures_from_pdf(
pdf_path: str,
output_dir: str,
min_width: int = 100,
min_height: int = 100,
) -> list[dict]:
"""Extract embedded images from a PDF."""
import os
os.makedirs(output_dir, exist_ok=True)
figures = []
doc = fitz.open(pdf_path)
for page_num, page in enumerate(doc):
image_list = page.get_images(full=True)
for img_idx, img in enumerate(image_list):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
width = base_image["width"]
height = base_image["height"]
if width < min_width or height < min_height:
continue # Skip icons and decorative elements
# Save figure
figure_path = os.path.join(
output_dir,
f"figure_p{page_num+1}_i{img_idx}.{base_image['ext']}",
)
with open(figure_path, "wb") as f:
f.write(image_bytes)
figures.append({
"path": figure_path,
"page": page_num + 1,
"width": width,
"height": height,
})
doc.close()
return figuresFound this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.