Multimodal RAG: Images and Documents

Why Multimodal RAG

Many domain knowledge bases contain valuable information in non-text formats:

Clinical: ECG tracings, X-rays, lab result tables as images
Scientific: charts, diagrams, figures in research papers
Technical: schematics, flowcharts, screenshots

A text-only RAG system ignores all this visual information. Multimodal RAG indexes and retrieves from images as well as text.

Approach 1: Caption-Based Indexing

Extract text descriptions from images, then index the captions:

Python

from openai import OpenAI
import base64
from pathlib import Path
from dataclasses import dataclass

client = OpenAI()

@dataclass
class ImageDocument:
    """An image with generated caption for RAG indexing."""
    id: str
    image_path: str
    caption: str
    detailed_description: str
    image_type: str       # "chart", "diagram", "photo", "screenshot", "table"
    extracted_text: str   # OCR text from the image
    metadata: dict


def generate_image_caption(
    image_path: str,
    context: str = "",
) -> dict:
    """Generate a searchable caption and detailed description for an image."""
    with open(image_path, "rb") as f:
        image_data = base64.b64encode(f.read()).decode("utf-8")

    ext = Path(image_path).suffix.lower().lstrip(".")
    if ext == "jpg":
        ext = "jpeg"
    media_type = f"image/{ext}"

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": """You analyze images for a medical knowledge base.
Provide three things:
1. A concise caption (1-2 sentences) suitable for search indexing
2. A detailed description (3-5 sentences) of all content visible
3. Any text visible in the image (transcribe exactly)
4. Image type classification

Return JSON:
{
  "caption": "...",
  "detailed_description": "...",
  "extracted_text": "...",
  "image_type": "chart|diagram|photo|screenshot|table|other"
}""",
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:{media_type};base64,{image_data}",
                            "detail": "high",
                        },
                    },
                    {
                        "type": "text",
                        "text": f"Context: {context}\n\nAnalyze this image for medical knowledge base indexing.",
                    },
                ],
            },
        ],
        response_format={"type": "json_object"},
        temperature=0,
    )

    import json
    result = json.loads(response.choices[0].message.content)
    return result


def ingest_image_document(
    image_path: str,
    document_id: str,
    context: str = "",
    metadata: dict = None,
) -> ImageDocument:
    """Process an image into a searchable document."""
    caption_data = generate_image_caption(image_path, context=context)

    # Build searchable content: combine all text for embedding
    searchable_content = " ".join(filter(None, [
        caption_data.get("caption", ""),
        caption_data.get("detailed_description", ""),
        caption_data.get("extracted_text", ""),
    ]))

    return ImageDocument(
        id=document_id,
        image_path=image_path,
        caption=caption_data.get("caption", ""),
        detailed_description=caption_data.get("detailed_description", ""),
        image_type=caption_data.get("image_type", "other"),
        extracted_text=caption_data.get("extracted_text", ""),
        metadata=metadata or {},
    )

Approach 2: CLIP Embeddings for Native Image Search

Embed images and queries in the same semantic space using CLIP:

Python

from transformers import CLIPModel, CLIPProcessor
import torch
import torch.nn.functional as F
from PIL import Image
import numpy as np

class CLIPRetriever:
    """Image retrieval using CLIP's shared vision-language embedding space."""

    def __init__(self, model_name: str = "openai/clip-vit-base-patch32"):
        self.model = CLIPModel.from_pretrained(model_name)
        self.processor = CLIPProcessor.from_pretrained(model_name)
        self.model.eval()

        self.image_embeddings: list[np.ndarray] = []
        self.image_documents: list[dict] = []

    def embed_image(self, image: Image.Image) -> np.ndarray:
        """Embed a single image using CLIP's vision encoder."""
        inputs = self.processor(images=image, return_tensors="pt")
        with torch.no_grad():
            features = self.model.get_image_features(**inputs)
        return F.normalize(features, dim=-1).numpy()[0]

    def embed_text(self, text: str) -> np.ndarray:
        """Embed a text query using CLIP's text encoder."""
        inputs = self.processor(text=[text], return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            features = self.model.get_text_features(**inputs)
        return F.normalize(features, dim=-1).numpy()[0]

    def add_image(self, image_path: str, metadata: dict = None) -> None:
        """Add an image to the retrieval index."""
        image = Image.open(image_path).convert("RGB")
        embedding = self.embed_image(image)

        self.image_embeddings.append(embedding)
        self.image_documents.append({
            "image_path": image_path,
            "metadata": metadata or {},
        })

    def search_by_text(self, query: str, top_k: int = 5) -> list[dict]:
        """Find images most similar to a text query."""
        if not self.image_embeddings:
            return []

        query_embedding = self.embed_text(query)
        image_matrix = np.stack(self.image_embeddings)

        # Cosine similarity (embeddings are normalized)
        similarities = image_matrix @ query_embedding
        top_indices = np.argsort(-similarities)[:top_k]

        return [
            {
                "image_path": self.image_documents[i]["image_path"],
                "metadata": self.image_documents[i]["metadata"],
                "similarity": float(similarities[i]),
            }
            for i in top_indices
        ]

    def search_by_image(self, query_image_path: str, top_k: int = 5) -> list[dict]:
        """Find similar images to a query image."""
        query_image = Image.open(query_image_path).convert("RGB")
        query_embedding = self.embed_image(query_image)

        image_matrix = np.stack(self.image_embeddings)
        similarities = image_matrix @ query_embedding
        top_indices = np.argsort(-similarities)[:top_k]

        return [
            {
                "image_path": self.image_documents[i]["image_path"],
                "similarity": float(similarities[i]),
            }
            for i in top_indices
        ]

Approach 3: Late Interaction (ColPali)

ColPali embeds document page images directly and uses late-interaction for retrieval, without OCR:

Python

# ColPali: Efficient Document Retrieval with Vision Language Models
# pip install colpali-engine

from colpali_engine.models import ColPali
from colpali_engine.utils.torch_utils import get_torch_device
from PIL import Image
import torch

def load_colpali_model() -> ColPali:
    """Load ColPali model for visual document retrieval."""
    device = get_torch_device("auto")
    model = ColPali.from_pretrained(
        "vidore/colpali-v1.2",
        torch_dtype=torch.bfloat16,
        device_map=device,
    )
    return model


def embed_page_image(model: ColPali, image: Image.Image) -> torch.Tensor:
    """Embed a document page image using ColPali."""
    from colpali_engine.utils.torch_utils import ListDataset
    from torch.utils.data import DataLoader

    dataset = ListDataset([image], transform=model.processor.process_images)
    loader = DataLoader(dataset, batch_size=1)

    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(model.device) for k, v in batch.items()}
            page_embeddings = model(**batch)

    return page_embeddings  # (1, n_patches, d_model)


# ColPali advantages:
# - No OCR required — processes page images directly
# - Excellent at text-heavy document pages
# - Handles complex layouts naturally (it "sees" the page)
# - State-of-art on DocVQA and ViDoRe benchmarks

Combining Text and Image Retrieval

Python

class MultimodalRetriever:
    """Combines text and image retrieval for mixed-media knowledge bases."""

    def __init__(self, text_retriever, clip_retriever: CLIPRetriever):
        self.text_retriever = text_retriever
        self.clip_retriever = clip_retriever

    def retrieve(
        self,
        query: str,
        query_embedding: list[float],
        top_k: int = 5,
        image_weight: float = 0.3,  # Weight for image results
        text_weight: float = 0.7,
    ) -> dict:
        """Retrieve from both text and image indices, then fuse."""
        # Text retrieval
        text_results = self.text_retriever.retrieve(query_embedding, top_k=top_k)

        # Image retrieval using CLIP
        image_results = self.clip_retriever.search_by_text(query, top_k=top_k)

        return {
            "text_results": text_results,
            "image_results": image_results,
        }


def answer_multimodal_question(
    question: str,
    text_results: list[dict],
    image_results: list[dict],
) -> str:
    """Generate an answer using both text and image context."""
    # Build text context
    text_context = "\n\n".join([
        f"[Text Document {i+1}]: {result['content']}"
        for i, result in enumerate(text_results[:3])
    ])

    # Build image content blocks
    image_content = []
    for i, img_result in enumerate(image_results[:2]):
        with open(img_result["image_path"], "rb") as f:
            img_data = base64.b64encode(f.read()).decode("utf-8")

        image_content.append({"type": "text", "text": f"[Image {i+1}]:"})
        image_content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{img_data}",
                "detail": "high",
            },
        })

    # Generate answer using all context
    messages = [
        {
            "role": "system",
            "content": "Answer using the provided text documents and images. Cite your sources.",
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": f"Context:\n{text_context}\n\nAdditional visual context:"},
                *image_content,
                {"type": "text", "text": f"\nQuestion: {question}"},
            ],
        },
    ]

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message.content

Figure Extraction from PDFs

Extract figures from PDFs for separate indexing:

Python

import fitz  # PyMuPDF

def extract_figures_from_pdf(
    pdf_path: str,
    output_dir: str,
    min_width: int = 100,
    min_height: int = 100,
) -> list[dict]:
    """Extract embedded images from a PDF."""
    import os
    os.makedirs(output_dir, exist_ok=True)

    figures = []
    doc = fitz.open(pdf_path)

    for page_num, page in enumerate(doc):
        image_list = page.get_images(full=True)

        for img_idx, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            width = base_image["width"]
            height = base_image["height"]

            if width < min_width or height < min_height:
                continue  # Skip icons and decorative elements

            # Save figure
            figure_path = os.path.join(
                output_dir,
                f"figure_p{page_num+1}_i{img_idx}.{base_image['ext']}",
            )
            with open(figure_path, "wb") as f:
                f.write(image_bytes)

            figures.append({
                "path": figure_path,
                "page": page_num + 1,
                "width": width,
                "height": height,
            })

    doc.close()
    return figures

Multimodal RAG: Images and Documents

Why Multimodal RAG

Approach 1: Caption-Based Indexing

Approach 2: CLIP Embeddings for Native Image Search

Approach 3: Late Interaction (ColPali)

Combining Text and Image Retrieval

Figure Extraction from PDFs

Enjoyed this article?

Leave a comment