Multimodal LLMs: Vision, Audio, and Beyond

How Multimodal Models Work

A text-only LLM processes token sequences. A multimodal LLM processes sequences of tokens drawn from multiple modalities. The key challenge: images, audio, and video must be converted to representations that can be interleaved with text tokens in the same sequence.

The general approach:

Modal-specific encoder converts raw input (pixels, audio samples) to a sequence of vectors
Projection layer maps these vectors into the LLM's embedding space
Interleaving places image tokens between text tokens
Causal attention processes the combined sequence normally

The LLM backbone doesn't change — it still processes a 1D sequence of embeddings. Only the inputs change.

Vision Language Models: Architecture

Python

import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, CLIPVisionModel, CLIPImageProcessor

class VisionLanguageModel(nn.Module):
    """
    Simplified vision-language model architecture.
    Combines a vision encoder (CLIP) with an LLM backbone.
    """

    def __init__(self, vision_model_name: str, llm_name: str):
        super().__init__()

        # Vision encoder: converts images to patch embeddings
        self.vision_encoder = CLIPVisionModel.from_pretrained(vision_model_name)
        self.image_processor = CLIPImageProcessor.from_pretrained(vision_model_name)

        # LLM backbone
        self.llm = AutoModelForCausalLM.from_pretrained(llm_name)

        # Projection layer: maps vision features (d_vision) to LLM space (d_llm)
        d_vision = self.vision_encoder.config.hidden_size    # e.g., 1024 for CLIP-L
        d_llm = self.llm.config.hidden_size                  # e.g., 4096 for 7B LLM

        # Linear projector (or MLP for better alignment)
        self.projector = nn.Sequential(
            nn.Linear(d_vision, d_llm),
            nn.GELU(),
            nn.Linear(d_llm, d_llm),
        )

    def encode_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
        """
        Encode image into a sequence of vectors in LLM embedding space.
        pixel_values: (B, 3, H, W) — preprocessed image tensor
        Returns: (B, n_patches, d_llm) — image patch embeddings
        """
        with torch.no_grad():
            vision_output = self.vision_encoder(pixel_values=pixel_values)

        # Use patch embeddings (not CLS token — we want spatial detail)
        # vision_output.last_hidden_state: (B, n_patches + 1, d_vision)
        patch_embeddings = vision_output.last_hidden_state[:, 1:, :]  # Skip CLS

        # Project to LLM space
        projected = self.projector(patch_embeddings)  # (B, n_patches, d_llm)
        return projected

    def forward(
        self,
        input_ids: torch.Tensor,         # Text tokens
        pixel_values: torch.Tensor,       # Image pixels
        image_token_id: int = 32000,     # Special token marking image position
        labels: torch.Tensor = None,
    ) -> torch.Tensor:
        """
        Forward pass with interleaved image and text tokens.
        """
        # Get text embeddings from LLM's embedding table
        text_embeddings = self.llm.get_input_embeddings()(input_ids)

        # Encode image to patch embeddings in LLM space
        image_embeddings = self.encode_image(pixel_values)  # (B, n_patches, d_llm)

        # Find image token positions and replace with visual embeddings
        image_positions = (input_ids == image_token_id)

        # Create the interleaved sequence
        combined_embeddings = text_embeddings.clone()
        for b in range(input_ids.shape[0]):
            pos = image_positions[b].nonzero(as_tuple=True)[0]
            if len(pos) > 0:
                # Replace image placeholder tokens with visual patch embeddings
                # (simplified — real implementation handles variable-length images)
                n_patches = min(image_embeddings.shape[1], len(pos))
                combined_embeddings[b, pos[:n_patches]] = image_embeddings[b, :n_patches]

        # Run LLM on combined sequence
        outputs = self.llm(
            inputs_embeds=combined_embeddings,
            labels=labels,
        )
        return outputs

Image Tokenization Strategies

Different VLMs handle images differently:

Python

class ImageTokenizationStrategy:
    """Different approaches to converting images to token sequences."""

    @staticmethod
    def fixed_patch_grid(
        image: torch.Tensor,
        patch_size: int = 14,
    ) -> torch.Tensor:
        """
        Split image into fixed NxN grid of patches (ViT/CLIP approach).
        224x224 image with 14x14 patches = 256 patch tokens.
        """
        B, C, H, W = image.shape
        n_h = H // patch_size
        n_w = W // patch_size

        # Reshape into patches: (B, n_patches, patch_size*patch_size*C)
        patches = image.reshape(B, C, n_h, patch_size, n_w, patch_size)
        patches = patches.permute(0, 2, 4, 1, 3, 5)  # (B, n_h, n_w, C, patch, patch)
        patches = patches.reshape(B, n_h * n_w, C * patch_size * patch_size)
        return patches

    @staticmethod
    def dynamic_resolution(
        image: torch.Tensor,
        target_tokens: int = 256,
        min_patches: int = 1,
        max_patches: int = 6,
    ) -> tuple[torch.Tensor, tuple[int, int]]:
        """
        InternVL/LLaVA-Next approach: select optimal tile configuration.
        Handles high-resolution images by splitting into multiple tiles.
        """
        B, C, H, W = image.shape
        aspect_ratio = W / H

        # Find best tiling that preserves aspect ratio
        best_config = None
        best_waste = float('inf')

        for n_rows in range(1, max_patches + 1):
            for n_cols in range(1, max_patches + 1):
                if n_rows * n_cols > max_patches:
                    continue
                tile_aspect = n_cols / n_rows
                waste = abs(tile_aspect - aspect_ratio)
                if waste < best_waste:
                    best_waste = waste
                    best_config = (n_rows, n_cols)

        n_rows, n_cols = best_config
        # In practice: resize image, split into tiles, encode each tile separately
        return image, (n_rows, n_cols)

Using GPT-4V

Python

from openai import OpenAI
import base64
from pathlib import Path

client = OpenAI()

def analyze_medical_image(image_path: str, question: str) -> str:
    """Analyze a medical image using GPT-4V."""
    with open(image_path, "rb") as f:
        image_data = base64.b64encode(f.read()).decode("utf-8")

    ext = Path(image_path).suffix.lower().lstrip(".")
    if ext == "jpg":
        ext = "jpeg"
    media_type = f"image/{ext}"

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": """You are a medical image analysis assistant supporting licensed clinicians.
                
IMPORTANT LIMITATIONS:
- Image analysis requires clinical correlation — do not make diagnostic conclusions
- Describe what you observe in the image, not what diagnosis it implies
- Always recommend formal radiologist or specialist review for clinical decisions"""
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:{media_type};base64,{image_data}",
                            "detail": "high",  # High detail for medical images
                        },
                    },
                    {"type": "text", "text": question},
                ],
            },
        ],
        max_tokens=1024,
        temperature=0,
    )
    return response.choices[0].message.content


def extract_text_from_image(image_path: str) -> str:
    """OCR-like text extraction from an image (prescription, form, label)."""
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{encode_image(image_path)}",
                            "detail": "high",
                        },
                    },
                    {
                        "type": "text",
                        "text": """Transcribe ALL text visible in this image exactly as written.
Preserve formatting where meaningful (tables, lists).
Return ONLY the transcribed text, nothing else.""",
                    },
                ],
            }
        ],
        temperature=0,
    )
    return response.choices[0].message.content

Claude's Vision API

Python

import anthropic
import base64

claude_client = anthropic.Anthropic()

def analyze_with_claude(image_path: str, prompt: str) -> str:
    """Analyze image using Claude's vision capability."""
    with open(image_path, "rb") as f:
        image_data = base64.standard_b64encode(f.read()).decode("utf-8")

    ext = Path(image_path).suffix.lower()
    media_types = {".jpg": "image/jpeg", ".jpeg": "image/jpeg",
                   ".png": "image/png", ".gif": "image/gif", ".webp": "image/webp"}
    media_type = media_types.get(ext, "image/jpeg")

    response = claude_client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=1024,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": media_type,
                            "data": image_data,
                        },
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ],
    )
    return response.content[0].text

Multimodal Embeddings (CLIP)

CLIP learns a shared embedding space for images and text — useful for image search:

Python

from transformers import CLIPModel, CLIPProcessor
import torch
import torch.nn.functional as F

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def compute_image_text_similarity(
    images: list,     # List of PIL images
    texts: list[str],
) -> torch.Tensor:
    """
    Compute cross-modal similarity between images and text queries.
    Returns (n_images, n_texts) similarity matrix.
    """
    # Process inputs
    inputs = processor(
        text=texts,
        images=images,
        return_tensors="pt",
        padding=True,
    )

    with torch.no_grad():
        outputs = model(**inputs)

    # L2-normalized embeddings in shared space
    image_embeds = F.normalize(outputs.image_embeds, dim=-1)
    text_embeds = F.normalize(outputs.text_embeds, dim=-1)

    # Cosine similarity matrix
    similarity = image_embeds @ text_embeds.T  # (n_images, n_texts)
    return similarity * model.logit_scale.exp()

# Example: find which text description matches each image
texts = [
    "An ECG tracing showing atrial fibrillation",
    "A chest X-ray with bilateral infiltrates",
    "A lab report with elevated INR",
]
# similarity = compute_image_text_similarity(images, texts)
# Most likely text for each image: similarity.argmax(dim=1)

Audio and Speech (Whisper Integration)

Python

from openai import OpenAI
import tempfile
import os

client = OpenAI()

def transcribe_clinical_audio(audio_path: str) -> dict:
    """Transcribe clinical audio (patient consultation, voice notes)."""
    with open(audio_path, "rb") as audio_file:
        transcription = client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_file,
            language="en",
            prompt="This is a clinical consultation. Medical terminology should be preserved accurately.",
        )

    return {
        "text": transcription.text,
    }


def voice_to_clinical_summary(audio_path: str) -> str:
    """Transcribe a voice note and extract structured clinical information."""
    transcription = transcribe_clinical_audio(audio_path)

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": "Extract structured clinical information from this voice note transcript.",
            },
            {
                "role": "user",
                "content": f"""Voice note transcript:
{transcription['text']}

Extract:
1. Medications mentioned (name, dose, frequency)
2. Patient complaints or symptoms
3. Clinical decisions or plan items
4. Follow-up actions required

Format as structured JSON.""",
            },
        ],
        temperature=0,
    )
    return response.choices[0].message.content

Multimodal RAG: Images in the Knowledge Base

Python

def build_multimodal_document(
    text: str,
    image_paths: list[str],
    document_id: str,
) -> dict:
    """
    Create a multimodal document entry for a RAG system.
    Images are captioned and the captions are added to the searchable text.
    """
    # Generate captions for all images
    captions = []
    for img_path in image_paths:
        caption = analyze_with_claude(
            img_path,
            "Describe this image in detail. Include all visible text, numbers, and clinical content.",
        )
        captions.append(caption)

    # Combine text and image captions for embedding
    combined_content = text
    for i, (caption, img_path) in enumerate(zip(captions, image_paths)):
        combined_content += f"\n\n[Figure {i+1} description]: {caption}"

    return {
        "id": document_id,
        "text": text,
        "image_paths": image_paths,
        "image_captions": captions,
        "combined_content": combined_content,  # Used for embedding/retrieval
    }

Multimodal LLMs: Vision, Audio, and Beyond

How Multimodal Models Work

Vision Language Models: Architecture

Image Tokenization Strategies

Using GPT-4V

Claude's Vision API

Multimodal Embeddings (CLIP)

Audio and Speech (Whisper Integration)

Multimodal RAG: Images in the Knowledge Base

Enjoyed this article?

Leave a comment