Multimodal LLMs: Vision, Audio, and Beyond
How multimodal LLMs process images, audio, and video alongside text. Vision encoders, cross-modal attention, GPT-4V internals, and building multimodal applications.
How Multimodal Models Work
A text-only LLM processes token sequences. A multimodal LLM processes sequences of tokens drawn from multiple modalities. The key challenge: images, audio, and video must be converted to representations that can be interleaved with text tokens in the same sequence.
The general approach:
- Modal-specific encoder converts raw input (pixels, audio samples) to a sequence of vectors
- Projection layer maps these vectors into the LLM's embedding space
- Interleaving places image tokens between text tokens
- Causal attention processes the combined sequence normally
The LLM backbone doesn't change — it still processes a 1D sequence of embeddings. Only the inputs change.
Vision Language Models: Architecture
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, CLIPVisionModel, CLIPImageProcessor
class VisionLanguageModel(nn.Module):
"""
Simplified vision-language model architecture.
Combines a vision encoder (CLIP) with an LLM backbone.
"""
def __init__(self, vision_model_name: str, llm_name: str):
super().__init__()
# Vision encoder: converts images to patch embeddings
self.vision_encoder = CLIPVisionModel.from_pretrained(vision_model_name)
self.image_processor = CLIPImageProcessor.from_pretrained(vision_model_name)
# LLM backbone
self.llm = AutoModelForCausalLM.from_pretrained(llm_name)
# Projection layer: maps vision features (d_vision) to LLM space (d_llm)
d_vision = self.vision_encoder.config.hidden_size # e.g., 1024 for CLIP-L
d_llm = self.llm.config.hidden_size # e.g., 4096 for 7B LLM
# Linear projector (or MLP for better alignment)
self.projector = nn.Sequential(
nn.Linear(d_vision, d_llm),
nn.GELU(),
nn.Linear(d_llm, d_llm),
)
def encode_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
"""
Encode image into a sequence of vectors in LLM embedding space.
pixel_values: (B, 3, H, W) — preprocessed image tensor
Returns: (B, n_patches, d_llm) — image patch embeddings
"""
with torch.no_grad():
vision_output = self.vision_encoder(pixel_values=pixel_values)
# Use patch embeddings (not CLS token — we want spatial detail)
# vision_output.last_hidden_state: (B, n_patches + 1, d_vision)
patch_embeddings = vision_output.last_hidden_state[:, 1:, :] # Skip CLS
# Project to LLM space
projected = self.projector(patch_embeddings) # (B, n_patches, d_llm)
return projected
def forward(
self,
input_ids: torch.Tensor, # Text tokens
pixel_values: torch.Tensor, # Image pixels
image_token_id: int = 32000, # Special token marking image position
labels: torch.Tensor = None,
) -> torch.Tensor:
"""
Forward pass with interleaved image and text tokens.
"""
# Get text embeddings from LLM's embedding table
text_embeddings = self.llm.get_input_embeddings()(input_ids)
# Encode image to patch embeddings in LLM space
image_embeddings = self.encode_image(pixel_values) # (B, n_patches, d_llm)
# Find image token positions and replace with visual embeddings
image_positions = (input_ids == image_token_id)
# Create the interleaved sequence
combined_embeddings = text_embeddings.clone()
for b in range(input_ids.shape[0]):
pos = image_positions[b].nonzero(as_tuple=True)[0]
if len(pos) > 0:
# Replace image placeholder tokens with visual patch embeddings
# (simplified — real implementation handles variable-length images)
n_patches = min(image_embeddings.shape[1], len(pos))
combined_embeddings[b, pos[:n_patches]] = image_embeddings[b, :n_patches]
# Run LLM on combined sequence
outputs = self.llm(
inputs_embeds=combined_embeddings,
labels=labels,
)
return outputsImage Tokenization Strategies
Different VLMs handle images differently:
class ImageTokenizationStrategy:
"""Different approaches to converting images to token sequences."""
@staticmethod
def fixed_patch_grid(
image: torch.Tensor,
patch_size: int = 14,
) -> torch.Tensor:
"""
Split image into fixed NxN grid of patches (ViT/CLIP approach).
224x224 image with 14x14 patches = 256 patch tokens.
"""
B, C, H, W = image.shape
n_h = H // patch_size
n_w = W // patch_size
# Reshape into patches: (B, n_patches, patch_size*patch_size*C)
patches = image.reshape(B, C, n_h, patch_size, n_w, patch_size)
patches = patches.permute(0, 2, 4, 1, 3, 5) # (B, n_h, n_w, C, patch, patch)
patches = patches.reshape(B, n_h * n_w, C * patch_size * patch_size)
return patches
@staticmethod
def dynamic_resolution(
image: torch.Tensor,
target_tokens: int = 256,
min_patches: int = 1,
max_patches: int = 6,
) -> tuple[torch.Tensor, tuple[int, int]]:
"""
InternVL/LLaVA-Next approach: select optimal tile configuration.
Handles high-resolution images by splitting into multiple tiles.
"""
B, C, H, W = image.shape
aspect_ratio = W / H
# Find best tiling that preserves aspect ratio
best_config = None
best_waste = float('inf')
for n_rows in range(1, max_patches + 1):
for n_cols in range(1, max_patches + 1):
if n_rows * n_cols > max_patches:
continue
tile_aspect = n_cols / n_rows
waste = abs(tile_aspect - aspect_ratio)
if waste < best_waste:
best_waste = waste
best_config = (n_rows, n_cols)
n_rows, n_cols = best_config
# In practice: resize image, split into tiles, encode each tile separately
return image, (n_rows, n_cols)Using GPT-4V
from openai import OpenAI
import base64
from pathlib import Path
client = OpenAI()
def analyze_medical_image(image_path: str, question: str) -> str:
"""Analyze a medical image using GPT-4V."""
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode("utf-8")
ext = Path(image_path).suffix.lower().lstrip(".")
if ext == "jpg":
ext = "jpeg"
media_type = f"image/{ext}"
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": """You are a medical image analysis assistant supporting licensed clinicians.
IMPORTANT LIMITATIONS:
- Image analysis requires clinical correlation — do not make diagnostic conclusions
- Describe what you observe in the image, not what diagnosis it implies
- Always recommend formal radiologist or specialist review for clinical decisions"""
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:{media_type};base64,{image_data}",
"detail": "high", # High detail for medical images
},
},
{"type": "text", "text": question},
],
},
],
max_tokens=1024,
temperature=0,
)
return response.choices[0].message.content
def extract_text_from_image(image_path: str) -> str:
"""OCR-like text extraction from an image (prescription, form, label)."""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encode_image(image_path)}",
"detail": "high",
},
},
{
"type": "text",
"text": """Transcribe ALL text visible in this image exactly as written.
Preserve formatting where meaningful (tables, lists).
Return ONLY the transcribed text, nothing else.""",
},
],
}
],
temperature=0,
)
return response.choices[0].message.contentClaude's Vision API
import anthropic
import base64
claude_client = anthropic.Anthropic()
def analyze_with_claude(image_path: str, prompt: str) -> str:
"""Analyze image using Claude's vision capability."""
with open(image_path, "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode("utf-8")
ext = Path(image_path).suffix.lower()
media_types = {".jpg": "image/jpeg", ".jpeg": "image/jpeg",
".png": "image/png", ".gif": "image/gif", ".webp": "image/webp"}
media_type = media_types.get(ext, "image/jpeg")
response = claude_client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": image_data,
},
},
{"type": "text", "text": prompt},
],
}
],
)
return response.content[0].textMultimodal Embeddings (CLIP)
CLIP learns a shared embedding space for images and text — useful for image search:
from transformers import CLIPModel, CLIPProcessor
import torch
import torch.nn.functional as F
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def compute_image_text_similarity(
images: list, # List of PIL images
texts: list[str],
) -> torch.Tensor:
"""
Compute cross-modal similarity between images and text queries.
Returns (n_images, n_texts) similarity matrix.
"""
# Process inputs
inputs = processor(
text=texts,
images=images,
return_tensors="pt",
padding=True,
)
with torch.no_grad():
outputs = model(**inputs)
# L2-normalized embeddings in shared space
image_embeds = F.normalize(outputs.image_embeds, dim=-1)
text_embeds = F.normalize(outputs.text_embeds, dim=-1)
# Cosine similarity matrix
similarity = image_embeds @ text_embeds.T # (n_images, n_texts)
return similarity * model.logit_scale.exp()
# Example: find which text description matches each image
texts = [
"An ECG tracing showing atrial fibrillation",
"A chest X-ray with bilateral infiltrates",
"A lab report with elevated INR",
]
# similarity = compute_image_text_similarity(images, texts)
# Most likely text for each image: similarity.argmax(dim=1)Audio and Speech (Whisper Integration)
from openai import OpenAI
import tempfile
import os
client = OpenAI()
def transcribe_clinical_audio(audio_path: str) -> dict:
"""Transcribe clinical audio (patient consultation, voice notes)."""
with open(audio_path, "rb") as audio_file:
transcription = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
language="en",
prompt="This is a clinical consultation. Medical terminology should be preserved accurately.",
)
return {
"text": transcription.text,
}
def voice_to_clinical_summary(audio_path: str) -> str:
"""Transcribe a voice note and extract structured clinical information."""
transcription = transcribe_clinical_audio(audio_path)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": "Extract structured clinical information from this voice note transcript.",
},
{
"role": "user",
"content": f"""Voice note transcript:
{transcription['text']}
Extract:
1. Medications mentioned (name, dose, frequency)
2. Patient complaints or symptoms
3. Clinical decisions or plan items
4. Follow-up actions required
Format as structured JSON.""",
},
],
temperature=0,
)
return response.choices[0].message.contentMultimodal RAG: Images in the Knowledge Base
def build_multimodal_document(
text: str,
image_paths: list[str],
document_id: str,
) -> dict:
"""
Create a multimodal document entry for a RAG system.
Images are captioned and the captions are added to the searchable text.
"""
# Generate captions for all images
captions = []
for img_path in image_paths:
caption = analyze_with_claude(
img_path,
"Describe this image in detail. Include all visible text, numbers, and clinical content.",
)
captions.append(caption)
# Combine text and image captions for embedding
combined_content = text
for i, (caption, img_path) in enumerate(zip(captions, image_paths)):
combined_content += f"\n\n[Figure {i+1} description]: {caption}"
return {
"id": document_id,
"text": text,
"image_paths": image_paths,
"image_captions": captions,
"combined_content": combined_content, # Used for embedding/retrieval
}Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.