Multimodal Prompting: Vision and Images

Sending Images to the Model

Modern vision models (GPT-4o, Claude 3.5 Sonnet, Gemini 1.5) accept images alongside text in the same API call:

Python

import anthropic
import base64
from pathlib import Path

claude_client = anthropic.Anthropic()

def image_to_base64(image_path: str) -> str:
    """Convert local image to base64 string."""
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

def ask_about_image(image_path: str, question: str) -> str:
    """Send an image with a question to Claude."""
    image_data = image_to_base64(image_path)
    # Determine media type from extension
    ext = Path(image_path).suffix.lower()
    media_type = {"jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png",
                  ".gif": "image/gif", ".webp": "image/webp"}.get(ext, "image/jpeg")

    response = claude_client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=1024,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": media_type,
                            "data": image_data,
                        },
                    },
                    {
                        "type": "text",
                        "text": question,
                    },
                ],
            }
        ],
    )
    return response.content[0].text

# OpenAI equivalent
from openai import OpenAI
openai_client = OpenAI()

def ask_about_image_openai(image_path: str, question: str, detail: str = "auto") -> str:
    """detail: 'low' (faster, cheaper), 'high' (more accurate), 'auto'"""
    image_data = image_to_base64(image_path)
    ext = Path(image_path).suffix.lower().lstrip(".")
    media_type = f"image/{ext if ext != 'jpg' else 'jpeg'}"

    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:{media_type};base64,{image_data}",
                            "detail": detail,
                        },
                    },
                    {"type": "text", "text": question},
                ],
            }
        ],
    )
    return response.choices[0].message.content

Structured Data Extraction from Images

Extract typed data from screenshots, forms, and documents:

Python

from pydantic import BaseModel
from typing import Optional
import json

class ExtractedLabResult(BaseModel):
    test_name: str
    value: float
    unit: str
    reference_range_low: Optional[float] = None
    reference_range_high: Optional[float] = None
    is_abnormal: bool
    flag: Optional[str] = None  # "H" for high, "L" for low, "C" for critical

class LabReport(BaseModel):
    patient_id: Optional[str] = None
    collection_date: Optional[str] = None
    results: list[ExtractedLabResult]
    ordering_provider: Optional[str] = None

def extract_lab_results_from_image(image_path: str) -> LabReport:
    """Extract structured lab results from a lab report image."""

    prompt = """Extract all laboratory test results from this lab report image.

Return ONLY valid JSON matching this schema:
{
  "patient_id": "string or null",
  "collection_date": "YYYY-MM-DD or null",
  "ordering_provider": "string or null",
  "results": [
    {
      "test_name": "string (e.g., 'INR', 'Hemoglobin', 'Creatinine')",
      "value": number,
      "unit": "string (e.g., 'mg/dL', 'g/dL', 'ratio')",
      "reference_range_low": number or null,
      "reference_range_high": number or null,
      "is_abnormal": boolean,
      "flag": "H or L or C or null"
    }
  ]
}

Extract EVERY test result shown. Do not skip any results."""

    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image_to_base64(image_path)}",
                            "detail": "high",  # High detail for text-heavy medical documents
                        },
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ],
        response_format={"type": "json_object"},
        temperature=0,
    )

    data = json.loads(response.choices[0].message.content)
    return LabReport(**data)

# Process a lab report image
lab_report = extract_lab_results_from_image("patient_labs.jpg")
print(f"Extracted {len(lab_report.results)} lab results")
for result in lab_report.results:
    flag = f" [{result.flag}]" if result.flag else ""
    print(f"  {result.test_name}: {result.value} {result.unit}{flag}")

Chart and Graph Analysis

Python

def analyze_clinical_chart(image_path: str, chart_type: str = "unknown") -> dict:
    """Extract data and insights from clinical charts and graphs."""

    prompt = f"""This is a {chart_type} clinical chart/graph.

Analyze it and provide:
1. CHART_TYPE: What type of chart/graph this is
2. AXES: X-axis label and Y-axis label with units
3. TIME_RANGE: Date/time range shown (if applicable)
4. KEY_VALUES: List the key data points or values visible
5. TRENDS: Describe any trends (increasing, decreasing, stable, cyclical)
6. CLINICAL_SIGNIFICANCE: What this pattern suggests clinically
7. ANOMALIES: Any unusual values, outliers, or notable events marked

Return as JSON:
{{
  "chart_type": "...",
  "x_axis": "...",
  "y_axis": "...",
  "time_range": "... or null",
  "key_values": [{{ "label": "...", "value": "...", "unit": "..." }}],
  "trend": "...",
  "clinical_significance": "...",
  "anomalies": ["..."]
}}"""

    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image_to_base64(image_path)}",
                            "detail": "high",
                        },
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ],
        response_format={"type": "json_object"},
        temperature=0,
    )
    return json.loads(response.choices[0].message.content)

# Example: INR trend chart
inr_analysis = analyze_clinical_chart("inr_trend.png", chart_type="INR trend over time")
print(f"Trend: {inr_analysis.get('trend')}")
print(f"Clinical significance: {inr_analysis.get('clinical_significance')}")

Multi-Image Comparison

Compare multiple images in a single prompt:

Python

def compare_images(
    image_paths: list[str],
    comparison_question: str,
) -> str:
    """Compare multiple images with a single prompt."""

    content = []
    for i, path in enumerate(image_paths, 1):
        content.append({"type": "text", "text": f"Image {i}:"})
        content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{image_to_base64(path)}",
                "detail": "auto",
            },
        })

    content.append({"type": "text", "text": comparison_question})

    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": content}],
        temperature=0,
    )
    return response.choices[0].message.content

# Compare two ECG tracings
comparison = compare_images(
    ["ecg_before.jpg", "ecg_after.jpg"],
    "Compare these two ECG tracings. What changed between Image 1 (baseline) and Image 2 (after medication change)? Focus on rate, rhythm, and any interval changes.",
)
print(comparison)

URL-Based Images (No Base64 Needed)

For images already on the web, pass the URL directly:

Python

def analyze_image_from_url(image_url: str, question: str) -> str:
    """Analyze a publicly accessible image by URL."""
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url, "detail": "auto"},
                    },
                    {"type": "text", "text": question},
                ],
            }
        ],
    )
    return response.choices[0].message.content

Cost and Performance Considerations

Python

def estimate_image_cost(
    image_path: str,
    detail: str = "auto",
    model: str = "gpt-4o",
) -> dict:
    """Estimate token cost for an image."""
    from PIL import Image

    img = Image.open(image_path)
    width, height = img.size

    if detail == "low":
        # Fixed cost: 85 tokens
        image_tokens = 85
    elif detail == "high":
        # Tiles of 512x512, each 170 tokens + 85 base
        tiles_x = (width + 511) // 512
        tiles_y = (height + 511) // 512
        tiles = min(tiles_x * tiles_y, 4 * 4)  # Max 4x4 grid
        image_tokens = 85 + tiles * 170
    else:  # auto
        # Auto selects high for images > 512x512, low otherwise
        if width > 512 or height > 512:
            tiles_x = (width + 511) // 512
            tiles_y = (height + 511) // 512
            tiles = min(tiles_x * tiles_y, 16)
            image_tokens = 85 + tiles * 170
        else:
            image_tokens = 85

    # gpt-4o: $2.50 per 1M input tokens
    cost_per_token = 2.50 / 1_000_000
    estimated_cost = image_tokens * cost_per_token

    return {
        "image_size": f"{width}x{height}",
        "detail_mode": detail,
        "estimated_tokens": image_tokens,
        "estimated_cost_usd": estimated_cost,
    }

# Use 'low' detail for documents where exact pixel-level accuracy isn't needed
# Use 'high' for medical images, charts with small text, or dense data tables

Vision Prompt Best Practices

Be specific about what to look at:

# Vague
"Analyze this image."

# Specific
"In this INR trend chart, identify: (1) the date range, (2) the therapeutic range (2.0-3.0) boundaries if shown, (3) any values outside the therapeutic range, and (4) the overall trend direction over the last 30 days."

Ask for structured output:

"Return the extracted data as JSON with fields: test_name, value, unit, reference_range, and flag."

Chain image and text processing:

Python

# Step 1: Extract raw data from image
raw_data = extract_lab_results_from_image("labs.jpg")

# Step 2: Analyze extracted data with text-only prompt (cheaper)
analysis = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[{
        "role": "user",
        "content": f"Given these lab results for a warfarin patient: {raw_data.model_dump_json()}\nAre any values concerning? What clinical action is indicated?",
    }],
)

This hybrid approach (vision for extraction, text-only for reasoning) reduces costs since text-only calls are cheaper than vision calls.

Multimodal Prompting: Vision and Images

Sending Images to the Model

Structured Data Extraction from Images

Chart and Graph Analysis

Multi-Image Comparison

URL-Based Images (No Base64 Needed)

Cost and Performance Considerations

Vision Prompt Best Practices

Enjoyed this article?

Leave a comment