Learnixo
Back to blog
AI Systemsintermediate

Multimodal Prompting: Vision and Images

Prompt LLMs with images, screenshots, and documents using vision APIs. Extract structured data from visual content, analyze charts, and process medical images.

Asma Hafeez KhanMay 16, 20266 min read
Prompt EngineeringMultimodalVisionGPT-4o
Share:𝕏

Sending Images to the Model

Modern vision models (GPT-4o, Claude 3.5 Sonnet, Gemini 1.5) accept images alongside text in the same API call:

Python
import anthropic
import base64
from pathlib import Path

claude_client = anthropic.Anthropic()

def image_to_base64(image_path: str) -> str:
    """Convert local image to base64 string."""
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

def ask_about_image(image_path: str, question: str) -> str:
    """Send an image with a question to Claude."""
    image_data = image_to_base64(image_path)
    # Determine media type from extension
    ext = Path(image_path).suffix.lower()
    media_type = {"jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png",
                  ".gif": "image/gif", ".webp": "image/webp"}.get(ext, "image/jpeg")

    response = claude_client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=1024,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": media_type,
                            "data": image_data,
                        },
                    },
                    {
                        "type": "text",
                        "text": question,
                    },
                ],
            }
        ],
    )
    return response.content[0].text

# OpenAI equivalent
from openai import OpenAI
openai_client = OpenAI()

def ask_about_image_openai(image_path: str, question: str, detail: str = "auto") -> str:
    """detail: 'low' (faster, cheaper), 'high' (more accurate), 'auto'"""
    image_data = image_to_base64(image_path)
    ext = Path(image_path).suffix.lower().lstrip(".")
    media_type = f"image/{ext if ext != 'jpg' else 'jpeg'}"

    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:{media_type};base64,{image_data}",
                            "detail": detail,
                        },
                    },
                    {"type": "text", "text": question},
                ],
            }
        ],
    )
    return response.choices[0].message.content

Structured Data Extraction from Images

Extract typed data from screenshots, forms, and documents:

Python
from pydantic import BaseModel
from typing import Optional
import json

class ExtractedLabResult(BaseModel):
    test_name: str
    value: float
    unit: str
    reference_range_low: Optional[float] = None
    reference_range_high: Optional[float] = None
    is_abnormal: bool
    flag: Optional[str] = None  # "H" for high, "L" for low, "C" for critical

class LabReport(BaseModel):
    patient_id: Optional[str] = None
    collection_date: Optional[str] = None
    results: list[ExtractedLabResult]
    ordering_provider: Optional[str] = None

def extract_lab_results_from_image(image_path: str) -> LabReport:
    """Extract structured lab results from a lab report image."""

    prompt = """Extract all laboratory test results from this lab report image.

Return ONLY valid JSON matching this schema:
{
  "patient_id": "string or null",
  "collection_date": "YYYY-MM-DD or null",
  "ordering_provider": "string or null",
  "results": [
    {
      "test_name": "string (e.g., 'INR', 'Hemoglobin', 'Creatinine')",
      "value": number,
      "unit": "string (e.g., 'mg/dL', 'g/dL', 'ratio')",
      "reference_range_low": number or null,
      "reference_range_high": number or null,
      "is_abnormal": boolean,
      "flag": "H or L or C or null"
    }
  ]
}

Extract EVERY test result shown. Do not skip any results."""

    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image_to_base64(image_path)}",
                            "detail": "high",  # High detail for text-heavy medical documents
                        },
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ],
        response_format={"type": "json_object"},
        temperature=0,
    )

    data = json.loads(response.choices[0].message.content)
    return LabReport(**data)

# Process a lab report image
lab_report = extract_lab_results_from_image("patient_labs.jpg")
print(f"Extracted {len(lab_report.results)} lab results")
for result in lab_report.results:
    flag = f" [{result.flag}]" if result.flag else ""
    print(f"  {result.test_name}: {result.value} {result.unit}{flag}")

Chart and Graph Analysis

Python
def analyze_clinical_chart(image_path: str, chart_type: str = "unknown") -> dict:
    """Extract data and insights from clinical charts and graphs."""

    prompt = f"""This is a {chart_type} clinical chart/graph.

Analyze it and provide:
1. CHART_TYPE: What type of chart/graph this is
2. AXES: X-axis label and Y-axis label with units
3. TIME_RANGE: Date/time range shown (if applicable)
4. KEY_VALUES: List the key data points or values visible
5. TRENDS: Describe any trends (increasing, decreasing, stable, cyclical)
6. CLINICAL_SIGNIFICANCE: What this pattern suggests clinically
7. ANOMALIES: Any unusual values, outliers, or notable events marked

Return as JSON:
{{
  "chart_type": "...",
  "x_axis": "...",
  "y_axis": "...",
  "time_range": "... or null",
  "key_values": [{{ "label": "...", "value": "...", "unit": "..." }}],
  "trend": "...",
  "clinical_significance": "...",
  "anomalies": ["..."]
}}"""

    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image_to_base64(image_path)}",
                            "detail": "high",
                        },
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ],
        response_format={"type": "json_object"},
        temperature=0,
    )
    return json.loads(response.choices[0].message.content)

# Example: INR trend chart
inr_analysis = analyze_clinical_chart("inr_trend.png", chart_type="INR trend over time")
print(f"Trend: {inr_analysis.get('trend')}")
print(f"Clinical significance: {inr_analysis.get('clinical_significance')}")

Multi-Image Comparison

Compare multiple images in a single prompt:

Python
def compare_images(
    image_paths: list[str],
    comparison_question: str,
) -> str:
    """Compare multiple images with a single prompt."""

    content = []
    for i, path in enumerate(image_paths, 1):
        content.append({"type": "text", "text": f"Image {i}:"})
        content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{image_to_base64(path)}",
                "detail": "auto",
            },
        })

    content.append({"type": "text", "text": comparison_question})

    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": content}],
        temperature=0,
    )
    return response.choices[0].message.content

# Compare two ECG tracings
comparison = compare_images(
    ["ecg_before.jpg", "ecg_after.jpg"],
    "Compare these two ECG tracings. What changed between Image 1 (baseline) and Image 2 (after medication change)? Focus on rate, rhythm, and any interval changes.",
)
print(comparison)

URL-Based Images (No Base64 Needed)

For images already on the web, pass the URL directly:

Python
def analyze_image_from_url(image_url: str, question: str) -> str:
    """Analyze a publicly accessible image by URL."""
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url, "detail": "auto"},
                    },
                    {"type": "text", "text": question},
                ],
            }
        ],
    )
    return response.choices[0].message.content

Cost and Performance Considerations

Python
def estimate_image_cost(
    image_path: str,
    detail: str = "auto",
    model: str = "gpt-4o",
) -> dict:
    """Estimate token cost for an image."""
    from PIL import Image

    img = Image.open(image_path)
    width, height = img.size

    if detail == "low":
        # Fixed cost: 85 tokens
        image_tokens = 85
    elif detail == "high":
        # Tiles of 512x512, each 170 tokens + 85 base
        tiles_x = (width + 511) // 512
        tiles_y = (height + 511) // 512
        tiles = min(tiles_x * tiles_y, 4 * 4)  # Max 4x4 grid
        image_tokens = 85 + tiles * 170
    else:  # auto
        # Auto selects high for images > 512x512, low otherwise
        if width > 512 or height > 512:
            tiles_x = (width + 511) // 512
            tiles_y = (height + 511) // 512
            tiles = min(tiles_x * tiles_y, 16)
            image_tokens = 85 + tiles * 170
        else:
            image_tokens = 85

    # gpt-4o: $2.50 per 1M input tokens
    cost_per_token = 2.50 / 1_000_000
    estimated_cost = image_tokens * cost_per_token

    return {
        "image_size": f"{width}x{height}",
        "detail_mode": detail,
        "estimated_tokens": image_tokens,
        "estimated_cost_usd": estimated_cost,
    }

# Use 'low' detail for documents where exact pixel-level accuracy isn't needed
# Use 'high' for medical images, charts with small text, or dense data tables

Vision Prompt Best Practices

Be specific about what to look at:

# Vague
"Analyze this image."

# Specific
"In this INR trend chart, identify: (1) the date range, (2) the therapeutic range (2.0-3.0) boundaries if shown, (3) any values outside the therapeutic range, and (4) the overall trend direction over the last 30 days."

Ask for structured output:

"Return the extracted data as JSON with fields: test_name, value, unit, reference_range, and flag."

Chain image and text processing:

Python
# Step 1: Extract raw data from image
raw_data = extract_lab_results_from_image("labs.jpg")

# Step 2: Analyze extracted data with text-only prompt (cheaper)
analysis = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[{
        "role": "user",
        "content": f"Given these lab results for a warfarin patient: {raw_data.model_dump_json()}\nAre any values concerning? What clinical action is indicated?",
    }],
)

This hybrid approach (vision for extraction, text-only for reasoning) reduces costs since text-only calls are cheaper than vision calls.

Enjoyed this article?

Explore the AI Systems learning path for more.

Found this helpful?

Share:𝕏

Leave a comment

Have a question, correction, or just found this helpful? Leave a note below.