AI Systemsintermediate
Multimodal Prompting: Vision and Images
Prompt LLMs with images, screenshots, and documents using vision APIs. Extract structured data from visual content, analyze charts, and process medical images.
Asma Hafeez KhanMay 16, 20266 min read
Prompt EngineeringMultimodalVisionGPT-4o
Sending Images to the Model
Modern vision models (GPT-4o, Claude 3.5 Sonnet, Gemini 1.5) accept images alongside text in the same API call:
Python
import anthropic
import base64
from pathlib import Path
claude_client = anthropic.Anthropic()
def image_to_base64(image_path: str) -> str:
"""Convert local image to base64 string."""
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
def ask_about_image(image_path: str, question: str) -> str:
"""Send an image with a question to Claude."""
image_data = image_to_base64(image_path)
# Determine media type from extension
ext = Path(image_path).suffix.lower()
media_type = {"jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png",
".gif": "image/gif", ".webp": "image/webp"}.get(ext, "image/jpeg")
response = claude_client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": image_data,
},
},
{
"type": "text",
"text": question,
},
],
}
],
)
return response.content[0].text
# OpenAI equivalent
from openai import OpenAI
openai_client = OpenAI()
def ask_about_image_openai(image_path: str, question: str, detail: str = "auto") -> str:
"""detail: 'low' (faster, cheaper), 'high' (more accurate), 'auto'"""
image_data = image_to_base64(image_path)
ext = Path(image_path).suffix.lower().lstrip(".")
media_type = f"image/{ext if ext != 'jpg' else 'jpeg'}"
response = openai_client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:{media_type};base64,{image_data}",
"detail": detail,
},
},
{"type": "text", "text": question},
],
}
],
)
return response.choices[0].message.contentStructured Data Extraction from Images
Extract typed data from screenshots, forms, and documents:
Python
from pydantic import BaseModel
from typing import Optional
import json
class ExtractedLabResult(BaseModel):
test_name: str
value: float
unit: str
reference_range_low: Optional[float] = None
reference_range_high: Optional[float] = None
is_abnormal: bool
flag: Optional[str] = None # "H" for high, "L" for low, "C" for critical
class LabReport(BaseModel):
patient_id: Optional[str] = None
collection_date: Optional[str] = None
results: list[ExtractedLabResult]
ordering_provider: Optional[str] = None
def extract_lab_results_from_image(image_path: str) -> LabReport:
"""Extract structured lab results from a lab report image."""
prompt = """Extract all laboratory test results from this lab report image.
Return ONLY valid JSON matching this schema:
{
"patient_id": "string or null",
"collection_date": "YYYY-MM-DD or null",
"ordering_provider": "string or null",
"results": [
{
"test_name": "string (e.g., 'INR', 'Hemoglobin', 'Creatinine')",
"value": number,
"unit": "string (e.g., 'mg/dL', 'g/dL', 'ratio')",
"reference_range_low": number or null,
"reference_range_high": number or null,
"is_abnormal": boolean,
"flag": "H or L or C or null"
}
]
}
Extract EVERY test result shown. Do not skip any results."""
response = openai_client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_to_base64(image_path)}",
"detail": "high", # High detail for text-heavy medical documents
},
},
{"type": "text", "text": prompt},
],
}
],
response_format={"type": "json_object"},
temperature=0,
)
data = json.loads(response.choices[0].message.content)
return LabReport(**data)
# Process a lab report image
lab_report = extract_lab_results_from_image("patient_labs.jpg")
print(f"Extracted {len(lab_report.results)} lab results")
for result in lab_report.results:
flag = f" [{result.flag}]" if result.flag else ""
print(f" {result.test_name}: {result.value} {result.unit}{flag}")Chart and Graph Analysis
Python
def analyze_clinical_chart(image_path: str, chart_type: str = "unknown") -> dict:
"""Extract data and insights from clinical charts and graphs."""
prompt = f"""This is a {chart_type} clinical chart/graph.
Analyze it and provide:
1. CHART_TYPE: What type of chart/graph this is
2. AXES: X-axis label and Y-axis label with units
3. TIME_RANGE: Date/time range shown (if applicable)
4. KEY_VALUES: List the key data points or values visible
5. TRENDS: Describe any trends (increasing, decreasing, stable, cyclical)
6. CLINICAL_SIGNIFICANCE: What this pattern suggests clinically
7. ANOMALIES: Any unusual values, outliers, or notable events marked
Return as JSON:
{{
"chart_type": "...",
"x_axis": "...",
"y_axis": "...",
"time_range": "... or null",
"key_values": [{{ "label": "...", "value": "...", "unit": "..." }}],
"trend": "...",
"clinical_significance": "...",
"anomalies": ["..."]
}}"""
response = openai_client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_to_base64(image_path)}",
"detail": "high",
},
},
{"type": "text", "text": prompt},
],
}
],
response_format={"type": "json_object"},
temperature=0,
)
return json.loads(response.choices[0].message.content)
# Example: INR trend chart
inr_analysis = analyze_clinical_chart("inr_trend.png", chart_type="INR trend over time")
print(f"Trend: {inr_analysis.get('trend')}")
print(f"Clinical significance: {inr_analysis.get('clinical_significance')}")Multi-Image Comparison
Compare multiple images in a single prompt:
Python
def compare_images(
image_paths: list[str],
comparison_question: str,
) -> str:
"""Compare multiple images with a single prompt."""
content = []
for i, path in enumerate(image_paths, 1):
content.append({"type": "text", "text": f"Image {i}:"})
content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_to_base64(path)}",
"detail": "auto",
},
})
content.append({"type": "text", "text": comparison_question})
response = openai_client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}],
temperature=0,
)
return response.choices[0].message.content
# Compare two ECG tracings
comparison = compare_images(
["ecg_before.jpg", "ecg_after.jpg"],
"Compare these two ECG tracings. What changed between Image 1 (baseline) and Image 2 (after medication change)? Focus on rate, rhythm, and any interval changes.",
)
print(comparison)URL-Based Images (No Base64 Needed)
For images already on the web, pass the URL directly:
Python
def analyze_image_from_url(image_url: str, question: str) -> str:
"""Analyze a publicly accessible image by URL."""
response = openai_client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_url, "detail": "auto"},
},
{"type": "text", "text": question},
],
}
],
)
return response.choices[0].message.contentCost and Performance Considerations
Python
def estimate_image_cost(
image_path: str,
detail: str = "auto",
model: str = "gpt-4o",
) -> dict:
"""Estimate token cost for an image."""
from PIL import Image
img = Image.open(image_path)
width, height = img.size
if detail == "low":
# Fixed cost: 85 tokens
image_tokens = 85
elif detail == "high":
# Tiles of 512x512, each 170 tokens + 85 base
tiles_x = (width + 511) // 512
tiles_y = (height + 511) // 512
tiles = min(tiles_x * tiles_y, 4 * 4) # Max 4x4 grid
image_tokens = 85 + tiles * 170
else: # auto
# Auto selects high for images > 512x512, low otherwise
if width > 512 or height > 512:
tiles_x = (width + 511) // 512
tiles_y = (height + 511) // 512
tiles = min(tiles_x * tiles_y, 16)
image_tokens = 85 + tiles * 170
else:
image_tokens = 85
# gpt-4o: $2.50 per 1M input tokens
cost_per_token = 2.50 / 1_000_000
estimated_cost = image_tokens * cost_per_token
return {
"image_size": f"{width}x{height}",
"detail_mode": detail,
"estimated_tokens": image_tokens,
"estimated_cost_usd": estimated_cost,
}
# Use 'low' detail for documents where exact pixel-level accuracy isn't needed
# Use 'high' for medical images, charts with small text, or dense data tablesVision Prompt Best Practices
Be specific about what to look at:
# Vague
"Analyze this image."
# Specific
"In this INR trend chart, identify: (1) the date range, (2) the therapeutic range (2.0-3.0) boundaries if shown, (3) any values outside the therapeutic range, and (4) the overall trend direction over the last 30 days."Ask for structured output:
"Return the extracted data as JSON with fields: test_name, value, unit, reference_range, and flag."Chain image and text processing:
Python
# Step 1: Extract raw data from image
raw_data = extract_lab_results_from_image("labs.jpg")
# Step 2: Analyze extracted data with text-only prompt (cheaper)
analysis = openai_client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": f"Given these lab results for a warfarin patient: {raw_data.model_dump_json()}\nAre any values concerning? What clinical action is indicated?",
}],
)This hybrid approach (vision for extraction, text-only for reasoning) reduces costs since text-only calls are cheaper than vision calls.
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.