Learnixo
Back to blog
AI Systemsintermediate

Dictionary Comprehensions

Build and transform dicts with comprehensions: basic syntax, filtering, inverting mappings, grouping, and patterns used in AI data pipelines and LangChain metadata handling.

Asma Hafeez KhanMay 16, 20265 min read
PythonDictionary ComprehensionData TransformationFunctional Programming
Share:𝕏

Basic Syntax

A dict comprehension creates a new dictionary by mapping keys to values across an iterable:

{key_expr: value_expr for item in iterable}
Python
# Without comprehension
drug_lengths = {}
for drug in ["warfarin", "aspirin", "metformin"]:
    drug_lengths[drug] = len(drug)
# {"warfarin": 8, "aspirin": 7, "metformin": 9}

# With comprehension  same result
drug_lengths = {drug: len(drug) for drug in ["warfarin", "aspirin", "metformin"]}

With Filtering

Python
# {key: value for item in iterable if condition}

drug_doses = {
    "warfarin": 5,
    "aspirin": 81,
    "metformin": 500,
    "lisinopril": 10,
    "atorvastatin": 20,
}

# Keep only low-dose drugs (under 50mg)
low_dose = {drug: dose for drug, dose in drug_doses.items() if dose < 50}
# {"warfarin": 5, "lisinopril": 10, "atorvastatin": 20}

# Normalize names and keep high doses
high_dose_normalized = {
    drug.upper(): dose
    for drug, dose in drug_doses.items()
    if dose >= 100
}
# {"ASPIRIN": 81, "METFORMIN": 500}

From a List of Tuples

Python
# Perfect for converting from a sequence of (key, value) pairs
pairs = [("warfarin", "anticoagulant"), ("metformin", "antidiabetic"), ("aspirin", "nsaid")]
category_map = {drug: category for drug, category in pairs}
# {"warfarin": "anticoagulant", "metformin": "antidiabetic", "aspirin": "nsaid"}

# Equivalent  dict() constructor also works from pairs
category_map = dict(pairs)

# Useful: enumerate() to create index  value mapping
drugs = ["warfarin", "aspirin", "metformin"]
index_to_drug = {i: drug for i, drug in enumerate(drugs)}
# {0: "warfarin", 1: "aspirin", 2: "metformin"}

drug_to_index = {drug: i for i, drug in enumerate(drugs)}
# {"warfarin": 0, "aspirin": 1, "metformin": 2}

Inverting a Dictionary

Python
original = {
    "warfarin": "anticoagulant",
    "heparin": "anticoagulant",
    "metformin": "antidiabetic",
    "glipizide": "antidiabetic",
}

# Simple inversion (works only if values are unique)
inverted = {v: k for k, v in original.items()}
# {"anticoagulant": "glipizide", "antidiabetic": "glipizide"}
# PROBLEM: duplicate values  "heparin" and "glipizide" overwrite earlier values!

# Safe inversion: group duplicates into lists
from collections import defaultdict

def invert_grouped(d: dict) -> dict:
    result = defaultdict(list)
    for k, v in d.items():
        result[v].append(k)
    return dict(result)

grouped = invert_grouped(original)
# {"anticoagulant": ["warfarin", "heparin"], "antidiabetic": ["metformin", "glipizide"]}

Transforming Values

Python
# Apply a transformation to all values
drug_doses_mg = {"warfarin": 5, "aspirin": 81, "metformin": 500}

# Convert to grams
drug_doses_g = {drug: dose / 1000 for drug, dose in drug_doses_mg.items()}
# {"warfarin": 0.005, "aspirin": 0.081, "metformin": 0.5}

# Round all values
drug_costs = {"warfarin": 0.1234, "aspirin": 0.0567, "metformin": 0.2341}
rounded_costs = {drug: round(cost, 2) for drug, cost in drug_costs.items()}
# {"warfarin": 0.12, "aspirin": 0.06, "metformin": 0.23}

Grouping and Counting

Python
from collections import defaultdict

# Group drugs by category
drug_category_pairs = [
    ("warfarin", "anticoagulant"),
    ("heparin", "anticoagulant"),
    ("metformin", "antidiabetic"),
    ("glipizide", "antidiabetic"),
    ("lisinopril", "antihypertensive"),
]

# Comprehension with defaultdict
groups = defaultdict(list)
{groups[cat].append(drug) for drug, cat in drug_category_pairs}
groups = dict(groups)
# {"anticoagulant": ["warfarin", "heparin"], ...}


# Count occurrences
drug_mentions = ["warfarin", "aspirin", "warfarin", "metformin", "warfarin", "aspirin"]
mention_counts = {}
for drug in drug_mentions:
    mention_counts[drug] = mention_counts.get(drug, 0) + 1
# {"warfarin": 3, "aspirin": 2, "metformin": 1}

# Faster with Counter:
from collections import Counter
mention_counts = dict(Counter(drug_mentions))

Set Comprehensions (Same Syntax)

While this article is about dict comprehensions, set comprehensions use the same pattern with {} and no ::

Python
# Set comprehension: {expression for item in iterable}
drug_names = ["Warfarin", "Aspirin", "METFORMIN", "warfarin"]
unique_lower = {name.lower() for name in drug_names}
# {"warfarin", "aspirin", "metformin"}  unique, unordered

Practical AI/ML Patterns

Python
# 1. Build metadata dict from list of documents
from langchain_core.documents import Document

def build_source_map(docs: list[Document]) -> dict[str, str]:
    """Map chunk IDs to source filenames."""
    return {
        doc.metadata.get("chunk_id", f"chunk_{i}"): doc.metadata.get("source", "unknown")
        for i, doc in enumerate(docs)
    }

# 2. Score-keyed results from retrieval
def results_by_score(retrieval_results: list[tuple]) -> dict[float, str]:
    """Map similarity score to document content (top-level)."""
    return {round(score, 3): doc.page_content[:100] for doc, score in retrieval_results}

# 3. Config merging with comprehension
base_config = {"temperature": 0, "model": "gpt-4o", "max_tokens": 500}
user_overrides = {"temperature": 0.3, "max_tokens": 1000}

merged = {
    key: user_overrides.get(key, base_config[key])
    for key in base_config
}
# {"temperature": 0.3, "model": "gpt-4o", "max_tokens": 1000}

# 4. Build prompt variables from multiple retrievers
def build_context(retrievers: dict[str, object], query: str) -> dict[str, str]:
    """Run each retriever and collect results into a dict of context strings."""
    return {
        name: "\n".join(d.page_content for d in retriever.invoke(query))
        for name, retriever in retrievers.items()
    }

contexts = build_context(
    retrievers={"drug_info": drug_retriever, "interactions": interaction_retriever},
    query="warfarin aspirin combination",
)
# {"drug_info": "...", "interactions": "..."}

# 5. Filter metadata for a specific tenant
all_docs_metadata = {
    "doc_1": {"tenant": "hospital_a", "category": "anticoagulant"},
    "doc_2": {"tenant": "hospital_b", "category": "antidiabetic"},
    "doc_3": {"tenant": "hospital_a", "category": "antihypertensive"},
}

hospital_a_docs = {
    doc_id: meta
    for doc_id, meta in all_docs_metadata.items()
    if meta["tenant"] == "hospital_a"
}
# {"doc_1": {...}, "doc_3": {...}}

When to Use vs. a Loop

Python
# Use comprehension: simple key-value transformation
drug_upper = {drug: drug.upper() for drug in drugs}

# Use a loop: building a dict with complex, multi-step logic
results = {}
for doc in documents:
    key = doc.metadata.get("source")
    if key is None:
        continue
    existing = results.get(key, [])
    existing.append(doc.page_content[:100])
    results[key] = existing

# Use comprehension: inversion, filtering, renaming keys
filtered = {k: v for k, v in config.items() if v is not None}

# Use dict() constructor: from list of pairs
config = dict([("model", "gpt-4o"), ("temperature", 0)])

Enjoyed this article?

Explore the AI Systems learning path for more.

Found this helpful?

Share:𝕏

Leave a comment

Have a question, correction, or just found this helpful? Leave a note below.