AI Systemsintermediate
Dictionary Comprehensions
Build and transform dicts with comprehensions: basic syntax, filtering, inverting mappings, grouping, and patterns used in AI data pipelines and LangChain metadata handling.
Asma Hafeez KhanMay 16, 20265 min read
PythonDictionary ComprehensionData TransformationFunctional Programming
Basic Syntax
A dict comprehension creates a new dictionary by mapping keys to values across an iterable:
{key_expr: value_expr for item in iterable}Python
# Without comprehension
drug_lengths = {}
for drug in ["warfarin", "aspirin", "metformin"]:
drug_lengths[drug] = len(drug)
# {"warfarin": 8, "aspirin": 7, "metformin": 9}
# With comprehension — same result
drug_lengths = {drug: len(drug) for drug in ["warfarin", "aspirin", "metformin"]}With Filtering
Python
# {key: value for item in iterable if condition}
drug_doses = {
"warfarin": 5,
"aspirin": 81,
"metformin": 500,
"lisinopril": 10,
"atorvastatin": 20,
}
# Keep only low-dose drugs (under 50mg)
low_dose = {drug: dose for drug, dose in drug_doses.items() if dose < 50}
# {"warfarin": 5, "lisinopril": 10, "atorvastatin": 20}
# Normalize names and keep high doses
high_dose_normalized = {
drug.upper(): dose
for drug, dose in drug_doses.items()
if dose >= 100
}
# {"ASPIRIN": 81, "METFORMIN": 500}From a List of Tuples
Python
# Perfect for converting from a sequence of (key, value) pairs
pairs = [("warfarin", "anticoagulant"), ("metformin", "antidiabetic"), ("aspirin", "nsaid")]
category_map = {drug: category for drug, category in pairs}
# {"warfarin": "anticoagulant", "metformin": "antidiabetic", "aspirin": "nsaid"}
# Equivalent — dict() constructor also works from pairs
category_map = dict(pairs)
# Useful: enumerate() to create index → value mapping
drugs = ["warfarin", "aspirin", "metformin"]
index_to_drug = {i: drug for i, drug in enumerate(drugs)}
# {0: "warfarin", 1: "aspirin", 2: "metformin"}
drug_to_index = {drug: i for i, drug in enumerate(drugs)}
# {"warfarin": 0, "aspirin": 1, "metformin": 2}Inverting a Dictionary
Python
original = {
"warfarin": "anticoagulant",
"heparin": "anticoagulant",
"metformin": "antidiabetic",
"glipizide": "antidiabetic",
}
# Simple inversion (works only if values are unique)
inverted = {v: k for k, v in original.items()}
# {"anticoagulant": "glipizide", "antidiabetic": "glipizide"}
# PROBLEM: duplicate values — "heparin" and "glipizide" overwrite earlier values!
# Safe inversion: group duplicates into lists
from collections import defaultdict
def invert_grouped(d: dict) -> dict:
result = defaultdict(list)
for k, v in d.items():
result[v].append(k)
return dict(result)
grouped = invert_grouped(original)
# {"anticoagulant": ["warfarin", "heparin"], "antidiabetic": ["metformin", "glipizide"]}Transforming Values
Python
# Apply a transformation to all values
drug_doses_mg = {"warfarin": 5, "aspirin": 81, "metformin": 500}
# Convert to grams
drug_doses_g = {drug: dose / 1000 for drug, dose in drug_doses_mg.items()}
# {"warfarin": 0.005, "aspirin": 0.081, "metformin": 0.5}
# Round all values
drug_costs = {"warfarin": 0.1234, "aspirin": 0.0567, "metformin": 0.2341}
rounded_costs = {drug: round(cost, 2) for drug, cost in drug_costs.items()}
# {"warfarin": 0.12, "aspirin": 0.06, "metformin": 0.23}Grouping and Counting
Python
from collections import defaultdict
# Group drugs by category
drug_category_pairs = [
("warfarin", "anticoagulant"),
("heparin", "anticoagulant"),
("metformin", "antidiabetic"),
("glipizide", "antidiabetic"),
("lisinopril", "antihypertensive"),
]
# Comprehension with defaultdict
groups = defaultdict(list)
{groups[cat].append(drug) for drug, cat in drug_category_pairs}
groups = dict(groups)
# {"anticoagulant": ["warfarin", "heparin"], ...}
# Count occurrences
drug_mentions = ["warfarin", "aspirin", "warfarin", "metformin", "warfarin", "aspirin"]
mention_counts = {}
for drug in drug_mentions:
mention_counts[drug] = mention_counts.get(drug, 0) + 1
# {"warfarin": 3, "aspirin": 2, "metformin": 1}
# Faster with Counter:
from collections import Counter
mention_counts = dict(Counter(drug_mentions))Set Comprehensions (Same Syntax)
While this article is about dict comprehensions, set comprehensions use the same pattern with {} and no ::
Python
# Set comprehension: {expression for item in iterable}
drug_names = ["Warfarin", "Aspirin", "METFORMIN", "warfarin"]
unique_lower = {name.lower() for name in drug_names}
# {"warfarin", "aspirin", "metformin"} — unique, unorderedPractical AI/ML Patterns
Python
# 1. Build metadata dict from list of documents
from langchain_core.documents import Document
def build_source_map(docs: list[Document]) -> dict[str, str]:
"""Map chunk IDs to source filenames."""
return {
doc.metadata.get("chunk_id", f"chunk_{i}"): doc.metadata.get("source", "unknown")
for i, doc in enumerate(docs)
}
# 2. Score-keyed results from retrieval
def results_by_score(retrieval_results: list[tuple]) -> dict[float, str]:
"""Map similarity score to document content (top-level)."""
return {round(score, 3): doc.page_content[:100] for doc, score in retrieval_results}
# 3. Config merging with comprehension
base_config = {"temperature": 0, "model": "gpt-4o", "max_tokens": 500}
user_overrides = {"temperature": 0.3, "max_tokens": 1000}
merged = {
key: user_overrides.get(key, base_config[key])
for key in base_config
}
# {"temperature": 0.3, "model": "gpt-4o", "max_tokens": 1000}
# 4. Build prompt variables from multiple retrievers
def build_context(retrievers: dict[str, object], query: str) -> dict[str, str]:
"""Run each retriever and collect results into a dict of context strings."""
return {
name: "\n".join(d.page_content for d in retriever.invoke(query))
for name, retriever in retrievers.items()
}
contexts = build_context(
retrievers={"drug_info": drug_retriever, "interactions": interaction_retriever},
query="warfarin aspirin combination",
)
# {"drug_info": "...", "interactions": "..."}
# 5. Filter metadata for a specific tenant
all_docs_metadata = {
"doc_1": {"tenant": "hospital_a", "category": "anticoagulant"},
"doc_2": {"tenant": "hospital_b", "category": "antidiabetic"},
"doc_3": {"tenant": "hospital_a", "category": "antihypertensive"},
}
hospital_a_docs = {
doc_id: meta
for doc_id, meta in all_docs_metadata.items()
if meta["tenant"] == "hospital_a"
}
# {"doc_1": {...}, "doc_3": {...}}When to Use vs. a Loop
Python
# Use comprehension: simple key-value transformation
drug_upper = {drug: drug.upper() for drug in drugs}
# Use a loop: building a dict with complex, multi-step logic
results = {}
for doc in documents:
key = doc.metadata.get("source")
if key is None:
continue
existing = results.get(key, [])
existing.append(doc.page_content[:100])
results[key] = existing
# Use comprehension: inversion, filtering, renaming keys
filtered = {k: v for k, v in config.items() if v is not None}
# Use dict() constructor: from list of pairs
config = dict([("model", "gpt-4o"), ("temperature", 0)])Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.