What is a Set and when should we use it?
Master Python sets: O(1) membership testing, set operations (union, intersection, difference), frozenset, and practical use cases in AI pipelines for deduplication and fast lookup.
What is a Set?
A Python set is an unordered collection of unique, hashable objects backed by a hash table. The key properties:
- Unordered ā no indexing, no slicing
- Unique ā duplicate values are automatically discarded
- Mutable ā you can add and remove elements
- O(1) membership test ā
x in my_setis constant time regardless of size
# Create a set
approved_drugs = {"warfarin", "aspirin", "metformin", "lisinopril"}
# Duplicates are silently dropped
with_dupes = {"warfarin", "aspirin", "warfarin", "metformin", "aspirin"}
print(with_dupes) # {"warfarin", "aspirin", "metformin"} ā 3 elements, not 5
# From a list
drug_list = ["warfarin", "aspirin", "warfarin", "metformin"]
unique_drugs = set(drug_list)
print(unique_drugs) # {"warfarin", "aspirin", "metformin"}
# Empty set ā must use set(), not {} (that creates an empty dict)
empty_set = set()
empty_dict = {} # This is a dict, not a set
print(type(empty_set)) # <class 'set'>
print(type(empty_dict)) # <class 'dict'>Membership Testing: Why Sets Beat Lists
# O(n) ā must scan the entire list
def is_approved_list(drug: str, approved: list[str]) -> bool:
return drug in approved # Checks each element until found
# O(1) ā hash lookup, size doesn't matter
def is_approved_set(drug: str, approved: set[str]) -> bool:
return drug in approved # Constant time
# Performance difference becomes dramatic at scale
import timeit
drug_list = list(range(1_000_000))
drug_set = set(range(1_000_000))
list_time = timeit.timeit("999_999 in drug_list", globals=globals(), number=1000)
set_time = timeit.timeit("999_999 in drug_set", globals=globals(), number=1000)
print(f"List: {list_time:.4f}s | Set: {set_time:.6f}s")
# List: ~0.1s | Set: ~0.000002s ā sets are ~50,000x faster for this lookupRule: If you're checking membership repeatedly, convert to a set first.
Modifying Sets
drug_set = {"warfarin", "aspirin"}
# Add elements
drug_set.add("metformin") # Add single element
drug_set.update(["lisinopril", "atorvastatin"]) # Add multiple
# Remove elements
drug_set.remove("aspirin") # Raises KeyError if not found
drug_set.discard("ibuprofen") # Silent ā no error if not found
popped = drug_set.pop() # Remove and return an arbitrary element (unordered!)
drug_set.clear() # Empty the setSet Operations
The most powerful feature of sets: mathematical set operations.
patient_a_meds = {"warfarin", "aspirin", "lisinopril", "atorvastatin"}
patient_b_meds = {"warfarin", "metformin", "atorvastatin", "glipizide"}
# Union ā all medications across both patients
all_meds = patient_a_meds | patient_b_meds
all_meds = patient_a_meds.union(patient_b_meds) # Same
# {"warfarin", "aspirin", "lisinopril", "atorvastatin", "metformin", "glipizide"}
# Intersection ā medications both patients share
shared = patient_a_meds & patient_b_meds
shared = patient_a_meds.intersection(patient_b_meds)
# {"warfarin", "atorvastatin"}
# Difference ā medications in A but not B
only_a = patient_a_meds - patient_b_meds
only_a = patient_a_meds.difference(patient_b_meds)
# {"aspirin", "lisinopril"}
# Symmetric difference ā medications in one but not both
exclusive = patient_a_meds ^ patient_b_meds
exclusive = patient_a_meds.symmetric_difference(patient_b_meds)
# {"aspirin", "lisinopril", "metformin", "glipizide"}
# Subset / superset checks
formulary = {"warfarin", "aspirin", "metformin", "lisinopril", "atorvastatin", "glipizide"}
print(patient_a_meds.issubset(formulary)) # True ā all patient A's drugs are in formulary
print(formulary.issuperset(patient_a_meds)) # True ā same check from other direction
print(patient_a_meds.isdisjoint({"penicillin", "sulfa"})) # True ā no overlapfrozenset: Immutable Sets
frozenset is the immutable version of set. It is hashable and can be used as a dictionary key or stored in a set.
MAJOR_ANTICOAGULANTS = frozenset({"warfarin", "heparin", "apixaban", "rivaroxaban"})
# Cannot modify
MAJOR_ANTICOAGULANTS.add("dabigatran") # AttributeError: no 'add' method
# Can use as dict key (regular sets cannot)
protocol_by_drugs = {
frozenset({"warfarin", "aspirin"}): "bleeding_risk_protocol",
frozenset({"metformin", "contrast"}): "contrast_hold_protocol",
}
patient_drugs = frozenset({"warfarin", "aspirin"})
print(protocol_by_drugs.get(patient_drugs)) # "bleeding_risk_protocol"
# frozenset still supports all set operations (returns new frozensets)
print(MAJOR_ANTICOAGULANTS & frozenset({"warfarin", "metformin"}))
# frozenset({"warfarin"})Practical Use Cases in AI Pipelines
# 1. Deduplication in RAG pipelines
def deduplicate_sources(retrieved_docs: list[dict]) -> list[dict]:
"""Remove duplicate documents by source URL."""
seen_sources: set[str] = set()
unique = []
for doc in retrieved_docs:
source = doc.get("metadata", {}).get("source", "")
if source not in seen_sources:
seen_sources.add(source)
unique.append(doc)
return unique
# 2. Vocabulary building for NLP
def build_vocabulary(texts: list[str], min_frequency: int = 2) -> set[str]:
from collections import Counter
all_tokens = []
for text in texts:
all_tokens.extend(text.lower().split())
counts = Counter(all_tokens)
return {token for token, count in counts.items() if count >= min_frequency}
# 3. Detecting missing required fields
REQUIRED_FIELDS = {"drug_name", "dose_mg", "frequency", "route"}
def validate_prescription(prescription: dict) -> list[str]:
present = set(prescription.keys())
missing = REQUIRED_FIELDS - present
return sorted(missing) # Sort for deterministic output
prescription = {"drug_name": "warfarin", "dose_mg": 5}
print(validate_prescription(prescription)) # ["frequency", "route"]
# 4. Fast lookup for allowed values
ALLOWED_ROUTES = frozenset({"PO", "IV", "IM", "SC", "SL", "IN"})
MAJOR_SEVERITIES = frozenset({"Major", "Critical"})
def needs_pharmacist_review(interaction_severity: str, route: str) -> bool:
return interaction_severity in MAJOR_SEVERITIES or route not in ALLOWED_ROUTES
# 5. Tracking processed items (avoiding reprocessing in pipelines)
processed_ids: set[str] = set()
for doc_id, doc in document_stream:
if doc_id in processed_ids:
continue # Already processed ā skip
process(doc)
processed_ids.add(doc_id)Set Comprehensions
# {expression for item in iterable [if condition]}
texts = ["Warfarin 5mg", "Metformin 500mg", "Warfarin 10mg", "Aspirin 81mg"]
# Extract unique drug names (set automatically deduplicates)
drug_names = {text.split()[0].lower() for text in texts}
# {"warfarin", "metformin", "aspirin"} ā "warfarin" appears once despite two entries
# Filter while building
long_drug_names = {name for name in drug_names if len(name) > 7}
# {"warfarin", "metformin"}Sets vs Lists vs Dicts
| Need | Use |
|---|---|
| Fast membership test (x in ...) | set |
| Deduplication | set |
| Set math (union, intersection) | set |
| Ordered collection with duplicates | list |
| Key-value pairs | dict |
| Immutable set (as dict key) | frozenset |
| Ordered unique collection | list(dict.fromkeys(items)) ā preserves order |
# Preserve order while deduplicating (set doesn't preserve order)
drugs = ["warfarin", "aspirin", "warfarin", "metformin", "aspirin"]
# Fast but unordered:
deduped_unordered = set(drugs)
# Order preserved (Python 3.7+):
deduped_ordered = list(dict.fromkeys(drugs))
# ["warfarin", "aspirin", "metformin"] ā insertion order preservedFound this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.