Learnixo

Python Essentials for AI Engineers · Lesson 7 of 36

What is a Set and when should we use it?

What is a Set?

A Python set is an unordered collection of unique, hashable objects backed by a hash table. The key properties:

  • Unordered — no indexing, no slicing
  • Unique — duplicate values are automatically discarded
  • Mutable — you can add and remove elements
  • O(1) membership testx in my_set is constant time regardless of size
Python
# Create a set
approved_drugs = {"warfarin", "aspirin", "metformin", "lisinopril"}

# Duplicates are silently dropped
with_dupes = {"warfarin", "aspirin", "warfarin", "metformin", "aspirin"}
print(with_dupes)   # {"warfarin", "aspirin", "metformin"}  3 elements, not 5

# From a list
drug_list = ["warfarin", "aspirin", "warfarin", "metformin"]
unique_drugs = set(drug_list)
print(unique_drugs)   # {"warfarin", "aspirin", "metformin"}

# Empty set  must use set(), not {} (that creates an empty dict)
empty_set  = set()
empty_dict = {}   # This is a dict, not a set
print(type(empty_set))   # <class 'set'>
print(type(empty_dict))  # <class 'dict'>

Membership Testing: Why Sets Beat Lists

Python
# O(n)  must scan the entire list
def is_approved_list(drug: str, approved: list[str]) -> bool:
    return drug in approved   # Checks each element until found

# O(1)  hash lookup, size doesn't matter
def is_approved_set(drug: str, approved: set[str]) -> bool:
    return drug in approved   # Constant time


# Performance difference becomes dramatic at scale
import timeit

drug_list = list(range(1_000_000))
drug_set  = set(range(1_000_000))

list_time = timeit.timeit("999_999 in drug_list", globals=globals(), number=1000)
set_time  = timeit.timeit("999_999 in drug_set",  globals=globals(), number=1000)

print(f"List: {list_time:.4f}s | Set: {set_time:.6f}s")
# List: ~0.1s | Set: ~0.000002s — sets are ~50,000x faster for this lookup

Rule: If you're checking membership repeatedly, convert to a set first.


Modifying Sets

Python
drug_set = {"warfarin", "aspirin"}

# Add elements
drug_set.add("metformin")           # Add single element
drug_set.update(["lisinopril", "atorvastatin"])  # Add multiple

# Remove elements
drug_set.remove("aspirin")          # Raises KeyError if not found
drug_set.discard("ibuprofen")       # Silent  no error if not found

popped = drug_set.pop()             # Remove and return an arbitrary element (unordered!)

drug_set.clear()                    # Empty the set

Set Operations

The most powerful feature of sets: mathematical set operations.

Python
patient_a_meds = {"warfarin", "aspirin", "lisinopril", "atorvastatin"}
patient_b_meds = {"warfarin", "metformin", "atorvastatin", "glipizide"}

# Union  all medications across both patients
all_meds = patient_a_meds | patient_b_meds
all_meds = patient_a_meds.union(patient_b_meds)   # Same
# {"warfarin", "aspirin", "lisinopril", "atorvastatin", "metformin", "glipizide"}

# Intersection  medications both patients share
shared = patient_a_meds & patient_b_meds
shared = patient_a_meds.intersection(patient_b_meds)
# {"warfarin", "atorvastatin"}

# Difference  medications in A but not B
only_a = patient_a_meds - patient_b_meds
only_a = patient_a_meds.difference(patient_b_meds)
# {"aspirin", "lisinopril"}

# Symmetric difference  medications in one but not both
exclusive = patient_a_meds ^ patient_b_meds
exclusive = patient_a_meds.symmetric_difference(patient_b_meds)
# {"aspirin", "lisinopril", "metformin", "glipizide"}

# Subset / superset checks
formulary = {"warfarin", "aspirin", "metformin", "lisinopril", "atorvastatin", "glipizide"}
print(patient_a_meds.issubset(formulary))      # True  all patient A's drugs are in formulary
print(formulary.issuperset(patient_a_meds))    # True — same check from other direction
print(patient_a_meds.isdisjoint({"penicillin", "sulfa"}))  # True — no overlap

frozenset: Immutable Sets

frozenset is the immutable version of set. It is hashable and can be used as a dictionary key or stored in a set.

Python
MAJOR_ANTICOAGULANTS = frozenset({"warfarin", "heparin", "apixaban", "rivaroxaban"})

# Cannot modify
MAJOR_ANTICOAGULANTS.add("dabigatran")  # AttributeError: no 'add' method

# Can use as dict key (regular sets cannot)
protocol_by_drugs = {
    frozenset({"warfarin", "aspirin"}): "bleeding_risk_protocol",
    frozenset({"metformin", "contrast"}): "contrast_hold_protocol",
}

patient_drugs = frozenset({"warfarin", "aspirin"})
print(protocol_by_drugs.get(patient_drugs))   # "bleeding_risk_protocol"

# frozenset still supports all set operations (returns new frozensets)
print(MAJOR_ANTICOAGULANTS & frozenset({"warfarin", "metformin"}))
# frozenset({"warfarin"})

Practical Use Cases in AI Pipelines

Python
# 1. Deduplication in RAG pipelines
def deduplicate_sources(retrieved_docs: list[dict]) -> list[dict]:
    """Remove duplicate documents by source URL."""
    seen_sources: set[str] = set()
    unique = []
    for doc in retrieved_docs:
        source = doc.get("metadata", {}).get("source", "")
        if source not in seen_sources:
            seen_sources.add(source)
            unique.append(doc)
    return unique


# 2. Vocabulary building for NLP
def build_vocabulary(texts: list[str], min_frequency: int = 2) -> set[str]:
    from collections import Counter
    all_tokens = []
    for text in texts:
        all_tokens.extend(text.lower().split())
    counts = Counter(all_tokens)
    return {token for token, count in counts.items() if count >= min_frequency}


# 3. Detecting missing required fields
REQUIRED_FIELDS = {"drug_name", "dose_mg", "frequency", "route"}

def validate_prescription(prescription: dict) -> list[str]:
    present = set(prescription.keys())
    missing = REQUIRED_FIELDS - present
    return sorted(missing)   # Sort for deterministic output

prescription = {"drug_name": "warfarin", "dose_mg": 5}
print(validate_prescription(prescription))   # ["frequency", "route"]


# 4. Fast lookup for allowed values
ALLOWED_ROUTES = frozenset({"PO", "IV", "IM", "SC", "SL", "IN"})
MAJOR_SEVERITIES = frozenset({"Major", "Critical"})

def needs_pharmacist_review(interaction_severity: str, route: str) -> bool:
    return interaction_severity in MAJOR_SEVERITIES or route not in ALLOWED_ROUTES


# 5. Tracking processed items (avoiding reprocessing in pipelines)
processed_ids: set[str] = set()

for doc_id, doc in document_stream:
    if doc_id in processed_ids:
        continue   # Already processed  skip
    process(doc)
    processed_ids.add(doc_id)

Set Comprehensions

Python
# {expression for item in iterable [if condition]}
texts = ["Warfarin 5mg", "Metformin 500mg", "Warfarin 10mg", "Aspirin 81mg"]

# Extract unique drug names (set automatically deduplicates)
drug_names = {text.split()[0].lower() for text in texts}
# {"warfarin", "metformin", "aspirin"}  "warfarin" appears once despite two entries


# Filter while building
long_drug_names = {name for name in drug_names if len(name) > 7}
# {"warfarin", "metformin"}

Sets vs Lists vs Dicts

| Need | Use | |---|---| | Fast membership test (x in ...) | set | | Deduplication | set | | Set math (union, intersection) | set | | Ordered collection with duplicates | list | | Key-value pairs | dict | | Immutable set (as dict key) | frozenset | | Ordered unique collection | list(dict.fromkeys(items)) — preserves order |

Python
# Preserve order while deduplicating (set doesn't preserve order)
drugs = ["warfarin", "aspirin", "warfarin", "metformin", "aspirin"]

# Fast but unordered:
deduped_unordered = set(drugs)

# Order preserved (Python 3.7+):
deduped_ordered = list(dict.fromkeys(drugs))
# ["warfarin", "aspirin", "metformin"] — insertion order preserved