Learnixo

Python Essentials for AI Engineers · Lesson 34 of 36

Dictionary and Hashing Problems

Problem 1: Word Frequency Map

Problem: Given a list of words, return a dict mapping each word to its frequency, sorted by frequency descending.

Python
from collections import Counter

def word_frequency(words: list[str]) -> dict[str, int]:
    """Return word → count, sorted by count descending."""
    counts = Counter(words)
    return dict(counts.most_common())   # most_common() returns sorted list


# Token frequency analysis for LLM outputs
tokens = ["warfarin", "dose", "warfarin", "daily", "dose", "warfarin", "INR"]
freq = word_frequency(tokens)
print(freq)
# {"warfarin": 3, "dose": 2, "daily": 1, "INR": 1}


# Manual version (no Counter)
def word_frequency_manual(words: list[str]) -> dict[str, int]:
    counts: dict[str, int] = {}
    for word in words:
        counts[word] = counts.get(word, 0) + 1
    return dict(sorted(counts.items(), key=lambda x: x[1], reverse=True))

Problem 2: Group Anagrams

Problem: Given a list of strings, group anagrams together.

Python
from collections import defaultdict

def group_anagrams(words: list[str]) -> list[list[str]]:
    """
    Group words that are anagrams of each other.
    Key insight: anagrams share the same sorted characters.
    O(n * k log k) where k is the max word length.
    """
    groups: defaultdict[str, list[str]] = defaultdict(list)

    for word in words:
        key = "".join(sorted(word))   # Canonical form: sorted characters
        groups[key].append(word)

    return list(groups.values())


print(group_anagrams(["eat", "tea", "tan", "ate", "nat", "bat"]))
# [["eat", "tea", "ate"], ["tan", "nat"], ["bat"]]


# AI context: grouping drug names with same characters (e.g., brand vs generic)
drug_names = ["ASPIRIN", "ANISPIR", "WARFARIN", "RAINFRAW"]
groups = group_anagrams([d.lower() for d in drug_names])
print(groups)
# [["aspirin", "anispir"], ["warfarin", "rainfraw"]]

Problem 3: LRU Cache

Problem: Implement a Least Recently Used (LRU) cache with O(1) get and put operations.

Python
from collections import OrderedDict

class LRUCache:
    """
    LRU cache using OrderedDict (maintains insertion order).
    O(1) get and put.
    
    AI context: caching LLM responses for repeated queries.
    """

    def __init__(self, capacity: int):
        self.capacity = capacity
        self._cache: OrderedDict[str, str] = OrderedDict()

    def get(self, key: str) -> str | None:
        if key not in self._cache:
            return None
        self._cache.move_to_end(key)   # Mark as most recently used
        return self._cache[key]

    def put(self, key: str, value: str) -> None:
        if key in self._cache:
            self._cache.move_to_end(key)
        self._cache[key] = value
        if len(self._cache) > self.capacity:
            self._cache.popitem(last=False)   # Evict least recently used (first item)

    def __len__(self) -> int:
        return len(self._cache)


cache = LRUCache(capacity=3)
cache.put("query_1", "response about warfarin")
cache.put("query_2", "response about aspirin")
cache.put("query_3", "response about metformin")

print(cache.get("query_1"))   # "response about warfarin"  moves to end
cache.put("query_4", "response about lisinopril")   # Evicts query_2 (LRU)

print(cache.get("query_2"))   # None  evicted
print(cache.get("query_4"))   # "response about lisinopril"

Problem 4: Top-K Frequent Elements

Problem: Given an array of elements, return the k most frequent elements. If there is a tie in frequency, any order is valid.

Python
import heapq
from collections import Counter

def top_k_frequent(items: list, k: int) -> list:
    """
    Return the k most frequent elements.
    O(n log k) using a heap — better than O(n log n) full sort.
    """
    counts = Counter(items)

    # heapq.nlargest: returns k largest by the key function
    return heapq.nlargest(k, counts.keys(), key=counts.get)


tokens = ["warfarin", "dose", "warfarin", "daily", "dose", "warfarin", "INR", "dose"]
print(top_k_frequent(tokens, k=2))   # ["warfarin", "dose"]


# Bucket sort variant: O(n) time
def top_k_frequent_bucket(items: list, k: int) -> list:
    """O(n) using bucket sort."""
    counts = Counter(items)
    n = len(items)

    # buckets[i] = list of items that appear exactly i times
    buckets: list[list] = [[] for _ in range(n + 1)]
    for item, count in counts.items():
        buckets[count].append(item)

    result = []
    for freq in range(n, 0, -1):
        result.extend(buckets[freq])
        if len(result) >= k:
            break

    return result[:k]


print(top_k_frequent_bucket(tokens, k=2))   # ["warfarin", "dose"]

Problem 5: Subarray Sum Equals K

Problem: Given an array of integers and a target k, count the number of contiguous subarrays that sum to k.

Python
def subarray_sum(nums: list[int], k: int) -> int:
    """
    Count subarrays that sum to k.
    O(n) using prefix sums + a hash map.
    
    Key insight: if prefix_sum[j] - prefix_sum[i] = k,
    then the subarray from i+1 to j sums to k.
    """
    count = 0
    prefix_sum = 0
    seen: dict[int, int] = {0: 1}   # prefix_sum  count of times seen

    for num in nums:
        prefix_sum += num
        # If (prefix_sum - k) was seen before, those earlier positions
        # form subarrays with current position summing to k
        count += seen.get(prefix_sum - k, 0)
        seen[prefix_sum] = seen.get(prefix_sum, 0) + 1

    return count


print(subarray_sum([1, 1, 1], 2))            # 2  ([1,1] at positions 0-1 and 1-2)
print(subarray_sum([1, 2, 3], 3))            # 2  ([3] and [1,2])
print(subarray_sum([1, -1, 1, -1, 1], 0))   # 4

Problem 6: Longest Substring Without Repeating Characters

Problem: Given a string, find the length of the longest substring without repeating characters.

Python
def length_of_longest_substring(s: str) -> int:
    """
    Sliding window + hash map.
    O(n) time, O(min(n, alphabet)) space.
    
    AI context: finding the longest unique token sequence in LLM output.
    """
    last_seen: dict[str, int] = {}   # char  last seen index
    start = 0
    max_len = 0

    for end, char in enumerate(s):
        if char in last_seen and last_seen[char] >= start:
            # Shrink window: move start past the last occurrence of this char
            start = last_seen[char] + 1

        last_seen[char] = end
        max_len = max(max_len, end - start + 1)

    return max_len


print(length_of_longest_substring("abcabcbb"))   # 3  "abc"
print(length_of_longest_substring("bbbbb"))       # 1  "b"
print(length_of_longest_substring("pwwkew"))      # 3  "wke"
print(length_of_longest_substring(""))             # 0

Problem 7: Four Sum Count

Problem: Given four lists A, B, C, D of integers, count tuples (i, j, k, l) such that A[i] + B[j] + C[k] + D[l] == 0.

Python
from collections import Counter

def four_sum_count(A: list[int], B: list[int], C: list[int], D: list[int]) -> int:
    """
    O(n²) using a 2+2 split approach.
    Brute force would be O(n⁴).
    
    Phase 1: compute all A+B sums and count them.
    Phase 2: for each C+D sum, check how many -(C+D) sums exist in Phase 1.
    """
    ab_sums: Counter = Counter(a + b for a in A for b in B)

    count = 0
    for c in C:
        for d in D:
            count += ab_sums.get(-(c + d), 0)

    return count


print(four_sum_count([1, 2], [-2, -1], [-1, 2], [0, 2]))   # 2
# (1, -1, -1, 0) and (2, -2, -1, 2)  both sum to 0... let me verify:
# A[0]+B[0]+C[0]+D[0] = 1 + (-2) + (-1) + 2 = 0 
# A[1]+B[1]+C[0]+D[0] = 2 + (-1) + (-1) + 0 = 0 

Complexity Summary

| Problem | Time | Space | Key Technique | |---|---|---|---| | Word Frequency | O(n) | O(n) | Counter / dict accumulation | | Group Anagrams | O(n·k log k) | O(n) | Sorted key as canonical form | | LRU Cache | O(1) get/put | O(capacity) | OrderedDict + move_to_end | | Top-K Frequent | O(n log k) | O(n) | Counter + heapq.nlargest | | Subarray Sum = K | O(n) | O(n) | Prefix sum + hash map | | Longest Unique Substring | O(n) | O(alphabet) | Sliding window + last-seen map | | Four Sum Count | O(n²) | O(n²) | 2+2 split with Counter |