Dictionary and Hashing Interview Problems
Common dictionary and hashing interview problems for AI engineers: frequency maps, grouping anagrams, LRU cache, top-K frequent elements, and two-sum variants.
Problem 1: Word Frequency Map
Problem: Given a list of words, return a dict mapping each word to its frequency, sorted by frequency descending.
from collections import Counter
def word_frequency(words: list[str]) -> dict[str, int]:
"""Return word ā count, sorted by count descending."""
counts = Counter(words)
return dict(counts.most_common()) # most_common() returns sorted list
# Token frequency analysis for LLM outputs
tokens = ["warfarin", "dose", "warfarin", "daily", "dose", "warfarin", "INR"]
freq = word_frequency(tokens)
print(freq)
# {"warfarin": 3, "dose": 2, "daily": 1, "INR": 1}
# Manual version (no Counter)
def word_frequency_manual(words: list[str]) -> dict[str, int]:
counts: dict[str, int] = {}
for word in words:
counts[word] = counts.get(word, 0) + 1
return dict(sorted(counts.items(), key=lambda x: x[1], reverse=True))Problem 2: Group Anagrams
Problem: Given a list of strings, group anagrams together.
from collections import defaultdict
def group_anagrams(words: list[str]) -> list[list[str]]:
"""
Group words that are anagrams of each other.
Key insight: anagrams share the same sorted characters.
O(n * k log k) where k is the max word length.
"""
groups: defaultdict[str, list[str]] = defaultdict(list)
for word in words:
key = "".join(sorted(word)) # Canonical form: sorted characters
groups[key].append(word)
return list(groups.values())
print(group_anagrams(["eat", "tea", "tan", "ate", "nat", "bat"]))
# [["eat", "tea", "ate"], ["tan", "nat"], ["bat"]]
# AI context: grouping drug names with same characters (e.g., brand vs generic)
drug_names = ["ASPIRIN", "ANISPIR", "WARFARIN", "RAINFRAW"]
groups = group_anagrams([d.lower() for d in drug_names])
print(groups)
# [["aspirin", "anispir"], ["warfarin", "rainfraw"]]Problem 3: LRU Cache
Problem: Implement a Least Recently Used (LRU) cache with O(1) get and put operations.
from collections import OrderedDict
class LRUCache:
"""
LRU cache using OrderedDict (maintains insertion order).
O(1) get and put.
AI context: caching LLM responses for repeated queries.
"""
def __init__(self, capacity: int):
self.capacity = capacity
self._cache: OrderedDict[str, str] = OrderedDict()
def get(self, key: str) -> str | None:
if key not in self._cache:
return None
self._cache.move_to_end(key) # Mark as most recently used
return self._cache[key]
def put(self, key: str, value: str) -> None:
if key in self._cache:
self._cache.move_to_end(key)
self._cache[key] = value
if len(self._cache) > self.capacity:
self._cache.popitem(last=False) # Evict least recently used (first item)
def __len__(self) -> int:
return len(self._cache)
cache = LRUCache(capacity=3)
cache.put("query_1", "response about warfarin")
cache.put("query_2", "response about aspirin")
cache.put("query_3", "response about metformin")
print(cache.get("query_1")) # "response about warfarin" ā moves to end
cache.put("query_4", "response about lisinopril") # Evicts query_2 (LRU)
print(cache.get("query_2")) # None ā evicted
print(cache.get("query_4")) # "response about lisinopril"Problem 4: Top-K Frequent Elements
Problem: Given an array of elements, return the k most frequent elements. If there is a tie in frequency, any order is valid.
import heapq
from collections import Counter
def top_k_frequent(items: list, k: int) -> list:
"""
Return the k most frequent elements.
O(n log k) using a heap ā better than O(n log n) full sort.
"""
counts = Counter(items)
# heapq.nlargest: returns k largest by the key function
return heapq.nlargest(k, counts.keys(), key=counts.get)
tokens = ["warfarin", "dose", "warfarin", "daily", "dose", "warfarin", "INR", "dose"]
print(top_k_frequent(tokens, k=2)) # ["warfarin", "dose"]
# Bucket sort variant: O(n) time
def top_k_frequent_bucket(items: list, k: int) -> list:
"""O(n) using bucket sort."""
counts = Counter(items)
n = len(items)
# buckets[i] = list of items that appear exactly i times
buckets: list[list] = [[] for _ in range(n + 1)]
for item, count in counts.items():
buckets[count].append(item)
result = []
for freq in range(n, 0, -1):
result.extend(buckets[freq])
if len(result) >= k:
break
return result[:k]
print(top_k_frequent_bucket(tokens, k=2)) # ["warfarin", "dose"]Problem 5: Subarray Sum Equals K
Problem: Given an array of integers and a target k, count the number of contiguous subarrays that sum to k.
def subarray_sum(nums: list[int], k: int) -> int:
"""
Count subarrays that sum to k.
O(n) using prefix sums + a hash map.
Key insight: if prefix_sum[j] - prefix_sum[i] = k,
then the subarray from i+1 to j sums to k.
"""
count = 0
prefix_sum = 0
seen: dict[int, int] = {0: 1} # prefix_sum ā count of times seen
for num in nums:
prefix_sum += num
# If (prefix_sum - k) was seen before, those earlier positions
# form subarrays with current position summing to k
count += seen.get(prefix_sum - k, 0)
seen[prefix_sum] = seen.get(prefix_sum, 0) + 1
return count
print(subarray_sum([1, 1, 1], 2)) # 2 ā ([1,1] at positions 0-1 and 1-2)
print(subarray_sum([1, 2, 3], 3)) # 2 ā ([3] and [1,2])
print(subarray_sum([1, -1, 1, -1, 1], 0)) # 4Problem 6: Longest Substring Without Repeating Characters
Problem: Given a string, find the length of the longest substring without repeating characters.
def length_of_longest_substring(s: str) -> int:
"""
Sliding window + hash map.
O(n) time, O(min(n, alphabet)) space.
AI context: finding the longest unique token sequence in LLM output.
"""
last_seen: dict[str, int] = {} # char ā last seen index
start = 0
max_len = 0
for end, char in enumerate(s):
if char in last_seen and last_seen[char] >= start:
# Shrink window: move start past the last occurrence of this char
start = last_seen[char] + 1
last_seen[char] = end
max_len = max(max_len, end - start + 1)
return max_len
print(length_of_longest_substring("abcabcbb")) # 3 ā "abc"
print(length_of_longest_substring("bbbbb")) # 1 ā "b"
print(length_of_longest_substring("pwwkew")) # 3 ā "wke"
print(length_of_longest_substring("")) # 0Problem 7: Four Sum Count
Problem: Given four lists A, B, C, D of integers, count tuples (i, j, k, l) such that A[i] + B[j] + C[k] + D[l] == 0.
from collections import Counter
def four_sum_count(A: list[int], B: list[int], C: list[int], D: list[int]) -> int:
"""
O(n²) using a 2+2 split approach.
Brute force would be O(nā“).
Phase 1: compute all A+B sums and count them.
Phase 2: for each C+D sum, check how many -(C+D) sums exist in Phase 1.
"""
ab_sums: Counter = Counter(a + b for a in A for b in B)
count = 0
for c in C:
for d in D:
count += ab_sums.get(-(c + d), 0)
return count
print(four_sum_count([1, 2], [-2, -1], [-1, 2], [0, 2])) # 2
# (1, -1, -1, 0) and (2, -2, -1, 2) ā both sum to 0... let me verify:
# A[0]+B[0]+C[0]+D[0] = 1 + (-2) + (-1) + 2 = 0 ā
# A[1]+B[1]+C[0]+D[0] = 2 + (-1) + (-1) + 0 = 0 āComplexity Summary
| Problem | Time | Space | Key Technique | |---|---|---|---| | Word Frequency | O(n) | O(n) | Counter / dict accumulation | | Group Anagrams | O(n·k log k) | O(n) | Sorted key as canonical form | | LRU Cache | O(1) get/put | O(capacity) | OrderedDict + move_to_end | | Top-K Frequent | O(n log k) | O(n) | Counter + heapq.nlargest | | Subarray Sum = K | O(n) | O(n) | Prefix sum + hash map | | Longest Unique Substring | O(n) | O(alphabet) | Sliding window + last-seen map | | Four Sum Count | O(n²) | O(n²) | 2+2 split with Counter |
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.