Live Coding Interview Prep · Lesson 2 of 16
Arrays and Dicts: Problems AI Interviewers Love
The Foundation of Every Interview
Arrays and hash maps solve roughly 40% of all LeetCode-style interview problems. In AI engineering interviews, the same patterns appear — but dressed in domain-specific clothing. "Two Sum" becomes "find two document IDs whose combined score equals a budget." Frequency counting becomes "find the most common tokens in a training corpus."
This lesson teaches the patterns, then shows you how they map to real AI problems.
Two Sum — The Archetype
Problem: Given a list of integers and a target, return the indices of two numbers that add up to the target.
Naive Approach — O(n²)
def two_sum_naive(nums: list[int], target: int) -> list[int]:
"""Brute force: check every pair. O(n^2) time, O(1) space."""
n = len(nums)
for i in range(n):
for j in range(i + 1, n):
if nums[i] + nums[j] == target:
return [i, j]
return []
# This is always the wrong answer in an interview — say it, then immediately move onOptimal Approach — O(n)
def two_sum(nums: list[int], target: int) -> list[int]:
"""
Hash map: for each number, check if its complement is already seen.
O(n) time, O(n) space.
Key insight: instead of looking forward for a pair,
look backward in a hash map of already-seen values.
"""
seen: dict[int, int] = {} # value → index
for i, num in enumerate(nums):
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return []
# Test cases — always write these in an interview
assert two_sum([2, 7, 11, 15], 9) == [0, 1]
assert two_sum([3, 2, 4], 6) == [1, 2]
assert two_sum([3, 3], 6) == [0, 1]
assert two_sum([1], 2) == []
print("All tests passed")Interview commentary: "I recognize this as the classic complement-lookup pattern. For each element, I need its complement. If I store previously seen elements in a hash map, I can check in O(1). Total time is O(n), space is O(n)."
AI Context: Deduplicating Documents by Hash
The same complement-lookup pattern applies to document deduplication.
import hashlib
from dataclasses import dataclass
@dataclass
class Document:
doc_id: str
text: str
def deduplicate_documents(documents: list[Document]) -> list[Document]:
"""
Remove exact duplicate documents using content hashing.
O(n * L) time where L = avg document length (for hashing)
O(n) space for the hash set
"""
seen_hashes: set[str] = set()
unique_docs: list[Document] = []
for doc in documents:
# MD5 is fast and good enough for dedup (not security use)
content_hash = hashlib.md5(doc.text.encode()).hexdigest()
if content_hash not in seen_hashes:
seen_hashes.add(content_hash)
unique_docs.append(doc)
duplicates_removed = len(documents) - len(unique_docs)
print(f"Removed {duplicates_removed} duplicates from {len(documents)} documents")
return unique_docs
# Fuzzy dedup: find near-duplicates using MinHash (simplified)
def shingling(text: str, k: int = 3) -> set[str]:
"""Create k-shingles (k-grams of words) for a document."""
words = text.lower().split()
if len(words) < k:
return {" ".join(words)}
return {" ".join(words[i:i+k]) for i in range(len(words) - k + 1)}
def jaccard_similarity(set_a: set, set_b: set) -> float:
"""Jaccard similarity between two sets: |A ∩ B| / |A ∪ B|"""
if not set_a and not set_b:
return 1.0
intersection = len(set_a & set_b)
union = len(set_a | set_b)
return intersection / union
def find_near_duplicates(
documents: list[Document],
threshold: float = 0.8,
shingle_k: int = 3,
) -> list[tuple[str, str, float]]:
"""
Brute-force near-duplicate detection using Jaccard similarity.
O(n^2 * shingle_size) — use LSH for production at scale.
"""
shingles = [(doc.doc_id, shingling(doc.text, shingle_k)) for doc in documents]
near_dups = []
n = len(shingles)
for i in range(n):
for j in range(i + 1, n):
sim = jaccard_similarity(shingles[i][1], shingles[j][1])
if sim >= threshold:
near_dups.append((shingles[i][0], shingles[j][0], sim))
return near_dupsFrequency Counting — The AI Engineer's Bread and Butter
Python dict vs defaultdict vs Counter
Understanding the right tool is itself an interview signal.
from collections import defaultdict, Counter
corpus = [
"the cat sat on the mat",
"the dog sat on the log",
"the cat and the dog",
]
# Method 1: plain dict — verbose, error-prone
def count_tokens_dict(corpus: list[str]) -> dict[str, int]:
counts: dict[str, int] = {}
for doc in corpus:
for token in doc.split():
if token in counts:
counts[token] += 1
else:
counts[token] = 0 + 1 # common mistake: forgetting initialization
return counts
# Method 2: defaultdict — cleaner, same performance
def count_tokens_defaultdict(corpus: list[str]) -> dict[str, int]:
counts: defaultdict[str, int] = defaultdict(int)
for doc in corpus:
for token in doc.split():
counts[token] += 1 # no KeyError, auto-initializes to 0
return dict(counts)
# Method 3: Counter — idiomatic Python, most readable
def count_tokens_counter(corpus: list[str]) -> Counter:
all_tokens = []
for doc in corpus:
all_tokens.extend(doc.split())
return Counter(all_tokens)
# Counter also gives you most_common() for free
token_counts = count_tokens_counter(corpus)
print("Top 5 tokens:", token_counts.most_common(5))
# Top 5 tokens: [('the', 5), ('cat', 2), ('sat', 2), ('on', 2), ('dog', 2)]
# Counter arithmetic — useful for vocabulary operations
doc1_counts = Counter("the cat sat on the mat".split())
doc2_counts = Counter("the dog sat on the log".split())
union = doc1_counts | doc2_counts # max of each count
intersection = doc1_counts & doc2_counts # min of each count
print("Shared tokens:", dict(intersection))Practice Problem: K Most Frequent Tokens
Problem: Given a list of documents, return the k tokens with the highest total frequency across the entire corpus. Handle ties by returning lexicographically smaller tokens first.
from collections import Counter
import heapq
def k_most_frequent_tokens(
documents: list[str],
k: int,
stop_words: set[str] | None = None,
) -> list[tuple[str, int]]:
"""
Find k most frequent tokens in a corpus.
Approach 1: Counter + most_common()
Time: O(n * avg_doc_len) to count + O(V log V) to sort vocab V
Space: O(V) for counter
For large vocabularies where k << V, a heap is more efficient.
"""
if stop_words is None:
stop_words = set()
# Step 1: count all tokens — O(n * avg_doc_len)
counts: Counter[str] = Counter()
for doc in documents:
tokens = doc.lower().split()
for token in tokens:
# Strip basic punctuation
token = token.strip(".,!?;:\"'()[]")
if token and token not in stop_words:
counts[token] += 1
# Step 2a: most_common gives top k — O(V) with a heap internally
return counts.most_common(k)
def k_most_frequent_tokens_heap(
documents: list[str],
k: int,
) -> list[tuple[str, int]]:
"""
Heap-based approach: O(V log k) which is better when k << V.
Uses a min-heap of size k.
"""
counts: Counter[str] = Counter()
for doc in documents:
for token in doc.lower().split():
counts[token] += 1
# Min-heap of (count, token): O(V log k)
# We negate count to simulate max-heap behavior
heap: list[tuple[int, str]] = []
for token, count in counts.items():
heapq.heappush(heap, (count, token))
if len(heap) > k:
heapq.heappop(heap) # remove smallest
# Sort descending by count, then lexicographically for ties
result = sorted(heap, key=lambda x: (-x[0], x[1]))
return [(token, count) for count, token in result]
# Test
docs = [
"the quick brown fox jumps over the lazy dog",
"the fox ran quickly through the forest",
"a quick brown dog outpaced a lazy fox",
"the dog slept all day in the forest",
]
print("Top 5 tokens (Counter approach):")
for token, count in k_most_frequent_tokens(docs, k=5):
print(f" {token}: {count}")
print("\nTop 5 tokens (Heap approach):")
for token, count in k_most_frequent_tokens_heap(docs, k=5):
print(f" {token}: {count}")
# Both should produce:
# the: 8
# fox: 3
# quick: 2 (or brown/dog/lazy depending on tie-breaking)Sliding Window on Arrays
The sliding window pattern avoids recomputing work by maintaining a running state as a window moves.
def max_sum_subarray(nums: list[int], k: int) -> int:
"""
Find maximum sum of any k consecutive elements.
O(n) with sliding window vs O(n*k) with nested loops.
"""
if len(nums) < k:
return 0
# Initialize first window
window_sum = sum(nums[:k]) # O(k)
max_sum = window_sum
# Slide: add right element, remove left element
for i in range(k, len(nums)):
window_sum += nums[i] - nums[i - k] # O(1) update
max_sum = max(max_sum, window_sum)
return max_sum
# AI application: find the window of k tokens with highest average TF-IDF score
def highest_scoring_window(
tfidf_scores: list[float],
window_size: int,
) -> tuple[int, int, float]:
"""
Given per-token TF-IDF scores, find the window of window_size tokens
with the highest total score. Returns (start, end, score).
Application: identify the most relevant passage in a long document.
"""
n = len(tfidf_scores)
if n < window_size:
return (0, n, sum(tfidf_scores))
window_sum = sum(tfidf_scores[:window_size])
best_start = 0
best_sum = window_sum
for i in range(window_size, n):
window_sum += tfidf_scores[i] - tfidf_scores[i - window_size]
if window_sum > best_sum:
best_sum = window_sum
best_start = i - window_size + 1
return (best_start, best_start + window_size, best_sum)
# Test
scores = [0.1, 0.5, 0.9, 0.8, 0.2, 0.1, 0.7, 0.6, 0.3]
start, end, total = highest_scoring_window(scores, window_size=3)
print(f"Best window: [{start}:{end}], total TF-IDF = {total:.2f}")
# Best window: [1:4], total TF-IDF = 2.20Two-Pointer Technique
Two pointers solve problems where you need pairs satisfying a condition in a sorted structure.
def find_document_pairs_by_combined_score(
scores: list[float],
target: float,
tolerance: float = 0.01,
) -> list[tuple[int, int]]:
"""
Find all pairs of documents whose combined relevance score is close to target.
Assumes scores is sorted in ascending order.
O(n) with two pointers vs O(n^2) with nested loops.
Application: find complementary document pairs that together cover a topic.
"""
left, right = 0, len(scores) - 1
pairs = []
while left < right:
combined = scores[left] + scores[right]
if abs(combined - target) <= tolerance:
pairs.append((left, right))
left += 1
right -= 1
elif combined < target:
left += 1 # need higher sum, move left pointer right
else:
right -= 1 # need lower sum, move right pointer left
return pairs
# Test
doc_scores = [0.1, 0.3, 0.5, 0.6, 0.7, 0.9]
pairs = find_document_pairs_by_combined_score(doc_scores, target=1.0, tolerance=0.05)
print("Document pairs with combined score ~1.0:", pairs)
# [(1, 5), (2, 4)] → scores 0.3+0.9=1.2? Let's check tolerance
# Actually: 0.1+0.9=1.0 ✓, 0.3+0.7=1.0 ✓, 0.5+...no exact match within toleranceInterview Patterns Summary
| Pattern | Key Idea | Time | Space | AI Use Case | |---------|----------|------|-------|-------------| | Hash map lookup | Store seen values | O(n) | O(n) | Dedup, vocabulary | | Counter | Frequency table | O(n) | O(V) | Token frequency | | Sliding window | Running state | O(n) | O(1) | Passage scoring | | Two pointers | Sort + converge | O(n) | O(1) | Pair matching | | Top-k heap | Min-heap of size k | O(n log k) | O(k) | Top tokens |
Common Mistakes to Avoid
# Mistake 1: modifying a list while iterating
tokens = ["the", "cat", "the", "mat"]
# BAD:
# for t in tokens:
# if t == "the":
# tokens.remove(t) # skips elements!
# GOOD:
tokens = [t for t in tokens if t != "the"]
# Mistake 2: using a list for O(1) lookups
stop_words_list = ["the", "a", "an", "is"] # O(n) lookup!
stop_words_set = {"the", "a", "an", "is"} # O(1) lookup — always use set
# Mistake 3: Counter vs dict for existence check
counts = Counter(["a", "b", "a"])
# Accessing missing key returns 0 (not KeyError) — be aware
print(counts["z"]) # 0, not KeyError — Counter is safe
# Mistake 4: forgetting that dict preserves insertion order (Python 3.7+)
# If you need sorted order, sort explicitly — don't assume insertion order = sorted order