Live Coding Interview Prep · Lesson 16 of 16
Full 45-Minute Mock Live Coding Session
How to Use This
Work through each problem as if in an interview. Read the problem, spend 2–3 minutes planning, then implement. Compare against the ideal approach afterward.
Problem 1: Simple Token Counter (10 min)
Prompt: "Implement a function that counts the approximate number of tokens in a string. Assume 1 token ≈ 4 characters for English text. Add a method to check if a text fits within a model's context limit."
Interviewer is looking for: Basic implementation, clear thinking about edge cases, understanding of tokenization approximation.
# Your implementation hereIdeal solution:
class TokenCounter:
CHARS_PER_TOKEN = 4 # Approximation for English
def count_tokens(self, text: str) -> int:
"""Approximate token count. Not exact — use tiktoken for GPT models."""
if not text:
return 0
return max(1, len(text) // self.CHARS_PER_TOKEN)
def fits_in_context(
self,
text: str,
max_tokens: int,
reserved_for_response: int = 512,
) -> bool:
"""Check if text fits within a context limit, reserving space for response."""
available = max_tokens - reserved_for_response
return self.count_tokens(text) <= available
def truncate_to_fit(self, text: str, max_tokens: int) -> str:
"""Truncate text to fit within token limit."""
max_chars = max_tokens * self.CHARS_PER_TOKEN
if len(text) <= max_chars:
return text
# Truncate at word boundary
truncated = text[:max_chars]
last_space = truncated.rfind(" ")
return truncated[:last_space] if last_space > 0 else truncated
counter = TokenCounter()
text = "Metformin is a biguanide antidiabetic medication used to treat type 2 diabetes."
print(counter.count_tokens(text)) # ~19 tokens
print(counter.fits_in_context(text, max_tokens=128)) # TrueCommon mistakes:
- Returning 0 for single-character strings
- Forgetting to reserve space for the model's response
- Integer division without handling empty string
Follow-up: "How would you make this exact instead of approximate?" → Use the tiktoken library: tiktoken.encoding_for_model('gpt-4o').encode(text) gives exact token count.
Problem 2: Embedding Cache (20 min)
Prompt: "Implement a cache for text embeddings. The cache should: (1) avoid recomputing embeddings for the same text, (2) have a configurable max size, (3) evict the least recently used entry when full."
Interviewer is looking for: LRU cache knowledge, thread safety awareness, correct use of data structures.
# Your implementation hereIdeal solution:
from collections import OrderedDict
import hashlib
import numpy as np
class EmbeddingCache:
def __init__(self, max_size: int = 1000):
self.max_size = max_size
self._cache: OrderedDict[str, np.ndarray] = OrderedDict()
self._hits = 0
self._misses = 0
def _key(self, text: str) -> str:
return hashlib.sha256(text.encode()).hexdigest()
def get(self, text: str) -> np.ndarray | None:
key = self._key(text)
if key in self._cache:
self._cache.move_to_end(key) # Mark as recently used
self._hits += 1
return self._cache[key]
self._misses += 1
return None
def set(self, text: str, embedding: np.ndarray):
key = self._key(text)
if key in self._cache:
self._cache.move_to_end(key)
self._cache[key] = embedding
if len(self._cache) > self.max_size:
self._cache.popitem(last=False) # Remove LRU
@property
def hit_rate(self) -> float:
total = self._hits + self._misses
return self._hits / total if total > 0 else 0.0
def cached_embed(self, text: str, embed_fn) -> np.ndarray:
"""Get embedding from cache or compute and cache it."""
cached = self.get(text)
if cached is not None:
return cached
embedding = embed_fn(text)
self.set(text, embedding)
return embeddingCommon mistakes:
- Using a plain dict instead of OrderedDict (loses LRU ordering in Python before 3.7, no way to track order after 3.7 without moving entries)
- Not hashing the key (storing raw text strings wastes memory)
- Forgetting to call
move_to_endon cache hit (this is what makes LRU work)
Follow-up: "How would you make this thread-safe?" → Add threading.Lock() and acquire it in get() and set().
Problem 3: Context Window Manager (25 min)
Prompt: "Build a context window manager for a multi-turn chatbot. It should: keep track of conversation messages, fit within a token limit, and automatically drop the oldest messages (but keep the system prompt) when the limit is exceeded."
Ideal solution:
from dataclasses import dataclass, field
@dataclass
class Message:
role: str # "system", "user", "assistant"
content: str
@property
def token_count(self) -> int:
return max(1, len(self.content) // 4) # Approximate
class ContextWindowManager:
def __init__(
self,
max_tokens: int = 8192,
reserved_for_response: int = 1024,
):
self.max_tokens = max_tokens
self.reserved = reserved_for_response
self.available = max_tokens - reserved_for_response
self.messages: list[Message] = []
@property
def system_message(self) -> Message | None:
if self.messages and self.messages[0].role == "system":
return self.messages[0]
return None
@property
def token_count(self) -> int:
return sum(m.token_count for m in self.messages)
def add(self, role: str, content: str):
msg = Message(role=role, content=content)
if msg.token_count > self.available:
# Truncate this message to fit
max_chars = self.available * 4
content = content[:max_chars]
msg = Message(role=role, content=content)
self.messages.append(msg)
self._trim()
def _trim(self):
"""Remove oldest non-system messages until within token limit."""
while self.token_count > self.available and len(self.messages) > 1:
# Find oldest non-system message
for i, msg in enumerate(self.messages):
if msg.role != "system":
self.messages.pop(i)
break
else:
break # Only system message left
def get_messages(self) -> list[dict]:
return [{"role": m.role, "content": m.content} for m in self.messages]
# Test
mgr = ContextWindowManager(max_tokens=200, reserved_for_response=50)
mgr.add("system", "You are a clinical pharmacology assistant.")
mgr.add("user", "What is metformin?")
mgr.add("assistant", "Metformin is a biguanide antidiabetic drug that reduces hepatic glucose production.")
mgr.add("user", "What are its side effects?")
mgr.add("assistant", "Common side effects include GI symptoms: nausea, diarrhea, and abdominal discomfort.")
mgr.add("user", "Is it safe in kidney disease?")
print(f"Messages: {len(mgr.messages)}")
print(f"Total tokens: {mgr.token_count}")
for m in mgr.get_messages():
print(f" [{m['role']}]: {m['content'][:60]}")Common mistakes:
- Dropping the system prompt during trimming
- Not checking token count before appending (can still overflow)
- Off-by-one errors in trimming logic
Problem 4: Simple RAG Pipeline (30 min)
Prompt: "Implement a minimal RAG pipeline. It should: (1) index documents using a simple vector store, (2) retrieve top-k relevant documents for a query, (3) generate an answer using the retrieved context."
Ideal solution:
import numpy as np
from openai import OpenAI
client = OpenAI()
class MinimalRAG:
def __init__(self, k: int = 3):
self.k = k
self.documents: list[str] = []
self.embeddings: list[np.ndarray] = []
def _embed(self, text: str) -> np.ndarray:
resp = client.embeddings.create(input=text, model="text-embedding-3-small")
return np.array(resp.data[0].embedding)
def index(self, documents: list[str]):
"""Embed and store all documents."""
self.documents = documents
self.embeddings = [self._embed(doc) for doc in documents]
print(f"Indexed {len(documents)} documents")
def retrieve(self, query: str) -> list[str]:
"""Find top-k relevant documents."""
query_emb = self._embed(query)
query_norm = query_emb / (np.linalg.norm(query_emb) + 1e-10)
corpus = np.stack(self.embeddings)
norms = np.linalg.norm(corpus, axis=1, keepdims=True)
corpus_norm = corpus / (norms + 1e-10)
similarities = corpus_norm @ query_norm
top_k_idx = np.argsort(similarities)[-self.k:][::-1]
return [self.documents[i] for i in top_k_idx]
def answer(self, query: str) -> str:
"""Retrieve context and generate answer."""
context_docs = self.retrieve(query)
context = "\n\n".join(f"- {doc}" for doc in context_docs)
resp = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "Answer questions using ONLY the provided context. If the context doesn't contain the answer, say so.",
},
{
"role": "user",
"content": f"Context:\n{context}\n\nQuestion: {query}",
},
],
temperature=0.1,
)
return resp.choices[0].message.content
# Test (without actual API calls in this demo)
rag = MinimalRAG(k=2)
docs = [
"Metformin inhibits hepatic gluconeogenesis via AMPK activation.",
"Warfarin inhibits vitamin K epoxide reductase, reducing clotting factors.",
"Ibuprofen is an NSAID that inhibits COX-1 and COX-2.",
]
# rag.index(docs)
# answer = rag.answer("How does metformin work?")Common mistakes:
- Not normalizing embeddings before cosine similarity
- Forgetting to handle the case where k is larger than the corpus
- System prompt that allows the model to use its parametric knowledge (defeats the purpose of RAG)
Follow-up: "How would you improve this for production?" → Vector database for scale, chunk documents rather than using full texts, reranking after retrieval, RAGAS evaluation, caching embeddings.