Mock Live Coding Interview
Full mock live coding interview for AI engineers: 4 problems with interviewer notes, expected approach, common mistakes, and follow-up questions.
How to Use This
Work through each problem as if in an interview. Read the problem, spend 2–3 minutes planning, then implement. Compare against the ideal approach afterward.
Problem 1: Simple Token Counter (10 min)
Prompt: "Implement a function that counts the approximate number of tokens in a string. Assume 1 token ≈ 4 characters for English text. Add a method to check if a text fits within a model's context limit."
Interviewer is looking for: Basic implementation, clear thinking about edge cases, understanding of tokenization approximation.
# Your implementation hereIdeal solution:
class TokenCounter:
CHARS_PER_TOKEN = 4 # Approximation for English
def count_tokens(self, text: str) -> int:
"""Approximate token count. Not exact — use tiktoken for GPT models."""
if not text:
return 0
return max(1, len(text) // self.CHARS_PER_TOKEN)
def fits_in_context(
self,
text: str,
max_tokens: int,
reserved_for_response: int = 512,
) -> bool:
"""Check if text fits within a context limit, reserving space for response."""
available = max_tokens - reserved_for_response
return self.count_tokens(text) <= available
def truncate_to_fit(self, text: str, max_tokens: int) -> str:
"""Truncate text to fit within token limit."""
max_chars = max_tokens * self.CHARS_PER_TOKEN
if len(text) <= max_chars:
return text
# Truncate at word boundary
truncated = text[:max_chars]
last_space = truncated.rfind(" ")
return truncated[:last_space] if last_space > 0 else truncated
counter = TokenCounter()
text = "Metformin is a biguanide antidiabetic medication used to treat type 2 diabetes."
print(counter.count_tokens(text)) # ~19 tokens
print(counter.fits_in_context(text, max_tokens=128)) # TrueCommon mistakes:
- Returning 0 for single-character strings
- Forgetting to reserve space for the model's response
- Integer division without handling empty string
Follow-up: "How would you make this exact instead of approximate?" → Use the tiktoken library: tiktoken.encoding_for_model('gpt-4o').encode(text) gives exact token count.
Problem 2: Embedding Cache (20 min)
Prompt: "Implement a cache for text embeddings. The cache should: (1) avoid recomputing embeddings for the same text, (2) have a configurable max size, (3) evict the least recently used entry when full."
Interviewer is looking for: LRU cache knowledge, thread safety awareness, correct use of data structures.
# Your implementation hereIdeal solution:
from collections import OrderedDict
import hashlib
import numpy as np
class EmbeddingCache:
def __init__(self, max_size: int = 1000):
self.max_size = max_size
self._cache: OrderedDict[str, np.ndarray] = OrderedDict()
self._hits = 0
self._misses = 0
def _key(self, text: str) -> str:
return hashlib.sha256(text.encode()).hexdigest()
def get(self, text: str) -> np.ndarray | None:
key = self._key(text)
if key in self._cache:
self._cache.move_to_end(key) # Mark as recently used
self._hits += 1
return self._cache[key]
self._misses += 1
return None
def set(self, text: str, embedding: np.ndarray):
key = self._key(text)
if key in self._cache:
self._cache.move_to_end(key)
self._cache[key] = embedding
if len(self._cache) > self.max_size:
self._cache.popitem(last=False) # Remove LRU
@property
def hit_rate(self) -> float:
total = self._hits + self._misses
return self._hits / total if total > 0 else 0.0
def cached_embed(self, text: str, embed_fn) -> np.ndarray:
"""Get embedding from cache or compute and cache it."""
cached = self.get(text)
if cached is not None:
return cached
embedding = embed_fn(text)
self.set(text, embedding)
return embeddingCommon mistakes:
- Using a plain dict instead of OrderedDict (loses LRU ordering in Python before 3.7, no way to track order after 3.7 without moving entries)
- Not hashing the key (storing raw text strings wastes memory)
- Forgetting to call
move_to_endon cache hit (this is what makes LRU work)
Follow-up: "How would you make this thread-safe?" → Add threading.Lock() and acquire it in get() and set().
Problem 3: Context Window Manager (25 min)
Prompt: "Build a context window manager for a multi-turn chatbot. It should: keep track of conversation messages, fit within a token limit, and automatically drop the oldest messages (but keep the system prompt) when the limit is exceeded."
Ideal solution:
from dataclasses import dataclass, field
@dataclass
class Message:
role: str # "system", "user", "assistant"
content: str
@property
def token_count(self) -> int:
return max(1, len(self.content) // 4) # Approximate
class ContextWindowManager:
def __init__(
self,
max_tokens: int = 8192,
reserved_for_response: int = 1024,
):
self.max_tokens = max_tokens
self.reserved = reserved_for_response
self.available = max_tokens - reserved_for_response
self.messages: list[Message] = []
@property
def system_message(self) -> Message | None:
if self.messages and self.messages[0].role == "system":
return self.messages[0]
return None
@property
def token_count(self) -> int:
return sum(m.token_count for m in self.messages)
def add(self, role: str, content: str):
msg = Message(role=role, content=content)
if msg.token_count > self.available:
# Truncate this message to fit
max_chars = self.available * 4
content = content[:max_chars]
msg = Message(role=role, content=content)
self.messages.append(msg)
self._trim()
def _trim(self):
"""Remove oldest non-system messages until within token limit."""
while self.token_count > self.available and len(self.messages) > 1:
# Find oldest non-system message
for i, msg in enumerate(self.messages):
if msg.role != "system":
self.messages.pop(i)
break
else:
break # Only system message left
def get_messages(self) -> list[dict]:
return [{"role": m.role, "content": m.content} for m in self.messages]
# Test
mgr = ContextWindowManager(max_tokens=200, reserved_for_response=50)
mgr.add("system", "You are a clinical pharmacology assistant.")
mgr.add("user", "What is metformin?")
mgr.add("assistant", "Metformin is a biguanide antidiabetic drug that reduces hepatic glucose production.")
mgr.add("user", "What are its side effects?")
mgr.add("assistant", "Common side effects include GI symptoms: nausea, diarrhea, and abdominal discomfort.")
mgr.add("user", "Is it safe in kidney disease?")
print(f"Messages: {len(mgr.messages)}")
print(f"Total tokens: {mgr.token_count}")
for m in mgr.get_messages():
print(f" [{m['role']}]: {m['content'][:60]}")Common mistakes:
- Dropping the system prompt during trimming
- Not checking token count before appending (can still overflow)
- Off-by-one errors in trimming logic
Problem 4: Simple RAG Pipeline (30 min)
Prompt: "Implement a minimal RAG pipeline. It should: (1) index documents using a simple vector store, (2) retrieve top-k relevant documents for a query, (3) generate an answer using the retrieved context."
Ideal solution:
import numpy as np
from openai import OpenAI
client = OpenAI()
class MinimalRAG:
def __init__(self, k: int = 3):
self.k = k
self.documents: list[str] = []
self.embeddings: list[np.ndarray] = []
def _embed(self, text: str) -> np.ndarray:
resp = client.embeddings.create(input=text, model="text-embedding-3-small")
return np.array(resp.data[0].embedding)
def index(self, documents: list[str]):
"""Embed and store all documents."""
self.documents = documents
self.embeddings = [self._embed(doc) for doc in documents]
print(f"Indexed {len(documents)} documents")
def retrieve(self, query: str) -> list[str]:
"""Find top-k relevant documents."""
query_emb = self._embed(query)
query_norm = query_emb / (np.linalg.norm(query_emb) + 1e-10)
corpus = np.stack(self.embeddings)
norms = np.linalg.norm(corpus, axis=1, keepdims=True)
corpus_norm = corpus / (norms + 1e-10)
similarities = corpus_norm @ query_norm
top_k_idx = np.argsort(similarities)[-self.k:][::-1]
return [self.documents[i] for i in top_k_idx]
def answer(self, query: str) -> str:
"""Retrieve context and generate answer."""
context_docs = self.retrieve(query)
context = "\n\n".join(f"- {doc}" for doc in context_docs)
resp = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "Answer questions using ONLY the provided context. If the context doesn't contain the answer, say so.",
},
{
"role": "user",
"content": f"Context:\n{context}\n\nQuestion: {query}",
},
],
temperature=0.1,
)
return resp.choices[0].message.content
# Test (without actual API calls in this demo)
rag = MinimalRAG(k=2)
docs = [
"Metformin inhibits hepatic gluconeogenesis via AMPK activation.",
"Warfarin inhibits vitamin K epoxide reductase, reducing clotting factors.",
"Ibuprofen is an NSAID that inhibits COX-1 and COX-2.",
]
# rag.index(docs)
# answer = rag.answer("How does metformin work?")Common mistakes:
- Not normalizing embeddings before cosine similarity
- Forgetting to handle the case where k is larger than the corpus
- System prompt that allows the model to use its parametric knowledge (defeats the purpose of RAG)
Follow-up: "How would you improve this for production?" → Vector database for scale, chunk documents rather than using full texts, reranking after retrieval, RAGAS evaluation, caching embeddings.
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.