AI Systemsadvanced
Interview: NumPy Problem Walk-Through
5 NumPy interview problems with full solutions: vectorized cosine similarity, z-score normalization, top-k retrieval, confusion matrix, and an embedding similarity pipeline.
Asma Hafeez KhanMay 16, 20267 min read
PythonNumPyInterviewMachine LearningEmbeddingsLinear Algebra
Q1: Implement cosine similarity between two embedding vectors without using scipy.
Answer:
Python
import numpy as np
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""
Compute cosine similarity between two vectors.
cosine_sim = dot(a, b) / (||a|| * ||b||)
Range: -1 (opposite) to 1 (identical), 0 (orthogonal)
"""
dot_product = np.dot(a, b)
norm_a = np.linalg.norm(a)
norm_b = np.linalg.norm(b)
if norm_a == 0 or norm_b == 0:
return 0.0 # Convention: zero vector is not similar to anything
return dot_product / (norm_a * norm_b)
# Test
v1 = np.array([1.0, 0.0, 0.0])
v2 = np.array([1.0, 0.0, 0.0])
v3 = np.array([0.0, 1.0, 0.0])
print(cosine_similarity(v1, v2)) # 1.0 — identical
print(cosine_similarity(v1, v3)) # 0.0 — orthogonal
# Batch version: one query vs many documents
def batch_cosine_similarity(query: np.ndarray, docs: np.ndarray) -> np.ndarray:
"""
Efficient vectorized cosine similarity.
query: (dim,)
docs: (n_docs, dim)
Returns: (n_docs,) — one score per doc
"""
query_norm = query / (np.linalg.norm(query) + 1e-10) # Avoid division by zero
doc_norms = np.linalg.norm(docs, axis=1, keepdims=True) + 1e-10
docs_normalized = docs / doc_norms
return docs_normalized @ query_norm # (n_docs, dim) @ (dim,) = (n_docs,)
query = np.random.randn(1536)
docs = np.random.randn(100, 1536)
scores = batch_cosine_similarity(query, docs)
print(scores.shape) # (100,)
print(f"Range: [{scores.min():.3f}, {scores.max():.3f}]")Q2: Normalize a batch of features using z-score normalization (per feature, not globally). Handle the case where a feature has zero variance.
Answer:
Python
import numpy as np
def zscore_normalize(X: np.ndarray, epsilon: float = 1e-8) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Z-score normalize features: (x - mean) / std, per feature.
X: (n_samples, n_features)
Returns: (X_normalized, mean, std) — save mean/std to apply to new data
"""
mean = X.mean(axis=0) # (n_features,)
std = X.std(axis=0) # (n_features,)
# Features with zero variance would cause division by zero
# Replace zero std with epsilon (effectively keeps the feature at 0 after normalization)
std_safe = np.where(std < epsilon, epsilon, std)
X_normalized = (X - mean) / std_safe # Broadcasting: (n_samples, n_features)
return X_normalized, mean, std
def apply_normalization(X: np.ndarray, mean: np.ndarray, std: np.ndarray, epsilon: float = 1e-8) -> np.ndarray:
"""Apply pre-computed normalization to new data (e.g., test set)."""
std_safe = np.where(std < epsilon, epsilon, std)
return (X - mean) / std_safe
# Example
np.random.seed(42)
X_train = np.random.randn(100, 10) * np.array([1, 5, 0, 2, 10, 1, 1, 1, 1, 1])
# Feature 2 has std=0 (all zeros after multiplying by 0)
X_norm, mean, std = zscore_normalize(X_train)
# Verify
print(f"Mean of normalized features: {X_norm.mean(axis=0).round(4)}")
# Should all be ~0
print(f"Std of normalized features: {X_norm.std(axis=0).round(4)}")
# Should all be ~1 (except feature 2 which had zero variance)
# Apply same normalization to test set
X_test = np.random.randn(20, 10) * 2
X_test_norm = apply_normalization(X_test, mean, std)
print(X_test_norm.shape) # (20, 10)Interview talking point: Always fit (compute mean/std) on training data, then transform both train and test. Never fit on test data — that leaks information.
Q3: Given a matrix of document embeddings and a query embedding, find the top-5 most similar documents. Optimize for speed with many documents.
Answer:
Python
import numpy as np
def top_k_similarity(
query: np.ndarray,
documents: np.ndarray,
k: int = 5,
normalize: bool = True,
) -> tuple[np.ndarray, np.ndarray]:
"""
Find the k most similar documents to a query.
query: (dim,)
documents: (n_docs, dim)
Returns: (top_k_indices, top_k_scores)
"""
if normalize:
# Pre-normalize for faster repeated queries
query = query / (np.linalg.norm(query) + 1e-10)
doc_norms = np.linalg.norm(documents, axis=1, keepdims=True) + 1e-10
documents = documents / doc_norms
# Vectorized dot product: (n_docs, dim) @ (dim,) = (n_docs,)
scores = documents @ query
# np.argpartition is faster than argsort for top-k (O(n) vs O(n log n))
# It guarantees the k smallest/largest are in the right partition, but not sorted
if k < len(scores):
top_k_unsorted = np.argpartition(scores, -k)[-k:]
top_k_scores = scores[top_k_unsorted]
# Sort just the top-k (much cheaper than sorting all)
sorted_order = np.argsort(top_k_scores)[::-1]
top_k_indices = top_k_unsorted[sorted_order]
else:
top_k_indices = np.argsort(scores)[::-1][:k]
return top_k_indices, scores[top_k_indices]
# Demo
np.random.seed(42)
n_docs = 100_000
dim = 1536
documents = np.random.randn(n_docs, dim).astype(np.float32)
query = np.random.randn(dim).astype(np.float32)
import time
start = time.time()
indices, scores = top_k_similarity(query, documents, k=5)
elapsed = (time.time() - start) * 1000
print(f"Top-5 doc indices: {indices}")
print(f"Top-5 scores: {scores.round(4)}")
print(f"Search time for {n_docs:,} docs: {elapsed:.1f}ms")argpartition vs argsort: np.argpartition is O(n) while np.argsort is O(n log n). For top-k where k is much smaller than n, argpartition is significantly faster.
Q4: Compute a confusion matrix from predicted and true labels without using sklearn.
Answer:
Python
import numpy as np
def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
"""
Compute confusion matrix.
Returns: (n_classes, n_classes) matrix
Where: cm[i, j] = number of samples with true label i predicted as j
"""
classes = np.unique(np.concatenate([y_true, y_pred]))
n_classes = len(classes)
# Map class labels to indices (handles non-contiguous labels)
label_to_idx = {label: idx for idx, label in enumerate(classes)}
cm = np.zeros((n_classes, n_classes), dtype=int)
for true, pred in zip(y_true, y_pred):
cm[label_to_idx[true], label_to_idx[pred]] += 1
return cm
def compute_metrics(cm: np.ndarray) -> dict:
"""Compute precision, recall, F1 from confusion matrix (macro average)."""
n_classes = cm.shape[0]
precisions, recalls, f1s = [], [], []
for i in range(n_classes):
tp = cm[i, i]
fp = cm[:, i].sum() - tp # Others predicted as class i
fn = cm[i, :].sum() - tp # Class i predicted as others
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
precisions.append(precision)
recalls.append(recall)
f1s.append(f1)
accuracy = np.trace(cm) / cm.sum() # Correct predictions / total
return {
"accuracy": round(accuracy, 4),
"precision": round(np.mean(precisions), 4),
"recall": round(np.mean(recalls), 4),
"f1": round(np.mean(f1s), 4),
}
# Test
y_true = np.array([0, 1, 2, 0, 1, 2, 0, 1, 2, 0])
y_pred = np.array([0, 2, 2, 0, 0, 2, 0, 1, 1, 1])
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm)
metrics = compute_metrics(cm)
print(f"\nMetrics: {metrics}")Q5: Build a vectorized embedding similarity pipeline: embed a list of drug names, find the most similar drugs to a query, and return results with scores.
Answer:
Python
import numpy as np
from dataclasses import dataclass
@dataclass
class SimilarityResult:
query: str
drug: str
score: float
class EmbeddingSimilarityPipeline:
"""
A simple embedding-based similarity search pipeline using NumPy.
In production this would use a real embedding model and vector DB.
"""
def __init__(self, embedding_dim: int = 64):
self.embedding_dim = embedding_dim
self._corpus: list[str] = []
self._embeddings: np.ndarray | None = None # (n_docs, dim)
self._rng = np.random.default_rng(seed=0)
def _embed(self, text: str) -> np.ndarray:
"""Mock embedding — in production: call OpenAI/Cohere API."""
seed = sum(ord(c) for c in text)
rng = np.random.default_rng(seed=seed)
raw = rng.standard_normal(self.embedding_dim)
return raw / np.linalg.norm(raw) # Unit vector
def index(self, texts: list[str]) -> None:
"""Embed and index a list of texts."""
self._corpus = texts
embeddings = np.array([self._embed(t) for t in texts])
# Normalize all embeddings (enables fast dot product search)
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
self._embeddings = embeddings / (norms + 1e-10)
print(f"Indexed {len(texts)} documents. Shape: {self._embeddings.shape}")
def search(self, query: str, k: int = 5) -> list[SimilarityResult]:
"""Find the k most similar texts to the query."""
if self._embeddings is None:
raise RuntimeError("Call index() before search()")
query_emb = self._embed(query) # Already normalized in _embed
# Vectorized cosine similarity (dot product on normalized vectors)
scores = self._embeddings @ query_emb # (n_docs,)
# Get top-k using argpartition (faster than argsort)
k_actual = min(k, len(self._corpus))
if k_actual < len(scores):
top_k_unsorted = np.argpartition(scores, -k_actual)[-k_actual:]
top_k_sorted = top_k_unsorted[np.argsort(scores[top_k_unsorted])[::-1]]
else:
top_k_sorted = np.argsort(scores)[::-1]
return [
SimilarityResult(
query=query,
drug=self._corpus[idx],
score=float(scores[idx]),
)
for idx in top_k_sorted
]
def batch_search(self, queries: list[str], k: int = 3) -> list[list[SimilarityResult]]:
"""Search for multiple queries at once — vectorized."""
query_embs = np.array([self._embed(q) for q in queries]) # (n_queries, dim)
# (n_queries, dim) @ (dim, n_docs) = (n_queries, n_docs)
all_scores = query_embs @ self._embeddings.T
results = []
for i, scores in enumerate(all_scores):
k_actual = min(k, len(self._corpus))
top_indices = np.argsort(scores)[::-1][:k_actual]
results.append([
SimilarityResult(queries[i], self._corpus[j], float(scores[j]))
for j in top_indices
])
return results
# Demo
pipeline = EmbeddingSimilarityPipeline(embedding_dim=128)
drug_corpus = [
"warfarin anticoagulant vitamin K antagonist VKORC1",
"heparin anticoagulant antithrombin factor Xa",
"apixaban direct oral anticoagulant factor Xa inhibitor",
"metformin biguanide antidiabetic AMPK hepatic glucose",
"glipizide sulfonylurea antidiabetic insulin secretagogue",
"lisinopril ACE inhibitor antihypertensive",
"amlodipine calcium channel blocker antihypertensive",
"atorvastatin statin HMG-CoA reductase cholesterol",
]
pipeline.index(drug_corpus)
results = pipeline.search("oral anticoagulant blood clot prevention", k=3)
print("\nTop 3 results for 'oral anticoagulant blood clot prevention':")
for r in results:
print(f" [{r.score:.4f}] {r.drug[:60]}")Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.