Learnixo

Python Essentials for AI Engineers · Lesson 31 of 36

Interview: NumPy Problem Walk-Through

Q1: Implement cosine similarity between two embedding vectors without using scipy.

Answer:

Python
import numpy as np

def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """
    Compute cosine similarity between two vectors.
    
    cosine_sim = dot(a, b) / (||a|| * ||b||)
    
    Range: -1 (opposite) to 1 (identical), 0 (orthogonal)
    """
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    
    if norm_a == 0 or norm_b == 0:
        return 0.0   # Convention: zero vector is not similar to anything
    
    return dot_product / (norm_a * norm_b)


# Test
v1 = np.array([1.0, 0.0, 0.0])
v2 = np.array([1.0, 0.0, 0.0])
v3 = np.array([0.0, 1.0, 0.0])

print(cosine_similarity(v1, v2))   # 1.0  identical
print(cosine_similarity(v1, v3))   # 0.0  orthogonal


# Batch version: one query vs many documents
def batch_cosine_similarity(query: np.ndarray, docs: np.ndarray) -> np.ndarray:
    """
    Efficient vectorized cosine similarity.
    
    query: (dim,)
    docs:  (n_docs, dim)
    Returns: (n_docs,) — one score per doc
    """
    query_norm = query / (np.linalg.norm(query) + 1e-10)   # Avoid division by zero
    doc_norms = np.linalg.norm(docs, axis=1, keepdims=True) + 1e-10
    docs_normalized = docs / doc_norms
    
    return docs_normalized @ query_norm   # (n_docs, dim) @ (dim,) = (n_docs,)


query = np.random.randn(1536)
docs  = np.random.randn(100, 1536)
scores = batch_cosine_similarity(query, docs)
print(scores.shape)   # (100,)
print(f"Range: [{scores.min():.3f}, {scores.max():.3f}]")

Q2: Normalize a batch of features using z-score normalization (per feature, not globally). Handle the case where a feature has zero variance.

Answer:

Python
import numpy as np

def zscore_normalize(X: np.ndarray, epsilon: float = 1e-8) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Z-score normalize features: (x - mean) / std, per feature.
    
    X: (n_samples, n_features)
    Returns: (X_normalized, mean, std) — save mean/std to apply to new data
    """
    mean = X.mean(axis=0)   # (n_features,)
    std  = X.std(axis=0)    # (n_features,)
    
    # Features with zero variance would cause division by zero
    # Replace zero std with epsilon (effectively keeps the feature at 0 after normalization)
    std_safe = np.where(std < epsilon, epsilon, std)
    
    X_normalized = (X - mean) / std_safe   # Broadcasting: (n_samples, n_features)
    
    return X_normalized, mean, std


def apply_normalization(X: np.ndarray, mean: np.ndarray, std: np.ndarray, epsilon: float = 1e-8) -> np.ndarray:
    """Apply pre-computed normalization to new data (e.g., test set)."""
    std_safe = np.where(std < epsilon, epsilon, std)
    return (X - mean) / std_safe


# Example
np.random.seed(42)
X_train = np.random.randn(100, 10) * np.array([1, 5, 0, 2, 10, 1, 1, 1, 1, 1])
# Feature 2 has std=0 (all zeros after multiplying by 0)

X_norm, mean, std = zscore_normalize(X_train)

# Verify
print(f"Mean of normalized features: {X_norm.mean(axis=0).round(4)}")
# Should all be ~0

print(f"Std of normalized features: {X_norm.std(axis=0).round(4)}")
# Should all be ~1 (except feature 2 which had zero variance)

# Apply same normalization to test set
X_test = np.random.randn(20, 10) * 2
X_test_norm = apply_normalization(X_test, mean, std)
print(X_test_norm.shape)   # (20, 10)

Interview talking point: Always fit (compute mean/std) on training data, then transform both train and test. Never fit on test data — that leaks information.


Q3: Given a matrix of document embeddings and a query embedding, find the top-5 most similar documents. Optimize for speed with many documents.

Answer:

Python
import numpy as np

def top_k_similarity(
    query: np.ndarray,
    documents: np.ndarray,
    k: int = 5,
    normalize: bool = True,
) -> tuple[np.ndarray, np.ndarray]:
    """
    Find the k most similar documents to a query.
    
    query:     (dim,)
    documents: (n_docs, dim)
    Returns:   (top_k_indices, top_k_scores)
    """
    if normalize:
        # Pre-normalize for faster repeated queries
        query = query / (np.linalg.norm(query) + 1e-10)
        doc_norms = np.linalg.norm(documents, axis=1, keepdims=True) + 1e-10
        documents = documents / doc_norms
    
    # Vectorized dot product: (n_docs, dim) @ (dim,) = (n_docs,)
    scores = documents @ query
    
    # np.argpartition is faster than argsort for top-k (O(n) vs O(n log n))
    # It guarantees the k smallest/largest are in the right partition, but not sorted
    if k < len(scores):
        top_k_unsorted = np.argpartition(scores, -k)[-k:]
        top_k_scores = scores[top_k_unsorted]
        # Sort just the top-k (much cheaper than sorting all)
        sorted_order = np.argsort(top_k_scores)[::-1]
        top_k_indices = top_k_unsorted[sorted_order]
    else:
        top_k_indices = np.argsort(scores)[::-1][:k]
    
    return top_k_indices, scores[top_k_indices]


# Demo
np.random.seed(42)
n_docs = 100_000
dim = 1536

documents = np.random.randn(n_docs, dim).astype(np.float32)
query = np.random.randn(dim).astype(np.float32)

import time
start = time.time()
indices, scores = top_k_similarity(query, documents, k=5)
elapsed = (time.time() - start) * 1000

print(f"Top-5 doc indices: {indices}")
print(f"Top-5 scores: {scores.round(4)}")
print(f"Search time for {n_docs:,} docs: {elapsed:.1f}ms")

argpartition vs argsort: np.argpartition is O(n) while np.argsort is O(n log n). For top-k where k is much smaller than n, argpartition is significantly faster.


Q4: Compute a confusion matrix from predicted and true labels without using sklearn.

Answer:

Python
import numpy as np

def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
    """
    Compute confusion matrix.
    
    Returns: (n_classes, n_classes) matrix
    Where: cm[i, j] = number of samples with true label i predicted as j
    """
    classes = np.unique(np.concatenate([y_true, y_pred]))
    n_classes = len(classes)
    
    # Map class labels to indices (handles non-contiguous labels)
    label_to_idx = {label: idx for idx, label in enumerate(classes)}
    
    cm = np.zeros((n_classes, n_classes), dtype=int)
    for true, pred in zip(y_true, y_pred):
        cm[label_to_idx[true], label_to_idx[pred]] += 1
    
    return cm


def compute_metrics(cm: np.ndarray) -> dict:
    """Compute precision, recall, F1 from confusion matrix (macro average)."""
    n_classes = cm.shape[0]
    
    precisions, recalls, f1s = [], [], []
    for i in range(n_classes):
        tp = cm[i, i]
        fp = cm[:, i].sum() - tp   # Others predicted as class i
        fn = cm[i, :].sum() - tp   # Class i predicted as others
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall    = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1        = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    accuracy = np.trace(cm) / cm.sum()   # Correct predictions / total
    
    return {
        "accuracy":  round(accuracy, 4),
        "precision": round(np.mean(precisions), 4),
        "recall":    round(np.mean(recalls), 4),
        "f1":        round(np.mean(f1s), 4),
    }


# Test
y_true = np.array([0, 1, 2, 0, 1, 2, 0, 1, 2, 0])
y_pred = np.array([0, 2, 2, 0, 0, 2, 0, 1, 1, 1])

cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm)

metrics = compute_metrics(cm)
print(f"\nMetrics: {metrics}")

Q5: Build a vectorized embedding similarity pipeline: embed a list of drug names, find the most similar drugs to a query, and return results with scores.

Answer:

Python
import numpy as np
from dataclasses import dataclass

@dataclass
class SimilarityResult:
    query: str
    drug: str
    score: float

class EmbeddingSimilarityPipeline:
    """
    A simple embedding-based similarity search pipeline using NumPy.
    In production this would use a real embedding model and vector DB.
    """

    def __init__(self, embedding_dim: int = 64):
        self.embedding_dim = embedding_dim
        self._corpus: list[str] = []
        self._embeddings: np.ndarray | None = None   # (n_docs, dim)
        self._rng = np.random.default_rng(seed=0)

    def _embed(self, text: str) -> np.ndarray:
        """Mock embedding — in production: call OpenAI/Cohere API."""
        seed = sum(ord(c) for c in text)
        rng = np.random.default_rng(seed=seed)
        raw = rng.standard_normal(self.embedding_dim)
        return raw / np.linalg.norm(raw)   # Unit vector

    def index(self, texts: list[str]) -> None:
        """Embed and index a list of texts."""
        self._corpus = texts
        embeddings = np.array([self._embed(t) for t in texts])
        # Normalize all embeddings (enables fast dot product search)
        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
        self._embeddings = embeddings / (norms + 1e-10)
        print(f"Indexed {len(texts)} documents. Shape: {self._embeddings.shape}")

    def search(self, query: str, k: int = 5) -> list[SimilarityResult]:
        """Find the k most similar texts to the query."""
        if self._embeddings is None:
            raise RuntimeError("Call index() before search()")

        query_emb = self._embed(query)   # Already normalized in _embed

        # Vectorized cosine similarity (dot product on normalized vectors)
        scores = self._embeddings @ query_emb   # (n_docs,)

        # Get top-k using argpartition (faster than argsort)
        k_actual = min(k, len(self._corpus))
        if k_actual < len(scores):
            top_k_unsorted = np.argpartition(scores, -k_actual)[-k_actual:]
            top_k_sorted = top_k_unsorted[np.argsort(scores[top_k_unsorted])[::-1]]
        else:
            top_k_sorted = np.argsort(scores)[::-1]

        return [
            SimilarityResult(
                query=query,
                drug=self._corpus[idx],
                score=float(scores[idx]),
            )
            for idx in top_k_sorted
        ]

    def batch_search(self, queries: list[str], k: int = 3) -> list[list[SimilarityResult]]:
        """Search for multiple queries at once — vectorized."""
        query_embs = np.array([self._embed(q) for q in queries])   # (n_queries, dim)
        # (n_queries, dim) @ (dim, n_docs) = (n_queries, n_docs)
        all_scores = query_embs @ self._embeddings.T

        results = []
        for i, scores in enumerate(all_scores):
            k_actual = min(k, len(self._corpus))
            top_indices = np.argsort(scores)[::-1][:k_actual]
            results.append([
                SimilarityResult(queries[i], self._corpus[j], float(scores[j]))
                for j in top_indices
            ])
        return results


# Demo
pipeline = EmbeddingSimilarityPipeline(embedding_dim=128)

drug_corpus = [
    "warfarin anticoagulant vitamin K antagonist VKORC1",
    "heparin anticoagulant antithrombin factor Xa",
    "apixaban direct oral anticoagulant factor Xa inhibitor",
    "metformin biguanide antidiabetic AMPK hepatic glucose",
    "glipizide sulfonylurea antidiabetic insulin secretagogue",
    "lisinopril ACE inhibitor antihypertensive",
    "amlodipine calcium channel blocker antihypertensive",
    "atorvastatin statin HMG-CoA reductase cholesterol",
]

pipeline.index(drug_corpus)

results = pipeline.search("oral anticoagulant blood clot prevention", k=3)
print("\nTop 3 results for 'oral anticoagulant blood clot prevention':")
for r in results:
    print(f"  [{r.score:.4f}] {r.drug[:60]}")