Machine Learning Foundations · Lesson 66 of 70
Why is My Model Not Learning?
Symptoms of a Model Not Learning
1. Training loss barely decreases (or doesn't decrease at all)
2. Training accuracy ≈ random baseline (e.g., 50% for balanced binary)
3. All predictions are the same class
4. Loss is NaN after a few iterations
5. Model performs identically to a dummy classifierThe Debug Protocol
Before changing the model, check the fundamentals.
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
def pre_flight_checks(X_train, y_train, X_val, y_val, model):
"""
Run these checks BEFORE debugging the model architecture.
"""
print("=== Pre-flight Checks ===\n")
# 1. Baseline: can a dummy model do this?
dummy = DummyClassifier(strategy="most_frequent")
dummy_scores = cross_val_score(dummy, X_train, y_train, cv=5, scoring="roc_auc")
print(f"1. Dummy AUC: {dummy_scores.mean():.3f} (target: model should beat this)")
# 2. Target distribution
unique, counts = np.unique(y_train, return_counts=True)
print(f"2. Class distribution: {dict(zip(unique, counts))}")
if len(unique) == 1:
print(" ERROR: only one class in training labels — model can't learn")
# 3. Feature sanity
print(f"3. Features: {X_train.shape[0]} samples × {X_train.shape[1]} features")
nan_count = np.isnan(X_train).sum()
inf_count = np.isinf(X_train).sum()
print(f" NaN count: {nan_count} {'← ERROR: will prevent training' if nan_count > 0 else '✓'}")
print(f" Inf count: {inf_count} {'← ERROR' if inf_count > 0 else '✓'}")
# 4. Feature variance
zero_var = (X_train.std(axis=0) == 0).sum()
print(f" Zero-variance features: {zero_var} {'← will confuse tree splits' if zero_var > 0 else '✓'}")
# 5. Label-feature alignment
print(f"4. y_train length: {len(y_train)}, X_train rows: {X_train.shape[0]}")
if len(y_train) != X_train.shape[0]:
print(" ERROR: mismatch between features and labels")
print("\n=== Done ===")
pre_flight_checks(X_train, y_train, X_val, y_val, model)Data Issues
NaN and Inf in Features
import numpy as np
import pandas as pd
def diagnose_feature_issues(X: np.ndarray, feature_names: list) -> None:
df = pd.DataFrame(X, columns=feature_names)
# NaN counts per feature
nan_cols = df.isnull().sum()
nan_cols = nan_cols[nan_cols > 0]
if len(nan_cols) > 0:
print("Features with NaN:")
print(nan_cols)
print("→ Fix: add SimpleImputer to pipeline before the model")
# Inf values
inf_cols = df.isin([np.inf, -np.inf]).sum()
inf_cols = inf_cols[inf_cols > 0]
if len(inf_cols) > 0:
print("Features with Inf:")
print(inf_cols)
print("→ Fix: df.replace([np.inf, -np.inf], np.nan), then impute")
# Constant features
const_cols = df.columns[df.std() < 1e-10]
if len(const_cols) > 0:
print("Constant features (no variance):")
print(const_cols.tolist())
print("→ Fix: use VarianceThreshold to remove them")Target Leakage (Model Learns Too Easily)
# Symptom: training accuracy 99-100%, val accuracy similar → no problem yet
# Or: training accuracy 99%, val accuracy 95% → suspicious on a hard problem
# How to detect leakage:
# 1. Check if any features are derived from the target or future data
# 2. Train a minimal model — if it performs perfectly, leakage is likely
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
# If a depth-1 tree (one split) gets 95%+ AUC, one feature is leaking the target
dt = DecisionTreeClassifier(max_depth=1)
scores = cross_val_score(dt, X_train, y_train, cv=5, scoring="roc_auc")
print(f"Depth-1 tree AUC: {scores.mean():.3f}")
if scores.mean() > 0.90:
print("WARNING: suspiciously high for a single split — check for leakage")
# Identify which feature drives the single split
dt.fit(X_train, y_train)
split_feature = feature_names[dt.tree_.feature[0]]
print(f"Splitting feature: {split_feature}")
print(f"→ Check: is '{split_feature}' derived from the target or future data?")Model-Specific Issues
All Predictions Are the Same Class
import numpy as np
y_pred = model.predict(X_train)
unique_preds = np.unique(y_pred)
if len(unique_preds) == 1:
print(f"Model predicts only class {unique_preds[0]}")
print("\nPossible causes:")
print(" 1. Severe class imbalance — model learned the majority class is always safe")
print(" Fix: use class_weight='balanced', or SMOTE, or lower threshold")
print(" 2. Regularization too strong — all coefficients near zero → always predicts majority")
print(" Fix: increase C (less regularization)")
print(" 3. Learning rate too small for neural nets — training hasn't started yet")
print(" Fix: increase lr, verify gradients are flowing")
print(" 4. Wrong loss function for the task")
# Check class imbalance
from sklearn.utils.class_weight import compute_class_weight
classes = np.unique(y_train)
weights = compute_class_weight("balanced", classes=classes, y=y_train)
print(f"\nClass weights (balanced): {dict(zip(classes, weights))}")Loss Is NaN
import numpy as np
def check_for_nan_loss(model, X_batch, y_batch, criterion):
"""For neural networks: diagnose NaN loss."""
pred = model(X_batch)
loss = criterion(pred, y_batch)
if torch.isnan(loss):
print("Loss is NaN — possible causes:")
print("1. Learning rate too large (exploding gradients)")
print(" Fix: reduce lr by 10×")
print("2. Log of zero (in cross-entropy) — check for extreme probabilities")
print(" Fix: add epsilon: log(p + 1e-8)")
print("3. Division by zero in the model")
print("4. NaN in input features")
print(" Fix: X_batch.isnan().any()")
# Check gradient norms
total_norm = 0
for p in model.parameters():
if p.grad is not None:
total_norm += p.grad.data.norm(2).item() ** 2
total_norm = total_norm ** 0.5
print(f"Gradient norm: {total_norm:.4f}")
if total_norm > 100:
print("WARNING: gradient norm is very large → exploding gradients")
print("Fix: gradient clipping → torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)")The Overfit-to-One-Batch Sanity Check
# Classic neural network debug technique:
# If a model can't overfit to a single batch, there's a bug in the architecture
import torch
import torch.nn as nn
import torch.optim as optim
def sanity_check_overfit_one_batch(model, X_small, y_small, n_steps=100):
"""
Train on a tiny batch for many steps.
A correct model should overfit (training loss → 0).
If it doesn't, there's a model bug.
"""
X_tensor = torch.FloatTensor(X_small[:16])
y_tensor = torch.FloatTensor(y_small[:16]).unsqueeze(1)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
model.train()
for step in range(n_steps):
optimizer.zero_grad()
pred = model(X_tensor)
loss = criterion(pred, y_tensor)
loss.backward()
optimizer.step()
if step % 20 == 0:
print(f"Step {step}: loss = {loss.item():.4f}")
final_loss = loss.item()
if final_loss < 0.01:
print("✓ Model correctly overfits one batch — architecture is valid")
else:
print("✗ Model fails to overfit one batch — check architecture/activations/loss")
# Run this before any full training
sanity_check_overfit_one_batch(model, X_train, y_train)Minimal Reproducible Example
# When debugging, reduce to the simplest case first
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
# Test with synthetic data — eliminates data issues
X_synthetic, y_synthetic = make_classification(
n_samples=500, n_features=10, n_informative=5,
n_redundant=2, random_state=42
)
lr = LogisticRegression(max_iter=1000)
scores = cross_val_score(lr, X_synthetic, y_synthetic, cv=5, scoring="roc_auc")
print(f"Sanity check on synthetic data: AUC={scores.mean():.3f}")
# Expected: AUC > 0.80 for n_informative=5
# If it fails here: pipeline bug, not a data bugInterview Answer Template
Q: Your model's training accuracy is stuck at 55% (just above chance). How do you debug this?
I start with the data before touching the model. Check: are there NaNs or Infs in features? Does the training set have both classes? Is there target leakage (a depth-1 tree getting 95%+ AUC is a red flag)? Next, compare to a dummy classifier — if the dummy gets 53% and my model gets 55%, it's barely learning. Then I check model-specific issues: is regularization too strong (all coefficients near zero)? For neural networks: is the learning rate too small? Are gradients flowing (check gradient norms)? I also run a one-batch overfit test — train on 16 samples for 100 steps. A correct model should overfit that to near-zero loss; if it doesn't, there's an architectural bug. Once those pass, I look at the features themselves: variance, scale, missing values, feature-label alignment. Systematic debugging order: data first, then pipeline, then model, never the reverse.