Machine Learning Foundations · Lesson 58 of 70
L2 Regularization (Ridge) Explained
The L2 Penalty
L2 loss = Standard loss + λ × Σ(wᵢ²)
Where:
wᵢ² = squared weight for each feature
λ = regularization strength
In sklearn:
Ridge regression: alpha = λ (directly)
LogisticRegression: C = 1/λ (penalty="l2" is the default)
Larger alpha/smaller C = stronger regularizationWhy L2 Shrinks But Never Zeros Weights
import numpy as np
# L2 gradient: d(w²)/dw = 2w
# Proportional to the weight — as w → 0, the gradient → 0
# Weight is updated by: w ← w - learning_rate × 2λw = w × (1 - 2λ × learning_rate)
# This is exponential decay — approaches zero but never reaches it
def l2_weight_decay(initial_weight: float, lambda_: float, steps: int = 10):
w = initial_weight
decay_factor = 1 - 2 * lambda_ * 0.1 # learning_rate = 0.1
print(f"{'Step':>4} {'Weight':>12} {'Gradient':>12}")
print("-" * 32)
for step in range(steps):
gradient = 2 * lambda_ * w
print(f"{step:>4} {w:>12.6f} {gradient:>12.6f}")
w = w * decay_factor
l2_weight_decay(initial_weight=2.0, lambda_=0.5)
# Weight halves each step but never hits exactly zero
# Contrast with L1: weight hits 0 in a fixed number of stepsRidge Regression
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
# Warfarin dose prediction — regression task
# Features: age, weight, height, creatinine, INR, medications...
alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
for alpha in alphas:
pipeline = Pipeline([
("scaler", StandardScaler()),
("model", Ridge(alpha=alpha)),
])
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="r2")
pipeline.fit(X_train, y_train)
coef_norm = np.linalg.norm(pipeline.named_steps["model"].coef_)
print(f"alpha={alpha:6}: R²={scores.mean():.3f} ± {scores.std():.3f}, |w|={coef_norm:.3f}")
# alpha near 0: low bias, high variance (may overfit with many features)
# optimal alpha: best R² in cross-validation
# alpha very high: high bias, low variance (coefficients all near zero → constant prediction)Coefficient Interpretation After Standardization
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
("scaler", StandardScaler()),
("model", Ridge(alpha=1.0)),
])
pipeline.fit(X_train, y_train)
coefs = pipeline.named_steps["model"].coef_
feature_names = ["age", "weight_kg", "serum_creatinine", "hba1c", "num_medications",
"systolic_bp", "prior_admissions", "length_of_stay"]
print("Ridge coefficients (standardized — comparable by magnitude):")
print(f"{'Feature':<25} {'Coefficient':>12} {'Relative importance':>20}")
print("-" * 60)
for name, coef in sorted(zip(feature_names, coefs), key=lambda x: abs(x[1]), reverse=True):
bar = "█" * int(abs(coef) * 10)
print(f"{name:<25} {coef:>12.4f} {bar}")
# After standardization: larger |coef| = more important feature
# Without standardization: coefficient reflects units (age in years vs creatinine in mg/dL)Handling Correlated Features
# L2's key advantage over L1: stable with correlated features
# Example: age and BMI are correlated
# L1: picks one (usually age) and sets BMI coefficient to 0 (arbitrary choice)
# L2: distributes weight between age and BMI — both remain non-zero
from sklearn.linear_model import Lasso, Ridge
import numpy as np
np.random.seed(42)
age = np.random.normal(60, 12, 200)
bmi = age * 0.3 + np.random.normal(0, 2, 200) # correlated with age
X_corr = np.column_stack([age, bmi])
y_corr = age * 0.5 + bmi * 0.4 + np.random.normal(0, 3, 200)
# L1: arbitrary between correlated features
lasso = Lasso(alpha=0.5)
lasso.fit(X_corr, y_corr)
print(f"Lasso: age={lasso.coef_[0]:.3f}, bmi={lasso.coef_[1]:.3f}")
# L2: distributes weight between correlated features
ridge = Ridge(alpha=1.0)
ridge.fit(X_corr, y_corr)
print(f"Ridge: age={ridge.coef_[0]:.3f}, bmi={ridge.coef_[1]:.3f}")
# Ridge gives more stable, interpretable coefficients for correlated inputsL2 in Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# L2 is the default penalty for LogisticRegression
# Tune C (= 1/λ) by cross-validation
pipeline = Pipeline([
("scaler", StandardScaler()),
("model", LogisticRegression(penalty="l2", max_iter=1000)),
])
param_grid = {"model__C": [0.001, 0.01, 0.1, 1, 10, 100]}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
search = GridSearchCV(pipeline, param_grid, cv=cv, scoring="roc_auc")
search.fit(X_train, y_train)
print(f"Best C: {search.best_params_['model__C']}")
print(f"Best CV AUC: {search.best_score_:.3f}")
for p, score in zip(search.cv_results_["params"], search.cv_results_["mean_test_score"]):
print(f" C={p['model__C']:6}: AUC={score:.3f}")L2 for Neural Networks (Weight Decay)
import torch
import torch.nn as nn
import torch.optim as optim
class ClinicalNet(nn.Module):
def __init__(self, n_features: int):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n_features, 64),
nn.ReLU(),
nn.Linear(64, 32),
nn.ReLU(),
nn.Linear(32, 1),
nn.Sigmoid(),
)
def forward(self, x):
return self.net(x)
model = ClinicalNet(n_features=20)
# weight_decay in PyTorch optimizer = L2 regularization on all weights
optimizer = optim.Adam(
model.parameters(),
lr=1e-3,
weight_decay=1e-4, # λ = 0.0001, applied to all parameters
)
# At each step, optimizer adds λ × w to the gradient
# Equivalent to minimizing loss + λ × Σ(wᵢ²)Ridge vs No Regularization: When It Matters
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score
import numpy as np
# Scenario: 50 features, only 150 samples (high feature-to-sample ratio)
# Without regularization: overfits badly
# With Ridge: substantially better generalization
lr_scores = cross_val_score(LinearRegression(), X_train, y_train, cv=5, scoring="r2")
ridge_scores = cross_val_score(Ridge(alpha=10.0), X_train, y_train, cv=5, scoring="r2")
print(f"LinearRegression: R²={lr_scores.mean():.3f} ± {lr_scores.std():.3f}")
print(f"Ridge (α=10): R²={ridge_scores.mean():.3f} ± {ridge_scores.std():.3f}")
# Ridge is typically much better when n_features is close to n_samplesInterview Answer Template
Q: What is L2 regularization (Ridge) and when do you use it?
L2 regularization adds the sum of squared weights to the loss function: Loss + λ × Σ(wᵢ²). The gradient of w² is 2w — proportional to the weight — so large weights are penalized heavily while small weights are penalized lightly. This causes weights to shrink proportionally toward zero but never reach exactly zero. All features remain in the model, just with reduced magnitude. L2 is the default regularization for logistic regression in sklearn. The key advantage over L1: stability with correlated features — L2 distributes weight across correlated predictors, while L1 arbitrarily picks one and zeros the others. I use L2 when I expect most features to have some signal (dense features), or when features are correlated and I want stable coefficient estimates. The regularization strength (λ or C = 1/λ) must be tuned by cross-validation — it's a hyperparameter like any other.