Deep Learning for AI Interviews · Lesson 35 of 56
Vanishing Gradient Problem
The Problem
In backpropagation, gradients flow backward through layers via the chain rule:
δ_L = δ_{L+1} × W_{L+1}.T × σ'(z_L)
If σ'(z_L) < 1 at every layer (which is always true for sigmoid and tanh),
the gradient shrinks with each layer it passes through.
Sigmoid: σ'(z) = σ(z)(1 - σ(z)), max value = 0.25 at z = 0
In saturated regions (|z| >> 0): σ'(z) ≈ 0
For a 10-layer sigmoid network:
gradient at layer 1 ≈ 0.25^10 × (weight terms) ≈ 9.5 × 10⁻⁷
This means weights in early layers receive nearly zero gradient → they don't learn.
The early layers (which learn low-level features) are frozen while late layers train.Demonstrating Vanishing Gradients
import torch
import torch.nn as nn
import numpy as np
def measure_gradient_flow(
n_layers: int,
activation: str = "sigmoid",
) -> list[float]:
"""Measure gradient norm at each layer after backward pass."""
act_map = {
"sigmoid": nn.Sigmoid,
"tanh": nn.Tanh,
"relu": nn.ReLU,
}
Act = act_map[activation]
# Build deep network
layers = []
for i in range(n_layers):
layers.extend([nn.Linear(32, 32), Act()])
layers.append(nn.Linear(32, 1))
model = nn.Sequential(*layers)
# Xavier init to give sigmoid a fair chance
for m in model.modules():
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
nn.init.zeros_(m.bias)
X = torch.randn(64, 32)
y = torch.randint(0, 2, (64,)).float()
criterion = nn.BCEWithLogitsLoss()
loss = criterion(model(X).squeeze(), y)
loss.backward()
grad_norms = []
for name, param in model.named_parameters():
if "weight" in name and param.grad is not None:
grad_norms.append(param.grad.norm().item())
return grad_norms # index 0 = output layer, last = input layer
print("Sigmoid gradient norms (output → input):")
sig_norms = measure_gradient_flow(n_layers=8, activation="sigmoid")
for i, norm in enumerate(reversed(sig_norms)):
print(f" Layer {i+1}: {norm:.2e}")
print("\nReLU gradient norms (output → input):")
relu_norms = measure_gradient_flow(n_layers=8, activation="relu")
for i, norm in enumerate(reversed(relu_norms)):
print(f" Layer {i+1}: {norm:.2e}")Fix 1: ReLU Activation
import torch
import torch.nn as nn
# Sigmoid: gradient in (0, 0.25), saturates at extremes
# ReLU: gradient is 0 or 1 — no attenuation for positive inputs
# Compare gradient magnitude for single layer
z = torch.randn(100)
sigmoid_a = torch.sigmoid(z)
sigmoid_grad = sigmoid_a * (1 - sigmoid_a)
print(f"Sigmoid gradient — mean: {sigmoid_grad.mean():.4f}, max: {sigmoid_grad.max():.4f}")
relu_grad = (z > 0).float()
print(f"ReLU gradient — mean: {relu_grad.mean():.4f}, max: {relu_grad.max():.4f}")
# Leaky ReLU: avoids dead neurons (gradient = α for z < 0)
leaky_relu = nn.LeakyReLU(negative_slope=0.01)
z_neg = torch.tensor([-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0])
with torch.enable_grad():
z_neg.requires_grad_(True)
out = leaky_relu(z_neg).sum()
out.backward()
print(f"\nLeaky ReLU gradients: {z_neg.grad.numpy()}")
# All positive: 0.01 for negatives, 1.0 for positivesFix 2: Batch Normalisation
import torch
import torch.nn as nn
# BatchNorm re-centres and rescales activations before the next layer
# This prevents inputs from saturating sigmoid/tanh activations
class DeepSigmoidWithBN(nn.Module):
def __init__(self, n_layers: int = 8):
super().__init__()
layers = []
for i in range(n_layers):
layers.extend([
nn.Linear(32, 32),
nn.BatchNorm1d(32), # normalise BEFORE activation
nn.Sigmoid(),
])
layers.append(nn.Linear(32, 1))
self.net = nn.Sequential(*layers)
def forward(self, x):
return self.net(x)
model = DeepSigmoidWithBN(n_layers=8)
X = torch.randn(64, 32)
y = torch.randint(0, 2, (64,)).float()
criterion = nn.BCEWithLogitsLoss()
loss = criterion(model(X).squeeze(), y)
loss.backward()
grad_norms = []
for name, param in model.named_parameters():
if "linear" in name and "weight" in name and param.grad is not None:
grad_norms.append((name, param.grad.norm().item()))
print("Sigmoid + BatchNorm gradient norms (early to late layers):")
for name, norm in grad_norms:
print(f" {name}: {norm:.4f}")
# Gradients should be more consistent across layers vs without BNFix 3: Residual Connections (Skip Connections)
import torch
import torch.nn as nn
class ResidualBlock(nn.Module):
"""Skip connection: output = F(x) + x. Gradient flows directly through +."""
def __init__(self, dim: int):
super().__init__()
self.net = nn.Sequential(
nn.Linear(dim, dim),
nn.BatchNorm1d(dim),
nn.ReLU(),
nn.Linear(dim, dim),
nn.BatchNorm1d(dim),
)
self.relu = nn.ReLU()
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.relu(self.net(x) + x) # skip connection
class DeepResNet(nn.Module):
def __init__(self, d_in: int = 32, n_blocks: int = 10):
super().__init__()
self.blocks = nn.Sequential(*[ResidualBlock(d_in) for _ in range(n_blocks)])
self.head = nn.Linear(d_in, 1)
def forward(self, x):
return self.head(self.blocks(x))
# ResNet gradient flow: + operation passes gradient unchanged to both branches
# dL/dx_input = dL/d(F(x)+x) × (dF/dx + 1) ≥ 1 when dF/dx > -1
# The "+1" term provides a direct gradient highway to early layers
deep_resnet = DeepResNet(d_in=32, n_blocks=20)
n_params = sum(p.numel() for p in deep_resnet.parameters())
print(f"Deep ResNet (20 blocks): {n_params:,} parameters")
X = torch.randn(64, 32)
y = torch.randint(0, 2, (64,)).float()
loss = nn.BCEWithLogitsLoss()(deep_resnet(X).squeeze(), y)
loss.backward()
# All weight gradients should be non-zero (no vanishing)
for name, p in list(deep_resnet.named_parameters())[:6]:
if p.grad is not None:
print(f"{name}: {p.grad.norm():.4f}")Fix 4: Proper Initialisation
import torch
import torch.nn as nn
# Poor init (e.g., all zeros, large random) → saturated activations from step 1
# Xavier init: W ~ N(0, 2/(fan_in + fan_out)) → keeps activation variance ~1
def compare_init_gradient_flow(init_strategy: str, n_layers: int = 5) -> float:
layers = []
for _ in range(n_layers):
linear = nn.Linear(64, 64)
if init_strategy == "zeros":
nn.init.zeros_(linear.weight)
elif init_strategy == "large":
nn.init.normal_(linear.weight, std=5.0)
elif init_strategy == "xavier":
nn.init.xavier_uniform_(linear.weight)
elif init_strategy == "kaiming":
nn.init.kaiming_normal_(linear.weight, nonlinearity="relu")
layers.extend([linear, nn.Sigmoid()])
layers.append(nn.Linear(64, 1))
model = nn.Sequential(*layers)
X = torch.randn(32, 64)
y = torch.randint(0, 2, (32,)).float()
loss = nn.BCEWithLogitsLoss()(model(X).squeeze(), y)
loss.backward()
first_layer_grad = next(model.parameters()).grad
return first_layer_grad.norm().item() if first_layer_grad is not None else 0.0
for strategy in ["zeros", "large", "xavier", "kaiming"]:
grad_norm = compare_init_gradient_flow(strategy)
print(f"{strategy:10s}: first-layer gradient norm = {grad_norm:.2e}")Interview Answer
"Vanishing gradients occur when gradient signals shrink to near-zero as they propagate backward through many layers. The root cause: chain rule multiplies many terms together, and for sigmoid/tanh activations, each term is at most 0.25 (sigmoid) — multiplying ten of these gives ~10⁻⁶. Early layers receive almost no gradient and don't learn. Four solutions: (1) ReLU activation — gradient is 0 or 1, not a decaying fraction; (2) BatchNorm — keeps activations in the non-saturated regime of sigmoid/tanh by normalising before activation; (3) Residual connections (ResNet) — add x directly to F(x), providing a gradient highway that doesn't pass through activations; (4) Proper initialisation (Xavier/Kaiming) — ensures activations start in the appropriate range. ReLU + Kaiming init is the standard for most architectures; residual connections are added for networks deeper than ~10 layers."