Learnixo

Deep Learning for AI Interviews · Lesson 35 of 56

Vanishing Gradient Problem

The Problem

In backpropagation, gradients flow backward through layers via the chain rule:
  δ_L = δ_{L+1} × W_{L+1}.T × σ'(z_L)

If σ'(z_L) < 1 at every layer (which is always true for sigmoid and tanh),
the gradient shrinks with each layer it passes through.

Sigmoid: σ'(z) = σ(z)(1 - σ(z)), max value = 0.25 at z = 0
         In saturated regions (|z| >> 0): σ'(z) ≈ 0

For a 10-layer sigmoid network:
  gradient at layer 1 ≈ 0.25^10 × (weight terms) ≈ 9.5 × 10⁻⁷

This means weights in early layers receive nearly zero gradient → they don't learn.
The early layers (which learn low-level features) are frozen while late layers train.

Demonstrating Vanishing Gradients

Python
import torch
import torch.nn as nn
import numpy as np

def measure_gradient_flow(
    n_layers: int,
    activation: str = "sigmoid",
) -> list[float]:
    """Measure gradient norm at each layer after backward pass."""
    act_map = {
        "sigmoid": nn.Sigmoid,
        "tanh":    nn.Tanh,
        "relu":    nn.ReLU,
    }
    Act = act_map[activation]
    
    # Build deep network
    layers = []
    for i in range(n_layers):
        layers.extend([nn.Linear(32, 32), Act()])
    layers.append(nn.Linear(32, 1))
    model = nn.Sequential(*layers)
    
    # Xavier init to give sigmoid a fair chance
    for m in model.modules():
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            nn.init.zeros_(m.bias)
    
    X = torch.randn(64, 32)
    y = torch.randint(0, 2, (64,)).float()
    
    criterion = nn.BCEWithLogitsLoss()
    loss = criterion(model(X).squeeze(), y)
    loss.backward()
    
    grad_norms = []
    for name, param in model.named_parameters():
        if "weight" in name and param.grad is not None:
            grad_norms.append(param.grad.norm().item())
    
    return grad_norms  # index 0 = output layer, last = input layer

print("Sigmoid gradient norms (output → input):")
sig_norms = measure_gradient_flow(n_layers=8, activation="sigmoid")
for i, norm in enumerate(reversed(sig_norms)):
    print(f"  Layer {i+1}: {norm:.2e}")

print("\nReLU gradient norms (output → input):")
relu_norms = measure_gradient_flow(n_layers=8, activation="relu")
for i, norm in enumerate(reversed(relu_norms)):
    print(f"  Layer {i+1}: {norm:.2e}")

Fix 1: ReLU Activation

Python
import torch
import torch.nn as nn

# Sigmoid: gradient in (0, 0.25), saturates at extremes
# ReLU: gradient is 0 or 1  no attenuation for positive inputs

# Compare gradient magnitude for single layer
z = torch.randn(100)

sigmoid_a = torch.sigmoid(z)
sigmoid_grad = sigmoid_a * (1 - sigmoid_a)
print(f"Sigmoid gradient — mean: {sigmoid_grad.mean():.4f}, max: {sigmoid_grad.max():.4f}")

relu_grad = (z > 0).float()
print(f"ReLU gradient    — mean: {relu_grad.mean():.4f}, max: {relu_grad.max():.4f}")

# Leaky ReLU: avoids dead neurons (gradient = α for z < 0)
leaky_relu = nn.LeakyReLU(negative_slope=0.01)
z_neg = torch.tensor([-2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0])
with torch.enable_grad():
    z_neg.requires_grad_(True)
    out = leaky_relu(z_neg).sum()
    out.backward()
    print(f"\nLeaky ReLU gradients: {z_neg.grad.numpy()}")
    # All positive: 0.01 for negatives, 1.0 for positives

Fix 2: Batch Normalisation

Python
import torch
import torch.nn as nn

# BatchNorm re-centres and rescales activations before the next layer
# This prevents inputs from saturating sigmoid/tanh activations

class DeepSigmoidWithBN(nn.Module):
    def __init__(self, n_layers: int = 8):
        super().__init__()
        layers = []
        for i in range(n_layers):
            layers.extend([
                nn.Linear(32, 32),
                nn.BatchNorm1d(32),   # normalise BEFORE activation
                nn.Sigmoid(),
            ])
        layers.append(nn.Linear(32, 1))
        self.net = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.net(x)

model = DeepSigmoidWithBN(n_layers=8)
X = torch.randn(64, 32)
y = torch.randint(0, 2, (64,)).float()

criterion = nn.BCEWithLogitsLoss()
loss = criterion(model(X).squeeze(), y)
loss.backward()

grad_norms = []
for name, param in model.named_parameters():
    if "linear" in name and "weight" in name and param.grad is not None:
        grad_norms.append((name, param.grad.norm().item()))

print("Sigmoid + BatchNorm gradient norms (early to late layers):")
for name, norm in grad_norms:
    print(f"  {name}: {norm:.4f}")
# Gradients should be more consistent across layers vs without BN

Fix 3: Residual Connections (Skip Connections)

Python
import torch
import torch.nn as nn

class ResidualBlock(nn.Module):
    """Skip connection: output = F(x) + x. Gradient flows directly through +."""
    
    def __init__(self, dim: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
            nn.ReLU(),
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
        )
        self.relu = nn.ReLU()
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.relu(self.net(x) + x)   # skip connection

class DeepResNet(nn.Module):
    def __init__(self, d_in: int = 32, n_blocks: int = 10):
        super().__init__()
        self.blocks = nn.Sequential(*[ResidualBlock(d_in) for _ in range(n_blocks)])
        self.head = nn.Linear(d_in, 1)
    
    def forward(self, x):
        return self.head(self.blocks(x))

# ResNet gradient flow: + operation passes gradient unchanged to both branches
# dL/dx_input = dL/d(F(x)+x) × (dF/dx + 1)  1 when dF/dx > -1
# The "+1" term provides a direct gradient highway to early layers

deep_resnet = DeepResNet(d_in=32, n_blocks=20)
n_params = sum(p.numel() for p in deep_resnet.parameters())
print(f"Deep ResNet (20 blocks): {n_params:,} parameters")

X = torch.randn(64, 32)
y = torch.randint(0, 2, (64,)).float()
loss = nn.BCEWithLogitsLoss()(deep_resnet(X).squeeze(), y)
loss.backward()

# All weight gradients should be non-zero (no vanishing)
for name, p in list(deep_resnet.named_parameters())[:6]:
    if p.grad is not None:
        print(f"{name}: {p.grad.norm():.4f}")

Fix 4: Proper Initialisation

Python
import torch
import torch.nn as nn

# Poor init (e.g., all zeros, large random)  saturated activations from step 1
# Xavier init: W ~ N(0, 2/(fan_in + fan_out))  keeps activation variance ~1

def compare_init_gradient_flow(init_strategy: str, n_layers: int = 5) -> float:
    layers = []
    for _ in range(n_layers):
        linear = nn.Linear(64, 64)
        
        if init_strategy == "zeros":
            nn.init.zeros_(linear.weight)
        elif init_strategy == "large":
            nn.init.normal_(linear.weight, std=5.0)
        elif init_strategy == "xavier":
            nn.init.xavier_uniform_(linear.weight)
        elif init_strategy == "kaiming":
            nn.init.kaiming_normal_(linear.weight, nonlinearity="relu")
        
        layers.extend([linear, nn.Sigmoid()])
    layers.append(nn.Linear(64, 1))
    model = nn.Sequential(*layers)
    
    X = torch.randn(32, 64)
    y = torch.randint(0, 2, (32,)).float()
    loss = nn.BCEWithLogitsLoss()(model(X).squeeze(), y)
    loss.backward()
    
    first_layer_grad = next(model.parameters()).grad
    return first_layer_grad.norm().item() if first_layer_grad is not None else 0.0

for strategy in ["zeros", "large", "xavier", "kaiming"]:
    grad_norm = compare_init_gradient_flow(strategy)
    print(f"{strategy:10s}: first-layer gradient norm = {grad_norm:.2e}")

Interview Answer

"Vanishing gradients occur when gradient signals shrink to near-zero as they propagate backward through many layers. The root cause: chain rule multiplies many terms together, and for sigmoid/tanh activations, each term is at most 0.25 (sigmoid) — multiplying ten of these gives ~10⁻⁶. Early layers receive almost no gradient and don't learn. Four solutions: (1) ReLU activation — gradient is 0 or 1, not a decaying fraction; (2) BatchNorm — keeps activations in the non-saturated regime of sigmoid/tanh by normalising before activation; (3) Residual connections (ResNet) — add x directly to F(x), providing a gradient highway that doesn't pass through activations; (4) Proper initialisation (Xavier/Kaiming) — ensures activations start in the appropriate range. ReLU + Kaiming init is the standard for most architectures; residual connections are added for networks deeper than ~10 layers."