Backpropagation
How backprop computes gradients layer by layer using the chain rule — the algorithm that makes deep learning trainable.
The Problem Backprop Solves
We need dL/dW for every weight W in the network.
Naively computing this for a 100M parameter network would take 100M forward passes.
Backpropagation solves this in two forward+backward passes:
1. Forward pass: compute predictions and save intermediate activations
2. Backward pass: propagate gradient signal from output to input
using the chain rule
Result: dL/dW for every W in one pass — O(forward pass cost).
This is what makes training deep networks feasible.The Chain Rule
If L = f(g(x)), then dL/dx = (dL/dg) × (dg/dx)
For a 3-layer network:
L = loss(a3)
a3 = σ(z3), z3 = a2 @ W3 + b3
a2 = σ(z2), z2 = a1 @ W2 + b2
a1 = σ(z1), z1 = x @ W1 + b1
dL/dW3 = dL/da3 × da3/dz3 × dz3/dW3 (only 3 terms)
dL/dW2 = dL/da3 × da3/dz3 × dz3/da2 × da2/dz2 × dz2/dW2
dL/dW1 = ... × dz2/da1 × da1/dz1 × dz1/dW1
Key insight: each gradient reuses the gradient from the layer above (δ_L).
δ_L = dL/dz_L (error signal for layer L)
δ_{L-1} = (W_L.T @ δ_L) * σ'(z_{L-1}) (backpropagate δ)Manual Backprop for a 2-Layer Network
import numpy as np
class ManualMLP:
"""Two-layer MLP with manual backpropagation (sigmoid activations)."""
def __init__(self, d_in: int, d_hidden: int, d_out: int):
self.W1 = np.random.randn(d_in, d_hidden) * 0.1
self.b1 = np.zeros(d_hidden)
self.W2 = np.random.randn(d_hidden, d_out) * 0.1
self.b2 = np.zeros(d_out)
@staticmethod
def sigmoid(z): return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
@staticmethod
def sigmoid_grad(a): return a * (1 - a) # σ'(z) in terms of a = σ(z)
def forward(self, X: np.ndarray) -> np.ndarray:
"""Forward pass — save intermediate values for backward."""
self.X = X
self.z1 = X @ self.W1 + self.b1 # (n, d_hidden)
self.a1 = self.sigmoid(self.z1) # (n, d_hidden)
self.z2 = self.a1 @ self.W2 + self.b2 # (n, d_out)
self.a2 = self.sigmoid(self.z2) # (n, d_out)
return self.a2
def backward(self, y: np.ndarray, lr: float = 0.01) -> float:
"""Backward pass — chain rule, layer by layer from output to input."""
n = len(y)
y = y.reshape(-1, 1)
# Loss: BCE = -[y·log(a2) + (1-y)·log(1-a2)]
loss = -np.mean(y * np.log(self.a2 + 1e-8) + (1-y) * np.log(1-self.a2 + 1e-8))
# dL/da2
dL_da2 = -(y / (self.a2 + 1e-8) - (1-y) / (1-self.a2 + 1e-8)) / n
# dL/dz2 = dL/da2 × da2/dz2 = dL/da2 × σ'(z2) = dL/da2 × a2(1-a2)
delta2 = dL_da2 * self.sigmoid_grad(self.a2) # (n, d_out)
# dL/dW2 = a1.T @ delta2
dW2 = self.a1.T @ delta2 # (d_hidden, d_out)
db2 = delta2.sum(axis=0) # (d_out,)
# dL/da1 = delta2 @ W2.T (backprop through linear layer)
dL_da1 = delta2 @ self.W2.T # (n, d_hidden)
# dL/dz1 = dL/da1 × σ'(z1)
delta1 = dL_da1 * self.sigmoid_grad(self.a1) # (n, d_hidden)
# dL/dW1 = X.T @ delta1
dW1 = self.X.T @ delta1 # (d_in, d_hidden)
db1 = delta1.sum(axis=0) # (d_hidden,)
# Weight updates
self.W2 -= lr * dW2
self.b2 -= lr * db2
self.W1 -= lr * dW1
self.b1 -= lr * db1
return float(loss)
# Test on XOR
mlp = ManualMLP(d_in=2, d_hidden=4, d_out=1)
X = np.array([[0,0],[0,1],[1,0],[1,1]], dtype=np.float32)
y = np.array([0, 1, 1, 0], dtype=np.float32)
for epoch in range(5000):
preds = mlp.forward(X)
loss = mlp.backward(y, lr=0.5)
if epoch % 1000 == 0:
print(f"Epoch {epoch:5d}: loss={loss:.4f}, preds={preds.flatten().round(2)}")PyTorch Autograd
import torch
import torch.nn as nn
# PyTorch builds the computation graph dynamically
# and traverses it backward automatically
model = nn.Sequential(
nn.Linear(2, 4),
nn.Sigmoid(),
nn.Linear(4, 1),
nn.Sigmoid(),
)
X = torch.tensor([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
y = torch.tensor([0.0, 1.0, 1.0, 0.0])
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)
# One step, comparing manual vs autograd gradients
optimizer.zero_grad()
preds = model(X).squeeze()
loss = criterion(preds, y)
loss.backward() # autograd computes all gradients
# Inspect computed gradients
for name, param in model.named_parameters():
print(f"{name:15s}: grad norm = {param.grad.norm().item():.6f}")
# Verify: the gradients should match what ManualMLP computes
optimizer.step()
print(f"Loss after step: {criterion(model(X).squeeze(), y).item():.4f}")Gradient Flow Visualisation
import torch
import torch.nn as nn
def check_gradient_flow(model: nn.Module) -> None:
"""Print gradient statistics for each layer after backward."""
print(f"{'Layer':40s} {'Mean |grad|':>12} {'Max |grad|':>12} {'Has NaN?':>10}")
print("-" * 80)
for name, param in model.named_parameters():
if param.grad is None:
print(f"{name:40s} {'(no grad)':>12}")
continue
g = param.grad.abs()
has_nan = torch.isnan(g).any().item()
print(f"{name:40s} {g.mean().item():>12.2e} {g.max().item():>12.2e} {str(has_nan):>10}")
# Build a deep network, do a forward+backward, then check
deep_model = nn.Sequential(
*[nn.Sequential(nn.Linear(32, 32), nn.Sigmoid()) for _ in range(6)],
nn.Linear(32, 1),
)
X = torch.randn(16, 32)
y = torch.randint(0, 2, (16,)).float()
criterion = nn.BCEWithLogitsLoss()
loss = criterion(deep_model(X).squeeze(), y)
loss.backward()
print("Gradient flow through 6-layer sigmoid network:")
check_gradient_flow(deep_model)
# Expect: gradient norms shrink dramatically in early layers (vanishing gradient)Interview Answer
"Backpropagation computes dL/dW for every weight in O(one forward pass) time by recursively applying the chain rule from output to input. Forward pass: propagate input through layers, saving intermediate activations. Backward pass: compute 'delta' (error signal) at the output layer, then propagate it backward — delta for layer L is: δ_L = (W_.T @ δ_) × σ'(z_L). The weight gradient is δ_L.T @ a_. PyTorch implements this automatically via dynamic computation graphs (autograd): every tensor operation records itself, and backward() traverses the graph applying chain rule. The key insight is that intermediate activations must be saved during the forward pass to compute gradients during the backward pass — this is why larger batch sizes require more GPU memory. Without backprop, training a neural network would require one forward pass per parameter to compute finite-difference gradients — completely infeasible for modern networks."
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.