Deep Learning for AI Interviews · Lesson 39 of 56
Leaky ReLU, ELU, and GELU
The Dead Neuron Problem
ReLU(z) = max(0, z)
If a neuron's pre-activation is negative for ALL training examples:
- Output is always 0
- Gradient is always 0
- Weight update: ΔW = 0 — the neuron never learns
Causes:
1. Large negative bias (initialisation or gradient update)
2. Large learning rate causing a bad weight update
3. Data distribution mismatch
Consequences:
- Loss information processing capacity
- Model may plateau even with more training
Leaky ReLU fixes this by allowing a small gradient for negative inputs.Leaky ReLU
import torch
import torch.nn as nn
# LeakyReLU(z) = z if z > 0 else α·z
# α (negative_slope) is small: default 0.01
# Gradient: 1 for z > 0, α for z ≤ 0 — never exactly 0
leaky = nn.LeakyReLU(negative_slope=0.01)
z = torch.tensor([-3.0, -1.0, -0.5, 0.0, 0.5, 1.0, 3.0])
import torch.autograd
z.requires_grad_(True)
out = leaky(z)
out.sum().backward()
print("Leaky ReLU:")
print(f"{'z':>8} {'output':>10} {'gradient':>10}")
for zi, oi, gi in zip(z.detach(), out.detach(), z.grad):
print(f"{zi.item():>8.2f} {oi.item():>10.4f} {gi.item():>10.4f}")
# Gradient is 0.01 for negative, 1.0 for positive — dead neurons avoidedAll Major ReLU Variants
import torch
import torch.nn as nn
import numpy as np
def compare_activations() -> None:
"""Compare all major ReLU variants on a range of inputs."""
z = torch.linspace(-3, 3, 7)
activations = {
"ReLU": nn.ReLU(),
"Leaky ReLU": nn.LeakyReLU(0.01),
"ELU": nn.ELU(alpha=1.0),
"SELU": nn.SELU(),
"GELU": nn.GELU(),
"SiLU (Swish)": nn.SiLU(),
"Mish": nn.Mish(),
}
print(f"{'z':>8}", end="")
for name in activations:
print(f" {name[:8]:>10}", end="")
print()
for i, z_val in enumerate(z):
print(f"{z_val.item():>8.2f}", end="")
for act in activations.values():
val = act(z_val.unsqueeze(0)).item()
print(f" {val:>10.4f}", end="")
print()
compare_activations()
# Key differences at z = -1:
# ReLU: 0.0000 (dead if z always negative)
# Leaky: -0.0100 (tiny gradient preserves)
# ELU: -0.6321 (smooth, continuous at 0)
# GELU: -0.1588 (smooth, probabilistic gating)
# SiLU: -0.2689 (smooth, self-gated)ELU: Exponential Linear Unit
import torch
import torch.nn as nn
# ELU(z) = z if z > 0 else α(e^z - 1)
# Properties:
# - Smooth at z=0 (continuous first derivative unlike ReLU)
# - Negative output for z < 0 → zero-centred mean activation
# - Never exactly dead: gradient = α·e^z > 0 for z < 0
elu = nn.ELU(alpha=1.0)
z = torch.tensor([-3.0, -1.0, -0.1, 0.0, 0.1, 1.0, 3.0], requires_grad=True)
out = elu(z)
out.sum().backward()
print("ELU properties:")
print(f" At z=-1: output={elu(torch.tensor([-1.0])).item():.4f}")
print(f" Gradient at z=-1: {(1.0 * torch.exp(torch.tensor([-1.0]))).item():.4f}")
print(f" Zero-centred mean for N(0,1) inputs: {elu(torch.randn(10000)).mean().item():.4f}")
# When to use ELU:
# - When dead neurons are a problem and Leaky ReLU isn't sufficient
# - When you want zero-centred activations (can improve convergence)
# - Slower than ReLU due to exp() computationGELU: Gaussian Error Linear Unit
import torch
import torch.nn as nn
import math
# GELU(z) = z · Φ(z) where Φ is the standard normal CDF
# Approximation: 0.5z · (1 + tanh(√(2/π) · (z + 0.044715z³)))
# Properties:
# - Smooth everywhere (differentiable)
# - Not exactly 0 for any z (unlike ReLU)
# - Probabilistic gating: weights input by P(z > 0) under N(0,1)
# - Standard in Transformers: BERT, GPT-2, GPT-3, RoBERTa, T5
gelu = nn.GELU()
# Manual approximation (used in original BERT)
def gelu_approx(z: torch.Tensor) -> torch.Tensor:
return 0.5 * z * (1 + torch.tanh(math.sqrt(2/math.pi) * (z + 0.044715 * z**3)))
z_test = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
gelu_exact = gelu(z_test)
gelu_manual = gelu_approx(z_test)
print("GELU (PyTorch) vs GELU approximation:")
for zi, ge, gm in zip(z_test, gelu_exact, gelu_manual):
print(f" z={zi.item():>5.1f}: exact={ge.item():.4f}, approx={gm.item():.4f}")
# Use GELU for:
# Transformer FFN blocks (BERT, GPT, T5, ViT)
# Any architecture where smooth activation improves performanceSiLU (Swish)
import torch
import torch.nn as nn
# SiLU(z) = z · σ(z) = z / (1 + e^{-z})
# Properties:
# - Self-gated: output is input weighted by its own sigmoid
# - Smooth, non-monotonic (has a local minimum around z=-1)
# - Used in: EfficientNet, MobileNetV3, many modern CNNs
# - Empirically outperforms ReLU on many vision tasks
silu = nn.SiLU()
z = torch.tensor([-3.0, -1.0, 0.0, 1.0, 3.0])
print("SiLU values:")
for zi in z:
val = silu(zi).item()
# Manual: z * sigmoid(z)
manual = zi.item() * torch.sigmoid(zi).item()
print(f" z={zi.item():>5.1f}: SiLU={val:.4f}, manual={manual:.4f}")Practical Decision Guide
import torch.nn as nn
def choose_activation(
architecture: str,
dataset_size: int,
dead_neuron_problem: bool = False,
) -> nn.Module:
"""Choose activation based on context."""
if architecture == "transformer":
return nn.GELU() # standard for attention-based models
if architecture == "efficientnet_style":
return nn.SiLU() # smooth gating, good for CNNs
if dead_neuron_problem:
if dataset_size < 10_000:
return nn.ELU() # smooth, non-dying
return nn.LeakyReLU(0.01) # simple fix
if architecture in ("mlp", "resnet"):
return nn.ReLU() # fast, reliable default
return nn.ReLU() # safe default for everything else
for arch, n in [("mlp", 10000), ("transformer", 100000), ("efficientnet_style", 50000)]:
act = choose_activation(arch, n)
print(f"{arch:20s} (n={n:>7,}): {type(act).__name__}")Interview Answer
"ReLU variants solve the dead neuron problem: neurons that output 0 for all training inputs receive zero gradient and never update. Leaky ReLU uses a small negative slope (0.01) for z ≤ 0, providing a tiny gradient that keeps neurons alive. ELU uses an exponential for z < 0, producing smooth, zero-centred activations that can improve convergence — at the cost of an extra exp() computation. GELU, the standard for Transformers (BERT, GPT), weights the input by P(x > 0) under a Gaussian, producing a smooth probabilistic gating effect. SiLU (Swish), z·σ(z), is used in EfficientNet and modern CNNs. Practical recommendation: ReLU with Kaiming init for MLP and ResNet; GELU for any Transformer architecture; LeakyReLU if dead neurons are detected (monitor via the fraction of neurons outputting 0 for a test batch)."