Leaky ReLU and ReLU Variants
Why Leaky ReLU, ELU, PReLU, and GELU were invented, what problem each solves, and when to use them over plain ReLU.
The Dead Neuron Problem
ReLU(z) = max(0, z)
If a neuron's pre-activation is negative for ALL training examples:
- Output is always 0
- Gradient is always 0
- Weight update: ΔW = 0 — the neuron never learns
Causes:
1. Large negative bias (initialisation or gradient update)
2. Large learning rate causing a bad weight update
3. Data distribution mismatch
Consequences:
- Loss information processing capacity
- Model may plateau even with more training
Leaky ReLU fixes this by allowing a small gradient for negative inputs.Leaky ReLU
import torch
import torch.nn as nn
# LeakyReLU(z) = z if z > 0 else α·z
# α (negative_slope) is small: default 0.01
# Gradient: 1 for z > 0, α for z ≤ 0 — never exactly 0
leaky = nn.LeakyReLU(negative_slope=0.01)
z = torch.tensor([-3.0, -1.0, -0.5, 0.0, 0.5, 1.0, 3.0])
import torch.autograd
z.requires_grad_(True)
out = leaky(z)
out.sum().backward()
print("Leaky ReLU:")
print(f"{'z':>8} {'output':>10} {'gradient':>10}")
for zi, oi, gi in zip(z.detach(), out.detach(), z.grad):
print(f"{zi.item():>8.2f} {oi.item():>10.4f} {gi.item():>10.4f}")
# Gradient is 0.01 for negative, 1.0 for positive — dead neurons avoidedAll Major ReLU Variants
import torch
import torch.nn as nn
import numpy as np
def compare_activations() -> None:
"""Compare all major ReLU variants on a range of inputs."""
z = torch.linspace(-3, 3, 7)
activations = {
"ReLU": nn.ReLU(),
"Leaky ReLU": nn.LeakyReLU(0.01),
"ELU": nn.ELU(alpha=1.0),
"SELU": nn.SELU(),
"GELU": nn.GELU(),
"SiLU (Swish)": nn.SiLU(),
"Mish": nn.Mish(),
}
print(f"{'z':>8}", end="")
for name in activations:
print(f" {name[:8]:>10}", end="")
print()
for i, z_val in enumerate(z):
print(f"{z_val.item():>8.2f}", end="")
for act in activations.values():
val = act(z_val.unsqueeze(0)).item()
print(f" {val:>10.4f}", end="")
print()
compare_activations()
# Key differences at z = -1:
# ReLU: 0.0000 (dead if z always negative)
# Leaky: -0.0100 (tiny gradient preserves)
# ELU: -0.6321 (smooth, continuous at 0)
# GELU: -0.1588 (smooth, probabilistic gating)
# SiLU: -0.2689 (smooth, self-gated)ELU: Exponential Linear Unit
import torch
import torch.nn as nn
# ELU(z) = z if z > 0 else α(e^z - 1)
# Properties:
# - Smooth at z=0 (continuous first derivative unlike ReLU)
# - Negative output for z < 0 → zero-centred mean activation
# - Never exactly dead: gradient = α·e^z > 0 for z < 0
elu = nn.ELU(alpha=1.0)
z = torch.tensor([-3.0, -1.0, -0.1, 0.0, 0.1, 1.0, 3.0], requires_grad=True)
out = elu(z)
out.sum().backward()
print("ELU properties:")
print(f" At z=-1: output={elu(torch.tensor([-1.0])).item():.4f}")
print(f" Gradient at z=-1: {(1.0 * torch.exp(torch.tensor([-1.0]))).item():.4f}")
print(f" Zero-centred mean for N(0,1) inputs: {elu(torch.randn(10000)).mean().item():.4f}")
# When to use ELU:
# - When dead neurons are a problem and Leaky ReLU isn't sufficient
# - When you want zero-centred activations (can improve convergence)
# - Slower than ReLU due to exp() computationGELU: Gaussian Error Linear Unit
import torch
import torch.nn as nn
import math
# GELU(z) = z · Φ(z) where Φ is the standard normal CDF
# Approximation: 0.5z · (1 + tanh(√(2/π) · (z + 0.044715z³)))
# Properties:
# - Smooth everywhere (differentiable)
# - Not exactly 0 for any z (unlike ReLU)
# - Probabilistic gating: weights input by P(z > 0) under N(0,1)
# - Standard in Transformers: BERT, GPT-2, GPT-3, RoBERTa, T5
gelu = nn.GELU()
# Manual approximation (used in original BERT)
def gelu_approx(z: torch.Tensor) -> torch.Tensor:
return 0.5 * z * (1 + torch.tanh(math.sqrt(2/math.pi) * (z + 0.044715 * z**3)))
z_test = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
gelu_exact = gelu(z_test)
gelu_manual = gelu_approx(z_test)
print("GELU (PyTorch) vs GELU approximation:")
for zi, ge, gm in zip(z_test, gelu_exact, gelu_manual):
print(f" z={zi.item():>5.1f}: exact={ge.item():.4f}, approx={gm.item():.4f}")
# Use GELU for:
# Transformer FFN blocks (BERT, GPT, T5, ViT)
# Any architecture where smooth activation improves performanceSiLU (Swish)
import torch
import torch.nn as nn
# SiLU(z) = z · σ(z) = z / (1 + e^{-z})
# Properties:
# - Self-gated: output is input weighted by its own sigmoid
# - Smooth, non-monotonic (has a local minimum around z=-1)
# - Used in: EfficientNet, MobileNetV3, many modern CNNs
# - Empirically outperforms ReLU on many vision tasks
silu = nn.SiLU()
z = torch.tensor([-3.0, -1.0, 0.0, 1.0, 3.0])
print("SiLU values:")
for zi in z:
val = silu(zi).item()
# Manual: z * sigmoid(z)
manual = zi.item() * torch.sigmoid(zi).item()
print(f" z={zi.item():>5.1f}: SiLU={val:.4f}, manual={manual:.4f}")Practical Decision Guide
import torch.nn as nn
def choose_activation(
architecture: str,
dataset_size: int,
dead_neuron_problem: bool = False,
) -> nn.Module:
"""Choose activation based on context."""
if architecture == "transformer":
return nn.GELU() # standard for attention-based models
if architecture == "efficientnet_style":
return nn.SiLU() # smooth gating, good for CNNs
if dead_neuron_problem:
if dataset_size < 10_000:
return nn.ELU() # smooth, non-dying
return nn.LeakyReLU(0.01) # simple fix
if architecture in ("mlp", "resnet"):
return nn.ReLU() # fast, reliable default
return nn.ReLU() # safe default for everything else
for arch, n in [("mlp", 10000), ("transformer", 100000), ("efficientnet_style", 50000)]:
act = choose_activation(arch, n)
print(f"{arch:20s} (n={n:>7,}): {type(act).__name__}")Interview Answer
"ReLU variants solve the dead neuron problem: neurons that output 0 for all training inputs receive zero gradient and never update. Leaky ReLU uses a small negative slope (0.01) for z ≤ 0, providing a tiny gradient that keeps neurons alive. ELU uses an exponential for z < 0, producing smooth, zero-centred activations that can improve convergence — at the cost of an extra exp() computation. GELU, the standard for Transformers (BERT, GPT), weights the input by P(x > 0) under a Gaussian, producing a smooth probabilistic gating effect. SiLU (Swish), z·σ(z), is used in EfficientNet and modern CNNs. Practical recommendation: ReLU with Kaiming init for MLP and ResNet; GELU for any Transformer architecture; LeakyReLU if dead neurons are detected (monitor via the fraction of neurons outputting 0 for a test batch)."
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.