Deep Learning for AI Interviews · Lesson 39 of 56

Leaky ReLU, ELU, and GELU

The Dead Neuron Problem

ReLU(z) = max(0, z)

If a neuron's pre-activation is negative for ALL training examples:
  - Output is always 0
  - Gradient is always 0
  - Weight update: ΔW = 0 — the neuron never learns

Causes:
  1. Large negative bias (initialisation or gradient update)
  2. Large learning rate causing a bad weight update
  3. Data distribution mismatch

Consequences:
  - Loss information processing capacity
  - Model may plateau even with more training

Leaky ReLU fixes this by allowing a small gradient for negative inputs.

Leaky ReLU

Python

import torch
import torch.nn as nn

# LeakyReLU(z) = z if z > 0 else α·z
# α (negative_slope) is small: default 0.01
# Gradient: 1 for z > 0, α for z ≤ 0 — never exactly 0

leaky = nn.LeakyReLU(negative_slope=0.01)

z = torch.tensor([-3.0, -1.0, -0.5, 0.0, 0.5, 1.0, 3.0])

import torch.autograd
z.requires_grad_(True)
out = leaky(z)
out.sum().backward()

print("Leaky ReLU:")
print(f"{'z':>8} {'output':>10} {'gradient':>10}")
for zi, oi, gi in zip(z.detach(), out.detach(), z.grad):
    print(f"{zi.item():>8.2f} {oi.item():>10.4f} {gi.item():>10.4f}")

# Gradient is 0.01 for negative, 1.0 for positive — dead neurons avoided

All Major ReLU Variants

Python

import torch
import torch.nn as nn
import numpy as np

def compare_activations() -> None:
    """Compare all major ReLU variants on a range of inputs."""
    z = torch.linspace(-3, 3, 7)
    
    activations = {
        "ReLU":          nn.ReLU(),
        "Leaky ReLU":    nn.LeakyReLU(0.01),
        "ELU":           nn.ELU(alpha=1.0),
        "SELU":          nn.SELU(),
        "GELU":          nn.GELU(),
        "SiLU (Swish)":  nn.SiLU(),
        "Mish":          nn.Mish(),
    }
    
    print(f"{'z':>8}", end="")
    for name in activations:
        print(f" {name[:8]:>10}", end="")
    print()
    
    for i, z_val in enumerate(z):
        print(f"{z_val.item():>8.2f}", end="")
        for act in activations.values():
            val = act(z_val.unsqueeze(0)).item()
            print(f" {val:>10.4f}", end="")
        print()

compare_activations()

# Key differences at z = -1:
# ReLU:     0.0000  (dead if z always negative)
# Leaky:   -0.0100  (tiny gradient preserves)
# ELU:     -0.6321  (smooth, continuous at 0)
# GELU:    -0.1588  (smooth, probabilistic gating)
# SiLU:    -0.2689  (smooth, self-gated)

ELU: Exponential Linear Unit

Python

import torch
import torch.nn as nn

# ELU(z) = z if z > 0 else α(e^z - 1)
# Properties:
#   - Smooth at z=0 (continuous first derivative unlike ReLU)
#   - Negative output for z < 0 → zero-centred mean activation
#   - Never exactly dead: gradient = α·e^z > 0 for z < 0

elu = nn.ELU(alpha=1.0)
z = torch.tensor([-3.0, -1.0, -0.1, 0.0, 0.1, 1.0, 3.0], requires_grad=True)
out = elu(z)
out.sum().backward()

print("ELU properties:")
print(f"  At z=-1: output={elu(torch.tensor([-1.0])).item():.4f}")
print(f"  Gradient at z=-1: {(1.0 * torch.exp(torch.tensor([-1.0]))).item():.4f}")
print(f"  Zero-centred mean for N(0,1) inputs: {elu(torch.randn(10000)).mean().item():.4f}")

# When to use ELU:
# - When dead neurons are a problem and Leaky ReLU isn't sufficient
# - When you want zero-centred activations (can improve convergence)
# - Slower than ReLU due to exp() computation

GELU: Gaussian Error Linear Unit

Python

import torch
import torch.nn as nn
import math

# GELU(z) = z · Φ(z) where Φ is the standard normal CDF
# Approximation: 0.5z · (1 + tanh(√(2/π) · (z + 0.044715z³)))
# Properties:
#   - Smooth everywhere (differentiable)
#   - Not exactly 0 for any z (unlike ReLU)
#   - Probabilistic gating: weights input by P(z > 0) under N(0,1)
#   - Standard in Transformers: BERT, GPT-2, GPT-3, RoBERTa, T5

gelu = nn.GELU()

# Manual approximation (used in original BERT)
def gelu_approx(z: torch.Tensor) -> torch.Tensor:
    return 0.5 * z * (1 + torch.tanh(math.sqrt(2/math.pi) * (z + 0.044715 * z**3)))

z_test = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
gelu_exact  = gelu(z_test)
gelu_manual = gelu_approx(z_test)

print("GELU (PyTorch) vs GELU approximation:")
for zi, ge, gm in zip(z_test, gelu_exact, gelu_manual):
    print(f"  z={zi.item():>5.1f}: exact={ge.item():.4f}, approx={gm.item():.4f}")

# Use GELU for:
# Transformer FFN blocks (BERT, GPT, T5, ViT)
# Any architecture where smooth activation improves performance

SiLU (Swish)

Python

import torch
import torch.nn as nn

# SiLU(z) = z · σ(z) = z / (1 + e^{-z})
# Properties:
#   - Self-gated: output is input weighted by its own sigmoid
#   - Smooth, non-monotonic (has a local minimum around z=-1)
#   - Used in: EfficientNet, MobileNetV3, many modern CNNs
#   - Empirically outperforms ReLU on many vision tasks

silu = nn.SiLU()

z = torch.tensor([-3.0, -1.0, 0.0, 1.0, 3.0])
print("SiLU values:")
for zi in z:
    val = silu(zi).item()
    # Manual: z * sigmoid(z)
    manual = zi.item() * torch.sigmoid(zi).item()
    print(f"  z={zi.item():>5.1f}: SiLU={val:.4f}, manual={manual:.4f}")

Practical Decision Guide

Python

import torch.nn as nn

def choose_activation(
    architecture: str,
    dataset_size: int,
    dead_neuron_problem: bool = False,
) -> nn.Module:
    """Choose activation based on context."""
    
    if architecture == "transformer":
        return nn.GELU()   # standard for attention-based models
    
    if architecture == "efficientnet_style":
        return nn.SiLU()   # smooth gating, good for CNNs
    
    if dead_neuron_problem:
        if dataset_size < 10_000:
            return nn.ELU()           # smooth, non-dying
        return nn.LeakyReLU(0.01)    # simple fix
    
    if architecture in ("mlp", "resnet"):
        return nn.ReLU()   # fast, reliable default
    
    return nn.ReLU()   # safe default for everything else

for arch, n in [("mlp", 10000), ("transformer", 100000), ("efficientnet_style", 50000)]:
    act = choose_activation(arch, n)
    print(f"{arch:20s} (n={n:>7,}): {type(act).__name__}")

Interview Answer

"ReLU variants solve the dead neuron problem: neurons that output 0 for all training inputs receive zero gradient and never update. Leaky ReLU uses a small negative slope (0.01) for z ≤ 0, providing a tiny gradient that keeps neurons alive. ELU uses an exponential for z < 0, producing smooth, zero-centred activations that can improve convergence — at the cost of an extra exp() computation. GELU, the standard for Transformers (BERT, GPT), weights the input by P(x > 0) under a Gaussian, producing a smooth probabilistic gating effect. SiLU (Swish), z·σ(z), is used in EfficientNet and modern CNNs. Practical recommendation: ReLU with Kaiming init for MLP and ResNet; GELU for any Transformer architecture; LeakyReLU if dead neurons are detected (monitor via the fraction of neurons outputting 0 for a test batch)."

ReLU and the Dying ReLU Problem

Next Lesson

Softmax for Multi-Class Output