Learning Rate Schedulers

Why Schedule the Learning Rate

Fixed learning rate has two problems:

1. Early training: need a large lr to move quickly through parameter space
2. Late training: large lr causes the model to overshoot the minimum
   → loss oscillates near convergence rather than settling

Solution: start with a larger lr, then decay as training progresses.

Rule of thumb: schedule is almost always better than fixed lr.
  The only exception: very short training runs where tuning time > benefit.

Common Schedulers

Python

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.1)

# ── 1. StepLR: multiply lr by gamma every step_size epochs ──
step_scheduler = optim.lr_scheduler.StepLR(
    optimizer, step_size=30, gamma=0.1
)
# epoch 0–29: lr=0.1, epoch 30–59: lr=0.01, epoch 60+: lr=0.001

# ── 2. MultiStepLR: decay at specific milestones ──
multi_scheduler = optim.lr_scheduler.MultiStepLR(
    optimizer, milestones=[30, 60, 90], gamma=0.1
)
# Common for ResNet training: 90 epochs with drops at 30/60/90

# ── 3. ExponentialLR: multiply by gamma every epoch ──
exp_scheduler = optim.lr_scheduler.ExponentialLR(
    optimizer, gamma=0.95   # 5% decay per epoch
)

# ── 4. CosineAnnealingLR: smoothly decay from lr to eta_min ──
cosine_scheduler = optim.lr_scheduler.CosineAnnealingLR(
    optimizer, T_max=100, eta_min=1e-6
)
# Recommended default for most training runs

# ── 5. ReduceLROnPlateau: decay when metric stops improving ──
plateau_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=10, min_lr=1e-6
)
# Called with: plateau_scheduler.step(val_loss)  — after each epoch

# ── 6. OneCycleLR: warmup → peak → cosine decay (1 cycle) ──
one_cycle = optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=0.01,
    steps_per_epoch=100,   # batches per epoch
    epochs=10,
    pct_start=0.3,          # 30% of training spent warming up
)
# Call after each batch (not each epoch)

Cosine Annealing (Recommended Default)

Python

import math

def cosine_lr(
    step: int,
    total_steps: int,
    max_lr: float = 1e-3,
    min_lr: float = 1e-6,
) -> float:
    """Cosine decay from max_lr to min_lr over total_steps."""
    if step >= total_steps:
        return min_lr
    progress = step / total_steps
    cosine_decay = 0.5 * (1 + math.cos(math.pi * progress))
    return min_lr + (max_lr - min_lr) * cosine_decay

# Trace the lr schedule
total_steps = 1000
lrs = [cosine_lr(s, total_steps) for s in range(total_steps)]
print(f"Step   0: lr = {lrs[0]:.2e}")
print(f"Step 250: lr = {lrs[250]:.2e}")
print(f"Step 500: lr = {lrs[500]:.2e}")
print(f"Step 750: lr = {lrs[750]:.2e}")
print(f"Step 999: lr = {lrs[999]:.2e}")

# PyTorch equivalent
model = nn.Linear(10, 1)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=1000, eta_min=1e-6)

Warmup + Cosine Decay (Transformer Standard)

Python

def warmup_cosine_schedule(
    step: int,
    warmup_steps: int,
    total_steps: int,
    max_lr: float = 3e-4,
    min_lr: float = 1e-6,
) -> float:
    """Linear warmup then cosine decay — standard for Transformers."""
    if step < warmup_steps:
        # Linear warmup from 0 to max_lr
        return max_lr * step / warmup_steps
    
    # Cosine decay from max_lr to min_lr
    progress = (step - warmup_steps) / max(1, total_steps - warmup_steps)
    cosine = 0.5 * (1 + math.cos(math.pi * progress))
    return min_lr + (max_lr - min_lr) * cosine

# Example: 500 warmup steps, 10000 total
warmup = 500
total  = 10000
steps  = [0, 100, 499, 500, 2000, 5000, 9999]
for s in steps:
    lr = warmup_cosine_schedule(s, warmup, total, max_lr=3e-4)
    phase = "warmup" if s < warmup else "decay"
    print(f"Step {s:5d} ({phase}): lr = {lr:.2e}")

# In PyTorch using LambdaLR
import torch.optim as optim

def get_warmup_cosine_scheduler(optimizer, warmup_steps, total_steps):
    def schedule(step):
        if step < warmup_steps:
            return step / warmup_steps
        progress = (step - warmup_steps) / (total_steps - warmup_steps)
        return 0.5 * (1 + math.cos(math.pi * progress))
    return optim.lr_scheduler.LambdaLR(optimizer, schedule)

Scheduler in Training Loop

Python

import torch
from torch.utils.data import DataLoader, TensorDataset

def full_training_loop(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    n_epochs: int = 50,
) -> dict:
    optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
    criterion = nn.BCEWithLogitsLoss()
    
    # Cosine annealing over all epochs
    total_steps = n_epochs * len(train_loader)
    scheduler = get_warmup_cosine_scheduler(
        optimizer,
        warmup_steps=total_steps // 10,  # 10% warmup
        total_steps=total_steps,
    )
    
    history = {"train_loss": [], "val_loss": [], "lr": []}
    
    for epoch in range(n_epochs):
        # ── Training ──
        model.train()
        epoch_loss = 0.0
        for X, y in train_loader:
            optimizer.zero_grad()
            loss = criterion(model(X).squeeze(), y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()   # step per BATCH for LambdaLR
            epoch_loss += loss.item()
        
        train_loss = epoch_loss / len(train_loader)
        current_lr = optimizer.param_groups[0]["lr"]
        
        # ── Validation ──
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X, y in val_loader:
                val_loss += criterion(model(X).squeeze(), y).item()
        val_loss /= len(val_loader)
        
        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        history["lr"].append(current_lr)
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch:3d}: train={train_loss:.4f}, val={val_loss:.4f}, lr={current_lr:.2e}")
    
    return history

Scheduler Comparison

Scheduler        | When to use                        | Key parameter
-----------------|------------------------------------|-----------------
StepLR           | Simple baselines, ResNet           | step_size, gamma
MultiStepLR      | Known decay schedule (ResNet)      | milestones, gamma
CosineAnnealingLR| Most tasks, safe default           | T_max (total epochs)
ReduceLROnPlateau| Unsure of epoch budget             | patience, factor
OneCycleLR       | Super-convergence, fast training   | max_lr, pct_start
WarmupCosine     | Transformers, LLMs                 | warmup_steps
ExponentialLR    | Continuous gentle decay            | gamma (≈0.95–0.99)

Interview Answer

"Learning rate schedulers decay the step size over training — starting large to move through parameter space quickly, then reducing to converge precisely. The most common scheduler for general use is cosine annealing: lr decreases smoothly from max to min following a cosine curve, avoiding abrupt drops. For Transformers, the standard is linear warmup + cosine decay: the warmup (typically first 5–10% of training) prevents early gradient explosion when weights are random. ReduceLROnPlateau is adaptive — it halves the lr when validation loss stops improving for N epochs — good when total training epochs are unknown. In PyTorch, step-based schedulers (LambdaLR, CosineAnnealingLR) call scheduler.step() after every batch; epoch-based schedulers (ReduceLROnPlateau) call it after every epoch with the monitored metric. Always log the current learning rate to detect schedule bugs early."

Learning Rate Schedulers

Why Schedule the Learning Rate

Common Schedulers

Cosine Annealing (Recommended Default)

Warmup + Cosine Decay (Transformer Standard)

Scheduler in Training Loop

Scheduler Comparison

Interview Answer

Enjoyed this article?

Leave a comment