Deep Learning for AI Interviews · Lesson 23 of 56

Learning Rate Schedulers and Warm-Up

Why Schedule the Learning Rate

Fixed learning rate has two problems:

1. Early training: need a large lr to move quickly through parameter space
2. Late training: large lr causes the model to overshoot the minimum
   → loss oscillates near convergence rather than settling

Solution: start with a larger lr, then decay as training progresses.

Rule of thumb: schedule is almost always better than fixed lr.
  The only exception: very short training runs where tuning time > benefit.

Common Schedulers

Python

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.1)

# ── 1. StepLR: multiply lr by gamma every step_size epochs ──
step_scheduler = optim.lr_scheduler.StepLR(
    optimizer, step_size=30, gamma=0.1
)
# epoch 0–29: lr=0.1, epoch 30–59: lr=0.01, epoch 60+: lr=0.001

# ── 2. MultiStepLR: decay at specific milestones ──
multi_scheduler = optim.lr_scheduler.MultiStepLR(
    optimizer, milestones=[30, 60, 90], gamma=0.1
)
# Common for ResNet training: 90 epochs with drops at 30/60/90

# ── 3. ExponentialLR: multiply by gamma every epoch ──
exp_scheduler = optim.lr_scheduler.ExponentialLR(
    optimizer, gamma=0.95   # 5% decay per epoch
)

# ── 4. CosineAnnealingLR: smoothly decay from lr to eta_min ──
cosine_scheduler = optim.lr_scheduler.CosineAnnealingLR(
    optimizer, T_max=100, eta_min=1e-6
)
# Recommended default for most training runs

# ── 5. ReduceLROnPlateau: decay when metric stops improving ──
plateau_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=10, min_lr=1e-6
)
# Called with: plateau_scheduler.step(val_loss)  — after each epoch

# ── 6. OneCycleLR: warmup → peak → cosine decay (1 cycle) ──
one_cycle = optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=0.01,
    steps_per_epoch=100,   # batches per epoch
    epochs=10,
    pct_start=0.3,          # 30% of training spent warming up
)
# Call after each batch (not each epoch)

Cosine Annealing (Recommended Default)

Python

import math

def cosine_lr(
    step: int,
    total_steps: int,
    max_lr: float = 1e-3,
    min_lr: float = 1e-6,
) -> float:
    """Cosine decay from max_lr to min_lr over total_steps."""
    if step >= total_steps:
        return min_lr
    progress = step / total_steps
    cosine_decay = 0.5 * (1 + math.cos(math.pi * progress))
    return min_lr + (max_lr - min_lr) * cosine_decay

# Trace the lr schedule
total_steps = 1000
lrs = [cosine_lr(s, total_steps) for s in range(total_steps)]
print(f"Step   0: lr = {lrs[0]:.2e}")
print(f"Step 250: lr = {lrs[250]:.2e}")
print(f"Step 500: lr = {lrs[500]:.2e}")
print(f"Step 750: lr = {lrs[750]:.2e}")
print(f"Step 999: lr = {lrs[999]:.2e}")

# PyTorch equivalent
model = nn.Linear(10, 1)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=1000, eta_min=1e-6)

Warmup + Cosine Decay (Transformer Standard)

Python

def warmup_cosine_schedule(
    step: int,
    warmup_steps: int,
    total_steps: int,
    max_lr: float = 3e-4,
    min_lr: float = 1e-6,
) -> float:
    """Linear warmup then cosine decay — standard for Transformers."""
    if step < warmup_steps:
        # Linear warmup from 0 to max_lr
        return max_lr * step / warmup_steps
    
    # Cosine decay from max_lr to min_lr
    progress = (step - warmup_steps) / max(1, total_steps - warmup_steps)
    cosine = 0.5 * (1 + math.cos(math.pi * progress))
    return min_lr + (max_lr - min_lr) * cosine

# Example: 500 warmup steps, 10000 total
warmup = 500
total  = 10000
steps  = [0, 100, 499, 500, 2000, 5000, 9999]
for s in steps:
    lr = warmup_cosine_schedule(s, warmup, total, max_lr=3e-4)
    phase = "warmup" if s < warmup else "decay"
    print(f"Step {s:5d} ({phase}): lr = {lr:.2e}")

# In PyTorch using LambdaLR
import torch.optim as optim

def get_warmup_cosine_scheduler(optimizer, warmup_steps, total_steps):
    def schedule(step):
        if step < warmup_steps:
            return step / warmup_steps
        progress = (step - warmup_steps) / (total_steps - warmup_steps)
        return 0.5 * (1 + math.cos(math.pi * progress))
    return optim.lr_scheduler.LambdaLR(optimizer, schedule)

Scheduler in Training Loop

Python

import torch
from torch.utils.data import DataLoader, TensorDataset

def full_training_loop(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    n_epochs: int = 50,
) -> dict:
    optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
    criterion = nn.BCEWithLogitsLoss()
    
    # Cosine annealing over all epochs
    total_steps = n_epochs * len(train_loader)
    scheduler = get_warmup_cosine_scheduler(
        optimizer,
        warmup_steps=total_steps // 10,  # 10% warmup
        total_steps=total_steps,
    )
    
    history = {"train_loss": [], "val_loss": [], "lr": []}
    
    for epoch in range(n_epochs):
        # ── Training ──
        model.train()
        epoch_loss = 0.0
        for X, y in train_loader:
            optimizer.zero_grad()
            loss = criterion(model(X).squeeze(), y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()   # step per BATCH for LambdaLR
            epoch_loss += loss.item()
        
        train_loss = epoch_loss / len(train_loader)
        current_lr = optimizer.param_groups[0]["lr"]
        
        # ── Validation ──
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X, y in val_loader:
                val_loss += criterion(model(X).squeeze(), y).item()
        val_loss /= len(val_loader)
        
        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        history["lr"].append(current_lr)
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch:3d}: train={train_loss:.4f}, val={val_loss:.4f}, lr={current_lr:.2e}")
    
    return history

Scheduler Comparison

Scheduler        | When to use                        | Key parameter
-----------------|------------------------------------|-----------------
StepLR           | Simple baselines, ResNet           | step_size, gamma
MultiStepLR      | Known decay schedule (ResNet)      | milestones, gamma
CosineAnnealingLR| Most tasks, safe default           | T_max (total epochs)
ReduceLROnPlateau| Unsure of epoch budget             | patience, factor
OneCycleLR       | Super-convergence, fast training   | max_lr, pct_start
WarmupCosine     | Transformers, LLMs                 | warmup_steps
ExponentialLR    | Continuous gentle decay            | gamma (≈0.95–0.99)

Interview Answer

"Learning rate schedulers decay the step size over training — starting large to move through parameter space quickly, then reducing to converge precisely. The most common scheduler for general use is cosine annealing: lr decreases smoothly from max to min following a cosine curve, avoiding abrupt drops. For Transformers, the standard is linear warmup + cosine decay: the warmup (typically first 5–10% of training) prevents early gradient explosion when weights are random. ReduceLROnPlateau is adaptive — it halves the lr when validation loss stops improving for N epochs — good when total training epochs are unknown. In PyTorch, step-based schedulers (LambdaLR, CosineAnnealingLR) call scheduler.step() after every batch; epoch-based schedulers (ReduceLROnPlateau) call it after every epoch with the monitored metric. Always log the current learning rate to detect schedule bugs early."

Learning Rate: Biggest Hyperparameter in DL

Next Lesson

Interview: Which Optimizer Would You Choose?