Deep Learning for AI Interviews · Lesson 37 of 56

Sigmoid Activation: Use Case and Problems

The Sigmoid Function

σ(z) = 1 / (1 + e^{-z})

Range: (0, 1) — maps any real number to a probability-like value
Output at z=0: 0.5
Output at z→+∞: → 1
Output at z→-∞: → 0

Gradient: σ'(z) = σ(z) · (1 - σ(z))
Max gradient: 0.25 (at z = 0)
Gradient approaches 0 at ±∞ (saturation)

Sigmoid Properties

Python

import torch
import numpy as np

# Sigmoid values at key points
z_vals = torch.tensor([-5.0, -2.0, -1.0, 0.0, 1.0, 2.0, 5.0])
sigma   = torch.sigmoid(z_vals)
grad    = sigma * (1 - sigma)   # analytical gradient

print(f"{'z':>8} {'σ(z)':>10} {'σ\'(z)':>10}")
for z, s, g in zip(z_vals, sigma, grad):
    print(f"{z.item():>8.1f} {s.item():>10.4f} {g.item():>10.4f}")

# Key observations:
# z =-5: σ ≈ 0.007, grad ≈ 0.007 (nearly zero)
# z = 0: σ = 0.500, grad = 0.250 (maximum)
# z =+5: σ ≈ 0.993, grad ≈ 0.007 (nearly zero)

# This saturation at extremes = vanishing gradient problem in hidden layers

When to Use Sigmoid

Python

import torch
import torch.nn as nn

# ── USE CASE 1: Binary classification output ──
# Sigmoid maps logit to P(y=1|x)

class BinaryClassifier(nn.Module):
    def __init__(self, n_features: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_features, 64),
            nn.ReLU(),           # ReLU in HIDDEN layers
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),   # raw logit
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)   # do NOT apply sigmoid here
    
    def predict_proba(self, x: torch.Tensor) -> torch.Tensor:
        with torch.no_grad():
            logits = self.net(x)
            return torch.sigmoid(logits)   # sigmoid at inference for probabilities

# During training: BCEWithLogitsLoss applies sigmoid internally (more stable)
model = BinaryClassifier(n_features=20)
criterion = nn.BCEWithLogitsLoss()

X = torch.randn(32, 20)
y = torch.randint(0, 2, (32,)).float()
loss = criterion(model(X).squeeze(), y)

# For inference
probs = model.predict_proba(X)
print(f"Probability range: [{probs.min():.3f}, {probs.max():.3f}]")
print(f"Predictions: {(probs.squeeze() > 0.5).sum().item()} of {len(probs)} predicted positive")

# ── USE CASE 2: Multi-label classification output ──
# Independent sigmoid per label (not softmax — labels are not mutually exclusive)

class MultiLabelClassifier(nn.Module):
    def __init__(self, n_features: int, n_labels: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_features, 64),
            nn.ReLU(),
            nn.Linear(64, n_labels),
        )
    
    def forward(self, x):
        return self.net(x)   # (batch, n_labels) logits
    
    def predict(self, x, threshold=0.5):
        with torch.no_grad():
            probs = torch.sigmoid(self.forward(x))
            return (probs > threshold).int()

When NOT to Use Sigmoid in Hidden Layers

Python

import torch
import torch.nn as nn

# BAD: sigmoid in hidden layers of a deep network
class BadDeepNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(10, 64), nn.Sigmoid(),   # PROBLEM: vanishing gradient
            nn.Linear(64, 64), nn.Sigmoid(),   # gradient ≤ 0.25 at each layer
            nn.Linear(64, 64), nn.Sigmoid(),   # compounded → ~0.25^3 ≈ 0.016
            nn.Linear(64, 1),
        )

# GOOD: ReLU in hidden layers, sigmoid only at output (if needed)
class GoodDeepNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(10, 64), nn.ReLU(),       # gradient = 0 or 1, no saturation
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, 64), nn.ReLU(),
            nn.Linear(64, 1),                   # raw logit; sigmoid in loss fn
        )

# Compare gradient norms at first layer
X = torch.randn(32, 10)
y = torch.randint(0, 2, (32,)).float()
criterion = nn.BCEWithLogitsLoss()

for name, model in [("Bad (Sigmoid hidden)", BadDeepNet()), ("Good (ReLU hidden)", GoodDeepNet())]:
    loss = criterion(model(X).squeeze(), y)
    loss.backward()
    first_weight_grad_norm = list(model.parameters())[0].grad.norm().item()
    print(f"{name:30s}: first layer grad norm = {first_weight_grad_norm:.2e}")

Sigmoid Numerical Stability

Python

import torch
import numpy as np

# Naive sigmoid implementation can overflow for very negative z
def sigmoid_naive(z: float) -> float:
    return 1 / (1 + np.exp(-z))

# For z = -1000: exp(1000) overflows to inf → 1/(1+inf) = 0 (actually works)
# For z = +1000: exp(-1000) underflows to 0 → 1/(1+0) = 1 (works)
# For z = -500 in float32: exp(500) = inf → NaN after division

# Numerically stable version:
def sigmoid_stable(z: float) -> float:
    if z >= 0:
        return 1 / (1 + np.exp(-z))
    else:
        exp_z = np.exp(z)
        return exp_z / (1 + exp_z)

# PyTorch's torch.sigmoid is already numerically stable
z_extreme = torch.tensor([-1000.0, -500.0, 0.0, 500.0, 1000.0])
print("PyTorch sigmoid on extreme values:")
print(torch.sigmoid(z_extreme))   # [0, 0, 0.5, 1, 1] — no NaN

# BCEWithLogitsLoss is more stable than applying sigmoid then BCELoss:
# Uses: -y·z + log(1 + exp(z)) = max(z, 0) - z·y + log(1 + exp(-|z|))
logits = torch.tensor([100.0, -100.0, 0.0])
labels = torch.tensor([1.0, 0.0, 0.5])

# BCELoss after sigmoid: can produce NaN for extreme logits
probs = torch.sigmoid(logits)
print(f"Probs from extreme logits: {probs}")

# BCEWithLogitsLoss: stable
bce_with_logits = nn.BCEWithLogitsLoss()
loss = bce_with_logits(logits, labels)
print(f"BCEWithLogitsLoss (stable): {loss.item():.4f}")

Interview Answer

"Sigmoid maps any real number to (0, 1): σ(z) = 1/(1+e^). Its gradient is σ(z)(1-σ(z)), with maximum 0.25 at z=0 and approaching 0 at ±∞ (saturation). Use sigmoid only at the output layer for binary classification (to get a probability) or multi-label classification (independent probability per label). Never use sigmoid in hidden layers of deep networks — the maximum gradient of 0.25 causes vanishing gradients: a 4-layer sigmoid network attenuates gradients by 0.25^4 ≈ 0.004, making early layers untrainable. In practice: use ReLU (or variants) in hidden layers and let BCEWithLogitsLoss handle the sigmoid at the output — this is more numerically stable than applying sigmoid yourself because BCEWithLogitsLoss uses a log-sum-exp trick to avoid overflow. One exception: sigmoid can appear in gates within LSTM cells, where it controls information flow rather than serving as a hidden-layer activation."

Exploding Gradients and Gradient Clipping

Next Lesson

ReLU and the Dying ReLU Problem