Deep Learning for AI Interviews · Lesson 37 of 56
Sigmoid Activation: Use Case and Problems
The Sigmoid Function
σ(z) = 1 / (1 + e^{-z})
Range: (0, 1) — maps any real number to a probability-like value
Output at z=0: 0.5
Output at z→+∞: → 1
Output at z→-∞: → 0
Gradient: σ'(z) = σ(z) · (1 - σ(z))
Max gradient: 0.25 (at z = 0)
Gradient approaches 0 at ±∞ (saturation)Sigmoid Properties
import torch
import numpy as np
# Sigmoid values at key points
z_vals = torch.tensor([-5.0, -2.0, -1.0, 0.0, 1.0, 2.0, 5.0])
sigma = torch.sigmoid(z_vals)
grad = sigma * (1 - sigma) # analytical gradient
print(f"{'z':>8} {'σ(z)':>10} {'σ\'(z)':>10}")
for z, s, g in zip(z_vals, sigma, grad):
print(f"{z.item():>8.1f} {s.item():>10.4f} {g.item():>10.4f}")
# Key observations:
# z =-5: σ ≈ 0.007, grad ≈ 0.007 (nearly zero)
# z = 0: σ = 0.500, grad = 0.250 (maximum)
# z =+5: σ ≈ 0.993, grad ≈ 0.007 (nearly zero)
# This saturation at extremes = vanishing gradient problem in hidden layersWhen to Use Sigmoid
import torch
import torch.nn as nn
# ── USE CASE 1: Binary classification output ──
# Sigmoid maps logit to P(y=1|x)
class BinaryClassifier(nn.Module):
def __init__(self, n_features: int):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n_features, 64),
nn.ReLU(), # ReLU in HIDDEN layers
nn.Linear(64, 32),
nn.ReLU(),
nn.Linear(32, 1), # raw logit
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.net(x) # do NOT apply sigmoid here
def predict_proba(self, x: torch.Tensor) -> torch.Tensor:
with torch.no_grad():
logits = self.net(x)
return torch.sigmoid(logits) # sigmoid at inference for probabilities
# During training: BCEWithLogitsLoss applies sigmoid internally (more stable)
model = BinaryClassifier(n_features=20)
criterion = nn.BCEWithLogitsLoss()
X = torch.randn(32, 20)
y = torch.randint(0, 2, (32,)).float()
loss = criterion(model(X).squeeze(), y)
# For inference
probs = model.predict_proba(X)
print(f"Probability range: [{probs.min():.3f}, {probs.max():.3f}]")
print(f"Predictions: {(probs.squeeze() > 0.5).sum().item()} of {len(probs)} predicted positive")
# ── USE CASE 2: Multi-label classification output ──
# Independent sigmoid per label (not softmax — labels are not mutually exclusive)
class MultiLabelClassifier(nn.Module):
def __init__(self, n_features: int, n_labels: int):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n_features, 64),
nn.ReLU(),
nn.Linear(64, n_labels),
)
def forward(self, x):
return self.net(x) # (batch, n_labels) logits
def predict(self, x, threshold=0.5):
with torch.no_grad():
probs = torch.sigmoid(self.forward(x))
return (probs > threshold).int()When NOT to Use Sigmoid in Hidden Layers
import torch
import torch.nn as nn
# BAD: sigmoid in hidden layers of a deep network
class BadDeepNet(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Linear(10, 64), nn.Sigmoid(), # PROBLEM: vanishing gradient
nn.Linear(64, 64), nn.Sigmoid(), # gradient ≤ 0.25 at each layer
nn.Linear(64, 64), nn.Sigmoid(), # compounded → ~0.25^3 ≈ 0.016
nn.Linear(64, 1),
)
# GOOD: ReLU in hidden layers, sigmoid only at output (if needed)
class GoodDeepNet(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Linear(10, 64), nn.ReLU(), # gradient = 0 or 1, no saturation
nn.Linear(64, 64), nn.ReLU(),
nn.Linear(64, 64), nn.ReLU(),
nn.Linear(64, 1), # raw logit; sigmoid in loss fn
)
# Compare gradient norms at first layer
X = torch.randn(32, 10)
y = torch.randint(0, 2, (32,)).float()
criterion = nn.BCEWithLogitsLoss()
for name, model in [("Bad (Sigmoid hidden)", BadDeepNet()), ("Good (ReLU hidden)", GoodDeepNet())]:
loss = criterion(model(X).squeeze(), y)
loss.backward()
first_weight_grad_norm = list(model.parameters())[0].grad.norm().item()
print(f"{name:30s}: first layer grad norm = {first_weight_grad_norm:.2e}")Sigmoid Numerical Stability
import torch
import numpy as np
# Naive sigmoid implementation can overflow for very negative z
def sigmoid_naive(z: float) -> float:
return 1 / (1 + np.exp(-z))
# For z = -1000: exp(1000) overflows to inf → 1/(1+inf) = 0 (actually works)
# For z = +1000: exp(-1000) underflows to 0 → 1/(1+0) = 1 (works)
# For z = -500 in float32: exp(500) = inf → NaN after division
# Numerically stable version:
def sigmoid_stable(z: float) -> float:
if z >= 0:
return 1 / (1 + np.exp(-z))
else:
exp_z = np.exp(z)
return exp_z / (1 + exp_z)
# PyTorch's torch.sigmoid is already numerically stable
z_extreme = torch.tensor([-1000.0, -500.0, 0.0, 500.0, 1000.0])
print("PyTorch sigmoid on extreme values:")
print(torch.sigmoid(z_extreme)) # [0, 0, 0.5, 1, 1] — no NaN
# BCEWithLogitsLoss is more stable than applying sigmoid then BCELoss:
# Uses: -y·z + log(1 + exp(z)) = max(z, 0) - z·y + log(1 + exp(-|z|))
logits = torch.tensor([100.0, -100.0, 0.0])
labels = torch.tensor([1.0, 0.0, 0.5])
# BCELoss after sigmoid: can produce NaN for extreme logits
probs = torch.sigmoid(logits)
print(f"Probs from extreme logits: {probs}")
# BCEWithLogitsLoss: stable
bce_with_logits = nn.BCEWithLogitsLoss()
loss = bce_with_logits(logits, labels)
print(f"BCEWithLogitsLoss (stable): {loss.item():.4f}")Interview Answer
"Sigmoid maps any real number to (0, 1): σ(z) = 1/(1+e^). Its gradient is σ(z)(1-σ(z)), with maximum 0.25 at z=0 and approaching 0 at ±∞ (saturation). Use sigmoid only at the output layer for binary classification (to get a probability) or multi-label classification (independent probability per label). Never use sigmoid in hidden layers of deep networks — the maximum gradient of 0.25 causes vanishing gradients: a 4-layer sigmoid network attenuates gradients by 0.25^4 ≈ 0.004, making early layers untrainable. In practice: use ReLU (or variants) in hidden layers and let BCEWithLogitsLoss handle the sigmoid at the output — this is more numerically stable than applying sigmoid yourself because BCEWithLogitsLoss uses a log-sum-exp trick to avoid overflow. One exception: sigmoid can appear in gates within LSTM cells, where it controls information flow rather than serving as a hidden-layer activation."