Sigmoid Activation
The sigmoid function โ its formula, gradient, saturation problem, and when to use it (output layer for binary classification) vs when to avoid it (hidden layers).
The Sigmoid Function
ฯ(z) = 1 / (1 + e^{-z})
Range: (0, 1) โ maps any real number to a probability-like value
Output at z=0: 0.5
Output at zโ+โ: โ 1
Output at zโ-โ: โ 0
Gradient: ฯ'(z) = ฯ(z) ยท (1 - ฯ(z))
Max gradient: 0.25 (at z = 0)
Gradient approaches 0 at ยฑโ (saturation)Sigmoid Properties
import torch
import numpy as np
# Sigmoid values at key points
z_vals = torch.tensor([-5.0, -2.0, -1.0, 0.0, 1.0, 2.0, 5.0])
sigma = torch.sigmoid(z_vals)
grad = sigma * (1 - sigma) # analytical gradient
print(f"{'z':>8} {'ฯ(z)':>10} {'ฯ\'(z)':>10}")
for z, s, g in zip(z_vals, sigma, grad):
print(f"{z.item():>8.1f} {s.item():>10.4f} {g.item():>10.4f}")
# Key observations:
# z =-5: ฯ โ 0.007, grad โ 0.007 (nearly zero)
# z = 0: ฯ = 0.500, grad = 0.250 (maximum)
# z =+5: ฯ โ 0.993, grad โ 0.007 (nearly zero)
# This saturation at extremes = vanishing gradient problem in hidden layersWhen to Use Sigmoid
import torch
import torch.nn as nn
# โโ USE CASE 1: Binary classification output โโ
# Sigmoid maps logit to P(y=1|x)
class BinaryClassifier(nn.Module):
def __init__(self, n_features: int):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n_features, 64),
nn.ReLU(), # ReLU in HIDDEN layers
nn.Linear(64, 32),
nn.ReLU(),
nn.Linear(32, 1), # raw logit
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.net(x) # do NOT apply sigmoid here
def predict_proba(self, x: torch.Tensor) -> torch.Tensor:
with torch.no_grad():
logits = self.net(x)
return torch.sigmoid(logits) # sigmoid at inference for probabilities
# During training: BCEWithLogitsLoss applies sigmoid internally (more stable)
model = BinaryClassifier(n_features=20)
criterion = nn.BCEWithLogitsLoss()
X = torch.randn(32, 20)
y = torch.randint(0, 2, (32,)).float()
loss = criterion(model(X).squeeze(), y)
# For inference
probs = model.predict_proba(X)
print(f"Probability range: [{probs.min():.3f}, {probs.max():.3f}]")
print(f"Predictions: {(probs.squeeze() > 0.5).sum().item()} of {len(probs)} predicted positive")
# โโ USE CASE 2: Multi-label classification output โโ
# Independent sigmoid per label (not softmax โ labels are not mutually exclusive)
class MultiLabelClassifier(nn.Module):
def __init__(self, n_features: int, n_labels: int):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n_features, 64),
nn.ReLU(),
nn.Linear(64, n_labels),
)
def forward(self, x):
return self.net(x) # (batch, n_labels) logits
def predict(self, x, threshold=0.5):
with torch.no_grad():
probs = torch.sigmoid(self.forward(x))
return (probs > threshold).int()When NOT to Use Sigmoid in Hidden Layers
import torch
import torch.nn as nn
# BAD: sigmoid in hidden layers of a deep network
class BadDeepNet(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Linear(10, 64), nn.Sigmoid(), # PROBLEM: vanishing gradient
nn.Linear(64, 64), nn.Sigmoid(), # gradient โค 0.25 at each layer
nn.Linear(64, 64), nn.Sigmoid(), # compounded โ ~0.25^3 โ 0.016
nn.Linear(64, 1),
)
# GOOD: ReLU in hidden layers, sigmoid only at output (if needed)
class GoodDeepNet(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Linear(10, 64), nn.ReLU(), # gradient = 0 or 1, no saturation
nn.Linear(64, 64), nn.ReLU(),
nn.Linear(64, 64), nn.ReLU(),
nn.Linear(64, 1), # raw logit; sigmoid in loss fn
)
# Compare gradient norms at first layer
X = torch.randn(32, 10)
y = torch.randint(0, 2, (32,)).float()
criterion = nn.BCEWithLogitsLoss()
for name, model in [("Bad (Sigmoid hidden)", BadDeepNet()), ("Good (ReLU hidden)", GoodDeepNet())]:
loss = criterion(model(X).squeeze(), y)
loss.backward()
first_weight_grad_norm = list(model.parameters())[0].grad.norm().item()
print(f"{name:30s}: first layer grad norm = {first_weight_grad_norm:.2e}")Sigmoid Numerical Stability
import torch
import numpy as np
# Naive sigmoid implementation can overflow for very negative z
def sigmoid_naive(z: float) -> float:
return 1 / (1 + np.exp(-z))
# For z = -1000: exp(1000) overflows to inf โ 1/(1+inf) = 0 (actually works)
# For z = +1000: exp(-1000) underflows to 0 โ 1/(1+0) = 1 (works)
# For z = -500 in float32: exp(500) = inf โ NaN after division
# Numerically stable version:
def sigmoid_stable(z: float) -> float:
if z >= 0:
return 1 / (1 + np.exp(-z))
else:
exp_z = np.exp(z)
return exp_z / (1 + exp_z)
# PyTorch's torch.sigmoid is already numerically stable
z_extreme = torch.tensor([-1000.0, -500.0, 0.0, 500.0, 1000.0])
print("PyTorch sigmoid on extreme values:")
print(torch.sigmoid(z_extreme)) # [0, 0, 0.5, 1, 1] โ no NaN
# BCEWithLogitsLoss is more stable than applying sigmoid then BCELoss:
# Uses: -yยทz + log(1 + exp(z)) = max(z, 0) - zยทy + log(1 + exp(-|z|))
logits = torch.tensor([100.0, -100.0, 0.0])
labels = torch.tensor([1.0, 0.0, 0.5])
# BCELoss after sigmoid: can produce NaN for extreme logits
probs = torch.sigmoid(logits)
print(f"Probs from extreme logits: {probs}")
# BCEWithLogitsLoss: stable
bce_with_logits = nn.BCEWithLogitsLoss()
loss = bce_with_logits(logits, labels)
print(f"BCEWithLogitsLoss (stable): {loss.item():.4f}")Interview Answer
"Sigmoid maps any real number to (0, 1): ฯ(z) = 1/(1+e^). Its gradient is ฯ(z)(1-ฯ(z)), with maximum 0.25 at z=0 and approaching 0 at ยฑโ (saturation). Use sigmoid only at the output layer for binary classification (to get a probability) or multi-label classification (independent probability per label). Never use sigmoid in hidden layers of deep networks โ the maximum gradient of 0.25 causes vanishing gradients: a 4-layer sigmoid network attenuates gradients by 0.25^4 โ 0.004, making early layers untrainable. In practice: use ReLU (or variants) in hidden layers and let BCEWithLogitsLoss handle the sigmoid at the output โ this is more numerically stable than applying sigmoid yourself because BCEWithLogitsLoss uses a log-sum-exp trick to avoid overflow. One exception: sigmoid can appear in gates within LSTM cells, where it controls information flow rather than serving as a hidden-layer activation."
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.