MLP Architecture
Multi-layer perceptrons from scratch β hidden layers, activation functions, parameter counting, and building MLPs for clinical tabular data.
What an MLP Is
Multi-Layer Perceptron (MLP) = Feedforward Neural Network
Structure:
Input Layer β receives features
Hidden Layer(s) β learn intermediate representations
Output Layer β produces predictions
Each layer: Z = X @ W.T + b β A = activation(Z)
"Multi-layer" means at least one hidden layer.
"Perceptron" refers to a single neuron (linear threshold unit).
"Fully connected" / "Dense" are synonyms for MLP layers.Building an MLP in PyTorch
import torch
import torch.nn as nn
# ββ Approach 1: Sequential (simple) ββ
mlp_simple = nn.Sequential(
nn.Linear(20, 128),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(128, 64),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(64, 1),
)
# ββ Approach 2: Custom Module (flexible) ββ
class ClinicalMLP(nn.Module):
"""
MLP for clinical tabular data (readmission prediction).
Input: 20 features (age, vitals, lab values, meds)
Output: logit for 30-day readmission
"""
def __init__(
self,
n_features: int = 20,
hidden_dims: list[int] = [128, 64, 32],
dropout: float = 0.3,
):
super().__init__()
dims = [n_features] + hidden_dims
layers = []
for in_dim, out_dim in zip(dims[:-1], dims[1:]):
layers.extend([
nn.Linear(in_dim, out_dim),
nn.BatchNorm1d(out_dim),
nn.ReLU(),
nn.Dropout(dropout),
])
layers.append(nn.Linear(hidden_dims[-1], 1)) # output head
self.net = nn.Sequential(*layers)
self._init_weights()
def _init_weights(self) -> None:
for module in self.modules():
if isinstance(module, nn.Linear):
nn.init.kaiming_normal_(module.weight, nonlinearity="relu")
nn.init.zeros_(module.bias)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.net(x)
model = ClinicalMLP(n_features=20, hidden_dims=[128, 64, 32], dropout=0.3)
# Inspect
X = torch.randn(32, 20)
out = model(X)
print(f"Input: {X.shape}") # (32, 20)
print(f"Output: {out.shape}") # (32, 1)Parameter Counting
def count_parameters(model: nn.Module) -> dict:
"""Count total and trainable parameters."""
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"{'Layer':40s} {'Shape':25s} {'Params':>10s}")
print("-" * 80)
for name, p in model.named_parameters():
if p.requires_grad:
print(f"{name:40s} {str(tuple(p.shape)):25s} {p.numel():>10,}")
print("-" * 80)
print(f"{'Total':40s} {'':25s} {total:>10,}")
print(f"{'Trainable':40s} {'':25s} {trainable:>10,}")
return {"total": total, "trainable": trainable}
# Formula: Linear(in, out) β in Γ out + out parameters
# 20β128: 20Γ128 + 128 = 2,688
# 128β64: 128Γ64 + 64 = 8,256
# 64β32: 64Γ32 + 32 = 2,080
# 32β1: 32Γ1 + 1 = 33
# Total (linear params): 13,057 (+ BatchNorm params)
model = ClinicalMLP()
stats = count_parameters(model)
print(f"\nTotal parameters: {stats['total']:,}")Depth vs Width
import torch
import torch.nn as nn
def make_mlp(
n_features: int,
architecture: str,
) -> nn.Module:
"""Create MLPs with different depth/width profiles."""
configs = {
# (hidden_dims)
"shallow-wide": [512, 512],
"deep-narrow": [64, 64, 64, 64, 64],
"standard": [128, 64, 32],
"pyramid": [256, 128, 64, 32],
"inverse-pyramid": [32, 64, 128, 256],
}
hidden = configs[architecture]
dims = [n_features] + hidden + [1]
layers = []
for i, (in_d, out_d) in enumerate(zip(dims[:-1], dims[1:])):
layers.append(nn.Linear(in_d, out_d))
if i < len(dims) - 2: # no activation on output layer
layers.extend([nn.ReLU(), nn.Dropout(0.2)])
return nn.Sequential(*layers)
for arch in ["shallow-wide", "deep-narrow", "standard", "pyramid"]:
m = make_mlp(20, arch)
n_params = sum(p.numel() for p in m.parameters())
print(f"{arch:20s}: {n_params:>8,} params")
# General rule: pyramid (wider early, narrower late) works well for tabular data
# Depth helps with complex feature interactions
# Width helps with representing many features simultaneouslyMLP for Different Output Types
import torch.nn as nn
class FlexibleMLP(nn.Module):
"""MLP with configurable output for different tasks."""
def __init__(
self,
n_features: int,
n_outputs: int,
task: str = "binary",
hidden_dims: list[int] = None,
):
super().__init__()
hidden_dims = hidden_dims or [128, 64]
dims = [n_features] + hidden_dims
backbone = []
for in_d, out_d in zip(dims[:-1], dims[1:]):
backbone.extend([nn.Linear(in_d, out_d), nn.ReLU(), nn.Dropout(0.2)])
self.backbone = nn.Sequential(*backbone)
self.head = nn.Linear(hidden_dims[-1], n_outputs)
self.task = task
def forward(self, x):
features = self.backbone(x)
logits = self.head(features)
if self.task == "binary":
return logits # use BCEWithLogitsLoss
elif self.task == "multiclass":
return logits # use CrossEntropyLoss
elif self.task == "regression":
return logits # use MSELoss
elif self.task == "multilabel":
return logits # use BCEWithLogitsLoss per output
return logits
# Binary: patient readmitted? β (batch, 1) logit
binary_model = FlexibleMLP(20, 1, task="binary")
# Multi-class: severity level 1β5? β (batch, 5) logits
multiclass_model = FlexibleMLP(20, 5, task="multiclass")
# Regression: predict INR value β (batch, 1) value
regression_model = FlexibleMLP(20, 1, task="regression")
# Multi-label: which conditions present? β (batch, n_conditions) logits
multilabel_model = FlexibleMLP(20, 10, task="multilabel")Training Template
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
def train_mlp(
model: nn.Module,
train_loader: DataLoader,
val_loader: DataLoader,
n_epochs: int = 50,
lr: float = 3e-4,
) -> nn.Module:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
criterion = nn.BCEWithLogitsLoss()
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs)
best_val_loss = float("inf")
best_weights = None
for epoch in range(n_epochs):
# Training
model.train()
for X, y in train_loader:
X, y = X.to(device), y.to(device)
optimizer.zero_grad()
loss = criterion(model(X).squeeze(), y)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
# Validation
model.eval()
val_loss = 0.0
with torch.no_grad():
for X, y in val_loader:
X, y = X.to(device), y.to(device)
val_loss += criterion(model(X).squeeze(), y).item()
val_loss /= len(val_loader)
if val_loss < best_val_loss:
best_val_loss = val_loss
import copy
best_weights = copy.deepcopy(model.state_dict())
scheduler.step()
if epoch % 10 == 0:
print(f"Epoch {epoch:3d}: val_loss={val_loss:.4f}")
model.load_state_dict(best_weights)
return modelInterview Answer
"An MLP (Multi-Layer Perceptron) is a fully-connected feedforward network: input β hidden layers β output. Each layer computes Z = X @ W.T + b then applies a non-linear activation (ReLU for hidden layers). Parameter count: Linear(in, out) has inΓout + out parameters. In PyTorch, build with nn.Sequential or subclass nn.Module. Key design choices: (1) Width (neurons per layer) vs depth (number of layers) β for tabular clinical data, a pyramid (wider early) with 2β4 layers typically works well; (2) Always include Dropout and BatchNorm in hidden layers for regularisation; (3) Kaiming init for ReLU activations. Output layer has no activation β use BCEWithLogitsLoss for binary, CrossEntropyLoss for multi-class, MSE for regression. The full recipe: AdamW optimiser, cosine LR schedule, gradient clipping at norm 1.0, early stopping on validation loss."
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.