Deep Learning for AI Interviews · Lesson 28 of 56
Multi-Layer Perceptron Architecture
What an MLP Is
Multi-Layer Perceptron (MLP) = Feedforward Neural Network
Structure:
Input Layer → receives features
Hidden Layer(s) → learn intermediate representations
Output Layer → produces predictions
Each layer: Z = X @ W.T + b → A = activation(Z)
"Multi-layer" means at least one hidden layer.
"Perceptron" refers to a single neuron (linear threshold unit).
"Fully connected" / "Dense" are synonyms for MLP layers.Building an MLP in PyTorch
import torch
import torch.nn as nn
# ── Approach 1: Sequential (simple) ──
mlp_simple = nn.Sequential(
nn.Linear(20, 128),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(128, 64),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(64, 1),
)
# ── Approach 2: Custom Module (flexible) ──
class ClinicalMLP(nn.Module):
"""
MLP for clinical tabular data (readmission prediction).
Input: 20 features (age, vitals, lab values, meds)
Output: logit for 30-day readmission
"""
def __init__(
self,
n_features: int = 20,
hidden_dims: list[int] = [128, 64, 32],
dropout: float = 0.3,
):
super().__init__()
dims = [n_features] + hidden_dims
layers = []
for in_dim, out_dim in zip(dims[:-1], dims[1:]):
layers.extend([
nn.Linear(in_dim, out_dim),
nn.BatchNorm1d(out_dim),
nn.ReLU(),
nn.Dropout(dropout),
])
layers.append(nn.Linear(hidden_dims[-1], 1)) # output head
self.net = nn.Sequential(*layers)
self._init_weights()
def _init_weights(self) -> None:
for module in self.modules():
if isinstance(module, nn.Linear):
nn.init.kaiming_normal_(module.weight, nonlinearity="relu")
nn.init.zeros_(module.bias)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.net(x)
model = ClinicalMLP(n_features=20, hidden_dims=[128, 64, 32], dropout=0.3)
# Inspect
X = torch.randn(32, 20)
out = model(X)
print(f"Input: {X.shape}") # (32, 20)
print(f"Output: {out.shape}") # (32, 1)Parameter Counting
def count_parameters(model: nn.Module) -> dict:
"""Count total and trainable parameters."""
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"{'Layer':40s} {'Shape':25s} {'Params':>10s}")
print("-" * 80)
for name, p in model.named_parameters():
if p.requires_grad:
print(f"{name:40s} {str(tuple(p.shape)):25s} {p.numel():>10,}")
print("-" * 80)
print(f"{'Total':40s} {'':25s} {total:>10,}")
print(f"{'Trainable':40s} {'':25s} {trainable:>10,}")
return {"total": total, "trainable": trainable}
# Formula: Linear(in, out) → in × out + out parameters
# 20→128: 20×128 + 128 = 2,688
# 128→64: 128×64 + 64 = 8,256
# 64→32: 64×32 + 32 = 2,080
# 32→1: 32×1 + 1 = 33
# Total (linear params): 13,057 (+ BatchNorm params)
model = ClinicalMLP()
stats = count_parameters(model)
print(f"\nTotal parameters: {stats['total']:,}")Depth vs Width
import torch
import torch.nn as nn
def make_mlp(
n_features: int,
architecture: str,
) -> nn.Module:
"""Create MLPs with different depth/width profiles."""
configs = {
# (hidden_dims)
"shallow-wide": [512, 512],
"deep-narrow": [64, 64, 64, 64, 64],
"standard": [128, 64, 32],
"pyramid": [256, 128, 64, 32],
"inverse-pyramid": [32, 64, 128, 256],
}
hidden = configs[architecture]
dims = [n_features] + hidden + [1]
layers = []
for i, (in_d, out_d) in enumerate(zip(dims[:-1], dims[1:])):
layers.append(nn.Linear(in_d, out_d))
if i < len(dims) - 2: # no activation on output layer
layers.extend([nn.ReLU(), nn.Dropout(0.2)])
return nn.Sequential(*layers)
for arch in ["shallow-wide", "deep-narrow", "standard", "pyramid"]:
m = make_mlp(20, arch)
n_params = sum(p.numel() for p in m.parameters())
print(f"{arch:20s}: {n_params:>8,} params")
# General rule: pyramid (wider early, narrower late) works well for tabular data
# Depth helps with complex feature interactions
# Width helps with representing many features simultaneouslyMLP for Different Output Types
import torch.nn as nn
class FlexibleMLP(nn.Module):
"""MLP with configurable output for different tasks."""
def __init__(
self,
n_features: int,
n_outputs: int,
task: str = "binary",
hidden_dims: list[int] = None,
):
super().__init__()
hidden_dims = hidden_dims or [128, 64]
dims = [n_features] + hidden_dims
backbone = []
for in_d, out_d in zip(dims[:-1], dims[1:]):
backbone.extend([nn.Linear(in_d, out_d), nn.ReLU(), nn.Dropout(0.2)])
self.backbone = nn.Sequential(*backbone)
self.head = nn.Linear(hidden_dims[-1], n_outputs)
self.task = task
def forward(self, x):
features = self.backbone(x)
logits = self.head(features)
if self.task == "binary":
return logits # use BCEWithLogitsLoss
elif self.task == "multiclass":
return logits # use CrossEntropyLoss
elif self.task == "regression":
return logits # use MSELoss
elif self.task == "multilabel":
return logits # use BCEWithLogitsLoss per output
return logits
# Binary: patient readmitted? → (batch, 1) logit
binary_model = FlexibleMLP(20, 1, task="binary")
# Multi-class: severity level 1–5? → (batch, 5) logits
multiclass_model = FlexibleMLP(20, 5, task="multiclass")
# Regression: predict INR value → (batch, 1) value
regression_model = FlexibleMLP(20, 1, task="regression")
# Multi-label: which conditions present? → (batch, n_conditions) logits
multilabel_model = FlexibleMLP(20, 10, task="multilabel")Training Template
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
def train_mlp(
model: nn.Module,
train_loader: DataLoader,
val_loader: DataLoader,
n_epochs: int = 50,
lr: float = 3e-4,
) -> nn.Module:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
criterion = nn.BCEWithLogitsLoss()
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs)
best_val_loss = float("inf")
best_weights = None
for epoch in range(n_epochs):
# Training
model.train()
for X, y in train_loader:
X, y = X.to(device), y.to(device)
optimizer.zero_grad()
loss = criterion(model(X).squeeze(), y)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
# Validation
model.eval()
val_loss = 0.0
with torch.no_grad():
for X, y in val_loader:
X, y = X.to(device), y.to(device)
val_loss += criterion(model(X).squeeze(), y).item()
val_loss /= len(val_loader)
if val_loss < best_val_loss:
best_val_loss = val_loss
import copy
best_weights = copy.deepcopy(model.state_dict())
scheduler.step()
if epoch % 10 == 0:
print(f"Epoch {epoch:3d}: val_loss={val_loss:.4f}")
model.load_state_dict(best_weights)
return modelInterview Answer
"An MLP (Multi-Layer Perceptron) is a fully-connected feedforward network: input → hidden layers → output. Each layer computes Z = X @ W.T + b then applies a non-linear activation (ReLU for hidden layers). Parameter count: Linear(in, out) has in×out + out parameters. In PyTorch, build with nn.Sequential or subclass nn.Module. Key design choices: (1) Width (neurons per layer) vs depth (number of layers) — for tabular clinical data, a pyramid (wider early) with 2–4 layers typically works well; (2) Always include Dropout and BatchNorm in hidden layers for regularisation; (3) Kaiming init for ReLU activations. Output layer has no activation — use BCEWithLogitsLoss for binary, CrossEntropyLoss for multi-class, MSE for regression. The full recipe: AdamW optimiser, cosine LR schedule, gradient clipping at norm 1.0, early stopping on validation loss."