Deep Learning for AI Interviews · Lesson 28 of 56

Multi-Layer Perceptron Architecture

What an MLP Is

Multi-Layer Perceptron (MLP) = Feedforward Neural Network

Structure:
  Input Layer  → receives features
  Hidden Layer(s) → learn intermediate representations
  Output Layer → produces predictions

Each layer: Z = X @ W.T + b → A = activation(Z)

"Multi-layer" means at least one hidden layer.
"Perceptron" refers to a single neuron (linear threshold unit).
"Fully connected" / "Dense" are synonyms for MLP layers.

Building an MLP in PyTorch

Python

import torch
import torch.nn as nn

# ── Approach 1: Sequential (simple) ──
mlp_simple = nn.Sequential(
    nn.Linear(20, 128),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(64, 1),
)

# ── Approach 2: Custom Module (flexible) ──
class ClinicalMLP(nn.Module):
    """
    MLP for clinical tabular data (readmission prediction).
    Input: 20 features (age, vitals, lab values, meds)
    Output: logit for 30-day readmission
    """
    
    def __init__(
        self,
        n_features: int = 20,
        hidden_dims: list[int] = [128, 64, 32],
        dropout: float = 0.3,
    ):
        super().__init__()
        
        dims = [n_features] + hidden_dims
        layers = []
        
        for in_dim, out_dim in zip(dims[:-1], dims[1:]):
            layers.extend([
                nn.Linear(in_dim, out_dim),
                nn.BatchNorm1d(out_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
            ])
        
        layers.append(nn.Linear(hidden_dims[-1], 1))  # output head
        self.net = nn.Sequential(*layers)
        self._init_weights()
    
    def _init_weights(self) -> None:
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.kaiming_normal_(module.weight, nonlinearity="relu")
                nn.init.zeros_(module.bias)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)

model = ClinicalMLP(n_features=20, hidden_dims=[128, 64, 32], dropout=0.3)

# Inspect
X = torch.randn(32, 20)
out = model(X)
print(f"Input:  {X.shape}")    # (32, 20)
print(f"Output: {out.shape}")  # (32, 1)

Parameter Counting

Python

def count_parameters(model: nn.Module) -> dict:
    """Count total and trainable parameters."""
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"{'Layer':40s} {'Shape':25s} {'Params':>10s}")
    print("-" * 80)
    for name, p in model.named_parameters():
        if p.requires_grad:
            print(f"{name:40s} {str(tuple(p.shape)):25s} {p.numel():>10,}")
    print("-" * 80)
    print(f"{'Total':40s} {'':25s} {total:>10,}")
    print(f"{'Trainable':40s} {'':25s} {trainable:>10,}")
    
    return {"total": total, "trainable": trainable}

# Formula: Linear(in, out) → in × out + out parameters
# 20→128: 20×128 + 128 = 2,688
# 128→64: 128×64 + 64  = 8,256
# 64→32:  64×32 + 32   = 2,080
# 32→1:   32×1 + 1     = 33
# Total (linear params): 13,057 (+ BatchNorm params)

model = ClinicalMLP()
stats = count_parameters(model)
print(f"\nTotal parameters: {stats['total']:,}")

Depth vs Width

Python

import torch
import torch.nn as nn

def make_mlp(
    n_features: int,
    architecture: str,
) -> nn.Module:
    """Create MLPs with different depth/width profiles."""
    configs = {
        # (hidden_dims)
        "shallow-wide":    [512, 512],
        "deep-narrow":     [64, 64, 64, 64, 64],
        "standard":        [128, 64, 32],
        "pyramid":         [256, 128, 64, 32],
        "inverse-pyramid": [32, 64, 128, 256],
    }
    
    hidden = configs[architecture]
    dims = [n_features] + hidden + [1]
    layers = []
    for i, (in_d, out_d) in enumerate(zip(dims[:-1], dims[1:])):
        layers.append(nn.Linear(in_d, out_d))
        if i < len(dims) - 2:  # no activation on output layer
            layers.extend([nn.ReLU(), nn.Dropout(0.2)])
    return nn.Sequential(*layers)

for arch in ["shallow-wide", "deep-narrow", "standard", "pyramid"]:
    m = make_mlp(20, arch)
    n_params = sum(p.numel() for p in m.parameters())
    print(f"{arch:20s}: {n_params:>8,} params")

# General rule: pyramid (wider early, narrower late) works well for tabular data
# Depth helps with complex feature interactions
# Width helps with representing many features simultaneously

MLP for Different Output Types

Python

import torch.nn as nn

class FlexibleMLP(nn.Module):
    """MLP with configurable output for different tasks."""
    
    def __init__(
        self,
        n_features: int,
        n_outputs: int,
        task: str = "binary",
        hidden_dims: list[int] = None,
    ):
        super().__init__()
        hidden_dims = hidden_dims or [128, 64]
        
        dims = [n_features] + hidden_dims
        backbone = []
        for in_d, out_d in zip(dims[:-1], dims[1:]):
            backbone.extend([nn.Linear(in_d, out_d), nn.ReLU(), nn.Dropout(0.2)])
        self.backbone = nn.Sequential(*backbone)
        self.head = nn.Linear(hidden_dims[-1], n_outputs)
        self.task = task
    
    def forward(self, x):
        features = self.backbone(x)
        logits = self.head(features)
        
        if self.task == "binary":
            return logits           # use BCEWithLogitsLoss
        elif self.task == "multiclass":
            return logits           # use CrossEntropyLoss
        elif self.task == "regression":
            return logits           # use MSELoss
        elif self.task == "multilabel":
            return logits           # use BCEWithLogitsLoss per output
        return logits

# Binary: patient readmitted? → (batch, 1) logit
binary_model    = FlexibleMLP(20, 1, task="binary")

# Multi-class: severity level 1–5? → (batch, 5) logits  
multiclass_model = FlexibleMLP(20, 5, task="multiclass")

# Regression: predict INR value → (batch, 1) value
regression_model = FlexibleMLP(20, 1, task="regression")

# Multi-label: which conditions present? → (batch, n_conditions) logits
multilabel_model = FlexibleMLP(20, 10, task="multilabel")

Training Template

Python

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

def train_mlp(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    n_epochs: int = 50,
    lr: float = 3e-4,
) -> nn.Module:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    criterion = nn.BCEWithLogitsLoss()
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs)
    
    best_val_loss = float("inf")
    best_weights = None
    
    for epoch in range(n_epochs):
        # Training
        model.train()
        for X, y in train_loader:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            loss = criterion(model(X).squeeze(), y)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        
        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(device), y.to(device)
                val_loss += criterion(model(X).squeeze(), y).item()
        val_loss /= len(val_loader)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            import copy
            best_weights = copy.deepcopy(model.state_dict())
        
        scheduler.step()
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch:3d}: val_loss={val_loss:.4f}")
    
    model.load_state_dict(best_weights)
    return model

Interview Answer

"An MLP (Multi-Layer Perceptron) is a fully-connected feedforward network: input → hidden layers → output. Each layer computes Z = X @ W.T + b then applies a non-linear activation (ReLU for hidden layers). Parameter count: Linear(in, out) has in×out + out parameters. In PyTorch, build with nn.Sequential or subclass nn.Module. Key design choices: (1) Width (neurons per layer) vs depth (number of layers) — for tabular clinical data, a pyramid (wider early) with 2–4 layers typically works well; (2) Always include Dropout and BatchNorm in hidden layers for regularisation; (3) Kaiming init for ReLU activations. Output layer has no activation — use BCEWithLogitsLoss for binary, CrossEntropyLoss for multi-class, MSE for regression. The full recipe: AdamW optimiser, cosine LR schedule, gradient clipping at norm 1.0, early stopping on validation loss."

The Loss Landscape and Local Minima

Next Lesson

Network Depth vs Width: Tradeoffs