MLP Architecture

What an MLP Is

Multi-Layer Perceptron (MLP) = Feedforward Neural Network

Structure:
  Input Layer  → receives features
  Hidden Layer(s) → learn intermediate representations
  Output Layer → produces predictions

Each layer: Z = X @ W.T + b → A = activation(Z)

"Multi-layer" means at least one hidden layer.
"Perceptron" refers to a single neuron (linear threshold unit).
"Fully connected" / "Dense" are synonyms for MLP layers.

Building an MLP in PyTorch

Python

import torch
import torch.nn as nn

# ── Approach 1: Sequential (simple) ──
mlp_simple = nn.Sequential(
    nn.Linear(20, 128),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(64, 1),
)

# ── Approach 2: Custom Module (flexible) ──
class ClinicalMLP(nn.Module):
    """
    MLP for clinical tabular data (readmission prediction).
    Input: 20 features (age, vitals, lab values, meds)
    Output: logit for 30-day readmission
    """
    
    def __init__(
        self,
        n_features: int = 20,
        hidden_dims: list[int] = [128, 64, 32],
        dropout: float = 0.3,
    ):
        super().__init__()
        
        dims = [n_features] + hidden_dims
        layers = []
        
        for in_dim, out_dim in zip(dims[:-1], dims[1:]):
            layers.extend([
                nn.Linear(in_dim, out_dim),
                nn.BatchNorm1d(out_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
            ])
        
        layers.append(nn.Linear(hidden_dims[-1], 1))  # output head
        self.net = nn.Sequential(*layers)
        self._init_weights()
    
    def _init_weights(self) -> None:
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.kaiming_normal_(module.weight, nonlinearity="relu")
                nn.init.zeros_(module.bias)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)

model = ClinicalMLP(n_features=20, hidden_dims=[128, 64, 32], dropout=0.3)

# Inspect
X = torch.randn(32, 20)
out = model(X)
print(f"Input:  {X.shape}")    # (32, 20)
print(f"Output: {out.shape}")  # (32, 1)

Parameter Counting

Python

def count_parameters(model: nn.Module) -> dict:
    """Count total and trainable parameters."""
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"{'Layer':40s} {'Shape':25s} {'Params':>10s}")
    print("-" * 80)
    for name, p in model.named_parameters():
        if p.requires_grad:
            print(f"{name:40s} {str(tuple(p.shape)):25s} {p.numel():>10,}")
    print("-" * 80)
    print(f"{'Total':40s} {'':25s} {total:>10,}")
    print(f"{'Trainable':40s} {'':25s} {trainable:>10,}")
    
    return {"total": total, "trainable": trainable}

# Formula: Linear(in, out) → in × out + out parameters
# 20→128: 20×128 + 128 = 2,688
# 128→64: 128×64 + 64  = 8,256
# 64→32:  64×32 + 32   = 2,080
# 32→1:   32×1 + 1     = 33
# Total (linear params): 13,057 (+ BatchNorm params)

model = ClinicalMLP()
stats = count_parameters(model)
print(f"\nTotal parameters: {stats['total']:,}")

Depth vs Width

Python

import torch
import torch.nn as nn

def make_mlp(
    n_features: int,
    architecture: str,
) -> nn.Module:
    """Create MLPs with different depth/width profiles."""
    configs = {
        # (hidden_dims)
        "shallow-wide":    [512, 512],
        "deep-narrow":     [64, 64, 64, 64, 64],
        "standard":        [128, 64, 32],
        "pyramid":         [256, 128, 64, 32],
        "inverse-pyramid": [32, 64, 128, 256],
    }
    
    hidden = configs[architecture]
    dims = [n_features] + hidden + [1]
    layers = []
    for i, (in_d, out_d) in enumerate(zip(dims[:-1], dims[1:])):
        layers.append(nn.Linear(in_d, out_d))
        if i < len(dims) - 2:  # no activation on output layer
            layers.extend([nn.ReLU(), nn.Dropout(0.2)])
    return nn.Sequential(*layers)

for arch in ["shallow-wide", "deep-narrow", "standard", "pyramid"]:
    m = make_mlp(20, arch)
    n_params = sum(p.numel() for p in m.parameters())
    print(f"{arch:20s}: {n_params:>8,} params")

# General rule: pyramid (wider early, narrower late) works well for tabular data
# Depth helps with complex feature interactions
# Width helps with representing many features simultaneously

MLP for Different Output Types

Python

import torch.nn as nn

class FlexibleMLP(nn.Module):
    """MLP with configurable output for different tasks."""
    
    def __init__(
        self,
        n_features: int,
        n_outputs: int,
        task: str = "binary",
        hidden_dims: list[int] = None,
    ):
        super().__init__()
        hidden_dims = hidden_dims or [128, 64]
        
        dims = [n_features] + hidden_dims
        backbone = []
        for in_d, out_d in zip(dims[:-1], dims[1:]):
            backbone.extend([nn.Linear(in_d, out_d), nn.ReLU(), nn.Dropout(0.2)])
        self.backbone = nn.Sequential(*backbone)
        self.head = nn.Linear(hidden_dims[-1], n_outputs)
        self.task = task
    
    def forward(self, x):
        features = self.backbone(x)
        logits = self.head(features)
        
        if self.task == "binary":
            return logits           # use BCEWithLogitsLoss
        elif self.task == "multiclass":
            return logits           # use CrossEntropyLoss
        elif self.task == "regression":
            return logits           # use MSELoss
        elif self.task == "multilabel":
            return logits           # use BCEWithLogitsLoss per output
        return logits

# Binary: patient readmitted? → (batch, 1) logit
binary_model    = FlexibleMLP(20, 1, task="binary")

# Multi-class: severity level 1–5? → (batch, 5) logits  
multiclass_model = FlexibleMLP(20, 5, task="multiclass")

# Regression: predict INR value → (batch, 1) value
regression_model = FlexibleMLP(20, 1, task="regression")

# Multi-label: which conditions present? → (batch, n_conditions) logits
multilabel_model = FlexibleMLP(20, 10, task="multilabel")

Training Template

Python

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

def train_mlp(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    n_epochs: int = 50,
    lr: float = 3e-4,
) -> nn.Module:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    criterion = nn.BCEWithLogitsLoss()
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs)
    
    best_val_loss = float("inf")
    best_weights = None
    
    for epoch in range(n_epochs):
        # Training
        model.train()
        for X, y in train_loader:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            loss = criterion(model(X).squeeze(), y)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        
        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(device), y.to(device)
                val_loss += criterion(model(X).squeeze(), y).item()
        val_loss /= len(val_loader)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            import copy
            best_weights = copy.deepcopy(model.state_dict())
        
        scheduler.step()
        
        if epoch % 10 == 0:
            print(f"Epoch {epoch:3d}: val_loss={val_loss:.4f}")
    
    model.load_state_dict(best_weights)
    return model

Interview Answer

"An MLP (Multi-Layer Perceptron) is a fully-connected feedforward network: input → hidden layers → output. Each layer computes Z = X @ W.T + b then applies a non-linear activation (ReLU for hidden layers). Parameter count: Linear(in, out) has in×out + out parameters. In PyTorch, build with nn.Sequential or subclass nn.Module. Key design choices: (1) Width (neurons per layer) vs depth (number of layers) — for tabular clinical data, a pyramid (wider early) with 2–4 layers typically works well; (2) Always include Dropout and BatchNorm in hidden layers for regularisation; (3) Kaiming init for ReLU activations. Output layer has no activation — use BCEWithLogitsLoss for binary, CrossEntropyLoss for multi-class, MSE for regression. The full recipe: AdamW optimiser, cosine LR schedule, gradient clipping at norm 1.0, early stopping on validation loss."

What an MLP Is

Building an MLP in PyTorch

Parameter Counting

Depth vs Width

MLP for Different Output Types

Training Template

Interview Answer

Enjoyed this article?

Leave a comment