Statistics & Math for AI/ML Interviews · Lesson 19 of 30
Bayes in AI Systems
Bayesian vs Frequentist Thinking
Frequentist:
Probability = long-run frequency of events
Parameters are fixed, unknown constants
Uncertainty comes from sampling variation
Tools: p-values, confidence intervals, MLE
Bayesian:
Probability = degree of belief
Parameters have distributions (they're uncertain)
Uncertainty comes from limited data and prior uncertainty
Tools: priors, posteriors, credible intervals, MCMC
Both are valid frameworks. Bayesian is more natural for:
- Incorporating prior knowledge
- Quantifying uncertainty about parameters
- Updating beliefs sequentially (online learning)
- Making decisions under uncertaintyNaive Bayes Classifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
import numpy as np
# Gaussian Naive Bayes: P(feature | class) ~ Normal(μ, σ²)
# Assumes features are normally distributed within each class
gnb = GaussianNB(
priors=[0.3, 0.7], # P(class=0) = 0.3, P(class=1) = 0.7
)
gnb.fit(X_train, y_train)
# Internally:
# P(class=k | x) ∝ P(class=k) × Π N(xᵢ; μᵢₖ, σᵢₖ²)
# gnb.theta_: learned μᵢₖ for each feature i and class k
# gnb.var_: learned σᵢₖ² for each feature i and class k
proba = gnb.predict_proba(X_test) # shape: (n_samples, n_classes)
print(f"P(class=0): {proba[0, 0]:.3f}, P(class=1): {proba[0, 1]:.3f}")Bayesian Hyperparameter Optimisation
from bayes_opt import BayesianOptimization # pip install bayesian-optimization
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
def objective(n_estimators, max_depth, min_samples_split):
model = RandomForestClassifier(
n_estimators=int(n_estimators),
max_depth=int(max_depth),
min_samples_split=int(min_samples_split),
random_state=42,
)
scores = cross_val_score(model, X_train, y_train, cv=3, scoring="roc_auc")
return scores.mean()
# Bayesian optimisation maintains a Gaussian Process surrogate model
# Prior: GP over the objective function surface
# Update: after each evaluation, posterior GP is updated with new (params, score) point
# Acquisition function: guides where to evaluate next (balance explore vs exploit)
optimizer = BayesianOptimization(
f=objective,
pbounds={
"n_estimators": (50, 500),
"max_depth": (3, 20),
"min_samples_split": (2, 20),
},
random_state=42,
)
optimizer.maximize(init_points=5, n_iter=25)
print(f"Best params: {optimizer.max['params']}")Gaussian Processes
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel
# Gaussian Process: Bayesian regression that outputs a distribution over functions
# Prior: all smooth functions consistent with the kernel
# Posterior: functions consistent with the kernel AND the training data
kernel = ConstantKernel(1.0) * RBF(length_scale=1.0)
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)
gp.fit(X_train, y_train)
# Predict: returns mean AND standard deviation (uncertainty)
y_pred_mean, y_pred_std = gp.predict(X_test, return_std=True)
# 95% credible interval for each prediction
lower = y_pred_mean - 1.96 * y_pred_std
upper = y_pred_mean + 1.96 * y_pred_std
# In clinical AI: high uncertainty → flag for human review
uncertain_cases = X_test[y_pred_std > 0.3]
print(f"High-uncertainty cases: {len(uncertain_cases)} / {len(X_test)}")Monte Carlo Dropout: Bayesian Neural Networks
import torch
import torch.nn as nn
class BayesianMLP(nn.Module):
def __init__(self, d_in: int, d_hidden: int, d_out: int, dropout_p: float = 0.1):
super().__init__()
self.net = nn.Sequential(
nn.Linear(d_in, d_hidden),
nn.ReLU(),
nn.Dropout(dropout_p),
nn.Linear(d_hidden, d_out),
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.net(x)
def mc_dropout_predict(
model: BayesianMLP,
x: torch.Tensor,
n_samples: int = 100,
) -> tuple[torch.Tensor, torch.Tensor]:
model.train() # keep dropout ACTIVE at inference
predictions = torch.stack([
torch.sigmoid(model(x))
for _ in range(n_samples)
])
mean = predictions.mean(dim=0)
std = predictions.std(dim=0)
return mean, std
# High std → high epistemic uncertainty → model doesn't know → flag for review
mean_pred, uncertainty = mc_dropout_predict(model, X_test_tensor)Bayesian A/B Testing
from scipy.stats import beta
import numpy as np
class BayesianABTest:
def __init__(self, alpha_prior: float = 1.0, beta_prior: float = 1.0):
"""Beta(1,1) = uniform prior over conversion rate."""
self.alpha_a = alpha_prior
self.beta_a = beta_prior
self.alpha_b = alpha_prior
self.beta_b = beta_prior
def update(self, variant: str, conversions: int, trials: int):
if variant == "A":
self.alpha_a += conversions
self.beta_a += trials - conversions
else:
self.alpha_b += conversions
self.beta_b += trials - conversions
def probability_b_beats_a(self, n_samples: int = 10_000) -> float:
"""P(B is better than A) by sampling from posteriors."""
samples_a = beta(self.alpha_a, self.beta_a).rvs(n_samples)
samples_b = beta(self.alpha_b, self.beta_b).rvs(n_samples)
return float((samples_b > samples_a).mean())
test = BayesianABTest()
test.update("A", conversions=120, trials=1000) # 12% conversion
test.update("B", conversions=135, trials=1000) # 13.5% conversion
p_b_wins = test.probability_b_beats_a()
print(f"P(B beats A) = {p_b_wins:.4f}") # e.g., 0.87Interview Answer
"Bayesian thinking pervades ML: Naive Bayes classifiers apply Bayes' theorem directly; Bayesian hyperparameter optimisation uses Gaussian Processes to model the objective surface and guide search (every evaluation updates the posterior over good hyperparameter regions); Gaussian Processes provide full predictive distributions for uncertainty-aware regression; MC Dropout approximates Bayesian neural networks by treating dropout at inference as sampling from a weight posterior; and Bayesian A/B testing gives probability that one variant beats another rather than a binary p-value. The common thread: uncertainty is quantified explicitly as a probability distribution, not a binary pass/fail."