Machine Learning Foundations · Lesson 68 of 70
Data Drift and Concept Drift
Two Types of Drift
Data drift (covariate shift):
P(X) changes — the input distribution shifts
The relationship between X and Y stays the same
Example: A new hospital site joins the network; their patients are younger
and have fewer comorbidities than the training population.
Effect: Model predictions shift; calibration may be off; performance may drop.
Concept drift:
P(Y|X) changes — the relationship between inputs and outcome changes
The input distribution may be unchanged
Example: COVID-19 changes the readmission dynamics — patients who would have
been discharged normally are now kept longer, changing who gets readmitted.
Effect: Model becomes systematically wrong in a new direction.Detecting Data Drift
Population Stability Index (PSI)
import numpy as np
def population_stability_index(expected: np.ndarray, actual: np.ndarray, bins: int = 10) -> float:
"""
PSI measures distribution shift between two populations.
PSI < 0.10: no significant change
PSI 0.10 - 0.25: minor change, monitor
PSI > 0.25: major change, investigate
"""
# Create bins from the expected distribution
_, bin_edges = np.histogram(expected, bins=bins)
bin_edges[0] = -np.inf
bin_edges[-1] = np.inf
expected_counts = np.histogram(expected, bins=bin_edges)[0]
actual_counts = np.histogram(actual, bins=bin_edges)[0]
# Convert to proportions (add small epsilon to avoid log(0))
expected_pct = (expected_counts + 1e-6) / len(expected)
actual_pct = (actual_counts + 1e-6) / len(actual)
# PSI = Σ (actual% - expected%) × ln(actual% / expected%)
psi = np.sum((actual_pct - expected_pct) * np.log(actual_pct / expected_pct))
return psi
# Clinical example: serum creatinine distribution
creatinine_train = np.random.beta(2, 8, 1000) * 5 + 0.5 # training population
creatinine_current = np.random.beta(3, 5, 500) * 5 + 0.5 # current (shifted)
psi = population_stability_index(creatinine_train, creatinine_current)
print(f"PSI for serum_creatinine: {psi:.4f}")
if psi > 0.25:
print("Major distribution shift detected — investigate data pipeline and retrain")
elif psi > 0.10:
print("Minor shift detected — monitor closely")
else:
print("No significant shift ✓")Kolmogorov-Smirnov Test
from scipy.stats import ks_2samp
import numpy as np
def detect_drift_ks(X_train: np.ndarray, X_new: np.ndarray,
feature_names: list, alpha: float = 0.01) -> list:
"""
Test each feature for distribution shift using KS test.
Returns features with statistically significant drift.
"""
drifted = []
for i, name in enumerate(feature_names):
stat, pval = ks_2samp(X_train[:, i], X_new[:, i])
if pval < alpha:
drifted.append({
"feature": name,
"ks_stat": stat,
"p_value": pval,
})
return sorted(drifted, key=lambda x: -x["ks_stat"])
drifted_features = detect_drift_ks(X_train, X_production, feature_names, alpha=0.01)
if drifted_features:
print("Features with significant drift:")
for f in drifted_features[:5]:
print(f" {f['feature']}: KS={f['ks_stat']:.3f}, p={f['p_value']:.4f}")Detecting Concept Drift
# Concept drift is harder to detect — requires ground truth labels
# Which arrive with delay (30 days for readmission models)
# Method 1: rolling AUC — watch for degradation in labeled recent predictions
from sklearn.metrics import roc_auc_score
from collections import deque
import numpy as np
class ConceptDriftDetector:
def __init__(self, window_size: int = 200):
self.predictions = deque(maxlen=window_size)
self.labels = deque(maxlen=window_size)
self.timestamps = deque(maxlen=window_size)
def add_labeled_prediction(self, prediction: float, label: int, timestamp):
self.predictions.append(prediction)
self.labels.append(label)
self.timestamps.append(timestamp)
def get_rolling_auc(self) -> float | None:
if len(self.labels) < 50:
return None
try:
return roc_auc_score(list(self.labels), list(self.predictions))
except Exception:
return None
def check_drift(self, baseline_auc: float, threshold: float = 0.05) -> dict:
rolling = self.get_rolling_auc()
if rolling is None:
return {"status": "insufficient_data"}
drop = baseline_auc - rolling
return {
"baseline_auc": baseline_auc,
"rolling_auc": rolling,
"drop": drop,
"drift_detected": drop > threshold,
}
# Method 2: monitor label rate
# If readmission rate changes substantially, concept drift is likely
def check_label_rate_drift(recent_labels: list, training_rate: float,
z_threshold: float = 3.0) -> bool:
recent_rate = np.mean(recent_labels)
# Binomial standard deviation
n = len(recent_labels)
std = np.sqrt(training_rate * (1 - training_rate) / n)
z = abs(recent_rate - training_rate) / std
print(f"Training rate: {training_rate:.3f}, Recent rate: {recent_rate:.3f}, z={z:.2f}")
return z > z_thresholdGradual vs Sudden Drift
# Gradual drift: slow shift over weeks/months (seasonal patterns, demographic shift)
# → Detect with PSI on rolling windows
# → Response: retrain periodically, use sliding window training data
# Sudden drift: abrupt change (protocol change, new patient population, COVID-19)
# → Detect with CUSUM or control charts on prediction mean
# → Response: retrain immediately with recent data
class CUSUMDriftDetector:
"""
CUSUM (Cumulative Sum) control chart for detecting sudden drift.
"""
def __init__(self, target_mean: float, k: float = 0.5, h: float = 5.0):
self.target_mean = target_mean
self.k = k # slack parameter
self.h = h # alert threshold
self.S_pos = 0.0
self.S_neg = 0.0
def update(self, observation: float) -> bool:
"""Returns True if drift is detected."""
diff = observation - self.target_mean
self.S_pos = max(0, self.S_pos + diff - self.k)
self.S_neg = max(0, self.S_neg - diff - self.k)
return self.S_pos > self.h or self.S_neg > self.h
# Usage: monitor prediction mean (or AUC) for sudden shifts
detector = CUSUMDriftDetector(target_mean=0.15) # expected mean prediction
for day_prediction_mean in recent_daily_means:
if detector.update(day_prediction_mean):
print(f"CUSUM drift alert: mean={day_prediction_mean:.3f}")
breakResponding to Drift
# Decision matrix for drift response
response_guide = {
"data_drift_only": {
"situation": "P(X) shifted, but model performance still OK",
"response": "Monitor closely; recalibrate if needed; flag OOD predictions",
"action": "Log warning, increase monitoring frequency",
},
"data_drift_with_perf_drop": {
"situation": "P(X) shifted AND AUC dropped",
"response": "Retrain on combined old+recent data, or just recent data",
"action": "Schedule retraining, consider domain adaptation",
},
"concept_drift": {
"situation": "P(Y|X) changed — model is systematically wrong",
"response": "Retrain with recent labeled data (old data may be harmful)",
"action": "Retrain on recent window only; validate new model before deployment",
},
"label_rate_drift": {
"situation": "Positive rate in production changed significantly",
"response": "Recalibrate threshold; check for labeling changes",
"action": "Investigate ground truth pipeline; adjust threshold",
},
}
for drift_type, info in response_guide.items():
print(f"\n{drift_type}:")
for k, v in info.items():
print(f" {k}: {v}")Interview Answer Template
Q: What's the difference between data drift and concept drift?
Data drift (covariate shift) means the input distribution P(X) has changed — for example, a new hospital system starts using the model and their patients are older and sicker than the training population. The underlying relationship between features and readmission may be unchanged, but the model is now operating on inputs it hasn't seen. Concept drift means P(Y|X) has changed — the relationship between inputs and outcome has shifted. COVID-19 disrupting normal readmission patterns is a classic example: the same patient profile that predicted readmission before now has different dynamics. Data drift is detectable without labels (PSI, KS test on feature distributions), while concept drift requires ground truth labels and manifests as AUC degradation in rolling windows. The responses differ: for data drift, recalibration or retraining on mixed data may suffice; for concept drift, the old training data may actually be harmful and retraining on a recent window only is more appropriate.