Machine Learning Foundations · Lesson 68 of 70

Data Drift and Concept Drift

Two Types of Drift

Data drift (covariate shift):
  P(X) changes — the input distribution shifts
  The relationship between X and Y stays the same
  
  Example: A new hospital site joins the network; their patients are younger
           and have fewer comorbidities than the training population.
  Effect:  Model predictions shift; calibration may be off; performance may drop.

Concept drift:
  P(Y|X) changes — the relationship between inputs and outcome changes
  The input distribution may be unchanged
  
  Example: COVID-19 changes the readmission dynamics — patients who would have
           been discharged normally are now kept longer, changing who gets readmitted.
  Effect:  Model becomes systematically wrong in a new direction.

Detecting Data Drift

Population Stability Index (PSI)

Python

import numpy as np

def population_stability_index(expected: np.ndarray, actual: np.ndarray, bins: int = 10) -> float:
    """
    PSI measures distribution shift between two populations.
    PSI < 0.10: no significant change
    PSI 0.10 - 0.25: minor change, monitor
    PSI > 0.25: major change, investigate
    """
    # Create bins from the expected distribution
    _, bin_edges = np.histogram(expected, bins=bins)
    bin_edges[0]  = -np.inf
    bin_edges[-1] = np.inf

    expected_counts = np.histogram(expected, bins=bin_edges)[0]
    actual_counts   = np.histogram(actual,   bins=bin_edges)[0]

    # Convert to proportions (add small epsilon to avoid log(0))
    expected_pct = (expected_counts + 1e-6) / len(expected)
    actual_pct   = (actual_counts   + 1e-6) / len(actual)

    # PSI = Σ (actual% - expected%) × ln(actual% / expected%)
    psi = np.sum((actual_pct - expected_pct) * np.log(actual_pct / expected_pct))
    return psi

# Clinical example: serum creatinine distribution
creatinine_train   = np.random.beta(2, 8, 1000) * 5 + 0.5   # training population
creatinine_current = np.random.beta(3, 5, 500)  * 5 + 0.5   # current (shifted)

psi = population_stability_index(creatinine_train, creatinine_current)
print(f"PSI for serum_creatinine: {psi:.4f}")
if psi > 0.25:
    print("Major distribution shift detected — investigate data pipeline and retrain")
elif psi > 0.10:
    print("Minor shift detected — monitor closely")
else:
    print("No significant shift ✓")

Kolmogorov-Smirnov Test

Python

from scipy.stats import ks_2samp
import numpy as np

def detect_drift_ks(X_train: np.ndarray, X_new: np.ndarray,
                    feature_names: list, alpha: float = 0.01) -> list:
    """
    Test each feature for distribution shift using KS test.
    Returns features with statistically significant drift.
    """
    drifted = []
    for i, name in enumerate(feature_names):
        stat, pval = ks_2samp(X_train[:, i], X_new[:, i])
        if pval < alpha:
            drifted.append({
                "feature":  name,
                "ks_stat":  stat,
                "p_value":  pval,
            })

    return sorted(drifted, key=lambda x: -x["ks_stat"])

drifted_features = detect_drift_ks(X_train, X_production, feature_names, alpha=0.01)
if drifted_features:
    print("Features with significant drift:")
    for f in drifted_features[:5]:
        print(f"  {f['feature']}: KS={f['ks_stat']:.3f}, p={f['p_value']:.4f}")

Detecting Concept Drift

Python

# Concept drift is harder to detect — requires ground truth labels
# Which arrive with delay (30 days for readmission models)

# Method 1: rolling AUC — watch for degradation in labeled recent predictions
from sklearn.metrics import roc_auc_score
from collections import deque
import numpy as np

class ConceptDriftDetector:
    def __init__(self, window_size: int = 200):
        self.predictions = deque(maxlen=window_size)
        self.labels      = deque(maxlen=window_size)
        self.timestamps  = deque(maxlen=window_size)

    def add_labeled_prediction(self, prediction: float, label: int, timestamp):
        self.predictions.append(prediction)
        self.labels.append(label)
        self.timestamps.append(timestamp)

    def get_rolling_auc(self) -> float | None:
        if len(self.labels) < 50:
            return None
        try:
            return roc_auc_score(list(self.labels), list(self.predictions))
        except Exception:
            return None

    def check_drift(self, baseline_auc: float, threshold: float = 0.05) -> dict:
        rolling = self.get_rolling_auc()
        if rolling is None:
            return {"status": "insufficient_data"}
        drop = baseline_auc - rolling
        return {
            "baseline_auc": baseline_auc,
            "rolling_auc":  rolling,
            "drop":         drop,
            "drift_detected": drop > threshold,
        }

# Method 2: monitor label rate
# If readmission rate changes substantially, concept drift is likely
def check_label_rate_drift(recent_labels: list, training_rate: float,
                           z_threshold: float = 3.0) -> bool:
    recent_rate = np.mean(recent_labels)
    # Binomial standard deviation
    n = len(recent_labels)
    std = np.sqrt(training_rate * (1 - training_rate) / n)
    z = abs(recent_rate - training_rate) / std
    print(f"Training rate: {training_rate:.3f}, Recent rate: {recent_rate:.3f}, z={z:.2f}")
    return z > z_threshold

Gradual vs Sudden Drift

Python

# Gradual drift: slow shift over weeks/months (seasonal patterns, demographic shift)
# → Detect with PSI on rolling windows
# → Response: retrain periodically, use sliding window training data

# Sudden drift: abrupt change (protocol change, new patient population, COVID-19)
# → Detect with CUSUM or control charts on prediction mean
# → Response: retrain immediately with recent data

class CUSUMDriftDetector:
    """
    CUSUM (Cumulative Sum) control chart for detecting sudden drift.
    """
    def __init__(self, target_mean: float, k: float = 0.5, h: float = 5.0):
        self.target_mean = target_mean
        self.k = k       # slack parameter
        self.h = h       # alert threshold
        self.S_pos = 0.0
        self.S_neg = 0.0

    def update(self, observation: float) -> bool:
        """Returns True if drift is detected."""
        diff = observation - self.target_mean
        self.S_pos = max(0, self.S_pos + diff - self.k)
        self.S_neg = max(0, self.S_neg - diff - self.k)
        return self.S_pos > self.h or self.S_neg > self.h

# Usage: monitor prediction mean (or AUC) for sudden shifts
detector = CUSUMDriftDetector(target_mean=0.15)  # expected mean prediction
for day_prediction_mean in recent_daily_means:
    if detector.update(day_prediction_mean):
        print(f"CUSUM drift alert: mean={day_prediction_mean:.3f}")
        break

Responding to Drift

Python

# Decision matrix for drift response

response_guide = {
    "data_drift_only": {
        "situation":  "P(X) shifted, but model performance still OK",
        "response":   "Monitor closely; recalibrate if needed; flag OOD predictions",
        "action":     "Log warning, increase monitoring frequency",
    },
    "data_drift_with_perf_drop": {
        "situation":  "P(X) shifted AND AUC dropped",
        "response":   "Retrain on combined old+recent data, or just recent data",
        "action":     "Schedule retraining, consider domain adaptation",
    },
    "concept_drift": {
        "situation":  "P(Y|X) changed — model is systematically wrong",
        "response":   "Retrain with recent labeled data (old data may be harmful)",
        "action":     "Retrain on recent window only; validate new model before deployment",
    },
    "label_rate_drift": {
        "situation":  "Positive rate in production changed significantly",
        "response":   "Recalibrate threshold; check for labeling changes",
        "action":     "Investigate ground truth pipeline; adjust threshold",
    },
}

for drift_type, info in response_guide.items():
    print(f"\n{drift_type}:")
    for k, v in info.items():
        print(f"  {k}: {v}")

Interview Answer Template

Q: What's the difference between data drift and concept drift?

Data drift (covariate shift) means the input distribution P(X) has changed — for example, a new hospital system starts using the model and their patients are older and sicker than the training population. The underlying relationship between features and readmission may be unchanged, but the model is now operating on inputs it hasn't seen. Concept drift means P(Y|X) has changed — the relationship between inputs and outcome has shifted. COVID-19 disrupting normal readmission patterns is a classic example: the same patient profile that predicted readmission before now has different dynamics. Data drift is detectable without labels (PSI, KS test on feature distributions), while concept drift requires ground truth labels and manifests as AUC degradation in rolling windows. The responses differ: for data drift, recalibration or retraining on mixed data may suffice; for concept drift, the old training data may actually be harmful and retraining on a recent window only is more appropriate.

Why Good Training Accuracy ≠ Good Production

Next Lesson

How to Debug a Bad ML Model Systematically