Random Search for Hyperparameter Tuning

Why Random Search Often Beats Grid Search

Grid search:   evaluates every combination
               → wastes trials on redundant combinations
               → covers the grid uniformly regardless of importance

Random search: samples combinations randomly from distributions
               → covers more of the hyperparameter space per trial
               → spends budget on diverse combinations
               → finds good solutions faster when some hyperparameters matter more than others

Key insight (Bergstra & Bengio, 2012):
  If 5 of 10 hyperparameters are important:
  - Grid search (10×10×10×10×10×10×10×10×10×10): millions of combos, most redundant
  - Random search (same budget): marginalizes out unimportant dimensions automatically

RandomizedSearchCV

Python

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, randint, loguniform
import numpy as np

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", GradientBoostingClassifier(random_state=42)),
])

# Use distributions, not just discrete values
param_distributions = {
    "model__n_estimators":   randint(50, 500),         # random integer in [50, 500)
    "model__max_depth":      randint(2, 10),            # random integer in [2, 10)
    "model__learning_rate":  loguniform(0.005, 0.5),   # log-uniform: good for LR
    "model__subsample":      uniform(0.5, 0.5),         # uniform in [0.5, 1.0]
    "model__min_samples_leaf": randint(1, 30),
    "model__max_features":   uniform(0.3, 0.7),         # fraction of features per split
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions,
    n_iter=50,           # number of random combinations to try
    cv=cv,
    scoring="roc_auc",
    random_state=42,
    n_jobs=-1,
    verbose=1,
    refit=True,
)

random_search.fit(X_train, y_train)

print(f"Best params: {random_search.best_params_}")
print(f"Best CV AUC: {random_search.best_score_:.3f}")
print(f"Total fits: 50 combinations × 5 folds = 250")

Choosing Sampling Distributions

Python

from scipy.stats import uniform, loguniform, randint
import numpy as np

# For learning rate and regularization strength: use log-uniform
# These parameters matter in orders of magnitude (0.001 vs 0.01 vs 0.1)
# log-uniform: equal probability in each decade
lr_samples = loguniform(0.001, 1.0).rvs(10000)
print(f"loguniform(0.001, 1.0) samples: min={lr_samples.min():.4f}, max={lr_samples.max():.4f}")

# For number of estimators / depth: use randint or uniform
n_est = randint(50, 500).rvs(10)
print(f"randint(50, 500) samples: {n_est}")

# For fractions (subsample, max_features): use uniform
subsample = uniform(0.5, 0.5).rvs(10)  # uniform in [0.5, 1.0]
print(f"uniform(0.5, 0.5) samples: {subsample.round(2)}")

# For dropout rate: uniform in [0, 0.5]
dropout = uniform(0, 0.5).rvs(10)
print(f"dropout uniform samples: {dropout.round(2)}")

Grid Search vs Random Search: Comparison

Python

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, loguniform
import time

# Grid search: explicit grid, exhaustive
grid_params = {
    "model__n_estimators":  [50, 100, 200, 400],
    "model__max_depth":     [3, 5, 7, 9],
    "model__learning_rate": [0.01, 0.05, 0.1, 0.2],
}
# Total: 4 × 4 × 4 = 64 combinations

# Random search: same budget as grid search
rand_params = {
    "model__n_estimators":  randint(50, 400),
    "model__max_depth":     randint(3, 10),
    "model__learning_rate": loguniform(0.01, 0.2),
}
# 64 random combinations from much wider distributions

t0 = time.time()
grid_search = GridSearchCV(pipeline, grid_params, cv=3, scoring="roc_auc", n_jobs=-1)
grid_search.fit(X_train, y_train)
t_grid = time.time() - t0

t0 = time.time()
rand_search = RandomizedSearchCV(pipeline, rand_params, n_iter=64, cv=3, scoring="roc_auc",
                                 random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)
t_rand = time.time() - t0

print(f"Grid search: AUC={grid_search.best_score_:.4f} ({t_grid:.1f}s)")
print(f"Random search: AUC={rand_search.best_score_:.4f} ({t_rand:.1f}s)")
# Often similar or better AUC with random search, sometimes faster

Budget-Controlled Tuning

Python

# Random search lets you set a budget explicitly
# Use n_iter based on how much compute you can afford

def tune_with_budget(pipeline, param_distributions, X, y, budget_minutes: float = 5):
    """
    Estimate n_iter from time budget.
    """
    import time
    from sklearn.model_selection import cross_val_score

    # Time a single fit
    t0 = time.time()
    cross_val_score(pipeline, X[:100], y[:100], cv=3, scoring="roc_auc")
    single_fit_time = time.time() - t0

    budget_seconds = budget_minutes * 60
    n_iter = max(10, int(budget_seconds / (single_fit_time * 3)))  # 3-fold CV
    print(f"Budget: {budget_minutes} min → n_iter={n_iter}")

    return RandomizedSearchCV(
        pipeline, param_distributions,
        n_iter=n_iter, cv=5, scoring="roc_auc", random_state=42, n_jobs=-1
    )

search = tune_with_budget(pipeline, rand_params, X_train, y_train, budget_minutes=3)
search.fit(X_train, y_train)
print(f"Best AUC: {search.best_score_:.3f}")

Warm Starting: Continuing from Prior Results

Python

import numpy as np

# You can continue random search from where you left off
# by changing random_state and increasing n_iter

# Round 1: quick search
search_r1 = RandomizedSearchCV(pipeline, rand_params, n_iter=20, cv=5, scoring="roc_auc",
                                random_state=42, n_jobs=-1)
search_r1.fit(X_train, y_train)
print(f"Round 1 best: {search_r1.best_score_:.3f}")

# Round 2: more iterations with different seed (new random points)
search_r2 = RandomizedSearchCV(pipeline, rand_params, n_iter=20, cv=5, scoring="roc_auc",
                                random_state=99, n_jobs=-1)
search_r2.fit(X_train, y_train)
print(f"Round 2 best: {search_r2.best_score_:.3f}")

# Combined best
best_overall = max(
    [search_r1, search_r2],
    key=lambda s: s.best_score_
)
print(f"Combined best: {best_overall.best_score_:.3f}")
print(f"Best params: {best_overall.best_params_}")

Random Search in Production

Python

# Practical tips for production hyperparameter tuning

# 1. Log all results
import pandas as pd
results_df = pd.DataFrame(random_search.cv_results_).sort_values("rank_test_score")
results_df[["params", "mean_test_score", "std_test_score"]].head(20).to_csv("search_results.csv")

# 2. Check for overfitting in the search
# If best CV AUC >> val AUC, the search overfit to the CV folds
# → Use a separate held-out val set to verify
best_val_auc = roc_auc_score(y_val, random_search.best_estimator_.predict_proba(X_val)[:, 1])
print(f"CV AUC (tuning set): {random_search.best_score_:.3f}")
print(f"Val AUC (held out):  {best_val_auc:.3f}")

# 3. Report top 5 configurations (not just top 1)
# The best might be a fluke — nearby configs with similar AUC are more reliable
print("\nTop 5 configurations:")
for _, row in results_df.head(5).iterrows():
    print(f"  AUC={row['mean_test_score']:.3f} ± {row['std_test_score']:.3f} — {row['params']}")

Interview Answer Template

Q: When would you use random search instead of grid search?

Random search samples hyperparameter combinations randomly from distributions rather than evaluating every point on a fixed grid. The key insight is that many hyperparameters don't matter much for most problems — if only 3 of 10 hyperparameters are important, grid search wastes budget on irrelevant combinations while random search automatically marginalizes over them and explores the important dimensions more. Random search is preferred when the search space has more than 2–3 dimensions, when training is expensive, or when the important hyperparameters are unknown. It also allows using continuous distributions — loguniform for learning rates and regularization strength, uniform for fractions — rather than forcing discrete values. Grid search is still useful for small, well-understood spaces where exhaustive search is feasible. In practice, I use random search with n_iter = 50–200 for most problems and check whether the results plateau, then stop early if they do.

Random Search for Hyperparameter Tuning

Why Random Search Often Beats Grid Search

RandomizedSearchCV

Choosing Sampling Distributions

Grid Search vs Random Search: Comparison

Budget-Controlled Tuning

Warm Starting: Continuing from Prior Results

Random Search in Production

Interview Answer Template

Enjoyed this article?

Leave a comment