# Load required packages
library(xgboost)
library(ParBayesianOptimization)
library(mlbench)
library(dplyr)
library(recipes)
library(rsample)
Using Bayesian Optimization to Tune XGBoost Models in R
Tuning machine learning models can be time-consuming and computationally expensive. This post shows how to use Bayesian optimization to efficiently find optimal XGBoost hyperparameters – saving time and improving model performance.
Required Packages
Data Preparation
We’ll use the Boston Housing dataset – a classic regression problem with both numeric and categorical variables.
# Load the Boston Housing dataset
data("BostonHousing2")
# Quick look at the data structure
str(BostonHousing2)
'data.frame': 506 obs. of 19 variables:
$ town : Factor w/ 92 levels "Arlington","Ashland",..: 54 77 77 46 46 46 69 69 69 69 ...
$ tract : int 2011 2021 2022 2031 2032 2033 2041 2042 2043 2044 ...
$ lon : num -71 -71 -70.9 -70.9 -70.9 ...
$ lat : num 42.3 42.3 42.3 42.3 42.3 ...
$ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
$ cmedv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 22.1 16.5 18.9 ...
$ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
$ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
$ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
$ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
$ rm : num 6.58 6.42 7.18 7 7.15 ...
$ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
$ dis : num 4.09 4.97 4.97 6.06 6.06 ...
$ rad : int 1 2 2 3 3 3 5 5 5 5 ...
$ tax : int 296 242 242 222 222 222 311 311 311 311 ...
$ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
$ b : num 397 397 393 395 397 ...
$ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
XGBoost requires numeric inputs, so we’ll use the recipes
package to transform our categorical variables:
# Create a recipe for preprocessing
<- recipe(cmedv ~ ., data = BostonHousing2) %>%
rec # Collapse categories where population is < 3%
step_other(town, chas, threshold = .03, other = "Other") %>%
# Create dummy variables for all factor variables
step_dummy(all_nominal_predictors())
# Train the recipe on the dataset
<- prep(rec, training = BostonHousing2)
prep
# Create the final model matrix
<- bake(prep, new_data = BostonHousing2)
model_df
# Check the column names after one-hot encoding
colnames(model_df)
[1] "tract" "lon" "lat"
[4] "medv" "crim" "zn"
[7] "indus" "nox" "rm"
[10] "age" "dis" "rad"
[13] "tax" "ptratio" "b"
[16] "lstat" "cmedv" "town_Boston.Savin.Hill"
[19] "town_Cambridge" "town_Lynn" "town_Newton"
[22] "town_Other" "chas_X1"
Next, we’ll split our data into training and testing sets:
# Create a 70/30 train-test split
<- rsample::initial_split(model_df, prop = 0.7)
splits <- rsample::training(splits)
train_df <- rsample::testing(splits)
test_df
# Prepare the training data for XGBoost
<- train_df %>%
X select(!medv, !cmedv) %>%
as.matrix()
# Get the target variable
<- train_df %>% pull(cmedv)
y
# Create cross-validation folds
<- list(
folds fold1 = as.integer(seq(1, nrow(X), by = 5)),
fold2 = as.integer(seq(2, nrow(X), by = 5))
)
Setting Up Bayesian Optimization
Bayesian optimization requires two key components:
- An objective function that evaluates model performance
- The parameter bounds we want to explore
# Our objective function takes hyperparameters as inputs
<- function(eta, max_depth, min_child_weight, subsample, lambda, alpha) {
obj_func
<- list(
param # Learning parameters
eta = eta, # Learning rate
max_depth = max_depth, # Tree depth
min_child_weight = min_child_weight, # Min observations per node
subsample = subsample, # Data subsampling
lambda = lambda, # L2 regularization
alpha = alpha, # L1 regularization
booster = "gbtree", # Use tree model
objective = "reg:squarederror", # Regression task
eval_metric = "mape" # Mean Absolute Percentage Error
)
<- xgb.cv(params = param,
xgbcv data = X,
label = y,
nround = 50,
folds = folds,
prediction = TRUE,
early_stopping_rounds = 5,
verbose = 0,
maximize = FALSE)
<- list(
lst # First argument must be named as "Score"
# Function finds maxima so inverting the output
Score = -min(xgbcv$evaluation_log$test_mape_mean),
# Get number of trees for the best performing model
nrounds = xgbcv$best_iteration
)
return(lst)
}
# Define the search space for each parameter
<- list(
bounds eta = c(0.001, 0.2), # Learning rate range
max_depth = c(1L, 10L), # Tree depth range
min_child_weight = c(1, 50), # Min observations range
subsample = c(0.1, 1), # Subsampling range
lambda = c(1, 10), # L2 regularization range
alpha = c(1, 10) # L1 regularization range
)
Running Bayesian Optimization
Now we’ll run the optimization process to intelligently search for the best parameters:
set.seed(1234)
<- bayesOpt(
bayes_out FUN = obj_func, # Our objective function
bounds = bounds, # Parameter bounds
initPoints = length(bounds) + 2, # Initial random points
iters.n = 10, # Number of iterations
verbose = 0 # Suppress output
)
# View top results
$scoreSummary[1:5, c(3:8, 13)] bayes_out
eta max_depth min_child_weight subsample lambda alpha Score
<num> <num> <num> <num> <num> <num> <num>
1: 0.13392137 8 4.913332 0.2105925 4.721124 3.887629 -0.1292920
2: 0.19400811 2 25.454160 0.9594105 9.329695 3.173695 -0.1790158
3: 0.16079775 2 14.035652 0.5118349 1.229953 5.093530 -0.1662595
4: 0.08957707 4 12.534842 0.3844404 4.358837 1.788342 -0.1672395
5: 0.02876388 4 36.586761 0.8107181 6.137100 6.039125 -0.3320015
# Get the best parameters
<- getBestPars(bayes_out)
best_params data.frame(best_params)
eta max_depth min_child_weight subsample lambda alpha
1 0.1251447 10 1 1 1 5.905011
Training the Final Model
With the optimal hyperparameters identified, we can now train our final XGBoost model.
# Combine best params with base params
<- append(
opt_params list(booster = "gbtree",
objective = "reg:squarederror",
eval_metric = "mae"),
best_params
)
# Run cross-validation to determine optimal number of rounds
<- xgb.cv(
xgbcv params = opt_params,
data = X,
label = y,
nround = 100,
folds = folds,
prediction = TRUE,
early_stopping_rounds = 5,
verbose = 0,
maximize = FALSE
)
# Get optimal number of rounds
= xgbcv$best_iteration
nrounds
# Fit the final XGBoost model
<- xgboost(
mdl data = X,
label = y,
params = opt_params,
maximize = FALSE,
early_stopping_rounds = 5,
nrounds = nrounds,
verbose = 0
)
# Make predictions on the test set
<- test_df$cmedv
actuals <- test_df %>%
predicted select_at(mdl$feature_names) %>%
as.matrix() %>%
predict(mdl, newdata = .)
# Evaluate performance using Mean Absolute Percentage Error (MAPE)
<- mean(abs(actuals - predicted)/actuals)
mape cat("MAPE on test set:", mape)
MAPE on test set: 0.006424492
Why Bayesian Optimization
Bayesian optimization offers several key advantages over traditional grid search:
- Efficiency: Finds optimal parameters in fewer iterations
- Intelligence: Learns from previous evaluations to focus on promising areas
- Scalability: Remains efficient even with many hyperparameters
- Speed: Completes in a fraction of the time while achieving comparable or better results
This approach becomes increasingly valuable as model complexity grows. For production models, consider increasing the iterations (iters.n
) to ensure thorough exploration of the parameter space.
The ParBayesianOptimization
package makes this powerful technique accessible to R users, allowing you to build better models with less computational overhead.