RStudio是一个开源的集成开发环境(IDE),专门用于R编程语言的开发和数据分析。R语言是一种流行的统计计算和数据可视化语言,广泛用于数据科学、统计学和机器学习领域。
RStudio提供了许多功能强大的工具,包括代码编辑器、调试器、数据可视化工具和集成的帮助文档。对于机器学习推广,RStudio可以帮助您进行数据预处理、模型训练、评估和部署。
RStudio中有许多用于机器学习的包和库,如caret、tidymodels、tensorflow等,这些包提供了丰富的机器学习算法和工具,帮助您构建和部署机器学习模型。
library(rlang)
library(gbm)
library(xgboost)
library(vip)
library(pdp)
library(ggplot2)
library(tidyverse)
library(recipes)
rm(list=ls())
s<-Sys.time()
#数据
# prereqs-data
# ames data
#install.packages("AmesHousing")
library(AmesHousing)
ames <- AmesHousing::make_ames()
# split data
set.seed(123)
split <- rsample::initial_split(ames, strata = "Sale_Price")
ames_train <- rsample::training(split)
ames_test <- rsample::testing(split)
## GBM
## * 非常受欢迎
## * 非常准确
## * 和随机森林类似,但是大不相同
## 第一个GBM模型
# shrinkage: learning rate=0.1学习率默认
# basic-gbm
tick <- proc.time()
set.seed(123)
ames_gbm <- gbm(
formula = Sale_Price ~ .,
data = ames_train,
distribution = "gaussian", # or bernoulli, multinomial, etc.
n.trees = 5000,
shrinkage = 0.1,
interaction.depth = 1,
n.minobsinnode = 10,
cv.folds = 5
)
tock <- proc.time() - tick;tock
# find index for n trees with minimum CV error
min_MSE <- which.min(ames_gbm$cv.error)
# get MSE and compute RMSE
sqrt(ames_gbm$cv.error[min_MSE])
# 检查是否过拟合
gbm.perf(ames_gbm, method = "cv") # or "OOB"
## 调参
## * 刚开始learning rate学习率默认0.1, 或者0.05-0.2
## * 确保number of trees不要过拟合
## * 调整learning rate学习率
## * 调整树参数
## * 可能降低learning rate, 增加number of trees
## 固定树参数
## 树高3
## 默认min obs
## 设定learning rate 0.01
## 增加CV更利于error估计
## 1分钟
## 用尽了所有的树
# tune1
set.seed(123)
tick <- proc.time()
ames_gbm1 <- gbm(
formula = Sale_Price ~ .,
data = ames_train,
distribution = "gaussian", # or bernoulli, multinomial, etc. #<<
n.trees = 5000, #<<
shrinkage = 0.01, #<<
interaction.depth = 3, #<<
n.minobsinnode = 10, #<<
cv.folds = 10 #<<
)
tock <- proc.time() - tick;tock
# find index for n trees with minimum CV error
min_MSE <- which.min(ames_gbm1$cv.error)
# get MSE and compute RMSE
sqrt(ames_gbm1$cv.error[min_MSE])
gbm.perf(ames_gbm1, method = "cv")
## 提高number of trees
## 尝试0.001
## 不准确
## 4分钟
# tune2
set.seed(123)
tick <- proc.time()
ames_gbm1 <- gbm(
formula = Sale_Price ~ .,
data = ames_train,
distribution = "gaussian", # or bernoulli, multinomial, etc. #<<
n.trees = 20000, #<<
shrinkage = 0.001, #<<
interaction.depth = 3, #<<
n.minobsinnode = 10, #<<
cv.folds = 10 #<<
)
tock <- proc.time() - tick;tock
# find index for n trees with minimum CV error
min_MSE <- which.min(ames_gbm1$cv.error)
# get MSE and compute RMSE
sqrt(ames_gbm1$cv.error[min_MSE])
gbm.perf(ames_gbm1, method = "cv")
# 调整树参数: 30分钟
# assess 3 values for tree depth
# assess 3 values for min obs
# tune12
# eval=FALSE或者cache = TRUE
# search grid
hyper_grid <- expand.grid(
n.trees = 6000,
shrinkage = .01,
interaction.depth = c(3, 5, 7), #<<
n.minobsinnode = c(5, 10, 15) #<<
)
model_fit <- function(n.trees, shrinkage, interaction.depth, n.minobsinnode) {
set.seed(123)
m <- gbm(
formula = Sale_Price ~ .,
data = ames_train,
distribution = "gaussian",
n.trees = n.trees,
shrinkage = shrinkage, #<<
interaction.depth = interaction.depth, #<<
n.minobsinnode = n.minobsinnode,
cv.folds = 10
)
# compute RMSE
sqrt(min(m$cv.error))
}
tick <- proc.time()
hyper_grid$rmse <- purrr::pmap_dbl(
hyper_grid,
~ model_fit(
n.trees = ..1,
shrinkage = ..2,
interaction.depth = ..3,
n.minobsinnode = ..4
)
)
tock <- proc.time() - tick;tock
arrange(hyper_grid, rmse)
## n.trees shrinkage interaction.depth n.minobsinnode rmse
## 1 6000 0.01 7 5 20683.65
## 2 6000 0.01 5 5 20746.36
## 3 6000 0.01 7 10 21114.62
## 4 6000 0.01 3 5 21126.33
## 5 6000 0.01 5 10 21228.68
## 6 6000 0.01 7 15 21246.73
## 7 6000 0.01 5 15 21272.91
## 8 6000 0.01 3 10 21767.08
## 9 6000 0.01 3 15 21862.35
# 加入随机性
# * 0.5-0.8
# * 可能需要Zoom in
# * 10分钟
# stochastic
bag_frac <- c(.5, .65, .8) #<<
tick <- proc.time()
for(i in bag_frac) {
set.seed(123)
m <- gbm(
formula = Sale_Price ~ .,
data = ames_train,
distribution = "gaussian",
n.trees = 6000,
shrinkage = 0.01,
interaction.depth = 7,
n.minobsinnode = 5,
bag.fraction = i, #<<
cv.folds = 10
)
# compute RMSE
print(sqrt(min(m$cv.error)))
}
tock <- proc.time() - tick;tock
## [1] 20683.65
## [1] 20398.93
## [1] 20287.76
## XGBoost
## XGBoost 需要one hot
## caret可以自动one hot
## * Other: 低频
## * Encode: 有序变量;提升速度和表现
# xgb-feature-prep
library(rlang)
xgb_prep <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_other(all_nominal(), threshold = .005) %>%
step_integer(all_nominal()) %>%
prep(training = ames_train, retain = TRUE) %>%
juice()
X <- as.matrix(xgb_prep[setdiff(names(xgb_prep), "Sale_Price")])
Y <- xgb_prep$Sale_Price
# 第一个XGBoost模型
# 大概几秒钟
# Early stopping rounds: CV RMSE 50棵树不进步
# https://xgboost.readthedocs.io/en/latest/parameter.html
# 默认eta=0.3
# basic-xgboost
tick <- proc.time()
set.seed(123)
ames_xgb <- xgb.cv(
data = X,
label = Y,
nrounds = 5000,
objective = "reg:squarederror",
early_stopping_rounds = 50,
nfold = 10,
verbose = 0,
)
tock <- proc.time() - tick;tock
ames_xgb$evaluation_log %>% tail()
## iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
## 1: 195 600.2057 55.89026 24382.85 1725.531
## 2: 196 591.9463 55.26891 24381.77 1725.488
## 3: 197 582.9737 53.65047 24382.20 1724.203
## 4: 198 572.7220 52.89840 24382.79 1723.236
## 5: 199 561.9686 49.76037 24382.41 1724.374
## 6: 200 554.3745 48.24520 24382.83 1723.857
# xgboost-eta
set.seed(123)
ames_xgb <- xgb.cv(
data = X,
label = Y,
nrounds = 6000,
objective = "reg:squarederror",
early_stopping_rounds = 50,
nfold = 10,
verbose = 0,
params = list(eta = .05) #<<
)
ames_xgb$evaluation_log %>% tail()
## iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
## 1: 1023 1034.059 51.96559 21879.08 1248.379
## 2: 1024 1032.371 52.17706 21879.89 1248.430
## 3: 1025 1030.903 52.01260 21879.53 1248.890
## 4: 1026 1029.302 51.47370 21879.29 1249.095
## 5: 1027 1027.053 50.80923 21879.43 1248.593
## 6: 1028 1025.107 50.38078 21879.46 1249.280
## - eta = .3(default): 24,382 w/200 trees (< 1 min)
## - eta = .1: 22,333 w/398 trees (< 1 min)
## - eta = .05: 21,877 w/978 trees (1.5 min)
## - eta = .01: 22,094 w/2843 trees (4 min)
## - one-hot encoded 30 min
# * Learning rate=0.05
# * 6000棵
# * Early stopping
# * 调整
# - tree depth
# - Child weight
# 30分钟
# xgboost-tree-specific
# grid
hyper_grid <- expand.grid(
eta = .05,
max_depth = c(1, 3, 5, 7, 9), #<<
min_child_weight = c(1, 3, 5, 7, 9), #<<
rmse = 0 # a place to dump results
)
# grid search
tick <- proc.time()
for(i in seq_len(nrow(hyper_grid))) {
set.seed(123)
m <- xgb.cv(
data = X,
label = Y,
nrounds = 6000,
objective = "reg:squarederror",
early_stopping_rounds = 50,
nfold = 10,
verbose = 0,
params = list( #<<
eta = hyper_grid$eta[i], #<<
max_depth = hyper_grid$max_depth[i], #<<
min_child_weight = hyper_grid$min_child_weight[i] #<<
) #<<
)
hyper_grid$rmse[i] <- min(m$evaluation_log$test_rmse_mean)
}
tock <- proc.time() - tick;tock
arrange(hyper_grid, rmse)
## eta max_depth min_child_weight rmse
## 1 0.05 3 3 20989.27
## 2 0.05 3 1 21062.92
## 3 0.05 3 5 21453.00
## 4 0.05 5 1 21685.04
## 5 0.05 5 3 21748.12
## 6 0.05 3 7 22058.70
## 7 0.05 7 1 22110.09
## 8 0.05 3 9 22181.64
## 9 0.05 5 5 22185.64
## 10 0.05 7 3 22468.97
## 11 0.05 9 1 22632.80
## 12 0.05 9 3 22664.75
## 13 0.05 7 5 22721.41
## 14 0.05 5 7 22801.02
## 15 0.05 7 7 22951.21
## 16 0.05 5 9 22970.42
## 17 0.05 9 5 22987.65
## 18 0.05 9 7 23212.10
## 19 0.05 1 1 23252.66
## 20 0.05 7 9 23452.08
## 21 0.05 9 9 23714.29
## 22 0.05 1 3 24147.17
## 23 0.05 1 5 25089.55
## 24 0.05 1 7 26534.20
## 25 0.05 1 9 26784.77
# * Learning rate=0.05
# * 6000棵
# * Early stopping
# * tree depth=3
# * Child weight=3
# * 调整随机性
# - subsampling rows for each tree
# - subsampling columns for each tree
# 12分钟
# xgb-stochastic-grid
# grid
hyper_grid <- expand.grid(
eta = .05,
max_depth = 3,
min_child_weight = 3,
subsample = c(.5, .65, .8, 1), #<<
colsample_bytree = c(.5, .65, .8, 1), #<<
rmse = 0 # a place to dump results
)
# grid search
tick <- proc.time()
for(i in seq_len(nrow(hyper_grid))) {
set.seed(123)
m <- xgb.cv(
data = X,
label = Y,
nrounds = 6000,
objective = "reg:squarederror",
early_stopping_rounds = 50,
nfold = 10,
verbose = 0,
params = list( #<<
eta = hyper_grid$eta[i],
max_depth = hyper_grid$max_depth[i],
min_child_weight = hyper_grid$min_child_weight[i],
subsample = hyper_grid$subsample[i], #<<
colsample_bytree = hyper_grid$colsample_bytree[i] #<<
) #<<
)
hyper_grid$rmse[i] <- min(m$evaluation_log$test_rmse_mean)
}
tock <- proc.time() - tick;tock
arrange(hyper_grid, rmse)
## eta max_depth min_child_weight subsample colsample_bytree rmse
## 1 0.05 3 3 0.80 1.00 20732.22
## 2 0.05 3 3 0.50 1.00 20752.65
## 3 0.05 3 3 1.00 1.00 20989.27
## 4 0.05 3 3 0.65 1.00 21018.13
## 5 0.05 3 3 1.00 0.80 21425.00
## 6 0.05 3 3 0.80 0.80 21444.68
## 7 0.05 3 3 1.00 0.65 21597.04
## 8 0.05 3 3 0.65 0.80 21890.11
## 9 0.05 3 3 0.80 0.65 21907.55
## 10 0.05 3 3 0.65 0.65 22047.80
## 11 0.05 3 3 0.50 0.80 22211.92
## 12 0.05 3 3 0.50 0.65 22460.97
## 13 0.05 3 3 0.80 0.50 26071.94
## 14 0.05 3 3 1.00 0.50 26555.12
## 15 0.05 3 3 0.65 0.50 26577.24
## 16 0.05 3 3 0.50 0.50 26952.51
# xgb-regularize
hyper_grid <- expand.grid(
eta = .05,
max_depth = 3,
min_child_weight = 3,
subsample = .8,
colsample_bytree = 1,
#gamma = c(1, 100, 1000, 10000),
#lambda = c(1e-2, 0.1, 1, 100, 1000, 10000),
alpha = c(1e-2, 0.1, 1, 100, 1000, 10000), #<<
rmse = 0 # a place to dump results
)
# grid search
tick <- proc.time()
for(i in seq_len(nrow(hyper_grid))) {
set.seed(123)
m <- xgb.cv(
data = X,
label = Y,
nrounds = 6000,
objective = "reg:squarederror",
early_stopping_rounds = 50,
nfold = 10,
verbose = 0,
params = list(
eta = hyper_grid$eta[i],
max_depth = hyper_grid$max_depth[i],
min_child_weight = hyper_grid$min_child_weight[i],
subsample = hyper_grid$subsample[i], #<<
colsample_bytree = hyper_grid$colsample_bytree[i],
#gamma = hyper_grid$gamma[i],
#lambda = hyper_grid$lambda[i]#,
alpha = hyper_grid$alpha[i] #<<
)
)
hyper_grid$rmse[i] <- min(m$evaluation_log$test_rmse_mean)
}
tock <- proc.time() - tick;tock
arrange(hyper_grid, rmse)
## eta max_depth min_child_weight subsample colsample_bytree alpha rmse
## 1 0.05 3 3 0.8 1 1e+02 20581.31
## 2 0.05 3 3 0.8 1 1e+03 20605.30
## 2 0.05 3 3 0.8 1 1e+04 20615.19
## 3 0.05 3 3 0.8 1 1e-01 20732.23
## 4 0.05 3 3 0.8 1 1e-02 20732.23
## 5 0.05 3 3 0.8 1 1e+00 20732.23
# final-xgb-model
# parameter list
params <- list(
eta = .05,
max_depth = 3,
min_child_weight = 3,
subsample = .8,
colsample_bytree = 1,
alpha = 100
)
# final cv fit
set.seed(123)
tick <- proc.time()
final_cv <- xgb.cv(
data = X,
label = Y,
nrounds = 6000,
objective = "reg:squarederror",
early_stopping_rounds = 50,
nfold = 10,
verbose = 0,
params = params
)
tock <- proc.time() - tick;tock
final_cv$evaluation_log %>%
ggplot(aes(x=iter,y=test_rmse_mean)) +
geom_line()
# train final model
ames_final_xgb <- xgboost(
data = X,
label = Y,
nrounds = final_cv$best_iteration,
objective = "reg:squarederror",
params = params,
verbose = 0
)
# xgb-vip
vip::vip(ames_final_xgb, num_features = 25)
# xgb-pdp
ames_final_xgb %>%
pdp::partial(
pred.var = "Gr_Liv_Area",
n.trees = ames_final_xgb$niter,
grid.resolution = 50,
train = X
) %>%
autoplot(rug = TRUE, train = X)
# top-bottom-vip
ames_vi <- vi(ames_final_xgb, feature_names = colnames(X))
feats <- c(head(ames_vi, n = 4)$Variable, tail(ames_vi, n = 4)$Variable)
pds <- lapply(feats, FUN = function(x) {
pd <- cbind(x, pdp::partial(ames_final_xgb, pred.var = x, train = X))
names(pd) <- c("xvar", "xval", "yhat")
pd
})
pds <- do.call(rbind, pds)
pds$xvar <- factor(pds$xvar,levels=feats)
ggplot(pds, aes(x = xval, y = yhat)) +
geom_line(size = 1.5) +
geom_hline(yintercept = mean(ames$Sale_Price), linetype = 2, col = "red2") +
facet_wrap( ~ xvar, scales = "free_x", nrow = 2) +
labs(x = "", y = "Partial dependence") +
theme_light()
## test data测试集上面预测
# recipe
test <- recipe(Sale_Price ~ ., data = ames_train) %>%
step_other(all_nominal(), threshold = .005) %>%
step_integer(all_nominal()) %>%
prep(training = ames_train) %>%
bake(new_data = ames_test)
# obtain X for test data
xgtest <- xgb.DMatrix(as.matrix(test %>% select(-Sale_Price)))
# make prediction and calculate error
test <- test %>%
mutate(prediction = predict(ames_final_xgb, xgtest)) %>%
mutate(error = (prediction-Sale_Price)^2)
# plot
test %>% ggplot(aes(Sale_Price,prediction)) +
geom_point()
# test error
test %>%
summarize(mean(error)) %>%
sqrt()
save.image("6_boosting_CCB_cohort2.RData")
#??????1
e1<-Sys.time()
print(e1-s)
总的来说,RStudio是一个功能强大的工具,可用于机器学习推广,帮助用户在数据分析和模型开发过程中更高效地工作。