Rstudio-深度学习执行代码

        RStudio是一个开源的集成开发环境(IDE),专门用于R编程语言的开发和数据分析。R语言是一种流行的统计计算和数据可视化语言,广泛用于数据科学、统计学和机器学习领域。

RStudio提供了许多功能强大的工具,包括代码编辑器、调试器、数据可视化工具和集成的帮助文档。对于机器学习推广,RStudio可以帮助您进行数据预处理、模型训练、评估和部署。

RStudio中有许多用于机器学习的包和库,如caret、tidymodels、tensorflow等,这些包提供了丰富的机器学习算法和工具,帮助您构建和部署机器学习模型。

library(rlang)
library(gbm)
library(xgboost)
library(vip)
library(pdp)
library(ggplot2)
library(tidyverse)
library(recipes)
rm(list=ls())

s<-Sys.time()

#数据

# prereqs-data
# ames data
#install.packages("AmesHousing")
library(AmesHousing)
ames <- AmesHousing::make_ames()

# split data
set.seed(123)
split <- rsample::initial_split(ames, strata = "Sale_Price")
ames_train <- rsample::training(split)
ames_test  <- rsample::testing(split)

## GBM

## * 非常受欢迎
## * 非常准确
## * 和随机森林类似,但是大不相同

## 第一个GBM模型

# shrinkage: learning rate=0.1学习率默认

# basic-gbm
tick <- proc.time()
set.seed(123)
ames_gbm <- gbm(
  formula = Sale_Price ~ .,
  data = ames_train,
  distribution = "gaussian", # or bernoulli, multinomial, etc. 
  n.trees = 5000, 
  shrinkage = 0.1, 
  interaction.depth = 1, 
  n.minobsinnode = 10, 
  cv.folds = 5 
)  
tock <- proc.time() - tick;tock

# find index for n trees with minimum CV error
min_MSE <- which.min(ames_gbm$cv.error)

# get MSE and compute RMSE
sqrt(ames_gbm$cv.error[min_MSE])

# 检查是否过拟合

gbm.perf(ames_gbm, method = "cv") # or "OOB"

## 调参

## * 刚开始learning rate学习率默认0.1, 或者0.05-0.2
## * 确保number of trees不要过拟合
## * 调整learning rate学习率
## * 调整树参数
## * 可能降低learning rate, 增加number of trees

## 固定树参数 

## 树高3

## 默认min obs

## 设定learning rate 0.01

## 增加CV更利于error估计

## 1分钟

## 用尽了所有的树

# tune1
set.seed(123)
tick <- proc.time()
ames_gbm1 <- gbm(
  formula = Sale_Price ~ .,
  data = ames_train,
  distribution = "gaussian", # or bernoulli, multinomial, etc. #<<
  n.trees = 5000, #<<
  shrinkage = 0.01, #<<
  interaction.depth = 3, #<<
  n.minobsinnode = 10, #<<
  cv.folds = 10 #<<
)
tock <- proc.time() - tick;tock

# find index for n trees with minimum CV error
min_MSE <- which.min(ames_gbm1$cv.error)

# get MSE and compute RMSE
sqrt(ames_gbm1$cv.error[min_MSE])

gbm.perf(ames_gbm1, method = "cv")


## 提高number of trees

## 尝试0.001

## 不准确

## 4分钟

# tune2
set.seed(123)
tick <- proc.time()
ames_gbm1 <- gbm(
  formula = Sale_Price ~ .,
  data = ames_train,
  distribution = "gaussian", # or bernoulli, multinomial, etc. #<<
  n.trees = 20000, #<<
  shrinkage = 0.001, #<<
  interaction.depth = 3, #<<
  n.minobsinnode = 10, #<<
  cv.folds = 10 #<<
)
tock <- proc.time() - tick;tock

# find index for n trees with minimum CV error
min_MSE <- which.min(ames_gbm1$cv.error)

# get MSE and compute RMSE
sqrt(ames_gbm1$cv.error[min_MSE])

gbm.perf(ames_gbm1, method = "cv")




# 调整树参数: 30分钟

# assess 3 values for tree depth

# assess 3 values for min obs


# tune12
# eval=FALSE或者cache = TRUE
# search grid
hyper_grid <- expand.grid(
  n.trees = 6000,
  shrinkage = .01,
  interaction.depth = c(3, 5, 7), #<<
  n.minobsinnode = c(5, 10, 15) #<<
)

model_fit <- function(n.trees, shrinkage, interaction.depth, n.minobsinnode) {
  set.seed(123)
  m <- gbm(
    formula = Sale_Price ~ .,
    data = ames_train,
    distribution = "gaussian",
    n.trees = n.trees,
    shrinkage = shrinkage, #<<
    interaction.depth = interaction.depth, #<<
    n.minobsinnode = n.minobsinnode,
    cv.folds = 10
  )
  # compute RMSE
  sqrt(min(m$cv.error))
}

tick <- proc.time()
hyper_grid$rmse <- purrr::pmap_dbl(
  hyper_grid,
  ~ model_fit(
    n.trees = ..1,
    shrinkage = ..2,
    interaction.depth = ..3,
    n.minobsinnode = ..4
  )
)
tock <- proc.time() - tick;tock

arrange(hyper_grid, rmse)
##   n.trees shrinkage interaction.depth n.minobsinnode     rmse
## 1    6000      0.01                 7              5 20683.65
## 2    6000      0.01                 5              5 20746.36
## 3    6000      0.01                 7             10 21114.62
## 4    6000      0.01                 3              5 21126.33
## 5    6000      0.01                 5             10 21228.68
## 6    6000      0.01                 7             15 21246.73
## 7    6000      0.01                 5             15 21272.91
## 8    6000      0.01                 3             10 21767.08
## 9    6000      0.01                 3             15 21862.35


# 加入随机性

# * 0.5-0.8
# * 可能需要Zoom in
# * 10分钟


# stochastic
bag_frac <- c(.5, .65, .8) #<<

tick <- proc.time()
for(i in bag_frac) {
  set.seed(123)
  m <- gbm(
    formula = Sale_Price ~ .,
    data = ames_train,
    distribution = "gaussian",
    n.trees = 6000, 
    shrinkage = 0.01, 
    interaction.depth = 7, 
    n.minobsinnode = 5,
    bag.fraction = i, #<<
    cv.folds = 10 
  )
  # compute RMSE
  print(sqrt(min(m$cv.error)))
}

tock <- proc.time() - tick;tock

## [1] 20683.65
## [1] 20398.93
## [1] 20287.76

## XGBoost

## XGBoost 需要one hot

## caret可以自动one hot

## * Other: 低频

## * Encode: 有序变量;提升速度和表现


# xgb-feature-prep
library(rlang)
xgb_prep <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_other(all_nominal(), threshold = .005) %>%
  step_integer(all_nominal()) %>%
  prep(training = ames_train, retain = TRUE) %>%
  juice()

X <- as.matrix(xgb_prep[setdiff(names(xgb_prep), "Sale_Price")])
Y <- xgb_prep$Sale_Price


# 第一个XGBoost模型

# 大概几秒钟

# Early stopping rounds: CV RMSE 50棵树不进步

# https://xgboost.readthedocs.io/en/latest/parameter.html

# 默认eta=0.3

# basic-xgboost
tick <- proc.time()
set.seed(123)
ames_xgb <- xgb.cv(
  data = X,
  label = Y,
  nrounds = 5000,
  objective = "reg:squarederror",
  early_stopping_rounds = 50, 
  nfold = 10,
  verbose = 0,
)  
tock <- proc.time() - tick;tock

ames_xgb$evaluation_log %>% tail()
## iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
## 1: 195 600.2057 55.89026 24382.85 1725.531
## 2: 196 591.9463 55.26891 24381.77 1725.488
## 3: 197 582.9737 53.65047 24382.20 1724.203
## 4: 198 572.7220 52.89840 24382.79 1723.236
## 5: 199 561.9686 49.76037 24382.41 1724.374
## 6: 200 554.3745 48.24520 24382.83 1723.857


# xgboost-eta
set.seed(123)
ames_xgb <- xgb.cv(
  data = X,
  label = Y,
  nrounds = 6000,
  objective = "reg:squarederror",
  early_stopping_rounds = 50, 
  nfold = 10,
  verbose = 0,
  params = list(eta = .05) #<<
)  

ames_xgb$evaluation_log %>% tail()
## iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
## 1: 1023 1034.059 51.96559 21879.08 1248.379
## 2: 1024 1032.371 52.17706 21879.89 1248.430
## 3: 1025 1030.903 52.01260 21879.53 1248.890
## 4: 1026 1029.302 51.47370 21879.29 1249.095
## 5: 1027 1027.053 50.80923 21879.43 1248.593
## 6: 1028 1025.107 50.38078 21879.46 1249.280

## - eta = .3(default): 24,382 w/200 trees (< 1 min)
## - eta = .1: 22,333 w/398 trees (< 1 min)
## - eta = .05: 21,877 w/978 trees (1.5 min)
## - eta = .01: 22,094 w/2843 trees (4 min)
## - one-hot encoded 30 min


# * Learning rate=0.05
# * 6000棵
# * Early stopping
# * 调整
# - tree depth
# - Child weight

# 30分钟

# xgboost-tree-specific
# grid
hyper_grid <- expand.grid(
  eta = .05,
  max_depth = c(1, 3, 5, 7, 9), #<<
  min_child_weight = c(1, 3, 5, 7, 9), #<<
  rmse = 0 # a place to dump results
)

# grid search
tick <- proc.time()
for(i in seq_len(nrow(hyper_grid))) {
  set.seed(123)
  m <- xgb.cv(
    data = X,
    label = Y,
    nrounds = 6000,
    objective = "reg:squarederror",
    early_stopping_rounds = 50, 
    nfold = 10,
    verbose = 0,
    params = list( #<<
      eta = hyper_grid$eta[i], #<<
      max_depth = hyper_grid$max_depth[i], #<<
      min_child_weight = hyper_grid$min_child_weight[i] #<<
    ) #<<
  )
  hyper_grid$rmse[i] <- min(m$evaluation_log$test_rmse_mean)
}
tock <- proc.time() - tick;tock

arrange(hyper_grid, rmse)
##     eta max_depth min_child_weight     rmse
## 1  0.05         3                3 20989.27
## 2  0.05         3                1 21062.92
## 3  0.05         3                5 21453.00
## 4  0.05         5                1 21685.04
## 5  0.05         5                3 21748.12
## 6  0.05         3                7 22058.70
## 7  0.05         7                1 22110.09
## 8  0.05         3                9 22181.64
## 9  0.05         5                5 22185.64
## 10 0.05         7                3 22468.97
## 11 0.05         9                1 22632.80
## 12 0.05         9                3 22664.75
## 13 0.05         7                5 22721.41
## 14 0.05         5                7 22801.02
## 15 0.05         7                7 22951.21
## 16 0.05         5                9 22970.42
## 17 0.05         9                5 22987.65
## 18 0.05         9                7 23212.10
## 19 0.05         1                1 23252.66
## 20 0.05         7                9 23452.08
## 21 0.05         9                9 23714.29
## 22 0.05         1                3 24147.17
## 23 0.05         1                5 25089.55
## 24 0.05         1                7 26534.20
## 25 0.05         1                9 26784.77

# * Learning rate=0.05
# * 6000棵
# * Early stopping
# * tree depth=3
# * Child weight=3
# * 调整随机性
# - subsampling rows for each tree
# - subsampling columns for each tree

# 12分钟

# xgb-stochastic-grid
# grid
hyper_grid <- expand.grid(
  eta = .05,
  max_depth = 3, 
  min_child_weight = 3,
  subsample = c(.5, .65, .8, 1), #<<
  colsample_bytree = c(.5, .65, .8, 1), #<<
  rmse = 0 # a place to dump results
)

# grid search
tick <- proc.time()
for(i in seq_len(nrow(hyper_grid))) {
  set.seed(123)
  m <- xgb.cv(
    data = X,
    label = Y,
    nrounds = 6000,
    objective = "reg:squarederror",
    early_stopping_rounds = 50, 
    nfold = 10,
    verbose = 0,
    params = list( #<<
      eta = hyper_grid$eta[i],
      max_depth = hyper_grid$max_depth[i],
      min_child_weight = hyper_grid$min_child_weight[i],
      subsample = hyper_grid$subsample[i], #<<
      colsample_bytree = hyper_grid$colsample_bytree[i] #<<
    ) #<<
  )
  hyper_grid$rmse[i] <- min(m$evaluation_log$test_rmse_mean)
}
tock <- proc.time() - tick;tock

arrange(hyper_grid, rmse)
##     eta max_depth min_child_weight subsample colsample_bytree     rmse
## 1  0.05         3                3      0.80             1.00 20732.22
## 2  0.05         3                3      0.50             1.00 20752.65
## 3  0.05         3                3      1.00             1.00 20989.27
## 4  0.05         3                3      0.65             1.00 21018.13
## 5  0.05         3                3      1.00             0.80 21425.00
## 6  0.05         3                3      0.80             0.80 21444.68
## 7  0.05         3                3      1.00             0.65 21597.04
## 8  0.05         3                3      0.65             0.80 21890.11
## 9  0.05         3                3      0.80             0.65 21907.55
## 10 0.05         3                3      0.65             0.65 22047.80
## 11 0.05         3                3      0.50             0.80 22211.92
## 12 0.05         3                3      0.50             0.65 22460.97
## 13 0.05         3                3      0.80             0.50 26071.94
## 14 0.05         3                3      1.00             0.50 26555.12
## 15 0.05         3                3      0.65             0.50 26577.24
## 16 0.05         3                3      0.50             0.50 26952.51

# xgb-regularize
hyper_grid <- expand.grid(
  eta = .05,
  max_depth = 3, 
  min_child_weight = 3,
  subsample = .8, 
  colsample_bytree = 1,
  #gamma = c(1, 100, 1000, 10000),
  #lambda = c(1e-2, 0.1, 1, 100, 1000, 10000),
  alpha = c(1e-2, 0.1, 1, 100, 1000, 10000), #<<
  rmse = 0 # a place to dump results
)

# grid search
tick <- proc.time()
for(i in seq_len(nrow(hyper_grid))) {
  set.seed(123)
  m <- xgb.cv(
    data = X,
    label = Y,
    nrounds = 6000,
    objective = "reg:squarederror",
    early_stopping_rounds = 50, 
    nfold = 10,
    verbose = 0,
    params = list( 
      eta = hyper_grid$eta[i], 
      max_depth = hyper_grid$max_depth[i],
      min_child_weight = hyper_grid$min_child_weight[i],
      subsample = hyper_grid$subsample[i], #<<
      colsample_bytree = hyper_grid$colsample_bytree[i],
      #gamma = hyper_grid$gamma[i], 
      #lambda = hyper_grid$lambda[i]#, 
      alpha = hyper_grid$alpha[i] #<<
    ) 
  )
  hyper_grid$rmse[i] <- min(m$evaluation_log$test_rmse_mean)
}
tock <- proc.time() - tick;tock

arrange(hyper_grid, rmse)
##    eta max_depth min_child_weight subsample colsample_bytree alpha     rmse
## 1 0.05         3                3       0.8                1 1e+02 20581.31
## 2 0.05         3                3       0.8                1 1e+03 20605.30
## 2 0.05         3                3       0.8                1 1e+04 20615.19
## 3 0.05         3                3       0.8                1 1e-01 20732.23
## 4 0.05         3                3       0.8                1 1e-02 20732.23
## 5 0.05         3                3       0.8                1 1e+00 20732.23

# final-xgb-model
# parameter list
params <- list(
  eta = .05,
  max_depth = 3, 
  min_child_weight = 3,
  subsample = .8, 
  colsample_bytree = 1,
  alpha = 100
)

# final cv fit
set.seed(123)
tick <- proc.time()
final_cv <- xgb.cv(
  data = X,
  label = Y,
  nrounds = 6000,
  objective = "reg:squarederror",
  early_stopping_rounds = 50, 
  nfold = 10,
  verbose = 0,
  params = params 
) 
tock <- proc.time() - tick;tock


final_cv$evaluation_log %>%
  ggplot(aes(x=iter,y=test_rmse_mean)) +
  geom_line()

# train final model
ames_final_xgb <- xgboost(
  data = X,
  label = Y,
  nrounds = final_cv$best_iteration, 
  objective = "reg:squarederror",
  params = params, 
  verbose = 0
)

# xgb-vip
vip::vip(ames_final_xgb, num_features = 25)


# xgb-pdp
ames_final_xgb %>%
  pdp::partial(
    pred.var = "Gr_Liv_Area", 
    n.trees = ames_final_xgb$niter, 
    grid.resolution = 50, 
    train = X
  ) %>%
  autoplot(rug = TRUE, train = X)

# top-bottom-vip
ames_vi <- vi(ames_final_xgb, feature_names = colnames(X))
feats <- c(head(ames_vi, n = 4)$Variable, tail(ames_vi, n = 4)$Variable)
pds <- lapply(feats, FUN = function(x) {
  pd <- cbind(x, pdp::partial(ames_final_xgb, pred.var = x, train = X))
  names(pd) <- c("xvar", "xval", "yhat")
  pd
})
pds <- do.call(rbind, pds)
pds$xvar <- factor(pds$xvar,levels=feats)
ggplot(pds, aes(x = xval, y = yhat)) +
  geom_line(size = 1.5) +
  geom_hline(yintercept = mean(ames$Sale_Price), linetype = 2, col = "red2") +
  facet_wrap( ~ xvar, scales = "free_x", nrow = 2) +
  labs(x = "", y = "Partial dependence") +
  theme_light()

## test data测试集上面预测

# recipe
test <- recipe(Sale_Price ~ ., data = ames_train) %>%
  step_other(all_nominal(), threshold = .005) %>%
  step_integer(all_nominal()) %>%
  prep(training = ames_train) %>%
  bake(new_data = ames_test)

# obtain X for test data
xgtest <- xgb.DMatrix(as.matrix(test %>% select(-Sale_Price)))

# make prediction and calculate error
test <- test %>%
  mutate(prediction = predict(ames_final_xgb, xgtest)) %>%
  mutate(error = (prediction-Sale_Price)^2)

# plot
test %>% ggplot(aes(Sale_Price,prediction)) +
  geom_point()

# test error
test %>%
  summarize(mean(error)) %>%
  sqrt()

save.image("6_boosting_CCB_cohort2.RData")

#??????1
e1<-Sys.time()
print(e1-s)

        总的来说,RStudio是一个功能强大的工具,可用于机器学习推广,帮助用户在数据分析和模型开发过程中更高效地工作。

你可能感兴趣的:(深度学习,计算机视觉,人工智能)