library(tidyverse) # for graphing and data cleaning
library(tidymodels) # for modeling
library(stacks) # for stacking models
library(naniar) # for analyzing missing values
library(vip) # for variable importance plots
library(usemodels) # for tidymodels suggestions
library(xgboost) # for boosting - need to install, don't need to load
library(doParallel) # for parallel processing
library(lubridate) # for dates
library(moderndive) # for King County housing data
library(patchwork) # for combining plots nicely
library(rmarkdown) # for paged tables
theme_set(theme_minimal()) # Lisa's favorite theme
Lasso Model
pollution <- read.csv("data_standardized.csv")
pollution %>%
select(where(is.numeric)) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "value") %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(vars(variable),
scales = "free",
nrow = 5)
pollution %>%
select(Breast_Surgery_Ct) %>%
ggplot(aes(x = Breast_Surgery_Ct)) +
geom_histogram()
for(i in 1:ncol(pollution)) {
median <- median(pollution[,i],na.rm = TRUE)
for (j in 1:nrow(pollution)){
if (is.na(pollution[j, i])){
pollution[j, i] <- median
}
}
}
pollution_mod <- pollution %>%
mutate(log_breast = log(Breast_Surgery_Ct, base = 10)) %>%
filter(log_breast != "-Inf") %>%
select(-Breast_Surgery_Ct) %>%
mutate(across(where(is.character), as.factor)) %>%
add_n_miss() %>%
filter(n_miss_all == 0) %>%
select(-n_miss_all)
set.seed(456)
pollution_split <- initial_split(pollution_mod, prop = .75)
pollution_training <- training(pollution_split)
pollution_testing <- testing(pollution_split)
pollution_recipe <- recipe(log_breast ~ ., data = pollution_training) %>%
step_rm(County,
asthma_er_avg,
Year,
All.causes..total.,
Alzheimer.s.disease,
Malignant.neoplasms,
Chronic.lower.respiratory.diseases,
Diabetes.mellitus,
Assault..homicide.,
Diseases.of.heart,
Essential.hypertension.and.hypertensive.renal.disease,
Accidents..unintentional.injuries.,
Chronic.liver.disease.and.cirrhosis,
Nephritis..nephrotic.syndrome.and.nephrosis,
Parkinson.s.disease,
Influenza.and.pneumonia,
Cerebrovascular.diseases,
Intentional.self.harm..suicide.,
FIPS,
cancer_incidence_rate,
asthma_deaths,
Bladder_Surgery_Ct,
Brain_Surgery_Ct,
# Breast_Surgery_Ct,
Colon_Surgery_Ct,
Esophagus_Surgery_Ct,
Liver_Surgery_Ct,
Lung_Surgery_Ct,
Pancreas_Surgery_Ct,
Prostate_Surgery_Ct,
Rectum_Surgery_Ct,
Stomach_Surgery_Ct) %>%
step_normalize(all_predictors(),
-all_nominal()) %>%
step_dummy(all_nominal(),
-all_outcomes())
pollution_recipe %>%
prep(pollution_training) %>%
juice()
## # A tibble: 110 × 33
## annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
## <dbl> <dbl> <dbl> <dbl>
## 1 0.791 1.99 -0.419 1.24
## 2 -0.757 -0.0177 -0.170 -0.416
## 3 0.0652 -0.0849 -0.170 0.159
## 4 1.03 -0.930 -0.170 -0.818
## 5 -0.816 0.169 -0.921 -0.952
## 6 -0.592 -0.0389 -0.818 -0.432
## 7 0.882 -0.0221 -0.170 -0.134
## 8 1.71 0.718 -0.170 -1.08
## 9 -1.03 -1.37 -0.170 -1.38
## 10 0.728 -0.710 -0.170 -0.715
## # … with 100 more rows, and 29 more variables: annual_mean_co <dbl>,
## # annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## # economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## # neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## # insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## # commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## # employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …
pollution_lasso_mod <-
linear_reg(mixture = 1) %>%
set_engine("glmnet") %>%
set_args(penalty = tune()) %>%
set_mode("regression")
pollution_lasso_wf <-
workflow() %>%
add_recipe(pollution_recipe) %>%
add_model(pollution_lasso_mod)
penalty_grid <- grid_regular(penalty(),
levels = 20)
penalty_grid
## # A tibble: 20 × 1
## penalty
## <dbl>
## 1 1 e-10
## 2 3.36e-10
## 3 1.13e- 9
## 4 3.79e- 9
## 5 1.27e- 8
## 6 4.28e- 8
## 7 1.44e- 7
## 8 4.83e- 7
## 9 1.62e- 6
## 10 5.46e- 6
## 11 1.83e- 5
## 12 6.16e- 5
## 13 2.07e- 4
## 14 6.95e- 4
## 15 2.34e- 3
## 16 7.85e- 3
## 17 2.64e- 2
## 18 8.86e- 2
## 19 2.98e- 1
## 20 1 e+ 0
ctrl_grid <- control_stack_grid()
set.seed(456)
pollution_cv <- vfold_cv(pollution_training, v = 5)
pollution_lasso_tune <-
pollution_lasso_wf %>%
tune_grid(
resamples = pollution_cv,
grid = penalty_grid,
control = ctrl_grid
)
pollution_lasso_tune
## # Tuning results
## # 5-fold cross-validation
## # A tibble: 5 × 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [88/22]> Fold1 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [440 × 5]>
## 2 <split [88/22]> Fold2 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [440 × 5]>
## 3 <split [88/22]> Fold3 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [440 × 5]>
## 4 <split [88/22]> Fold4 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [440 × 5]>
## 5 <split [88/22]> Fold5 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [440 × 5]>
pollution_lasso_tune %>%
select(id, .metrics) %>%
unnest(.metrics) %>%
filter(.metric == "rmse")
## # A tibble: 100 × 6
## id penalty .metric .estimator .estimate .config
## <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 Fold1 1 e-10 rmse standard 0.313 Preprocessor1_Model01
## 2 Fold1 3.36e-10 rmse standard 0.313 Preprocessor1_Model02
## 3 Fold1 1.13e- 9 rmse standard 0.313 Preprocessor1_Model03
## 4 Fold1 3.79e- 9 rmse standard 0.313 Preprocessor1_Model04
## 5 Fold1 1.27e- 8 rmse standard 0.313 Preprocessor1_Model05
## 6 Fold1 4.28e- 8 rmse standard 0.313 Preprocessor1_Model06
## 7 Fold1 1.44e- 7 rmse standard 0.313 Preprocessor1_Model07
## 8 Fold1 4.83e- 7 rmse standard 0.313 Preprocessor1_Model08
## 9 Fold1 1.62e- 6 rmse standard 0.313 Preprocessor1_Model09
## 10 Fold1 5.46e- 6 rmse standard 0.313 Preprocessor1_Model10
## # … with 90 more rows
pollution_lasso_tune %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
scale_x_log10(
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10",scales::math_format(10^.x))) +
labs(x = "penalty", y = "rmse")
pollution_lasso_tune %>%
show_best(metric = "rmse")
## # A tibble: 5 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00234 rmse standard 0.296 5 0.0179 Preprocessor1_Model15
## 2 0.00785 rmse standard 0.297 5 0.0163 Preprocessor1_Model16
## 3 0.000695 rmse standard 0.300 5 0.0177 Preprocessor1_Model14
## 4 0.000207 rmse standard 0.302 5 0.0179 Preprocessor1_Model13
## 5 0.0000616 rmse standard 0.302 5 0.0177 Preprocessor1_Model12
best_param <- pollution_lasso_tune %>%
select_best(metric = "rmse")
best_param
## # A tibble: 1 × 2
## penalty .config
## <dbl> <chr>
## 1 0.00234 Preprocessor1_Model15
one_se_param <- pollution_lasso_tune %>%
select_by_one_std_err(metric = "rmse", desc(penalty))
one_se_param
## # A tibble: 1 × 9
## penalty .metric .estimator mean n std_err .config .best .bound
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr> <dbl> <dbl>
## 1 0.00785 rmse standard 0.297 5 0.0163 Preprocessor1_Mod… 0.296 0.314
pollution_lasso_final_wf <- pollution_lasso_wf %>%
finalize_workflow(one_se_param)
pollution_lasso_final_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_rm()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
##
## Main Arguments:
## penalty = 0.00784759970351461
## mixture = 1
##
## Computational engine: glmnet
pollution_lasso_final_mod <- pollution_lasso_final_wf %>%
fit(data = pollution_training)
pollution_lasso_final_mod %>%
pull_workflow_fit() %>%
tidy()
## # A tibble: 33 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) -3.33 0.00785
## 2 annual_mean_pm25 0.0139 0.00785
## 3 annual_mean_ozone -0.0164 0.00785
## 4 annual_mean_pb -0.00664 0.00785
## 5 annual_mean_pm10 0 0.00785
## 6 annual_mean_co -0.0354 0.00785
## 7 annual_mean_no2 0.0812 0.00785
## 8 annual_mean_so2 0.0203 0.00785
## 9 hpi2score 0 0.00785
## 10 economic 0 0.00785
## # … with 23 more rows
pollution_lasso_test <- pollution_lasso_final_wf %>%
last_fit(pollution_split)
# Metrics for model applied to test data
pollution_lasso_test %>%
collect_metrics()
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.292 Preprocessor1_Model1
## 2 rsq standard 0.576 Preprocessor1_Model1
Xgboost model
use_xgboost(log_breast ~ ., data = pollution_training)
## xgboost_recipe <-
## recipe(formula = log_breast ~ ., data = pollution_training) %>%
## step_novel(all_nominal(), -all_outcomes()) %>%
## step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("regression") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(48676)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
boost_recipe <-
recipe(formula = log_breast ~ ., data = pollution_training) %>%
step_rm(County,
Year,
All.causes..total.,
Alzheimer.s.disease,
Malignant.neoplasms,
Chronic.lower.respiratory.diseases,
Diabetes.mellitus,
Diseases.of.heart,
Assault..homicide.,
Essential.hypertension.and.hypertensive.renal.disease,
Accidents..unintentional.injuries.,
Chronic.liver.disease.and.cirrhosis,
Nephritis..nephrotic.syndrome.and.nephrosis,
Parkinson.s.disease,
Influenza.and.pneumonia,
Cerebrovascular.diseases,
Intentional.self.harm..suicide.,
FIPS,
cancer_incidence_rate,
asthma_deaths,
Bladder_Surgery_Ct,
Brain_Surgery_Ct,
#Breast_Surgery_Ct,
Colon_Surgery_Ct,
Esophagus_Surgery_Ct,
Liver_Surgery_Ct,
Lung_Surgery_Ct,
Pancreas_Surgery_Ct,
Prostate_Surgery_Ct,
Rectum_Surgery_Ct,
asthma_er_avg,
Stomach_Surgery_Ct) %>%
step_novel(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors(),
-all_nominal())
boost_recipe %>%
prep() %>%
juice()
## # A tibble: 110 × 33
## annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
## <dbl> <dbl> <dbl> <dbl>
## 1 0.791 1.99 -0.419 1.24
## 2 -0.757 -0.0177 -0.170 -0.416
## 3 0.0652 -0.0849 -0.170 0.159
## 4 1.03 -0.930 -0.170 -0.818
## 5 -0.816 0.169 -0.921 -0.952
## 6 -0.592 -0.0389 -0.818 -0.432
## 7 0.882 -0.0221 -0.170 -0.134
## 8 1.71 0.718 -0.170 -1.08
## 9 -1.03 -1.37 -0.170 -1.38
## 10 0.728 -0.710 -0.170 -0.715
## # … with 100 more rows, and 29 more variables: annual_mean_co <dbl>,
## # annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## # economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## # neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## # insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## # commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## # employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …
boost_spec <- boost_tree(
trees = 1000, # number of trees, T in the equations above
tree_depth = 2, # max number of splits in the tree
min_n = 5, # min points required for node to be further split
loss_reduction = 10^-5, # when to stop - smaller = more since it only has to get a little bit better
sample_size = 1, # proportion of training data to use
learn_rate = tune(), # lambda from the equations above
stop_iter = 50 # number of iterations w/o improvement b4 stopping
) %>%
set_engine("xgboost", colsample_bytree = 1) %>%
set_mode("regression")
boost_grid <- grid_regular(learn_rate(),
levels = 10)
boost_grid
## # A tibble: 10 × 1
## learn_rate
## <dbl>
## 1 0.0000000001
## 2 0.000000001
## 3 0.00000001
## 4 0.0000001
## 5 0.000001
## 6 0.00001
## 7 0.0001
## 8 0.001
## 9 0.01
## 10 0.1
boost_wf <- workflow() %>%
add_recipe(boost_recipe) %>%
add_model(boost_spec)
set.seed(456)
val_split <- validation_split(pollution_training,
prop = .6)
val_split
## # Validation Set Split (0.6/0.4)
## # A tibble: 1 × 2
## splits id
## <list> <chr>
## 1 <split [66/44]> validation
set.seed(456)
registerDoParallel()
boost_tune <- boost_wf %>%
tune_grid(
# resamples = val_split,
resamples = pollution_cv,
grid = boost_grid,
control = ctrl_grid
)
collect_metrics(boost_tune)
## # A tibble: 20 × 7
## learn_rate .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000000001 rmse standard 3.85 5 0.0201 Preprocessor1_Model01
## 2 0.0000000001 rsq standard NaN 0 NA Preprocessor1_Model01
## 3 0.000000001 rmse standard 3.85 5 0.0201 Preprocessor1_Model02
## 4 0.000000001 rsq standard NaN 0 NA Preprocessor1_Model02
## 5 0.00000001 rmse standard 3.85 5 0.0201 Preprocessor1_Model03
## 6 0.00000001 rsq standard NaN 0 NA Preprocessor1_Model03
## 7 0.0000001 rmse standard 3.85 5 0.0201 Preprocessor1_Model04
## 8 0.0000001 rsq standard NaN 0 NA Preprocessor1_Model04
## 9 0.000001 rmse standard 3.84 5 0.0201 Preprocessor1_Model05
## 10 0.000001 rsq standard NaN 0 NA Preprocessor1_Model05
## 11 0.00001 rmse standard 3.81 5 0.0201 Preprocessor1_Model06
## 12 0.00001 rsq standard NaN 0 NA Preprocessor1_Model06
## 13 0.0001 rmse standard 3.49 5 0.0206 Preprocessor1_Model07
## 14 0.0001 rsq standard NaN 0 NA Preprocessor1_Model07
## 15 0.001 rmse standard 1.47 5 0.0246 Preprocessor1_Model08
## 16 0.001 rsq standard 0.188 5 0.0416 Preprocessor1_Model08
## 17 0.01 rmse standard 0.230 5 0.0295 Preprocessor1_Model09
## 18 0.01 rsq standard 0.664 5 0.0816 Preprocessor1_Model09
## 19 0.1 rmse standard 0.213 5 0.0350 Preprocessor1_Model10
## 20 0.1 rsq standard 0.704 5 0.104 Preprocessor1_Model10
collect_metrics(boost_tune) %>%
filter(.metric == "rmse") %>%
ggplot(aes(x = learn_rate, y = mean)) +
geom_point() +
geom_line() +
scale_x_log10() +
labs(y = "rmse") +
theme_minimal()
best_lr <- select_best(boost_tune, "rmse")
best_lr
## # A tibble: 1 × 2
## learn_rate .config
## <dbl> <chr>
## 1 0.1 Preprocessor1_Model10
# finalize workflow
final_boost_wf <- finalize_workflow(
boost_wf,
best_lr
)
# fit final
final_boost <- final_boost_wf %>%
fit(data = pollution_training)
final_boost %>%
pull_workflow_fit() %>%
vip(geom = "col")
# Use model on test data
test_preds <- pollution_testing %>%
bind_cols(predict(final_boost, new_data = pollution_testing))
# Compute test rmse
test_preds %>%
summarize(rmse = sqrt(mean((log_breast - .pred)^2))) %>%
pull(rmse)
## [1] 0.1397386
# Graph results
test_preds %>%
ggplot(aes(x = log_breast,
y = .pred)) +
geom_point(alpha = .5,
size = .5) +
geom_smooth(se = FALSE) +
geom_abline(slope = 1,
intercept = 0,
color = "darkred") +
labs(x = "Actual log(asthma)",
y = "Predicted log(asthma)")
Random Forest Model
pollution_rfrecipe <- recipe(log_breast ~ ., data = pollution_training) %>%
step_rm(County,
Year,
All.causes..total.,
Alzheimer.s.disease,
Malignant.neoplasms,
Chronic.lower.respiratory.diseases,
Diabetes.mellitus,
Assault..homicide.,
Diseases.of.heart,
asthma_er_avg,
Essential.hypertension.and.hypertensive.renal.disease,
Accidents..unintentional.injuries.,
Chronic.liver.disease.and.cirrhosis,
Nephritis..nephrotic.syndrome.and.nephrosis,
Parkinson.s.disease,
Influenza.and.pneumonia,
Cerebrovascular.diseases,
Intentional.self.harm..suicide.,
FIPS,
cancer_incidence_rate,
asthma_deaths,
Bladder_Surgery_Ct,
Brain_Surgery_Ct,
# Breast_Surgery_Ct,
Colon_Surgery_Ct,
Esophagus_Surgery_Ct,
Liver_Surgery_Ct,
Lung_Surgery_Ct,
Pancreas_Surgery_Ct,
Prostate_Surgery_Ct,
Rectum_Surgery_Ct,
Stomach_Surgery_Ct) %>%
step_normalize(all_predictors(),
-all_nominal()) %>%
step_dummy(all_nominal(),
-all_outcomes())
pollution_rfrecipe %>%
prep(pollution_training) %>%
juice()
## # A tibble: 110 × 33
## annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
## <dbl> <dbl> <dbl> <dbl>
## 1 0.791 1.99 -0.419 1.24
## 2 -0.757 -0.0177 -0.170 -0.416
## 3 0.0652 -0.0849 -0.170 0.159
## 4 1.03 -0.930 -0.170 -0.818
## 5 -0.816 0.169 -0.921 -0.952
## 6 -0.592 -0.0389 -0.818 -0.432
## 7 0.882 -0.0221 -0.170 -0.134
## 8 1.71 0.718 -0.170 -1.08
## 9 -1.03 -1.37 -0.170 -1.38
## 10 0.728 -0.710 -0.170 -0.715
## # … with 100 more rows, and 29 more variables: annual_mean_co <dbl>,
## # annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## # economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## # neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## # insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## # commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## # employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …
ranger_pollution <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = 200) %>%
set_mode("regression") %>%
set_engine("ranger")
pollution_rfworkflow <-
workflow() %>%
add_recipe(pollution_rfrecipe) %>%
add_model(ranger_pollution)
# Set up penalty grid model
rf_penalty_grid <- grid_regular(finalize(mtry(),
pollution_training %>%
select(-log_breast)),
min_n(),
levels = 3)
set.seed(456)
pollution_cv <- vfold_cv(pollution_training, v = 5)
# Tune model
pollution_rfTUNE <-
pollution_rfworkflow %>%
tune_grid(
resamples = pollution_cv,
grid = rf_penalty_grid,
control = ctrl_grid
)
best_rmse <-
pollution_rfTUNE %>%
select_best(metric = "rmse")
best_rmse
## # A tibble: 1 × 3
## mtry min_n .config
## <int> <int> <chr>
## 1 63 2 Preprocessor1_Model3
pol_rfFinal <- pollution_rfworkflow %>%
finalize_workflow(best_rmse) %>%
fit(data = pollution_training)
# OOB error (MSE)
pol_rfFinal$fit$fit$fit$prediction.error
## [1] 0.0387414
# OOB RMSE
sqrt(pol_rfFinal$fit$fit$fit$prediction.error)
## [1] 0.1968284
set.seed(456)
metric <- metric_set(rmse)
ctrl_res <- control_stack_resamples()
pollution_rfcv <- pollution_rfworkflow %>%
fit_resamples(pollution_cv,
metrics = metric,
control = ctrl_res)
collect_metrics(pollution_rfTUNE)
## # A tibble: 18 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 2 rmse standard 0.224 5 0.0333 Preprocessor1_Model1
## 2 1 2 rsq standard 0.688 5 0.0999 Preprocessor1_Model1
## 3 32 2 rmse standard 0.211 5 0.0311 Preprocessor1_Model2
## 4 32 2 rsq standard 0.692 5 0.0921 Preprocessor1_Model2
## 5 63 2 rmse standard 0.207 5 0.0336 Preprocessor1_Model3
## 6 63 2 rsq standard 0.693 5 0.0942 Preprocessor1_Model3
## 7 1 21 rmse standard 0.279 5 0.0228 Preprocessor1_Model4
## 8 1 21 rsq standard 0.545 5 0.0971 Preprocessor1_Model4
## 9 32 21 rmse standard 0.244 5 0.0251 Preprocessor1_Model5
## 10 32 21 rsq standard 0.605 5 0.109 Preprocessor1_Model5
## 11 63 21 rmse standard 0.245 5 0.0227 Preprocessor1_Model6
## 12 63 21 rsq standard 0.606 5 0.100 Preprocessor1_Model6
## 13 1 40 rmse standard 0.301 5 0.0219 Preprocessor1_Model7
## 14 1 40 rsq standard 0.491 5 0.102 Preprocessor1_Model7
## 15 32 40 rmse standard 0.264 5 0.0244 Preprocessor1_Model8
## 16 32 40 rsq standard 0.549 5 0.122 Preprocessor1_Model8
## 17 63 40 rmse standard 0.260 5 0.0241 Preprocessor1_Model9
## 18 63 40 rsq standard 0.569 5 0.120 Preprocessor1_Model9
Stacking
pollution_stack <-
stacks() %>%
add_candidates(boost_tune) %>%
add_candidates(pollution_lasso_tune) %>%
add_candidates(pollution_rfTUNE)
as_tibble(pollution_stack)
## # A tibble: 110 × 28
## log_breast boost_tune_1_01 boost_tune_1_03 boost_tune_1_04 boost_tune_1_05
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -3.36 0.5 0.500 0.500 0.496
## 2 -3.25 0.5 0.500 0.500 0.496
## 3 -3.25 0.5 0.500 0.500 0.496
## 4 -3.05 0.5 0.500 0.500 0.496
## 5 -2.76 0.5 0.500 0.500 0.496
## 6 -3.04 0.5 0.500 0.500 0.496
## 7 -3.97 0.5 0.500 0.500 0.496
## 8 -4.66 0.5 0.500 0.500 0.496
## 9 -3.03 0.5 0.500 0.500 0.496
## 10 -3.19 0.5 0.500 0.500 0.496
## # … with 100 more rows, and 23 more variables: boost_tune_1_06 <dbl>,
## # boost_tune_1_07 <dbl>, boost_tune_1_08 <dbl>, boost_tune_1_09 <dbl>,
## # boost_tune_1_10 <dbl>, pollution_lasso_tune_1_01 <dbl>,
## # pollution_lasso_tune_1_12 <dbl>, pollution_lasso_tune_1_13 <dbl>,
## # pollution_lasso_tune_1_14 <dbl>, pollution_lasso_tune_1_15 <dbl>,
## # pollution_lasso_tune_1_16 <dbl>, pollution_lasso_tune_1_17 <dbl>,
## # pollution_lasso_tune_1_18 <dbl>, pollution_lasso_tune_1_19 <dbl>, …
set.seed(456)
pollution_blend <-
pollution_stack %>%
blend_predictions()
pollution_blend
## # A tibble: 5 × 3
## member type weight
## <chr> <chr> <dbl>
## 1 pollution_rfTUNE_1_3 rand_forest 0.557
## 2 boost_tune_1_10 boost_tree 0.263
## 3 pollution_rfTUNE_1_1 rand_forest 0.204
## 4 pollution_lasso_tune_1_15 linear_reg 0.0776
## 5 pollution_rfTUNE_1_2 rand_forest 0.00987
pollution_blend$metrics %>%
filter(.metric == "rmse")
## # A tibble: 6 × 8
## penalty mixture .metric .estimator mean n std_err .config
## <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.000001 1 rmse standard 0.235 25 0.00831 Preprocessor1_Model1
## 2 0.00001 1 rmse standard 0.235 25 0.00831 Preprocessor1_Model2
## 3 0.0001 1 rmse standard 0.235 25 0.00831 Preprocessor1_Model3
## 4 0.001 1 rmse standard 0.234 25 0.00829 Preprocessor1_Model4
## 5 0.01 1 rmse standard 0.233 25 0.00813 Preprocessor1_Model5
## 6 0.1 1 rmse standard 0.249 25 0.00938 Preprocessor1_Model6
autoplot(pollution_blend)
autoplot(pollution_blend, type = "members")
autoplot(pollution_blend, type = "weights")
pollution_final_stack <- pollution_blend %>%
fit_members()
pollution_final_stack
## # A tibble: 5 × 3
## member type weight
## <chr> <chr> <dbl>
## 1 pollution_rfTUNE_1_3 rand_forest 0.557
## 2 boost_tune_1_10 boost_tree 0.263
## 3 pollution_rfTUNE_1_1 rand_forest 0.204
## 4 pollution_lasso_tune_1_15 linear_reg 0.0776
## 5 pollution_rfTUNE_1_2 rand_forest 0.00987
pollution_final_stack %>%
predict(new_data = pollution_testing) %>%
bind_cols(pollution_testing) %>%
ggplot(aes(x = log_breast,
y = .pred)) +
geom_point(alpha = .5,
size = .5) +
geom_smooth(se = FALSE) +
geom_abline(slope = 1,
intercept = 0,
color = "darkred") +
labs(x = "Actual log(breast cancer)",
y = "Predicted log(breast_cancer)")
library(Metrics)
data_rmse <- pollution_final_stack %>%
predict(new_data = pollution_testing) %>%
bind_cols(pollution_testing)
rmse(data_rmse$log_alzheimers, data_rmse$.pred)
## [1] NaN