library(tidyverse)         # for graphing and data cleaning
library(tidymodels)        # for modeling
library(stacks)            # for stacking models
library(naniar)            # for analyzing missing values
library(vip)               # for variable importance plots
library(usemodels)         # for tidymodels suggestions
library(xgboost)           # for boosting - need to install, don't need to load
library(doParallel)        # for parallel processing
library(lubridate)         # for dates
library(moderndive)        # for King County housing data
library(patchwork)         # for combining plots nicely
library(rmarkdown)         # for paged tables
theme_set(theme_minimal()) # Lisa's favorite theme

Lasso Model

pollution <- read.csv("data_standardized.csv")

pollution %>% 
  select(where(is.numeric)) %>% 
  pivot_longer(cols = everything(),
               names_to = "variable", 
               values_to = "value") %>% 
  ggplot(aes(x = value)) +
  geom_histogram(bins = 30) +
  facet_wrap(vars(variable), 
             scales = "free",
             nrow = 5)

pollution %>% 
  select(Breast_Surgery_Ct) %>% 
  ggplot(aes(x = Breast_Surgery_Ct)) +
  geom_histogram()

for(i in 1:ncol(pollution)) {
  median <- median(pollution[,i],na.rm = TRUE)
  for (j in 1:nrow(pollution)){
    if (is.na(pollution[j, i])){
      pollution[j, i] <- median
    }
  }
}

pollution_mod <- pollution %>%
  mutate(log_breast = log(Breast_Surgery_Ct, base = 10)) %>% 
  filter(log_breast != "-Inf") %>%
  select(-Breast_Surgery_Ct) %>% 
  mutate(across(where(is.character), as.factor)) %>% 
  add_n_miss() %>% 
  filter(n_miss_all == 0) %>% 
  select(-n_miss_all)

set.seed(456)
pollution_split <- initial_split(pollution_mod, prop = .75)
pollution_training <- training(pollution_split)
pollution_testing <- testing(pollution_split)

pollution_recipe <- recipe(log_breast ~ ., data = pollution_training) %>% 
  step_rm(County,
          asthma_er_avg,
          Year,
          All.causes..total.,
          Alzheimer.s.disease,
          Malignant.neoplasms,
          Chronic.lower.respiratory.diseases,
          Diabetes.mellitus,
          Assault..homicide.,
          Diseases.of.heart,
          Essential.hypertension.and.hypertensive.renal.disease,
          Accidents..unintentional.injuries.,
          Chronic.liver.disease.and.cirrhosis,
          Nephritis..nephrotic.syndrome.and.nephrosis,
          Parkinson.s.disease,
          Influenza.and.pneumonia,
          Cerebrovascular.diseases,
          Intentional.self.harm..suicide.,
          FIPS,
          cancer_incidence_rate,
          asthma_deaths,
          Bladder_Surgery_Ct,
          Brain_Surgery_Ct,
         # Breast_Surgery_Ct,
          Colon_Surgery_Ct,
          Esophagus_Surgery_Ct,
          Liver_Surgery_Ct,
          Lung_Surgery_Ct,
          Pancreas_Surgery_Ct,
          Prostate_Surgery_Ct,
          Rectum_Surgery_Ct,
          Stomach_Surgery_Ct) %>% 
  step_normalize(all_predictors(), 
                 -all_nominal()) %>% 
  step_dummy(all_nominal(), 
             -all_outcomes())

pollution_recipe %>% 
  prep(pollution_training) %>%
  juice()

## # A tibble: 110 × 33
##    annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
##               <dbl>             <dbl>          <dbl>            <dbl>
##  1           0.791             1.99           -0.419            1.24 
##  2          -0.757            -0.0177         -0.170           -0.416
##  3           0.0652           -0.0849         -0.170            0.159
##  4           1.03             -0.930          -0.170           -0.818
##  5          -0.816             0.169          -0.921           -0.952
##  6          -0.592            -0.0389         -0.818           -0.432
##  7           0.882            -0.0221         -0.170           -0.134
##  8           1.71              0.718          -0.170           -1.08 
##  9          -1.03             -1.37           -0.170           -1.38 
## 10           0.728            -0.710          -0.170           -0.715
## # … with 100 more rows, and 29 more variables: annual_mean_co <dbl>,
## #   annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## #   economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## #   neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## #   insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## #   commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## #   employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …

pollution_lasso_mod <- 
  linear_reg(mixture = 1) %>% 
  set_engine("glmnet") %>% 
  set_args(penalty = tune()) %>% 
  set_mode("regression")

pollution_lasso_wf <- 
  workflow() %>% 
  add_recipe(pollution_recipe) %>% 
  add_model(pollution_lasso_mod)

penalty_grid <- grid_regular(penalty(),
                             levels = 20)
penalty_grid

## # A tibble: 20 × 1
##     penalty
##       <dbl>
##  1 1   e-10
##  2 3.36e-10
##  3 1.13e- 9
##  4 3.79e- 9
##  5 1.27e- 8
##  6 4.28e- 8
##  7 1.44e- 7
##  8 4.83e- 7
##  9 1.62e- 6
## 10 5.46e- 6
## 11 1.83e- 5
## 12 6.16e- 5
## 13 2.07e- 4
## 14 6.95e- 4
## 15 2.34e- 3
## 16 7.85e- 3
## 17 2.64e- 2
## 18 8.86e- 2
## 19 2.98e- 1
## 20 1   e+ 0

ctrl_grid <- control_stack_grid()

set.seed(456)
pollution_cv <- vfold_cv(pollution_training, v = 5)

pollution_lasso_tune <- 
  pollution_lasso_wf %>% 
  tune_grid(
    resamples = pollution_cv,
    grid = penalty_grid,
    control = ctrl_grid
    )

pollution_lasso_tune

## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 × 5
##   splits          id    .metrics          .notes           .predictions      
##   <list>          <chr> <list>            <list>           <list>            
## 1 <split [88/22]> Fold1 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [440 × 5]>
## 2 <split [88/22]> Fold2 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [440 × 5]>
## 3 <split [88/22]> Fold3 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [440 × 5]>
## 4 <split [88/22]> Fold4 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [440 × 5]>
## 5 <split [88/22]> Fold5 <tibble [40 × 5]> <tibble [1 × 1]> <tibble [440 × 5]>

pollution_lasso_tune %>% 
  select(id, .metrics) %>% 
  unnest(.metrics) %>% 
  filter(.metric == "rmse")

## # A tibble: 100 × 6
##    id     penalty .metric .estimator .estimate .config              
##    <chr>    <dbl> <chr>   <chr>          <dbl> <chr>                
##  1 Fold1 1   e-10 rmse    standard       0.313 Preprocessor1_Model01
##  2 Fold1 3.36e-10 rmse    standard       0.313 Preprocessor1_Model02
##  3 Fold1 1.13e- 9 rmse    standard       0.313 Preprocessor1_Model03
##  4 Fold1 3.79e- 9 rmse    standard       0.313 Preprocessor1_Model04
##  5 Fold1 1.27e- 8 rmse    standard       0.313 Preprocessor1_Model05
##  6 Fold1 4.28e- 8 rmse    standard       0.313 Preprocessor1_Model06
##  7 Fold1 1.44e- 7 rmse    standard       0.313 Preprocessor1_Model07
##  8 Fold1 4.83e- 7 rmse    standard       0.313 Preprocessor1_Model08
##  9 Fold1 1.62e- 6 rmse    standard       0.313 Preprocessor1_Model09
## 10 Fold1 5.46e- 6 rmse    standard       0.313 Preprocessor1_Model10
## # … with 90 more rows

pollution_lasso_tune %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  ggplot(aes(x = penalty, y = mean)) +
  geom_point() +
  geom_line() +
  scale_x_log10(
   breaks = scales::trans_breaks("log10", function(x) 10^x),
   labels = scales::trans_format("log10",scales::math_format(10^.x))) +
  labs(x = "penalty", y = "rmse")

pollution_lasso_tune %>% 
  show_best(metric = "rmse")

## # A tibble: 5 × 7
##     penalty .metric .estimator  mean     n std_err .config              
##       <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1 0.00234   rmse    standard   0.296     5  0.0179 Preprocessor1_Model15
## 2 0.00785   rmse    standard   0.297     5  0.0163 Preprocessor1_Model16
## 3 0.000695  rmse    standard   0.300     5  0.0177 Preprocessor1_Model14
## 4 0.000207  rmse    standard   0.302     5  0.0179 Preprocessor1_Model13
## 5 0.0000616 rmse    standard   0.302     5  0.0177 Preprocessor1_Model12

best_param <- pollution_lasso_tune %>% 
  select_best(metric = "rmse")
best_param

## # A tibble: 1 × 2
##   penalty .config              
##     <dbl> <chr>                
## 1 0.00234 Preprocessor1_Model15

one_se_param <- pollution_lasso_tune %>% 
  select_by_one_std_err(metric = "rmse", desc(penalty))
one_se_param

## # A tibble: 1 × 9
##   penalty .metric .estimator  mean     n std_err .config            .best .bound
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>              <dbl>  <dbl>
## 1 0.00785 rmse    standard   0.297     5  0.0163 Preprocessor1_Mod… 0.296  0.314

pollution_lasso_final_wf <- pollution_lasso_wf %>% 
  finalize_workflow(one_se_param)
pollution_lasso_final_wf

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_rm()
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Main Arguments:
##   penalty = 0.00784759970351461
##   mixture = 1
## 
## Computational engine: glmnet

pollution_lasso_final_mod <- pollution_lasso_final_wf %>% 
  fit(data = pollution_training)

pollution_lasso_final_mod %>% 
  pull_workflow_fit() %>% 
  tidy()

## # A tibble: 33 × 3
##    term              estimate penalty
##    <chr>                <dbl>   <dbl>
##  1 (Intercept)       -3.33    0.00785
##  2 annual_mean_pm25   0.0139  0.00785
##  3 annual_mean_ozone -0.0164  0.00785
##  4 annual_mean_pb    -0.00664 0.00785
##  5 annual_mean_pm10   0       0.00785
##  6 annual_mean_co    -0.0354  0.00785
##  7 annual_mean_no2    0.0812  0.00785
##  8 annual_mean_so2    0.0203  0.00785
##  9 hpi2score          0       0.00785
## 10 economic           0       0.00785
## # … with 23 more rows

pollution_lasso_test <- pollution_lasso_final_wf %>% 
  last_fit(pollution_split)

# Metrics for model applied to test data
pollution_lasso_test %>% 
  collect_metrics()

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.292 Preprocessor1_Model1
## 2 rsq     standard       0.576 Preprocessor1_Model1

Xgboost model

use_xgboost(log_breast ~ ., data = pollution_training)

## xgboost_recipe <- 
##   recipe(formula = log_breast ~ ., data = pollution_training) %>% 
##   step_novel(all_nominal(), -all_outcomes()) %>% 
##   step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("regression") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(48676)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))

boost_recipe <- 
  recipe(formula = log_breast ~ ., data = pollution_training) %>% 
  step_rm(County,
          Year,
          All.causes..total.,
          Alzheimer.s.disease,
          Malignant.neoplasms,
          Chronic.lower.respiratory.diseases,
          Diabetes.mellitus,
          Diseases.of.heart,
          Assault..homicide.,
          Essential.hypertension.and.hypertensive.renal.disease,
          Accidents..unintentional.injuries.,
          Chronic.liver.disease.and.cirrhosis,
          Nephritis..nephrotic.syndrome.and.nephrosis,
          Parkinson.s.disease,
          Influenza.and.pneumonia,
          Cerebrovascular.diseases,
          Intentional.self.harm..suicide.,
          FIPS,
          cancer_incidence_rate,
          asthma_deaths,
          Bladder_Surgery_Ct,
          Brain_Surgery_Ct,
          #Breast_Surgery_Ct,
          Colon_Surgery_Ct,
          Esophagus_Surgery_Ct,
          Liver_Surgery_Ct,
          Lung_Surgery_Ct,
          Pancreas_Surgery_Ct,
          Prostate_Surgery_Ct,
          Rectum_Surgery_Ct,
          asthma_er_avg,
          Stomach_Surgery_Ct) %>% 
  step_novel(all_nominal_predictors()) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE)  %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_predictors(), 
                 -all_nominal())

boost_recipe %>% 
  prep() %>% 
  juice()

## # A tibble: 110 × 33
##    annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
##               <dbl>             <dbl>          <dbl>            <dbl>
##  1           0.791             1.99           -0.419            1.24 
##  2          -0.757            -0.0177         -0.170           -0.416
##  3           0.0652           -0.0849         -0.170            0.159
##  4           1.03             -0.930          -0.170           -0.818
##  5          -0.816             0.169          -0.921           -0.952
##  6          -0.592            -0.0389         -0.818           -0.432
##  7           0.882            -0.0221         -0.170           -0.134
##  8           1.71              0.718          -0.170           -1.08 
##  9          -1.03             -1.37           -0.170           -1.38 
## 10           0.728            -0.710          -0.170           -0.715
## # … with 100 more rows, and 29 more variables: annual_mean_co <dbl>,
## #   annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## #   economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## #   neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## #   insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## #   commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## #   employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …

boost_spec <- boost_tree(
  trees = 1000,            # number of trees, T in the equations above
  tree_depth = 2,          # max number of splits in the tree
  min_n = 5,               # min points required for node to be further split
  loss_reduction = 10^-5,  # when to stop - smaller = more since it only has to get a little bit better 
  sample_size = 1,         # proportion of training data to use
  learn_rate = tune(),     # lambda from the equations above
  stop_iter = 50           # number of iterations w/o improvement b4 stopping
) %>% 
  set_engine("xgboost", colsample_bytree = 1) %>% 
  set_mode("regression")

boost_grid <- grid_regular(learn_rate(),
                           levels = 10)
boost_grid

## # A tibble: 10 × 1
##      learn_rate
##           <dbl>
##  1 0.0000000001
##  2 0.000000001 
##  3 0.00000001  
##  4 0.0000001   
##  5 0.000001    
##  6 0.00001     
##  7 0.0001      
##  8 0.001       
##  9 0.01        
## 10 0.1

boost_wf <- workflow() %>% 
  add_recipe(boost_recipe) %>%
  add_model(boost_spec)

set.seed(456)
val_split <- validation_split(pollution_training, 
                              prop = .6)
val_split

## # Validation Set Split (0.6/0.4)  
## # A tibble: 1 × 2
##   splits          id        
##   <list>          <chr>     
## 1 <split [66/44]> validation

set.seed(456)
registerDoParallel()

boost_tune <- boost_wf %>% 
  tune_grid(
  # resamples = val_split,
  resamples = pollution_cv,
  grid = boost_grid,
  control = ctrl_grid
)

collect_metrics(boost_tune)

## # A tibble: 20 × 7
##      learn_rate .metric .estimator    mean     n std_err .config              
##           <dbl> <chr>   <chr>        <dbl> <int>   <dbl> <chr>                
##  1 0.0000000001 rmse    standard     3.85      5  0.0201 Preprocessor1_Model01
##  2 0.0000000001 rsq     standard   NaN         0 NA      Preprocessor1_Model01
##  3 0.000000001  rmse    standard     3.85      5  0.0201 Preprocessor1_Model02
##  4 0.000000001  rsq     standard   NaN         0 NA      Preprocessor1_Model02
##  5 0.00000001   rmse    standard     3.85      5  0.0201 Preprocessor1_Model03
##  6 0.00000001   rsq     standard   NaN         0 NA      Preprocessor1_Model03
##  7 0.0000001    rmse    standard     3.85      5  0.0201 Preprocessor1_Model04
##  8 0.0000001    rsq     standard   NaN         0 NA      Preprocessor1_Model04
##  9 0.000001     rmse    standard     3.84      5  0.0201 Preprocessor1_Model05
## 10 0.000001     rsq     standard   NaN         0 NA      Preprocessor1_Model05
## 11 0.00001      rmse    standard     3.81      5  0.0201 Preprocessor1_Model06
## 12 0.00001      rsq     standard   NaN         0 NA      Preprocessor1_Model06
## 13 0.0001       rmse    standard     3.49      5  0.0206 Preprocessor1_Model07
## 14 0.0001       rsq     standard   NaN         0 NA      Preprocessor1_Model07
## 15 0.001        rmse    standard     1.47      5  0.0246 Preprocessor1_Model08
## 16 0.001        rsq     standard     0.188     5  0.0416 Preprocessor1_Model08
## 17 0.01         rmse    standard     0.230     5  0.0295 Preprocessor1_Model09
## 18 0.01         rsq     standard     0.664     5  0.0816 Preprocessor1_Model09
## 19 0.1          rmse    standard     0.213     5  0.0350 Preprocessor1_Model10
## 20 0.1          rsq     standard     0.704     5  0.104  Preprocessor1_Model10

collect_metrics(boost_tune) %>% 
  filter(.metric == "rmse") %>% 
  ggplot(aes(x = learn_rate, y = mean)) +
  geom_point() +
  geom_line() +
  scale_x_log10() +
  labs(y = "rmse") +
  theme_minimal()

best_lr <- select_best(boost_tune, "rmse")
best_lr

## # A tibble: 1 × 2
##   learn_rate .config              
##        <dbl> <chr>                
## 1        0.1 Preprocessor1_Model10

# finalize workflow
final_boost_wf <- finalize_workflow(
  boost_wf,
  best_lr
)

# fit final
final_boost <- final_boost_wf %>% 
  fit(data = pollution_training)

final_boost %>% 
  pull_workflow_fit() %>%
  vip(geom = "col")

# Use model on test data
test_preds <- pollution_testing %>% 
  bind_cols(predict(final_boost, new_data = pollution_testing)) 

# Compute test rmse
test_preds %>% 
  summarize(rmse = sqrt(mean((log_breast - .pred)^2))) %>% 
  pull(rmse)

## [1] 0.1397386

# Graph results
test_preds %>% 
  ggplot(aes(x = log_breast,
             y = .pred)) +
  geom_point(alpha = .5, 
             size = .5) +
  geom_smooth(se = FALSE) +
  geom_abline(slope = 1, 
              intercept = 0, 
              color = "darkred") +
  labs(x = "Actual log(asthma)", 
       y = "Predicted log(asthma)")

Random Forest Model

pollution_rfrecipe <- recipe(log_breast ~ ., data = pollution_training) %>% 
  step_rm(County,
          Year,
          All.causes..total.,
          Alzheimer.s.disease,
          Malignant.neoplasms,
          Chronic.lower.respiratory.diseases,
          Diabetes.mellitus,
          Assault..homicide.,
          Diseases.of.heart,
          asthma_er_avg,
          Essential.hypertension.and.hypertensive.renal.disease,
          Accidents..unintentional.injuries.,
          Chronic.liver.disease.and.cirrhosis,
          Nephritis..nephrotic.syndrome.and.nephrosis,
          Parkinson.s.disease,
          Influenza.and.pneumonia,
          Cerebrovascular.diseases,
          Intentional.self.harm..suicide.,
          FIPS,
          cancer_incidence_rate,
          asthma_deaths,
          Bladder_Surgery_Ct,
          Brain_Surgery_Ct,
         # Breast_Surgery_Ct,
          Colon_Surgery_Ct,
          Esophagus_Surgery_Ct,
          Liver_Surgery_Ct,
          Lung_Surgery_Ct,
          Pancreas_Surgery_Ct,
          Prostate_Surgery_Ct,
          Rectum_Surgery_Ct,
          Stomach_Surgery_Ct) %>% 
  step_normalize(all_predictors(), 
                 -all_nominal()) %>% 
  step_dummy(all_nominal(), 
             -all_outcomes())

pollution_rfrecipe %>% 
  prep(pollution_training) %>%
  juice()

## # A tibble: 110 × 33
##    annual_mean_pm25 annual_mean_ozone annual_mean_pb annual_mean_pm10
##               <dbl>             <dbl>          <dbl>            <dbl>
##  1           0.791             1.99           -0.419            1.24 
##  2          -0.757            -0.0177         -0.170           -0.416
##  3           0.0652           -0.0849         -0.170            0.159
##  4           1.03             -0.930          -0.170           -0.818
##  5          -0.816             0.169          -0.921           -0.952
##  6          -0.592            -0.0389         -0.818           -0.432
##  7           0.882            -0.0221         -0.170           -0.134
##  8           1.71              0.718          -0.170           -1.08 
##  9          -1.03             -1.37           -0.170           -1.38 
## 10           0.728            -0.710          -0.170           -0.715
## # … with 100 more rows, and 29 more variables: annual_mean_co <dbl>,
## #   annual_mean_no2 <dbl>, annual_mean_so2 <dbl>, hpi2score <dbl>,
## #   economic <dbl>, education <dbl>, housing <dbl>, healthcareaccess <dbl>,
## #   neighborhood <dbl>, pollution <dbl>, transportation <dbl>, social <dbl>,
## #   insured <dbl>, uncrowded <dbl>, homeownership <dbl>, automobile <dbl>,
## #   commute <dbl>, inpreschool <dbl>, inhighschool <dbl>, bachelorsed <dbl>,
## #   employed <dbl>, abovepoverty <dbl>, income <dbl>, tree_canopy <dbl>, …

ranger_pollution <- 
  rand_forest(mtry = tune(), 
              min_n = tune(), 
              trees = 200) %>% 
  set_mode("regression") %>% 
  set_engine("ranger")

pollution_rfworkflow <- 
  workflow() %>% 
  add_recipe(pollution_rfrecipe) %>% 
  add_model(ranger_pollution)

# Set up penalty grid model 
rf_penalty_grid <- grid_regular(finalize(mtry(),
                                         pollution_training %>%
                                           select(-log_breast)),
                                min_n(),
                                levels = 3)

set.seed(456) 

pollution_cv <- vfold_cv(pollution_training, v = 5) 

# Tune model
pollution_rfTUNE <- 
  pollution_rfworkflow %>% 
  tune_grid(
    resamples = pollution_cv,
    grid = rf_penalty_grid,
    control = ctrl_grid
  )

best_rmse <- 
  pollution_rfTUNE %>%
  select_best(metric = "rmse")

best_rmse

## # A tibble: 1 × 3
##    mtry min_n .config             
##   <int> <int> <chr>               
## 1    63     2 Preprocessor1_Model3

pol_rfFinal <- pollution_rfworkflow %>%
  finalize_workflow(best_rmse) %>%
  fit(data = pollution_training)

# OOB error (MSE)
pol_rfFinal$fit$fit$fit$prediction.error

## [1] 0.0387414

# OOB RMSE
sqrt(pol_rfFinal$fit$fit$fit$prediction.error)

## [1] 0.1968284

set.seed(456)

metric <- metric_set(rmse) 
ctrl_res <- control_stack_resamples() 

pollution_rfcv <- pollution_rfworkflow %>%
  fit_resamples(pollution_cv, 
               metrics = metric, 
               control = ctrl_res) 

collect_metrics(pollution_rfTUNE)

## # A tibble: 18 × 8
##     mtry min_n .metric .estimator  mean     n std_err .config             
##    <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
##  1     1     2 rmse    standard   0.224     5  0.0333 Preprocessor1_Model1
##  2     1     2 rsq     standard   0.688     5  0.0999 Preprocessor1_Model1
##  3    32     2 rmse    standard   0.211     5  0.0311 Preprocessor1_Model2
##  4    32     2 rsq     standard   0.692     5  0.0921 Preprocessor1_Model2
##  5    63     2 rmse    standard   0.207     5  0.0336 Preprocessor1_Model3
##  6    63     2 rsq     standard   0.693     5  0.0942 Preprocessor1_Model3
##  7     1    21 rmse    standard   0.279     5  0.0228 Preprocessor1_Model4
##  8     1    21 rsq     standard   0.545     5  0.0971 Preprocessor1_Model4
##  9    32    21 rmse    standard   0.244     5  0.0251 Preprocessor1_Model5
## 10    32    21 rsq     standard   0.605     5  0.109  Preprocessor1_Model5
## 11    63    21 rmse    standard   0.245     5  0.0227 Preprocessor1_Model6
## 12    63    21 rsq     standard   0.606     5  0.100  Preprocessor1_Model6
## 13     1    40 rmse    standard   0.301     5  0.0219 Preprocessor1_Model7
## 14     1    40 rsq     standard   0.491     5  0.102  Preprocessor1_Model7
## 15    32    40 rmse    standard   0.264     5  0.0244 Preprocessor1_Model8
## 16    32    40 rsq     standard   0.549     5  0.122  Preprocessor1_Model8
## 17    63    40 rmse    standard   0.260     5  0.0241 Preprocessor1_Model9
## 18    63    40 rsq     standard   0.569     5  0.120  Preprocessor1_Model9

Stacking

pollution_stack <- 
  stacks() %>% 
  add_candidates(boost_tune) %>%
  add_candidates(pollution_lasso_tune) %>% 
  add_candidates(pollution_rfTUNE)

as_tibble(pollution_stack)

## # A tibble: 110 × 28
##    log_breast boost_tune_1_01 boost_tune_1_03 boost_tune_1_04 boost_tune_1_05
##         <dbl>           <dbl>           <dbl>           <dbl>           <dbl>
##  1      -3.36             0.5           0.500           0.500           0.496
##  2      -3.25             0.5           0.500           0.500           0.496
##  3      -3.25             0.5           0.500           0.500           0.496
##  4      -3.05             0.5           0.500           0.500           0.496
##  5      -2.76             0.5           0.500           0.500           0.496
##  6      -3.04             0.5           0.500           0.500           0.496
##  7      -3.97             0.5           0.500           0.500           0.496
##  8      -4.66             0.5           0.500           0.500           0.496
##  9      -3.03             0.5           0.500           0.500           0.496
## 10      -3.19             0.5           0.500           0.500           0.496
## # … with 100 more rows, and 23 more variables: boost_tune_1_06 <dbl>,
## #   boost_tune_1_07 <dbl>, boost_tune_1_08 <dbl>, boost_tune_1_09 <dbl>,
## #   boost_tune_1_10 <dbl>, pollution_lasso_tune_1_01 <dbl>,
## #   pollution_lasso_tune_1_12 <dbl>, pollution_lasso_tune_1_13 <dbl>,
## #   pollution_lasso_tune_1_14 <dbl>, pollution_lasso_tune_1_15 <dbl>,
## #   pollution_lasso_tune_1_16 <dbl>, pollution_lasso_tune_1_17 <dbl>,
## #   pollution_lasso_tune_1_18 <dbl>, pollution_lasso_tune_1_19 <dbl>, …

set.seed(456)

pollution_blend <- 
  pollution_stack %>% 
  blend_predictions()

pollution_blend

## # A tibble: 5 × 3
##   member                    type         weight
##   <chr>                     <chr>         <dbl>
## 1 pollution_rfTUNE_1_3      rand_forest 0.557  
## 2 boost_tune_1_10           boost_tree  0.263  
## 3 pollution_rfTUNE_1_1      rand_forest 0.204  
## 4 pollution_lasso_tune_1_15 linear_reg  0.0776 
## 5 pollution_rfTUNE_1_2      rand_forest 0.00987

pollution_blend$metrics %>% 
  filter(.metric == "rmse")

## # A tibble: 6 × 8
##    penalty mixture .metric .estimator  mean     n std_err .config             
##      <dbl>   <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 0.000001       1 rmse    standard   0.235    25 0.00831 Preprocessor1_Model1
## 2 0.00001        1 rmse    standard   0.235    25 0.00831 Preprocessor1_Model2
## 3 0.0001         1 rmse    standard   0.235    25 0.00831 Preprocessor1_Model3
## 4 0.001          1 rmse    standard   0.234    25 0.00829 Preprocessor1_Model4
## 5 0.01           1 rmse    standard   0.233    25 0.00813 Preprocessor1_Model5
## 6 0.1            1 rmse    standard   0.249    25 0.00938 Preprocessor1_Model6

autoplot(pollution_blend)

autoplot(pollution_blend, type = "members")

autoplot(pollution_blend, type = "weights")

pollution_final_stack <- pollution_blend %>% 
  fit_members()

pollution_final_stack

## # A tibble: 5 × 3
##   member                    type         weight
##   <chr>                     <chr>         <dbl>
## 1 pollution_rfTUNE_1_3      rand_forest 0.557  
## 2 boost_tune_1_10           boost_tree  0.263  
## 3 pollution_rfTUNE_1_1      rand_forest 0.204  
## 4 pollution_lasso_tune_1_15 linear_reg  0.0776 
## 5 pollution_rfTUNE_1_2      rand_forest 0.00987

pollution_final_stack %>% 
  predict(new_data = pollution_testing) %>% 
  bind_cols(pollution_testing) %>% 
  ggplot(aes(x = log_breast, 
             y = .pred)) +
  geom_point(alpha = .5, 
             size = .5) +
  geom_smooth(se = FALSE) +
  geom_abline(slope = 1, 
              intercept = 0, 
              color = "darkred") +
  labs(x = "Actual log(breast cancer)", 
       y = "Predicted log(breast_cancer)")

library(Metrics)

data_rmse <- pollution_final_stack %>% 
  predict(new_data = pollution_testing) %>% 
  bind_cols(pollution_testing) 

rmse(data_rmse$log_alzheimers, data_rmse$.pred)

## [1] NaN

Breat Cancer Surgeries

Mia Rothberg

12/6/2021

Lasso Model

Xgboost model

Random Forest Model

Stacking