From 1db5260d5fbdc49b53366da3d4803d0b1638edfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98topepo=E2=80=99?= Date: Fri, 14 Nov 2025 09:47:13 -0500 Subject: [PATCH 01/23] previous content from parsnip vignette --- learn/models/parsnip-predictions/index.qmd | 4757 ++++++++++++++++++++ 1 file changed, 4757 insertions(+) create mode 100644 learn/models/parsnip-predictions/index.qmd diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd new file mode 100644 index 00000000..2511285c --- /dev/null +++ b/learn/models/parsnip-predictions/index.qmd @@ -0,0 +1,4757 @@ +--- +title: "Fitting and predicting with parsnip" +categories: + - model fitting + - parsnip + - regression + - classification +type: learn-subsection +weight: 1 +description: | + Examples that show how to fit and predict with different combinations of model, mode, and engine. +toc: true +toc-depth: 2 +include-after-body: ../../../resources.html +--- + +```{r} +#| label: "setup" +#| include: false +#| message: false +#| warning: false +source(here::here("common.R")) +``` + +```{r} +#| label: "load" +#| include: false +library(tidymodels) + +# Add everything here? + +#' skip format +pkgs <- c("tidymodels", "agua", "baguette", "bonsai", "censored", "discrim", + "multilevelmod", "plsmod", "poissonreg", "rules") + + +``` + + +## Introduction + +`r article_req_pkgs(pkgs)` + +These examples show how to *fit* and *predict* with different combinations of model, mode, and engine. As a reminder, in parsnip, + +- the **model type** differentiates basic modeling approaches, such as random forests, logistic regression, linear support vector machines, etc., + +- the **mode** denotes in what kind of modeling context it will be used (most commonly, classification or regression), and + +- the computational **engine** indicates how the model is fit, such as with a specific R package implementation or even methods outside of R like Keras or Stan. + +The following examples use consistent data sets throughout. + +todo + +- multielvel examples +- get automl working +- expand survival prediction tibbles +- keras3 updates +- use `
` for long model prints +- avoid subsection titles capitalizing the engine name (e.g., "CATBOOST") and text within backticks + +```{r} +#| label: load-tm +library(tidymodels) +theme_set(theme_bw() + theme(legend.position = "top")) +``` + +# Classification Models + +To demonstrate classification, let's make a small training and test sets for a binary outcome. We'll center and scale the data since some models require the same units. + +```{r} +#| label: bin-data +set.seed(207) +bin_split <- + modeldata::two_class_dat |> + rename(class = Class) |> + initial_split(prop = 0.994, strata = class) +bin_split + +bin_rec <- + recipe(class ~ ., data = training(bin_split)) |> + step_normalize(all_numeric_predictors()) |> + prep() + +bin_train <- bake(bin_rec, new_data = NULL) +bin_test <- bake(bin_rec, new_data = testing(bin_split)) +``` + +For data sets that _only_ work for three or more classes, we'll simulate: + +```{r} +#| label: mtl-data +set.seed(1752) +mtl_data <- + sim_multinomial( + 200, + ~ -0.5 + 0.6 * abs(A), + ~ ifelse(A > 0 & B > 0, 1.0 + 0.2 * A / B, - 2), + ~ A + B - A * B) + +mtl_split <- initial_split(mtl_data, prop = 0.967, strata = class) +mtl_split + +# Predictors are in the same units +mtl_train <- training(mtl_split) +mtl_test <- testing(mtl_split) +``` + + +## Auto Ml (`auto_ml()`) + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-auto-ml-classification-agua +#| output: false +library(agua) + +# and initialize a server +h20_server <- agua::h2o_start() +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-auto-ml-classification +#| eval: false +auto_ml_spec <- auto_ml() |> + # We dont need to set the engine (since there is only one) but we'll set + # a time limit + set_engine("h2o", max_runtime_secs = 60 * 3) |> + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-auto-ml-classification +#| eval: false +auto_ml_fit <- auto_ml_spec |> fit(class ~ ., data = bin_train) +auto_ml_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-auto-ml-classification +#| eval: false +predict(auto_ml_fit, type = "class", new_data = bin_test) +predict(auto_ml_fit, type = "prob", new_data = bin_test) +``` + +## Bagged MARS (`bag_mars()`) + +## `earth` Engine + +This engine requires the baguette extension package, so let's load this first: + +```{r} +#| label: load-earth-bag-mars-classification-baguette +#| output: false +library(baguette) +``` + +We create a model specification via: + +```{r} +#| label: spec-earth-bag-mars-classification +bag_mars_spec <- bag_mars() |> + # We need to set the mode since this engine works with multiple modes + # and earth is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-earth-bag-mars-classification +bag_mars_fit <- bag_mars_spec |> fit(class ~ ., data = bin_train) +bag_mars_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-earth-bag-mars-classification +predict(bag_mars_fit, type = "class", new_data = bin_test) +predict(bag_mars_fit, type = "prob", new_data = bin_test) +``` + +## Bagged Neural Networks (`bag_mlp()`) + +## `nnet` Engine + +This engine requires the baguette extension package, so let's load this first: + +```{r} +#| label: load-nnet-bag-mlp-classification-baguette +#| output: false +library(baguette) +``` + +We create a model specification via: + +```{r} +#| label: spec-nnet-bag-mlp-classification +bag_mlp_spec <- bag_mlp() |> + # We need to set the mode since this engine works with multiple modes + # and nnet is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-nnet-bag-mlp-classification +bag_mlp_fit <- bag_mlp_spec |> fit(class ~ ., data = bin_train) +bag_mlp_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-nnet-bag-mlp-classification +predict(bag_mlp_fit, type = "class", new_data = bin_test) +predict(bag_mlp_fit, type = "prob", new_data = bin_test) +``` + +## Bagged Decision Trees (`bag_tree()`) + +## `C5.0` Engine + +This engine requires the baguette extension package, so let's load this first: + +```{r} +#| label: load-C5.0-bag-tree-classification-baguette +#| output: false +library(baguette) +``` + +We create a model specification via: + +```{r} +#| label: spec-C5.0-bag-tree-classification +bag_tree_spec <- bag_tree() |> + set_mode("classification") |> + set_engine("C5.0") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-C5.0-bag-tree-classification +bag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train) +bag_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-C5.0-bag-tree-classification +predict(bag_tree_fit, type = "class", new_data = bin_test) +predict(bag_tree_fit, type = "prob", new_data = bin_test) +``` + +## `rpart` Engine + +This engine requires the baguette extension package, so let's load this first: + +```{r} +#| label: load-rpart-bag-tree-classification-baguette +#| output: false +library(baguette) +``` + +We create a model specification via: + +```{r} +#| label: spec-rpart-bag-tree-classification +bag_tree_spec <- bag_tree() |> + # We need to set the mode since this engine works with multiple modes + # and rpart is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-rpart-bag-tree-classification +bag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train) +bag_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-rpart-bag-tree-classification +predict(bag_tree_fit, type = "class", new_data = bin_test) +predict(bag_tree_fit, type = "prob", new_data = bin_test) +``` + +## Bayesian Additive Regression Trees (`bart()`) + +## `dbarts` Engine + +We create a model specification via: + +```{r} +#| label: spec-dbarts-bart-classification +bart_spec <- bart() |> + # We need to set the mode since this engine works with multiple modes + # and dbarts is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-dbarts-bart-classification +bart_fit <- bart_spec |> fit(class ~ ., data = bin_train) +bart_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-dbarts-bart-classification +predict(bart_fit, type = "class", new_data = bin_test) +predict(bart_fit, type = "prob", new_data = bin_test) +predict(bart_fit, type = "conf_int", new_data = bin_test) +predict(bart_fit, type = "pred_int", new_data = bin_test) +``` + +## Boosted Decision Trees (`boost_tree()`) + +## `C5.0` Engine + +We create a model specification via: + +```{r} +#| label: spec-C5.0-boost-tree-classification +boost_tree_spec <- boost_tree() |> + set_mode("classification") |> + set_engine("C5.0") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-C5.0-boost-tree-classification +boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-C5.0-boost-tree-classification +predict(boost_tree_fit, type = "class", new_data = bin_test) +predict(boost_tree_fit, type = "prob", new_data = bin_test) +``` + +## `catboost` Engine + +This engine requires the bonsai extension package, so let's load this first: + +```{r} +#| label: load-catboost-boost-tree-classification-bonsai +#| output: false +library(bonsai) +``` + +We create a model specification via: + +```{r} +#| label: spec-catboost-boost-tree-classification +boost_tree_spec <- boost_tree() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("catboost") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-catboost-boost-tree-classification +boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-catboost-boost-tree-classification +predict(boost_tree_fit, type = "class", new_data = bin_test) +predict(boost_tree_fit, type = "prob", new_data = bin_test) +``` + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-boost-tree-classification-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-boost-tree-classification +boost_tree_spec <- boost_tree() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("h2o_gbm") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-boost-tree-classification +boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-boost-tree-classification +predict(boost_tree_fit, type = "class", new_data = bin_test) +predict(boost_tree_fit, type = "prob", new_data = bin_test) +``` + +## `h2o_gbm` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-gbm-boost-tree-classification-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-gbm-boost-tree-classification +boost_tree_spec <- boost_tree() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("h2o_gbm") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-gbm-boost-tree-classification +boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-gbm-boost-tree-classification +predict(boost_tree_fit, type = "class", new_data = bin_test) +predict(boost_tree_fit, type = "prob", new_data = bin_test) +``` + +## `lightgbm` Engine + +This engine requires the bonsai extension package, so let's load this first: + +```{r} +#| label: load-lightgbm-boost-tree-classification-bonsai +#| output: false +library(bonsai) +``` + +We create a model specification via: + +```{r} +#| label: spec-lightgbm-boost-tree-classification +boost_tree_spec <- boost_tree() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("lightgbm") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-lightgbm-boost-tree-classification +boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-lightgbm-boost-tree-classification +predict(boost_tree_fit, type = "class", new_data = bin_test) +predict(boost_tree_fit, type = "prob", new_data = bin_test) +``` + +## `xgboost` Engine + +We create a model specification via: + +```{r} +#| label: spec-xgboost-boost-tree-classification +boost_tree_spec <- boost_tree() |> + # We need to set the mode since this engine works with multiple modes + # and xgboost is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-xgboost-boost-tree-classification +boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-xgboost-boost-tree-classification +predict(boost_tree_fit, type = "class", new_data = bin_test) +predict(boost_tree_fit, type = "prob", new_data = bin_test) +``` + +## C5 Rules (`C5_rules()`) + +## `C5.0` Engine + +This engine requires the rules extension package, so let's load this first: + +```{r} +#| label: load-C5.0-C5-rules-classification-rules +#| output: false +library(rules) +``` + +We create a model specification via: + +```{r} +#| label: spec-C5.0-C5-rules-classification +# This engine works with a single mode so no need to set that +# and C5.0 is the default engine so there is no need to set that either. +C5_rules_spec <- C5_rules() +``` + +Now we create the model fit object: + +```{r} +#| label: fit-C5.0-C5-rules-classification +C5_rules_fit <- C5_rules_spec |> fit(class ~ ., data = bin_train) +C5_rules_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-C5.0-C5-rules-classification +predict(C5_rules_fit, type = "class", new_data = bin_test) +predict(C5_rules_fit, type = "prob", new_data = bin_test) +``` + +## Decision Tree (`decision_tree()`) + +## `C5.0` Engine + +We create a model specification via: + +```{r} +#| label: spec-C5.0-decision-tree-classification +decision_tree_spec <- decision_tree() |> + set_mode("classification") |> + set_engine("C5.0") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-C5.0-decision-tree-classification +decision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train) +decision_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-C5.0-decision-tree-classification +predict(decision_tree_fit, type = "class", new_data = bin_test) +predict(decision_tree_fit, type = "prob", new_data = bin_test) +``` + +## `partykit` Engine + +This engine requires the bonsai extension package, so let's load this first: + +```{r} +#| label: load-partykit-decision-tree-classification-bonsai +#| output: false +library(bonsai) +``` + +We create a model specification via: + +```{r} +#| label: spec-partykit-decision-tree-classification +decision_tree_spec <- decision_tree() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("partykit") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-partykit-decision-tree-classification +decision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train) +decision_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-partykit-decision-tree-classification +predict(decision_tree_fit, type = "class", new_data = bin_test) +predict(decision_tree_fit, type = "prob", new_data = bin_test) +``` + +## `rpart` Engine + +We create a model specification via: + +```{r} +#| label: spec-rpart-decision-tree-classification +decision_tree_spec <- decision_tree() |> + # We need to set the mode since this engine works with multiple modes + # and rpart is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-rpart-decision-tree-classification +decision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train) +decision_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-rpart-decision-tree-classification +predict(decision_tree_fit, type = "class", new_data = bin_test) +predict(decision_tree_fit, type = "prob", new_data = bin_test) +``` + +## Flexible Discriminant Analysis (`discrim_flexible()`) + +## `earth` Engine + +This engine requires the discrim extension package, so let's load this first: + +```{r} +#| label: load-earth-discrim-flexible-classification-discrim +#| output: false +library(discrim) +``` + +We create a model specification via: + +```{r} +#| label: spec-earth-discrim-flexible-classification +# This engine works with a single mode so no need to set that +# and earth is the default engine so there is no need to set that either. +discrim_flexible_spec <- discrim_flexible() +``` + +Now we create the model fit object: + +```{r} +#| label: fit-earth-discrim-flexible-classification +discrim_flexible_fit <- discrim_flexible_spec |> fit(class ~ ., data = bin_train) +discrim_flexible_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-earth-discrim-flexible-classification +predict(discrim_flexible_fit, type = "class", new_data = bin_test) +predict(discrim_flexible_fit, type = "prob", new_data = bin_test) +``` + +## Linear Discriminant Analysis (`discrim_linear()`) + +## `MASS` Engine + +This engine requires the discrim extension package, so let's load this first: + +```{r} +#| label: load-MASS-discrim-linear-classification-discrim +#| output: false +library(discrim) +``` + +We create a model specification via: + +```{r} +#| label: spec-MASS-discrim-linear-classification +# This engine works with a single mode so no need to set that +# and MASS is the default engine so there is no need to set that either. +discrim_linear_spec <- discrim_linear() +``` + +Now we create the model fit object: + +```{r} +#| label: fit-MASS-discrim-linear-classification +discrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train) +discrim_linear_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-MASS-discrim-linear-classification +predict(discrim_linear_fit, type = "class", new_data = bin_test) +predict(discrim_linear_fit, type = "prob", new_data = bin_test) +``` + +## `mda` Engine + +This engine requires the discrim extension package, so let's load this first: + +```{r} +#| label: load-mda-discrim-linear-classification-discrim +#| output: false +library(discrim) +``` + +We create a model specification via: + +```{r} +#| label: spec-mda-discrim-linear-classification +discrim_linear_spec <- discrim_linear() |> + # This engine works with a single mode so no need to set that + set_engine("mda") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-mda-discrim-linear-classification +discrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train) +discrim_linear_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-mda-discrim-linear-classification +predict(discrim_linear_fit, type = "class", new_data = bin_test) +predict(discrim_linear_fit, type = "prob", new_data = bin_test) +``` + +## `sda` Engine + +This engine requires the discrim extension package, so let's load this first: + +```{r} +#| label: load-sda-discrim-linear-classification-discrim +#| output: false +library(discrim) +``` + +We create a model specification via: + +```{r} +#| label: spec-sda-discrim-linear-classification +discrim_linear_spec <- discrim_linear() |> + # This engine works with a single mode so no need to set that + set_engine("sda") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-sda-discrim-linear-classification +discrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train) +discrim_linear_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-sda-discrim-linear-classification +predict(discrim_linear_fit, type = "class", new_data = bin_test) +predict(discrim_linear_fit, type = "prob", new_data = bin_test) +``` + +## `sparsediscrim` Engine + +This engine requires the discrim extension package, so let's load this first: + +```{r} +#| label: load-sparsediscrim-discrim-linear-classification-discrim +#| output: false +library(discrim) +``` + +We create a model specification via: + +```{r} +#| label: spec-sparsediscrim-discrim-linear-classification +discrim_linear_spec <- discrim_linear() |> + # This engine works with a single mode so no need to set that + set_engine("sparsediscrim") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-sparsediscrim-discrim-linear-classification +discrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train) +discrim_linear_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-sparsediscrim-discrim-linear-classification +predict(discrim_linear_fit, type = "class", new_data = bin_test) +predict(discrim_linear_fit, type = "prob", new_data = bin_test) +``` + +## Quandratic Discriminant Analysis (`discrim_quad()`) + +## `MASS` Engine + +This engine requires the discrim extension package, so let's load this first: + +```{r} +#| label: load-MASS-discrim-quad-classification-discrim +#| output: false +library(discrim) +``` + +We create a model specification via: + +```{r} +#| label: spec-MASS-discrim-quad-classification +discrim_quad_spec <- discrim_quad() + # This engine works with a single mode so no need to set that + # and MASS is the default engine so there is no need to set that either. +``` + +Now we create the model fit object: + +```{r} +#| label: fit-MASS-discrim-quad-classification +discrim_quad_fit <- discrim_quad_spec |> fit(class ~ ., data = bin_train) +discrim_quad_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-MASS-discrim-quad-classification +predict(discrim_quad_fit, type = "class", new_data = bin_test) +predict(discrim_quad_fit, type = "prob", new_data = bin_test) +``` + +## `sparsediscrim` Engine + +This engine requires the discrim extension package, so let's load this first: + +```{r} +#| label: load-sparsediscrim-discrim-quad-classification-discrim +#| output: false +library(discrim) +``` + +We create a model specification via: + +```{r} +#| label: spec-sparsediscrim-discrim-quad-classification +discrim_quad_spec <- discrim_quad() |> + # This engine works with a single mode so no need to set that + set_engine("sparsediscrim") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-sparsediscrim-discrim-quad-classification +discrim_quad_fit <- discrim_quad_spec |> fit(class ~ ., data = bin_train) +discrim_quad_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-sparsediscrim-discrim-quad-classification +predict(discrim_quad_fit, type = "class", new_data = bin_test) +predict(discrim_quad_fit, type = "prob", new_data = bin_test) +``` + +## Regularized Discriminant Analysis (`discrim_regularized()`) + +## `klaR` Engine + +This engine requires the discrim extension package, so let's load this first: + +```{r} +#| label: load-klaR-discrim-regularized-classification-discrim +#| output: false +library(discrim) +``` + +We create a model specification via: + +```{r} +#| label: spec-klaR-discrim-regularized-classification +# This engine works with a single mode so no need to set that +# and klaR is the default engine so there is no need to set that either. +discrim_regularized_spec <- discrim_regularized() +``` + +Now we create the model fit object: + +```{r} +#| label: fit-klaR-discrim-regularized-classification +discrim_regularized_fit <- discrim_regularized_spec |> fit(class ~ ., data = bin_train) +discrim_regularized_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-klaR-discrim-regularized-classification +predict(discrim_regularized_fit, type = "class", new_data = bin_test) +predict(discrim_regularized_fit, type = "prob", new_data = bin_test) +``` + +## Generalized Additive Models (`gen_additive_mod()`) + +## `mgcv` Engine + +We create a model specification via: + +```{r} +#| label: spec-mgcv-gen-additive-mod-classification +gen_additive_mod_spec <- gen_additive_mod() |> + # We need to set the mode since this engine works with multiple modes + # and mgcv is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-mgcv-gen-additive-mod-classification +gen_additive_mod_fit <- + gen_additive_mod_spec |> + fit(class ~ s(A) + s(B), data = bin_train) +gen_additive_mod_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-mgcv-gen-additive-mod-classification +predict(gen_additive_mod_fit, type = "class", new_data = bin_test) +predict(gen_additive_mod_fit, type = "prob", new_data = bin_test) +predict(gen_additive_mod_fit, type = "conf_int", new_data = bin_test) +``` + +## Logistic Regression (`logistic_reg()`) + +## `brulee` Engine + +We create a model specification via: + +```{r} +#| label: spec-brulee-logistic-reg-classification +logistic_reg_spec <- logistic_reg() |> + # This engine works with a single mode so no need to set that + set_engine("brulee") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-brulee-logistic-reg-classification +logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) +logistic_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-brulee-logistic-reg-classification +predict(logistic_reg_fit, type = "class", new_data = bin_test) +predict(logistic_reg_fit, type = "prob", new_data = bin_test) +``` + +## `gee` Engine + +This engine requires the multilevelmod extension package, so let's load this first: + +```{r} +#| label: load-gee-logistic-reg-classification-multilevelmod +#| output: false +library(multilevelmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-gee-logistic-reg-classification +logistic_reg_spec <- logistic_reg() |> + # This engine works with a single mode so no need to set that + set_engine("gee") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-gee-logistic-reg-classification +#| eval: false +logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) +logistic_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-gee-logistic-reg-classification +#| eval: false +predict(logistic_reg_fit, type = "class", new_data = bin_test) +predict(logistic_reg_fit, type = "prob", new_data = bin_test) +``` + +## `glm` Engine + +We create a model specification via: + +```{r} +#| label: spec-glm-logistic-reg-classification +logistic_reg_spec <- logistic_reg() + # This engine works with a single mode so no need to set that + # and glm is the default engine so there is no need to set that either. +``` + +Now we create the model fit object: + +```{r} +#| label: fit-glm-logistic-reg-classification +logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) +logistic_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-glm-logistic-reg-classification +predict(logistic_reg_fit, type = "class", new_data = bin_test) +predict(logistic_reg_fit, type = "prob", new_data = bin_test) +predict(logistic_reg_fit, type = "conf_int", new_data = bin_test) +``` + +## `glmer` Engine + +This engine requires the multilevelmod extension package, so let's load this first: + +```{r} +#| label: load-glmer-logistic-reg-classification-multilevelmod +#| output: false +library(multilevelmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-glmer-logistic-reg-classification +#| eval: false +logistic_reg_spec <- logistic_reg() |> + # This engine works with a single mode so no need to set that + set_engine("glmer") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-glmer-logistic-reg-classification +#| eval: false +logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) +logistic_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-glmer-logistic-reg-classification +predict(logistic_reg_fit, type = "class", new_data = bin_test) +predict(logistic_reg_fit, type = "prob", new_data = bin_test) +``` + +## `glmnet` Engine + +We create a model specification via: + +```{r} +#| label: spec-glmnet-logistic-reg-classification +logistic_reg_spec <- logistic_reg(penalty = 0.01) |> + # This engine works with a single mode so no need to set that + set_engine("glmnet") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-glmnet-logistic-reg-classification +logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) +logistic_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-glmnet-logistic-reg-classification +predict(logistic_reg_fit, type = "class", new_data = bin_test) +predict(logistic_reg_fit, type = "prob", new_data = bin_test) +``` + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-logistic-reg-classification-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-logistic-reg-classification +logistic_reg_spec <- logistic_reg() |> + # This engine works with a single mode so no need to set that + set_engine("h2o") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-logistic-reg-classification +logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) +logistic_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-logistic-reg-classification +predict(logistic_reg_fit, type = "class", new_data = bin_test) +predict(logistic_reg_fit, type = "prob", new_data = bin_test) +``` + +## `keras` Engine + +We create a model specification via: + +```{r} +#| label: spec-keras-logistic-reg-classification +logistic_reg_spec <- logistic_reg() |> + # This engine works with a single mode so no need to set that + set_engine("keras") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-keras-logistic-reg-classification +#| eval: false +logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) +logistic_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-keras-logistic-reg-classification +#| eval: false +predict(logistic_reg_fit, type = "class", new_data = bin_test) +predict(logistic_reg_fit, type = "prob", new_data = bin_test) +``` + +## `LiblineaR` Engine + +We create a model specification via: + +```{r} +#| label: spec-LiblineaR-logistic-reg-classification +logistic_reg_spec <- logistic_reg() |> + # This engine works with a single mode so no need to set that + set_engine("LiblineaR") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-LiblineaR-logistic-reg-classification +logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) +logistic_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-LiblineaR-logistic-reg-classification +predict(logistic_reg_fit, type = "class", new_data = bin_test) +predict(logistic_reg_fit, type = "prob", new_data = bin_test) +``` + +## `stan` Engine + +We create a model specification via: + +```{r} +#| label: spec-stan-logistic-reg-classification +logistic_reg_spec <- logistic_reg() |> + # This engine works with a single mode so no need to set that + set_engine("stan") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-stan-logistic-reg-classification +logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) +logistic_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-stan-logistic-reg-classification +predict(logistic_reg_fit, type = "class", new_data = bin_test) +predict(logistic_reg_fit, type = "prob", new_data = bin_test) +predict(logistic_reg_fit, type = "conf_int", new_data = bin_test) +predict(logistic_reg_fit, type = "pred_int", new_data = bin_test) +``` + +## `stan_glmer` Engine + +This engine requires the multilevelmod extension package, so let's load this first: + +```{r} +#| label: load-stan-glmer-logistic-reg-classification-multilevelmod +#| output: false +library(multilevelmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-stan-glmer-logistic-reg-classification +logistic_reg_spec <- logistic_reg() |> + # This engine works with a single mode so no need to set that + set_engine("stan_glmer") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-stan-glmer-logistic-reg-classification +#| eval: false +logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) +logistic_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-stan-glmer-logistic-reg-classification +#| eval: false +predict(logistic_reg_fit, type = "class", new_data = bin_test) +predict(logistic_reg_fit, type = "prob", new_data = bin_test) +predict(logistic_reg_fit, type = "conf_int", new_data = bin_test) +predict(logistic_reg_fit, type = "pred_int", new_data = bin_test) +``` + +## Multivariate Adaptive Regression Splines (`mars()`) + +## `earth` Engine + +We create a model specification via: + +```{r} +#| label: spec-earth-mars-classification +mars_spec <- mars() |> + # We need to set the mode since this engine works with multiple modes + # and earth is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-earth-mars-classification +mars_fit <- mars_spec |> fit(class ~ ., data = bin_train) +mars_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-earth-mars-classification +predict(mars_fit, type = "class", new_data = bin_test) +predict(mars_fit, type = "prob", new_data = bin_test) +``` + +## Neural Networks (`mlp()`) + +## `brulee` Engine + +We create a model specification via: + +```{r} +#| label: spec-brulee-mlp-classification +mlp_spec <- mlp() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("brulee") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-brulee-mlp-classification +mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) +mlp_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-brulee-mlp-classification +predict(mlp_fit, type = "class", new_data = bin_test) +predict(mlp_fit, type = "prob", new_data = bin_test) +``` + +## `brulee_two_layer` Engine + +We create a model specification via: + +```{r} +#| label: spec-brulee-two-layer-mlp-classification +mlp_spec <- mlp() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("brulee_two_layer") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-brulee-two-layer-mlp-classification +mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) +mlp_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-brulee-two-layer-mlp-classification +predict(mlp_fit, type = "class", new_data = bin_test) +predict(mlp_fit, type = "prob", new_data = bin_test) +``` + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-mlp-classification-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-mlp-classification +mlp_spec <- mlp() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("h2o") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-mlp-classification +mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) +mlp_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-mlp-classification +predict(mlp_fit, type = "class", new_data = bin_test) +predict(mlp_fit, type = "prob", new_data = bin_test) +``` + +## `keras` Engine + +We create a model specification via: + +```{r} +#| label: spec-keras-mlp-classification +mlp_spec <- mlp() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("keras") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-keras-mlp-classification +#| eval: false +mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) +mlp_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-keras-mlp-classification +#| eval: false +predict(mlp_fit, type = "class", new_data = bin_test) +predict(mlp_fit, type = "prob", new_data = bin_test) +``` + +## `nnet` Engine + +We create a model specification via: + +```{r} +#| label: spec-nnet-mlp-classification +mlp_spec <- mlp() |> + # We need to set the mode since this engine works with multiple modes + # and nnet is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-nnet-mlp-classification +mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) +mlp_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-nnet-mlp-classification +predict(mlp_fit, type = "class", new_data = bin_test) +predict(mlp_fit, type = "prob", new_data = bin_test) +``` + +## Multinom Regression (`multinom_reg()`) + +## `brulee` Engine + +We create a model specification via: + +```{r} +#| label: spec-brulee-multinom-reg-classification +multinom_reg_spec <- multinom_reg() |> + # This engine works with a single mode so no need to set that + set_engine("brulee") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-brulee-multinom-reg-classification +multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train) +multinom_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-brulee-multinom-reg-classification +predict(multinom_reg_fit, type = "class", new_data = mtl_test) +predict(multinom_reg_fit, type = "prob", new_data = mtl_test) +``` + +## `glmnet` Engine + +We create a model specification via: + +```{r} +#| label: spec-glmnet-multinom-reg-classification +multinom_reg_spec <- multinom_reg(penalty = 0.01) |> + # This engine works with a single mode so no need to set that + set_engine("glmnet") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-glmnet-multinom-reg-classification +multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train) +multinom_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-glmnet-multinom-reg-classification +predict(multinom_reg_fit, type = "class", new_data = mtl_test) +predict(multinom_reg_fit, type = "prob", new_data = mtl_test) +``` + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-multinom-reg-classification-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-multinom-reg-classification +multinom_reg_spec <- multinom_reg() |> + # This engine works with a single mode so no need to set that + set_engine("h2o") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-multinom-reg-classification +multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train) +multinom_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-multinom-reg-classification +predict(multinom_reg_fit, type = "class", new_data = mtl_test) +predict(multinom_reg_fit, type = "prob", new_data = mtl_test) +``` + +## `keras` Engine + +We create a model specification via: + +```{r} +#| label: spec-keras-multinom-reg-classification +multinom_reg_spec <- multinom_reg() |> + # This engine works with a single mode so no need to set that + set_engine("keras") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-keras-multinom-reg-classification +#| eval: false +multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train) +multinom_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-keras-multinom-reg-classification +#| eval: false +predict(multinom_reg_fit, type = "class", new_data = mtl_test) +predict(multinom_reg_fit, type = "prob", new_data = mtl_test) +``` + +## `nnet` Engine + +We create a model specification via: + +```{r} +#| label: spec-nnet-multinom-reg-classification +# This engine works with a single mode so no need to set that +# and nnet is the default engine so there is no need to set that either. +multinom_reg_spec <- multinom_reg() +``` + +Now we create the model fit object: + +```{r} +#| label: fit-nnet-multinom-reg-classification +multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train) +multinom_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-nnet-multinom-reg-classification +predict(multinom_reg_fit, type = "class", new_data = mtl_test) +predict(multinom_reg_fit, type = "prob", new_data = mtl_test) +``` + +## Naive Bayes (`naive_Bayes()`) + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-naive-Bayes-classification-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-naive-Bayes-classification +naive_Bayes_spec <- naive_Bayes() |> + # This engine works with a single mode so no need to set that + set_engine("h2o") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-naive-Bayes-classification +naive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train) +naive_Bayes_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-naive-Bayes-classification +predict(naive_Bayes_fit, type = "class", new_data = bin_test) +predict(naive_Bayes_fit, type = "prob", new_data = bin_test) +``` + +## `klaR` Engine + +This engine requires the discrim extension package, so let's load this first: + +```{r} +#| label: load-klaR-naive-Bayes-classification-discrim +#| output: false +library(discrim) +``` + +We create a model specification via: + +```{r} +#| label: spec-klaR-naive-Bayes-classification +# This engine works with a single mode so no need to set that +# and klaR is the default engine so there is no need to set that either. +naive_Bayes_spec <- naive_Bayes() +``` + +Now we create the model fit object: + +```{r} +#| label: fit-klaR-naive-Bayes-classification +naive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train) + +# No real print method +# naive_Bayes_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-klaR-naive-Bayes-classification +predict(naive_Bayes_fit, type = "class", new_data = bin_test) +predict(naive_Bayes_fit, type = "prob", new_data = bin_test) +``` + +## `naivebayes` Engine + +This engine requires the discrim extension package, so let's load this first: + +```{r} +#| label: load-naivebayes-naive-Bayes-classification-discrim +#| output: false +library(discrim) +``` + +We create a model specification via: + +```{r} +#| label: spec-naivebayes-naive-Bayes-classification +naive_Bayes_spec <- naive_Bayes() |> + # This engine works with a single mode so no need to set that + set_engine("naivebayes") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-naivebayes-naive-Bayes-classification +naive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train) +naive_Bayes_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-naivebayes-naive-Bayes-classification +predict(naive_Bayes_fit, type = "class", new_data = bin_test) +predict(naive_Bayes_fit, type = "prob", new_data = bin_test) +``` + +## K-Nearest Neighbors (`nearest_neighbor()`) + +## `kknn` Engine + +We create a model specification via: + +```{r} +#| label: spec-kknn-nearest-neighbor-classification +nearest_neighbor_spec <- nearest_neighbor() |> + # We need to set the mode since this engine works with multiple modes + # and kknn is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-kknn-nearest-neighbor-classification +nearest_neighbor_fit <- nearest_neighbor_spec |> fit(class ~ ., data = bin_train) +nearest_neighbor_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-kknn-nearest-neighbor-classification +predict(nearest_neighbor_fit, type = "class", new_data = bin_test) +predict(nearest_neighbor_fit, type = "prob", new_data = bin_test) +``` + +## Null Model (`null_model()`) + +## `parsnip` Engine + +We create a model specification via: + +```{r} +#| label: spec-parsnip-null-model-classification +null_model_spec <- null_model() |> + # We need to set the mode since this engine works with multiple modes + # and parsnip is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-parsnip-null-model-classification +null_model_fit <- null_model_spec |> fit(class ~ ., data = bin_train) +null_model_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-parsnip-null-model-classification +predict(null_model_fit, type = "class", new_data = bin_test) +predict(null_model_fit, type = "prob", new_data = bin_test) +``` + +## Partial Least Squares (`pls()`) + +## `mixOmics` Engine + +This engine requires the plsmod extension package, so let's load this first: + +```{r} +#| label: load-mixOmics-pls-classification-plsmod +#| output: false +library(plsmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-mixOmics-pls-classification +pls_spec <- pls() |> + # We need to set the mode since this engine works with multiple modes + # and mixOmics is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-mixOmics-pls-classification +pls_fit <- pls_spec |> fit(class ~ ., data = bin_train) +pls_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-mixOmics-pls-classification +predict(pls_fit, type = "class", new_data = bin_test) +predict(pls_fit, type = "prob", new_data = bin_test) +``` + +## Random Forests (`rand_forest()`) + +## `aorsf` Engine + +This engine requires the bonsai extension package, so let's load this first: + +```{r} +#| label: load-aorsf-rand-forest-classification-bonsai +#| output: false +library(bonsai) +``` + +We create a model specification via: + +```{r} +#| label: spec-aorsf-rand-forest-classification +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("aorsf") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-aorsf-rand-forest-classification +rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-aorsf-rand-forest-classification +predict(rand_forest_fit, type = "class", new_data = bin_test) +predict(rand_forest_fit, type = "prob", new_data = bin_test) +``` + +## `grf` Engine + +We create a model specification via: + +```{r} +#| label: spec-grf-rand-forest-classification +#| eval: false +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("grf") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-grf-rand-forest-classification +#| eval: false +rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-grf-rand-forest-classification +#| eval: false +predict(rand_forest_fit, type = "class", new_data = bin_test) +predict(rand_forest_fit, type = "prob", new_data = bin_test) +predict(rand_forest_fit, type = "conf_int", new_data = bin_test) +``` + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-rand-forest-classification-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-rand-forest-classification +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("h2o") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-rand-forest-classification +rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-rand-forest-classification +predict(rand_forest_fit, type = "class", new_data = bin_test) +predict(rand_forest_fit, type = "prob", new_data = bin_test) +``` + +## `partykit` Engine + +This engine requires the bonsai extension package, so let's load this first: + +```{r} +#| label: load-partykit-rand-forest-classification-bonsai +#| output: false +library(bonsai) +``` + +We create a model specification via: + +```{r} +#| label: spec-partykit-rand-forest-classification +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("partykit") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-partykit-rand-forest-classification +rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) + +# Too long to print +# rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-partykit-rand-forest-classification +predict(rand_forest_fit, type = "class", new_data = bin_test) +predict(rand_forest_fit, type = "prob", new_data = bin_test) +``` + +## `randomForest` Engine + +We create a model specification via: + +```{r} +#| label: spec-randomForest-rand-forest-classification +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("randomForest") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-randomForest-rand-forest-classification +rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-randomForest-rand-forest-classification +predict(rand_forest_fit, type = "class", new_data = bin_test) +predict(rand_forest_fit, type = "prob", new_data = bin_test) +``` + +## `ranger` Engine + +We create a model specification via: + +```{r} +#| label: spec-ranger-rand-forest-classification +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + # and ranger is the default engine so there is no need to set that either. + set_engine("ranger", keep.inbag = TRUE) |> + # However, we'll set the engine and use the keep.inbag=TRUE option so that we + # can produce interval predictions. This is not generally required. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-ranger-rand-forest-classification +rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-ranger-rand-forest-classification +predict(rand_forest_fit, type = "class", new_data = bin_test) +predict(rand_forest_fit, type = "prob", new_data = bin_test) +predict(rand_forest_fit, type = "conf_int", new_data = bin_test) +``` + +## Rule Fit (`rule_fit()`) + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-rule-fit-classification-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-rule-fit-classification +rule_fit_spec <- rule_fit() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("h2o") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-rule-fit-classification +rule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train) +rule_fit_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-rule-fit-classification +predict(rule_fit_fit, type = "class", new_data = bin_test) +predict(rule_fit_fit, type = "prob", new_data = bin_test) +``` + +## `xrf` Engine + +This engine requires the rules extension package, so let's load this first: + +```{r} +#| label: load-xrf-rule-fit-classification-rules +#| output: false +library(rules) +``` + +We create a model specification via: + +```{r} +#| label: spec-xrf-rule-fit-classification +rule_fit_spec <- rule_fit() |> + # We need to set the mode since this engine works with multiple modes + # and xrf is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-xrf-rule-fit-classification +rule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train) +rule_fit_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-xrf-rule-fit-classification +predict(rule_fit_fit, type = "class", new_data = bin_test) +predict(rule_fit_fit, type = "prob", new_data = bin_test) +``` + +## Support Vector Machine (Linear Kernel) (`svm_linear()`) + +## `kernlab` Engine + +We create a model specification via: + +```{r} +#| label: spec-kernlab-svm-linear-classification +svm_linear_spec <- svm_linear() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("kernlab") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-kernlab-svm-linear-classification +svm_linear_fit <- svm_linear_spec |> fit(class ~ ., data = bin_train) +svm_linear_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-kernlab-svm-linear-classification +predict(svm_linear_fit, type = "class", new_data = bin_test) +predict(svm_linear_fit, type = "prob", new_data = bin_test) +``` + +## `LiblineaR` Engine + +We create a model specification via: + +```{r} +#| label: spec-LiblineaR-svm-linear-classification +svm_linear_spec <- svm_linear() |> + # We need to set the mode since this engine works with multiple modes + # and LiblineaR is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-LiblineaR-svm-linear-classification +svm_linear_fit <- svm_linear_spec |> fit(class ~ ., data = bin_train) +svm_linear_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-LiblineaR-svm-linear-classification +predict(svm_linear_fit, type = "class", new_data = bin_test) +``` + +## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) + +## `kernlab` Engine + +We create a model specification via: + +```{r} +#| label: spec-kernlab-svm-poly-classification +svm_poly_spec <- svm_poly() |> + # We need to set the mode since this engine works with multiple modes + # and kernlab is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-kernlab-svm-poly-classification +svm_poly_fit <- svm_poly_spec |> fit(class ~ ., data = bin_train) +svm_poly_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-kernlab-svm-poly-classification +predict(svm_poly_fit, type = "class", new_data = bin_test) +predict(svm_poly_fit, type = "prob", new_data = bin_test) +``` + +## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) + +## `kernlab` Engine + +We create a model specification via: + +```{r} +#| label: spec-kernlab-svm-rbf-classification +svm_rbf_spec <- svm_rbf() |> + # We need to set the mode since this engine works with multiple modes + # and kernlab is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-kernlab-svm-rbf-classification +svm_rbf_fit <- svm_rbf_spec |> fit(class ~ ., data = bin_train) +svm_rbf_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-kernlab-svm-rbf-classification +predict(svm_rbf_fit, type = "class", new_data = bin_test) +predict(svm_rbf_fit, type = "prob", new_data = bin_test) +``` + +## `liquidSVM` Engine + +Note that this package is not on CRAN. You can install it via its : + +```{r} +#| label: install-liquidSVM +#| eval: false +pak::pak("cran/liquidSVM") # fails +``` + +We create a model specification via: + +```{r} +#| label: spec-liquidSVM-svm-rbf-classification +svm_rbf_spec <- svm_rbf() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("liquidSVM") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-liquidSVM-svm-rbf-classification +#| eval: false +svm_rbf_fit <- svm_rbf_spec |> fit(class ~ ., data = bin_train) +svm_rbf_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-liquidSVM-svm-rbf-classification +#| eval: false +predict(svm_rbf_fit, type = "class", new_data = bin_test) +predict(svm_rbf_fit, type = "prob", new_data = bin_test) +``` + +# Regression Models + + +To demonstrate regression, we'll subset some data. make a training/test split, and stanrdize the predictors: + +```{r} +#| label: reg-data +set.seed(938) +reg_split <- + modeldata::concrete |> + slice_sample(n = 100) |> + select(strength = compressive_strength, cement, age) |> + initial_split(prop = 0.95, strata = strength) +reg_split + +reg_rec <- + recipe(strength ~ ., data = training(reg_split)) |> + step_normalize(all_numeric_predictors()) |> + prep() + +reg_train <- bake(reg_rec, new_data = NULL) +reg_test <- bake(reg_rec, new_data = testing(reg_split)) +``` + +We also have some models that are specific to integer count outcomes. The data for these are: + +```{r} +#| label: count-data +set.seed(207) +count_split <- + attrition |> + select(num_years = TotalWorkingYears, age = Age, income = MonthlyIncome) |> + initial_split(prop = 0.994) +count_split + +count_rec <- + recipe(num_years ~ ., data = training(count_split)) |> + step_normalize(all_numeric_predictors()) |> + prep() + +count_train <- bake(count_rec, new_data = NULL) +count_test <- bake(count_rec, new_data = testing(count_split)) +``` + + +## Auto Ml (`auto_ml()`) + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-auto-ml-regression-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-auto-ml-regression +#| eval: false +auto_ml_spec <- auto_ml() |> + # We dont need to set the engine (since there is only one) but we'll set + # a time limit + set_engine("h2o", max_runtime_secs = 60 * 3) |> + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-auto-ml-regression +#| eval: false +auto_ml_fit <- auto_ml_spec |> fit(strength ~ ., data = reg_train) +auto_ml_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-auto-ml-regression +#| eval: false +predict(auto_ml_fit, new_data = reg_test) +``` + +## Bagged MARS (`bag_mars()`) + +## `earth` Engine + +This engine requires the baguette extension package, so let's load this first: + +```{r} +#| label: load-earth-bag-mars-regression-baguette +#| output: false +library(baguette) +``` + +We create a model specification via: + +```{r} +#| label: spec-earth-bag-mars-regression +bag_mars_spec <- bag_mars() |> + # We need to set the mode since this engine works with multiple modes + # and earth is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-earth-bag-mars-regression +bag_mars_fit <- bag_mars_spec |> fit(strength ~ ., data = reg_train) +bag_mars_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-earth-bag-mars-regression +predict(bag_mars_fit, new_data = reg_test) +``` + +## Bagged Neural Networks (`bag_mlp()`) + +## `nnet` Engine + +This engine requires the baguette extension package, so let's load this first: + +```{r} +#| label: load-nnet-bag-mlp-regression-baguette +#| output: false +library(baguette) +``` + +We create a model specification via: + +```{r} +#| label: spec-nnet-bag-mlp-regression +bag_mlp_spec <- bag_mlp() |> + # We need to set the mode since this engine works with multiple modes + # and nnet is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-nnet-bag-mlp-regression +bag_mlp_fit <- bag_mlp_spec |> fit(strength ~ ., data = reg_train) +bag_mlp_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-nnet-bag-mlp-regression +predict(bag_mlp_fit, new_data = reg_test) +``` + +## Bagged Decision Trees (`bag_tree()`) + +## `rpart` Engine + +This engine requires the baguette extension package, so let's load this first: + +```{r} +#| label: load-rpart-bag-tree-regression-baguette +#| output: false +library(baguette) +``` + +We create a model specification via: + +```{r} +#| label: spec-rpart-bag-tree-regression +bag_tree_spec <- bag_tree() |> + # We need to set the mode since this engine works with multiple modes + # and rpart is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-rpart-bag-tree-regression +bag_tree_fit <- bag_tree_spec |> fit(strength ~ ., data = reg_train) +bag_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-rpart-bag-tree-regression +predict(bag_tree_fit, new_data = reg_test) +``` + +## Bayesian Additive Regression Trees (`bart()`) + +## `dbarts` Engine + +We create a model specification via: + +```{r} +#| label: spec-dbarts-bart-regression +bart_spec <- bart() |> + # We need to set the mode since this engine works with multiple modes + # and dbarts is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-dbarts-bart-regression +bart_fit <- bart_spec |> fit(strength ~ ., data = reg_train) +bart_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-dbarts-bart-regression +predict(bart_fit, new_data = reg_test) +predict(bart_fit, type = "conf_int", new_data = reg_test) +predict(bart_fit, type = "pred_int", new_data = reg_test) +``` + +## Boosted Decision Trees (`boost_tree()`) + +## `catboost` Engine + +This engine requires the bonsai extension package, so let's load this first: + +```{r} +#| label: load-catboost-boost-tree-regression-bonsai +#| output: false +library(bonsai) +``` + +We create a model specification via: + +```{r} +#| label: spec-catboost-boost-tree-regression +boost_tree_spec <- boost_tree() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("catboost") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-catboost-boost-tree-regression +boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-catboost-boost-tree-regression +predict(boost_tree_fit, new_data = reg_test) +``` + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-boost-tree-regression-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-boost-tree-regression +boost_tree_spec <- boost_tree() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("h2o_gbm") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-boost-tree-regression +boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-boost-tree-regression +predict(boost_tree_fit, new_data = reg_test) +``` + +## `h2o_gbm` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-gbm-boost-tree-regression-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-gbm-boost-tree-regression +boost_tree_spec <- boost_tree() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("h2o_gbm") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-gbm-boost-tree-regression +boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-gbm-boost-tree-regression +predict(boost_tree_fit, new_data = reg_test) +``` + +## `lightgbm` Engine + +This engine requires the bonsai extension package, so let's load this first: + +```{r} +#| label: load-lightgbm-boost-tree-regression-bonsai +#| output: false +library(bonsai) +``` + +We create a model specification via: + +```{r} +#| label: spec-lightgbm-boost-tree-regression +boost_tree_spec <- boost_tree() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("lightgbm") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-lightgbm-boost-tree-regression +boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-lightgbm-boost-tree-regression +predict(boost_tree_fit, new_data = reg_test) +``` + +## `xgboost` Engine + +We create a model specification via: + +```{r} +#| label: spec-xgboost-boost-tree-regression +boost_tree_spec <- boost_tree() |> + # We need to set the mode since this engine works with multiple modes + # and xgboost is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-xgboost-boost-tree-regression +boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-xgboost-boost-tree-regression +predict(boost_tree_fit, new_data = reg_test) +``` + +## Cubist Rules (`cubist_rules()`) + +## `Cubist` Engine + +This engine requires the rules extension package, so let's load this first: + +```{r} +#| label: load-Cubist-cubist-rules-regression-rules +#| output: false +library(rules) +``` + +We create a model specification via: + +```{r} +#| label: spec-Cubist-cubist-rules-regression +# This engine works with a single mode so no need to set that +# and Cubist is the default engine so there is no need to set that either. +cubist_rules_spec <- cubist_rules() +``` + +Now we create the model fit object: + +```{r} +#| label: fit-Cubist-cubist-rules-regression +cubist_rules_fit <- cubist_rules_spec |> fit(strength ~ ., data = reg_train) +cubist_rules_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-Cubist-cubist-rules-regression +predict(cubist_rules_fit, new_data = reg_test) +``` + +## Decision Tree (`decision_tree()`) + +## `partykit` Engine + +This engine requires the bonsai extension package, so let's load this first: + +```{r} +#| label: load-partykit-decision-tree-regression-bonsai +#| output: false +library(bonsai) +``` + +We create a model specification via: + +```{r} +#| label: spec-partykit-decision-tree-regression +decision_tree_spec <- decision_tree() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("partykit") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-partykit-decision-tree-regression +decision_tree_fit <- decision_tree_spec |> fit(strength ~ ., data = reg_train) +decision_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-partykit-decision-tree-regression +predict(decision_tree_fit, new_data = reg_test) +``` + +## `rpart` Engine + +We create a model specification via: + +```{r} +#| label: spec-rpart-decision-tree-regression +decision_tree_spec <- decision_tree() |> + # We need to set the mode since this engine works with multiple modes + # and rpart is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-rpart-decision-tree-regression +decision_tree_fit <- decision_tree_spec |> fit(strength ~ ., data = reg_train) +decision_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-rpart-decision-tree-regression +predict(decision_tree_fit, new_data = reg_test) +``` + +## Generalized Additive Models (`gen_additive_mod()`) + +## `mgcv` Engine + +We create a model specification via: + +```{r} +#| label: spec-mgcv-gen-additive-mod-regression +gen_additive_mod_spec <- gen_additive_mod() |> + # We need to set the mode since this engine works with multiple modes + # and mgcv is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-mgcv-gen-additive-mod-regression +gen_additive_mod_fit <- + gen_additive_mod_spec |> + fit(strength ~ s(age) + s(cement), data = reg_train) +gen_additive_mod_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-mgcv-gen-additive-mod-regression +predict(gen_additive_mod_fit, new_data = reg_test) +predict(gen_additive_mod_fit, type = "conf_int", new_data = reg_test) +``` + +## Linear Reg (`linear_reg()`) + +## `brulee` Engine + +We create a model specification via: + +```{r} +#| label: spec-brulee-linear-reg-regression +linear_reg_spec <- linear_reg() |> + # This engine works with a single mode so no need to set that + set_engine("brulee") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-brulee-linear-reg-regression +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-brulee-linear-reg-regression +predict(linear_reg_fit, new_data = reg_test) +``` + +## `gee` Engine + +This engine requires the multilevelmod extension package, so let's load this first: + +```{r} +#| label: load-gee-linear-reg-regression-multilevelmod +#| output: false +library(multilevelmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-gee-linear-reg-regression +linear_reg_spec <- linear_reg() |> + # This engine works with a single mode so no need to set that + set_engine("gee") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-gee-linear-reg-regression +#| eval: false +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-gee-linear-reg-regression +#| eval: false +predict(linear_reg_fit, new_data = reg_test) +``` + +## `glm` Engine + +We create a model specification via: + +```{r} +#| label: spec-glm-linear-reg-regression +linear_reg_spec <- linear_reg() |> + # This engine works with a single mode so no need to set that + set_engine("glm") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-glm-linear-reg-regression +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-glm-linear-reg-regression +predict(linear_reg_fit, new_data = reg_test) +predict(linear_reg_fit, type = "conf_int", new_data = reg_test) +``` + +## `glmer` Engine + +This engine requires the multilevelmod extension package, so let's load this first: + +```{r} +#| label: load-glmer-linear-reg-regression-multilevelmod +#| output: false +library(multilevelmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-glmer-linear-reg-regression +#| eval: false +linear_reg_spec <- linear_reg() |> + # This engine works with a single mode so no need to set that + set_engine("glmer") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-glmer-linear-reg-regression +#| eval: false +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-glmer-linear-reg-regression +predict(linear_reg_fit, new_data = reg_test) +``` + +## `glmnet` Engine + +We create a model specification via: + +```{r} +#| label: spec-glmnet-linear-reg-regression +linear_reg_spec <- linear_reg(penalty = 0.01) |> + # This engine works with a single mode so no need to set that + set_engine("glmnet") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-glmnet-linear-reg-regression +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-glmnet-linear-reg-regression +predict(linear_reg_fit, new_data = reg_test) +``` + +## `gls` Engine + +This engine requires the multilevelmod extension package, so let's load this first: + +```{r} +#| label: load-gls-linear-reg-regression-multilevelmod +#| output: false +library(multilevelmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-gls-linear-reg-regression +linear_reg_spec <- linear_reg() |> + # This engine works with a single mode so no need to set that + set_engine("gls") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-gls-linear-reg-regression +#| eval: false +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-gls-linear-reg-regression +#| eval: false +predict(linear_reg_fit, new_data = reg_test) +``` + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-linear-reg-regression-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-linear-reg-regression +linear_reg_spec <- linear_reg() |> + # This engine works with a single mode so no need to set that + set_engine("h2o") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-linear-reg-regression +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-linear-reg-regression +predict(linear_reg_fit, new_data = reg_test) +``` + +## `keras` Engine + +We create a model specification via: + +```{r} +#| label: spec-keras-linear-reg-regression +linear_reg_spec <- linear_reg() |> + # This engine works with a single mode so no need to set that + set_engine("keras") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-keras-linear-reg-regression +#| eval: false +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-keras-linear-reg-regression +#| eval: false +predict(linear_reg_fit, new_data = reg_test) +``` + +## `lm` Engine + +We create a model specification via: + +```{r} +#| label: spec-lm-linear-reg-regression +# This engine works with a single mode so no need to set that +# and lm is the default engine so there is no need to set that either. +linear_reg_spec <- linear_reg() +``` + +Now we create the model fit object: + +```{r} +#| label: fit-lm-linear-reg-regression +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-lm-linear-reg-regression +predict(linear_reg_fit, new_data = reg_test) +predict(linear_reg_fit, type = "conf_int", new_data = reg_test) +predict(linear_reg_fit, type = "pred_int", new_data = reg_test) +``` + +## `lme` Engine + +This engine requires the multilevelmod extension package, so let's load this first: + +```{r} +#| label: load-lme-linear-reg-regression-multilevelmod +#| output: false +library(multilevelmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-lme-linear-reg-regression +linear_reg_spec <- linear_reg() |> + # This engine works with a single mode so no need to set that + set_engine("lme") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-lme-linear-reg-regression +#| eval: false +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-lme-linear-reg-regression +#| eval: false +predict(linear_reg_fit, new_data = reg_test) +``` + +## `lmer` Engine + +This engine requires the multilevelmod extension package, so let's load this first: + +```{r} +#| label: load-lmer-linear-reg-regression-multilevelmod +#| output: false +library(multilevelmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-lmer-linear-reg-regression +linear_reg_spec <- linear_reg() |> + # This engine works with a single mode so no need to set that + set_engine("lmer") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-lmer-linear-reg-regression +#| eval: false +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-lmer-linear-reg-regression +#| eval: false +predict(linear_reg_fit, new_data = reg_test) +``` + +## `stan` Engine + +We create a model specification via: + +```{r} +#| label: spec-stan-linear-reg-regression +linear_reg_spec <- linear_reg() |> + # This engine works with a single mode so no need to set that + set_engine("stan") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-stan-linear-reg-regression +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-stan-linear-reg-regression +predict(linear_reg_fit, new_data = reg_test) +predict(linear_reg_fit, type = "conf_int", new_data = reg_test) +predict(linear_reg_fit, type = "pred_int", new_data = reg_test) +``` + +## `stan_glmer` Engine + +This engine requires the multilevelmod extension package, so let's load this first: + +```{r} +#| label: load-stan-glmer-linear-reg-regression-multilevelmod +#| output: false +library(multilevelmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-stan-glmer-linear-reg-regression +linear_reg_spec <- linear_reg() |> + # This engine works with a single mode so no need to set that + set_engine("stan_glmer") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-stan-glmer-linear-reg-regression +#| eval: false +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-stan-glmer-linear-reg-regression +#| eval: false +predict(linear_reg_fit, new_data = reg_test) +predict(linear_reg_fit, type = "pred_int", new_data = reg_test) +``` + +## Multivariate Adaptive Regression Splines (`mars()`) + +## `earth` Engine + +We create a model specification via: + +```{r} +#| label: spec-earth-mars-regression +mars_spec <- mars() |> + # We need to set the mode since this engine works with multiple modes + # and earth is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-earth-mars-regression +mars_fit <- mars_spec |> fit(strength ~ ., data = reg_train) +mars_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-earth-mars-regression +predict(mars_fit, new_data = reg_test) +``` + +## Neural Networks (`mlp()`) + +## `brulee` Engine + +We create a model specification via: + +```{r} +#| label: spec-brulee-mlp-regression +mlp_spec <- mlp() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("brulee") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-brulee-mlp-regression +mlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train) +mlp_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-brulee-mlp-regression +predict(mlp_fit, new_data = reg_test) +``` + +## `brulee_two_layer` Engine + +We create a model specification via: + +```{r} +#| label: spec-brulee-two-layer-mlp-regression +mlp_spec <- mlp() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("brulee_two_layer") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-brulee-two-layer-mlp-regression +mlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train) +mlp_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-brulee-two-layer-mlp-regression +predict(mlp_fit, new_data = reg_test) +``` + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-mlp-regression-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-mlp-regression +mlp_spec <- mlp() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("h2o") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-mlp-regression +mlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train) +mlp_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-mlp-regression +predict(mlp_fit, new_data = reg_test) +``` + +## `keras` Engine + +We create a model specification via: + +```{r} +#| label: spec-keras-mlp-regression +mlp_spec <- mlp() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("keras") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-keras-mlp-regression +#| eval: false +mlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train) +mlp_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-keras-mlp-regression +#| eval: false +predict(mlp_fit, new_data = reg_test) +``` + +## `nnet` Engine + +We create a model specification via: + +```{r} +#| label: spec-nnet-mlp-regression +mlp_spec <- mlp() |> + # We need to set the mode since this engine works with multiple modes + # and nnet is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-nnet-mlp-regression +mlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train) +mlp_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-nnet-mlp-regression +predict(mlp_fit, new_data = reg_test) +``` + +## K-Nearest Neighbors (`nearest_neighbor()`) + +## `kknn` Engine + +We create a model specification via: + +```{r} +#| label: spec-kknn-nearest-neighbor-regression +nearest_neighbor_spec <- nearest_neighbor() |> + # We need to set the mode since this engine works with multiple modes + # and kknn is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-kknn-nearest-neighbor-regression +nearest_neighbor_fit <- nearest_neighbor_spec |> fit(strength ~ ., data = reg_train) +nearest_neighbor_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-kknn-nearest-neighbor-regression +predict(nearest_neighbor_fit, new_data = reg_test) +``` + +## Null Model (`null_model()`) + +## `parsnip` Engine + +We create a model specification via: + +```{r} +#| label: spec-parsnip-null-model-regression +null_model_spec <- null_model() |> + # We need to set the mode since this engine works with multiple modes + # and parsnip is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-parsnip-null-model-regression +null_model_fit <- null_model_spec |> fit(strength ~ ., data = reg_train) +null_model_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-parsnip-null-model-regression +predict(null_model_fit, new_data = reg_test) +``` + +## Partial Least Squares (`pls()`) + +## `mixOmics` Engine + +This engine requires the plsmod extension package, so let's load this first: + +```{r} +#| label: load-mixOmics-pls-regression-plsmod +#| output: false +library(plsmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-mixOmics-pls-regression +pls_spec <- pls() |> + # We need to set the mode since this engine works with multiple modes + # and mixOmics is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-mixOmics-pls-regression +pls_fit <- pls_spec |> fit(strength ~ ., data = reg_train) +pls_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-mixOmics-pls-regression +predict(pls_fit, new_data = reg_test) +``` + +## Poisson Reg (`poisson_reg()`) + +## `gee` Engine + +This engine requires the multilevelmod extension package, so let's load this first: + +```{r} +#| label: load-gee-poisson-reg-regression-multilevelmod +#| output: false +library(multilevelmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-gee-poisson-reg-regression +poisson_reg_spec <- poisson_reg() |> + # This engine works with a single mode so no need to set that + set_engine("gee") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-gee-poisson-reg-regression +#| eval: false +poisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train) +poisson_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-gee-poisson-reg-regression +#| eval: false +predict(poisson_reg_fit, new_data = reg_test) +``` + +## `glm` Engine + +This engine requires the poissonreg extension package, so let's load this first: + +```{r} +#| label: load-glm-poisson-reg-regression-poissonreg +#| output: false +library(poissonreg) +``` + +We create a model specification via: + +```{r} +#| label: spec-glm-poisson-reg-regression +# This engine works with a single mode so no need to set that +# and glm is the default engine so there is no need to set that either. +poisson_reg_spec <- poisson_reg() +``` + +Now we create the model fit object: + +```{r} +#| label: fit-glm-poisson-reg-regression +poisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train) +poisson_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-glm-poisson-reg-regression +predict(poisson_reg_fit, new_data = count_test) +``` + +## `glmer` Engine + +This engine requires the multilevelmod extension package, so let's load this first: + +```{r} +#| label: load-glmer-poisson-reg-regression-multilevelmod +#| output: false +library(multilevelmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-glmer-poisson-reg-regression +poisson_reg_spec <- poisson_reg() |> + # This engine works with a single mode so no need to set that + set_engine("glmer") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-glmer-poisson-reg-regression +#| eval: false +poisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train) +poisson_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-glmer-poisson-reg-regression +#| eval: false +predict(poisson_reg_fit, new_data = reg_test) +``` + +## `glmnet` Engine + +This engine requires the poissonreg extension package, so let's load this first: + +```{r} +#| label: load-glmnet-poisson-reg-regression-poissonreg +#| output: false +library(poissonreg) +``` + +We create a model specification via: + +```{r} +#| label: spec-glmnet-poisson-reg-regression +poisson_reg_spec <- poisson_reg(penalty = 0.01) |> + # This engine works with a single mode so no need to set that + set_engine("glmnet") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-glmnet-poisson-reg-regression +poisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train) +poisson_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-glmnet-poisson-reg-regression +predict(poisson_reg_fit, new_data = count_test) +``` + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-poisson-reg-regression-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-poisson-reg-regression +poisson_reg_spec <- poisson_reg() |> + # This engine works with a single mode so no need to set that + set_engine("h2o") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-poisson-reg-regression +poisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train) +poisson_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-poisson-reg-regression +predict(poisson_reg_fit, new_data = count_test) +``` + +## `hurdle` Engine + +This engine requires the poissonreg extension package, so let's load this first: + +```{r} +#| label: load-hurdle-poisson-reg-regression-poissonreg +#| output: false +library(poissonreg) +``` + +We create a model specification via: + +```{r} +#| label: spec-hurdle-poisson-reg-regression +poisson_reg_spec <- poisson_reg() |> + # This engine works with a single mode so no need to set that + set_engine("hurdle") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-hurdle-poisson-reg-regression +poisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train) +poisson_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-hurdle-poisson-reg-regression +predict(poisson_reg_fit, new_data = count_test) +``` + +## `stan` Engine + +This engine requires the poissonreg extension package, so let's load this first: + +```{r} +#| label: load-stan-poisson-reg-regression-poissonreg +#| output: false +library(poissonreg) +``` + +We create a model specification via: + +```{r} +#| label: spec-stan-poisson-reg-regression +poisson_reg_spec <- poisson_reg() |> + # This engine works with a single mode so no need to set that + set_engine("stan") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-stan-poisson-reg-regression +#| eval: false +poisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train) +poisson_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-stan-poisson-reg-regression +#| eval: false +predict(poisson_reg_fit, new_data = reg_test) +predict(poisson_reg_fit, type = "conf_int", new_data = reg_test) +predict(poisson_reg_fit, type = "pred_int", new_data = reg_test) +``` + +## `stan_glmer` Engine + +This engine requires the multilevelmod extension package, so let's load this first: + +```{r} +#| label: load-stan-glmer-poisson-reg-regression-multilevelmod +#| output: false +library(multilevelmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-stan-glmer-poisson-reg-regression +poisson_reg_spec <- poisson_reg() |> + # This engine works with a single mode so no need to set that + set_engine("stan_glmer") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-stan-glmer-poisson-reg-regression +#| eval: false +poisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train) +poisson_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-stan-glmer-poisson-reg-regression +#| eval: false +predict(poisson_reg_fit, new_data = reg_test) +predict(poisson_reg_fit, type = "pred_int", new_data = reg_test) +``` + +## `zeroinfl` Engine + +This engine requires the poissonreg extension package, so let's load this first: + +```{r} +#| label: load-zeroinfl-poisson-reg-regression-poissonreg +#| output: false +library(poissonreg) +``` + +We create a model specification via: + +```{r} +#| label: spec-zeroinfl-poisson-reg-regression +poisson_reg_spec <- poisson_reg() |> + # This engine works with a single mode so no need to set that + set_engine("zeroinfl") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-zeroinfl-poisson-reg-regression +poisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train) +poisson_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-zeroinfl-poisson-reg-regression +predict(poisson_reg_fit, new_data = count_test) +``` + +## Random Forests (`rand_forest()`) + +## `aorsf` Engine + +This engine requires the bonsai extension package, so let's load this first: + +```{r} +#| label: load-aorsf-rand-forest-regression-bonsai +#| output: false +library(bonsai) +``` + +We create a model specification via: + +```{r} +#| label: spec-aorsf-rand-forest-regression +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("aorsf") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-aorsf-rand-forest-regression +rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-aorsf-rand-forest-regression +predict(rand_forest_fit, new_data = reg_test) +``` + +## `grf` Engine + +We create a model specification via: + +```{r} +#| label: spec-grf-rand-forest-regression +#| eval: false +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("grf") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-grf-rand-forest-regression +#| eval: false +rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-grf-rand-forest-regression +#| eval: false +predict(rand_forest_fit, new_data = reg_test) +predict(rand_forest_fit, type = "conf_int", new_data = reg_test) +``` + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-rand-forest-regression-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-rand-forest-regression +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("h2o") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-rand-forest-regression +rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-rand-forest-regression +predict(rand_forest_fit, new_data = reg_test) +``` + +## `partykit` Engine + +This engine requires the bonsai extension package, so let's load this first: + +```{r} +#| label: load-partykit-rand-forest-regression-bonsai +#| output: false +library(bonsai) +``` + +We create a model specification via: + +```{r} +#| label: spec-partykit-rand-forest-regression +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("partykit") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-partykit-rand-forest-regression +rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) + +# Too long to print +# rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-partykit-rand-forest-regression +predict(rand_forest_fit, new_data = reg_test) +``` + +## `randomForest` Engine + +We create a model specification via: + +```{r} +#| label: spec-randomForest-rand-forest-regression +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("randomForest") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-randomForest-rand-forest-regression +rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-randomForest-rand-forest-regression +predict(rand_forest_fit, new_data = reg_test) +``` + +## `ranger` Engine + +We create a model specification via: + +```{r} +#| label: spec-ranger-rand-forest-regression +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + # and ranger is the default engine so there is no need to set that either. + set_engine("ranger", keep.inbag = TRUE) |> + # However, we'll set the engine and use the keep.inbag=TRUE option so that we + # can produce interval predictions. This is not generally required. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-ranger-rand-forest-regression +rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-ranger-rand-forest-regression +predict(rand_forest_fit, new_data = reg_test) +predict(rand_forest_fit, type = "conf_int", new_data = reg_test) +``` + +## Rule Fit (`rule_fit()`) + +## `h2o` Engine + +This engine requires the agua extension package, so let's load this first: + +```{r} +#| label: load-h2o-rule-fit-regression-agua +#| output: false +library(agua) +``` + +We create a model specification via: + +```{r} +#| label: spec-h2o-rule-fit-regression +rule_fit_spec <- rule_fit() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("h2o") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-h2o-rule-fit-regression +rule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train) +rule_fit_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-h2o-rule-fit-regression +predict(rule_fit_fit, new_data = reg_test) +``` + +## `xrf` Engine + +This engine requires the rules extension package, so let's load this first: + +```{r} +#| label: load-xrf-rule-fit-regression-rules +#| output: false +library(rules) +``` + +We create a model specification via: + +```{r} +#| label: spec-xrf-rule-fit-regression +rule_fit_spec <- rule_fit() |> + # We need to set the mode since this engine works with multiple modes + # and xrf is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-xrf-rule-fit-regression +rule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train) +rule_fit_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-xrf-rule-fit-regression +predict(rule_fit_fit, new_data = reg_test) +``` + +## Support Vector Machine (Linear Kernel) (`svm_linear()`) + +## `kernlab` Engine + +We create a model specification via: + +```{r} +#| label: spec-kernlab-svm-linear-regression +svm_linear_spec <- svm_linear() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("kernlab") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-kernlab-svm-linear-regression +svm_linear_fit <- svm_linear_spec |> fit(strength ~ ., data = reg_train) +svm_linear_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-kernlab-svm-linear-regression +predict(svm_linear_fit, new_data = reg_test) +``` + +## `LiblineaR` Engine + +We create a model specification via: + +```{r} +#| label: spec-LiblineaR-svm-linear-regression +svm_linear_spec <- svm_linear() |> + # We need to set the mode since this engine works with multiple modes + # and LiblineaR is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-LiblineaR-svm-linear-regression +svm_linear_fit <- svm_linear_spec |> fit(strength ~ ., data = reg_train) +svm_linear_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-LiblineaR-svm-linear-regression +predict(svm_linear_fit, new_data = reg_test) +``` + +## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) + +## `kernlab` Engine + +We create a model specification via: + +```{r} +#| label: spec-kernlab-svm-poly-regression +svm_poly_spec <- svm_poly() |> + # We need to set the mode since this engine works with multiple modes + # and kernlab is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-kernlab-svm-poly-regression +svm_poly_fit <- svm_poly_spec |> fit(strength ~ ., data = reg_train) +svm_poly_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-kernlab-svm-poly-regression +predict(svm_poly_fit, new_data = reg_test) +``` + +## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) + +## `kernlab` Engine + +We create a model specification via: + +```{r} +#| label: spec-kernlab-svm-rbf-regression +svm_rbf_spec <- svm_rbf() |> + # We need to set the mode since this engine works with multiple modes + # and kernlab is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-kernlab-svm-rbf-regression +svm_rbf_fit <- svm_rbf_spec |> fit(strength ~ ., data = reg_train) +svm_rbf_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-kernlab-svm-rbf-regression +predict(svm_rbf_fit, new_data = reg_test) +``` + +## `liquidSVM` Engine + +We create a model specification via: + +```{r} +#| label: spec-liquidSVM-svm-rbf-regression +#| eval: false +svm_rbf_spec <- svm_rbf() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("liquidSVM") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-liquidSVM-svm-rbf-regression +#| eval: false +svm_rbf_fit <- svm_rbf_spec |> fit(strength ~ ., data = reg_train) +svm_rbf_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-liquidSVM-svm-rbf-regression +predict(svm_rbf_fit, new_data = reg_test) +``` + +# Censored Regression Models + +Let's simulate a data set using the prodlim and survival packages: + +```{r} +#| label: cns-data +library(survival) +library(prodlim) + +set.seed(1000) +cns_data <- + SimSurv(250) |> + mutate(event_time = Surv(time, event)) |> + select(event_time, X1, X2) + +cns_split <- initial_split(cns_data, prop = 0.98) +cns_split +cns_train <- training(cns_split) +cns_test <- testing(cns_split) +``` + +For some types of predictions, we need the _evaluation time(s)_ for the predictions. We'll use these three times to demonstrate: + +```{r} +#| label: eval-times +eval_times <- c(1, 3, 5) +``` + +## Bagged Decision Trees (`bag_tree()`) + +## `rpart` Engine + +This engine requires the censored extension package, so let's load this first: + +```{r} +#| label: load-rpart-bag-tree-censored-regression-censored +#| output: false +library(censored) +``` + +We create a model specification via: + +```{r} +#| label: spec-rpart-bag-tree-censored-regression +bag_tree_spec <- bag_tree() |> + # We need to set the mode since this engine works with multiple modes + # and rpart is the default engine so there is no need to set that either. + set_mode("censored regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-rpart-bag-tree-censored-regression +bag_tree_fit <- bag_tree_spec |> fit(event_time ~ ., data = cns_train) +bag_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-rpart-bag-tree-censored-regression +predict(bag_tree_fit, type = "time", new_data = cns_test) +predict(bag_tree_fit, type = "survival", new_data = cns_test, eval_time = eval_times) +``` + +## Boosted Decision Trees (`boost_tree()`) + +## `mboost` Engine + +This engine requires the censored extension package, so let's load this first: + +```{r} +#| label: load-mboost-boost-tree-censored-regression-censored +#| output: false +library(censored) +``` + +We create a model specification via: + +```{r} +#| label: spec-mboost-boost-tree-censored-regression +boost_tree_spec <- boost_tree() |> + set_mode("censored regression") |> + set_engine("mboost") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-mboost-boost-tree-censored-regression +boost_tree_fit <- boost_tree_spec |> fit(event_time ~ ., data = cns_train) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-mboost-boost-tree-censored-regression +predict(boost_tree_fit, type = "time", new_data = cns_test) +predict(boost_tree_fit, type = "survival", new_data = cns_test, eval_time = eval_times) +predict(boost_tree_fit, type = "linear_pred", new_data = cns_test) +``` + +## Decision Tree (`decision_tree()`) + +## `partykit` Engine + +This engine requires the censored extension package, so let's load this first: + +```{r} +#| label: load-partykit-decision-tree-censored-regression-censored +#| output: false +library(censored) +``` + +We create a model specification via: + +```{r} +#| label: spec-partykit-decision-tree-censored-regression +decision_tree_spec <- decision_tree() |> + # We need to set the mode since this engine works with multiple modes + set_mode("censored regression") |> + set_engine("partykit") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-partykit-decision-tree-censored-regression +decision_tree_fit <- decision_tree_spec |> fit(event_time ~ ., data = cns_train) +decision_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-partykit-decision-tree-censored-regression +predict(decision_tree_fit, type = "time", new_data = cns_test) +predict(decision_tree_fit, type = "survival", new_data = cns_test, eval_time = eval_times) +``` + +## `rpart` Engine + +This engine requires the censored extension package, so let's load this first: + +```{r} +#| label: load-rpart-decision-tree-censored-regression-censored +#| output: false +library(censored) +``` + +We create a model specification via: + +```{r} +#| label: spec-rpart-decision-tree-censored-regression +decision_tree_spec <- decision_tree() |> + # We need to set the mode since this engine works with multiple modes + # and rpart is the default engine so there is no need to set that either. + set_mode("censored regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-rpart-decision-tree-censored-regression +decision_tree_fit <- decision_tree_spec |> fit(event_time ~ ., data = cns_train) +decision_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-rpart-decision-tree-censored-regression +predict(decision_tree_fit, type = "time", new_data = cns_test) +predict(decision_tree_fit, type = "survival", new_data = cns_test, eval_time = eval_times) +``` + +## Proportional Hazards (`proportional_hazards()`) + +## `glmnet` Engine + +This engine requires the censored extension package, so let's load this first: + +```{r} +#| label: load-glmnet-proportional-hazards-censored-regression-censored +#| output: false +library(censored) +``` + +We create a model specification via: + +```{r} +#| label: spec-glmnet-proportional-hazards-censored-regression +proportional_hazards_spec <- proportional_hazards(penalty = 0.01) |> + # This engine works with a single mode so no need to set that + set_engine("glmnet") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-glmnet-proportional-hazards-censored-regression +proportional_hazards_fit <- proportional_hazards_spec |> fit(event_time ~ ., data = cns_train) +proportional_hazards_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-glmnet-proportional-hazards-censored-regression +predict(proportional_hazards_fit, type = "time", new_data = cns_test) +predict(proportional_hazards_fit, type = "survival", new_data = cns_test, eval_time = eval_times) +predict(proportional_hazards_fit, type = "linear_pred", new_data = cns_test) +``` + +## `survival` Engine + +This engine requires the censored extension package, so let's load this first: + +```{r} +#| label: load-survival-proportional-hazards-censored-regression-censored +#| output: false +library(censored) +``` + +We create a model specification via: + +```{r} +#| label: spec-survival-proportional-hazards-censored-regression +# This engine works with a single mode so no need to set that +# and survival is the default engine so there is no need to set that either. +proportional_hazards_spec <- proportional_hazards() +``` + +Now we create the model fit object: + +```{r} +#| label: fit-survival-proportional-hazards-censored-regression +proportional_hazards_fit <- proportional_hazards_spec |> fit(event_time ~ ., data = cns_train) +proportional_hazards_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-survival-proportional-hazards-censored-regression +predict(proportional_hazards_fit, type = "time", new_data = cns_test) +predict(proportional_hazards_fit, type = "survival", new_data = cns_test, eval_time = eval_times) +predict(proportional_hazards_fit, type = "linear_pred", new_data = cns_test) +``` + +## Random Forests (`rand_forest()`) + +## `aorsf` Engine + +This engine requires the censored extension package, so let's load this first: + +```{r} +#| label: load-aorsf-rand-forest-censored-regression-censored +#| output: false +library(censored) +``` + +We create a model specification via: + +```{r} +#| label: spec-aorsf-rand-forest-censored-regression +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + set_mode("censored regression") |> + set_engine("aorsf") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-aorsf-rand-forest-censored-regression +rand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-aorsf-rand-forest-censored-regression +predict(rand_forest_fit, type = "time", new_data = cns_test) +predict(rand_forest_fit, type = "survival", new_data = cns_test, eval_time = eval_times) +``` + +## `partykit` Engine + +This engine requires the censored extension package, so let's load this first: + +```{r} +#| label: load-partykit-rand-forest-censored-regression-censored +#| output: false +library(censored) +``` + +We create a model specification via: + +```{r} +#| label: spec-partykit-rand-forest-censored-regression +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + set_mode("censored regression") |> + set_engine("partykit") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-partykit-rand-forest-censored-regression +rand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train) + +# Too long to print +# rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-partykit-rand-forest-censored-regression +predict(rand_forest_fit, type = "time", new_data = cns_test) +predict(rand_forest_fit, type = "survival", new_data = cns_test, eval_time = eval_times) +``` + +## ParaMetric Survival Models (`survival_reg()`) + +## `flexsurv` Engine + +This engine requires the censored extension package, so let's load this first: + +```{r} +#| label: load-flexsurv-survival-reg-censored-regression-censored +#| output: false +library(censored) +``` + +We create a model specification via: + +```{r} +#| label: spec-flexsurv-survival-reg-censored-regression +survival_reg_spec <- survival_reg() |> + # This engine works with a single mode so no need to set that + set_engine("flexsurv") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-flexsurv-survival-reg-censored-regression +survival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train) +survival_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-flexsurv-survival-reg-censored-regression +predict(survival_reg_fit, type = "time", new_data = cns_test) +predict(survival_reg_fit, type = "survival", new_data = cns_test, eval_time = eval_times) +predict(survival_reg_fit, type = "hazard", new_data = cns_test, eval_time = eval_times) +predict(survival_reg_fit, type = "linear_pred", new_data = cns_test) +predict(survival_reg_fit, type = "quantile", new_data = cns_test) +``` + +## `flexsurvspline` Engine + +This engine requires the censored extension package, so let's load this first: + +```{r} +#| label: load-flexsurvspline-survival-reg-censored-regression-censored +#| output: false +library(censored) +``` + +We create a model specification via: + +```{r} +#| label: spec-flexsurvspline-survival-reg-censored-regression +survival_reg_spec <- survival_reg() |> + # This engine works with a single mode so no need to set that + set_engine("flexsurvspline") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-flexsurvspline-survival-reg-censored-regression +survival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train) +survival_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-flexsurvspline-survival-reg-censored-regression +predict(survival_reg_fit, type = "time", new_data = cns_test) +predict(survival_reg_fit, type = "survival", new_data = cns_test, eval_time = eval_times) +predict(survival_reg_fit, type = "hazard", new_data = cns_test, eval_time = eval_times) +predict(survival_reg_fit, type = "linear_pred", new_data = cns_test) +predict(survival_reg_fit, type = "quantile", new_data = cns_test) +``` + +## `survival` Engine + +This engine requires the censored extension package, so let's load this first: + +```{r} +#| label: load-survival-survival-reg-censored-regression-censored +#| output: false +library(censored) +``` + +We create a model specification via: + +```{r} +#| label: spec-survival-survival-reg-censored-regression +# This engine works with a single mode so no need to set that +# and survival is the default engine so there is no need to set that either. +survival_reg_spec <- survival_reg() +``` + +Now we create the model fit object: + +```{r} +#| label: fit-survival-survival-reg-censored-regression +survival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train) +survival_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-survival-survival-reg-censored-regression +predict(survival_reg_fit, type = "time", new_data = cns_test) +predict(survival_reg_fit, type = "survival", new_data = cns_test, eval_time = eval_times) +predict(survival_reg_fit, type = "hazard", new_data = cns_test, eval_time = eval_times) +predict(survival_reg_fit, type = "linear_pred", new_data = cns_test) +predict(survival_reg_fit, type = "quantile", new_data = cns_test) +``` + +# Quantile Regression Models + +To demonstrate quantile regression, let's make a larger version of our regression data: + +```{r} +#| label: qnt-data +set.seed(938) +qnt_split <- + modeldata::concrete |> + slice_sample(n = 100) |> + select(strength = compressive_strength, cement, age) |> + initial_split(prop = 0.95, strata = strength) +qnt_split + +qnt_rec <- + recipe(strength ~ ., data = training(qnt_split)) |> + step_normalize(all_numeric_predictors()) |> + prep() + +qnt_train <- bake(qnt_rec, new_data = NULL) +qnt_test <- bake(qnt_rec, new_data = testing(qnt_split)) +``` + +We'll also predict these quantile levels: + +```{r} +#| label: qnt-lvls +qnt_lvls <- (1:3) / 4 +``` + + +## Linear Regression (`linear_reg()`) + +## `quantreg` Engine + +We create a model specification via: + +```{r} +#| label: spec-quantreg-linear-reg-quantile-regression +linear_reg_spec <- linear_reg() |> + set_engine("quantreg") |> + set_mode("quantile regression", quantile_levels = qnt_lvls) +``` + +Now we create the model fit object: + +```{r} +#| label: fit-quantreg-linear-reg-quantile-regression +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = qnt_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-quantreg-linear-reg-quantile-regression +predict(linear_reg_fit, type = "quantile", new_data = qnt_test) +``` + +## Random Forests (`rand_forest()`) + +## `grf` Engine + +We create a model specification via: + +```{r} +#| label: spec-grf-rand-forest-quantile-regression +#| eval: false +rand_forest_spec <- rand_forest() |> + set_mode("quantile regression", quantile_levels = qnt_lvls) |> + set_engine("grf") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-grf-rand-forest-quantile-regression +#| eval: false +rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = qnt_train) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-grf-rand-forest-quantile-regression +#| eval: false +predict(rand_forest_fit, type = "quantile", new_data = qnt_test) +``` + From 7223e112af0271d59dba42a766f32578e77f07ae Mon Sep 17 00:00:00 2001 From: Max Kuhn Date: Tue, 18 Nov 2025 08:50:58 -0500 Subject: [PATCH 02/23] Apply suggestions from code review Co-authored-by: Emil Hvitfeldt Co-authored-by: Hannah Frick --- learn/models/parsnip-predictions/index.qmd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 2511285c..59eddd22 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -10,7 +10,7 @@ weight: 1 description: | Examples that show how to fit and predict with different combinations of model, mode, and engine. toc: true -toc-depth: 2 +toc-depth: 3 include-after-body: ../../../resources.html --- @@ -88,7 +88,7 @@ bin_train <- bake(bin_rec, new_data = NULL) bin_test <- bake(bin_rec, new_data = testing(bin_split)) ``` -For data sets that _only_ work for three or more classes, we'll simulate: +For models that _only_ work for three or more classes, we'll simulate: ```{r} #| label: mtl-data @@ -2276,7 +2276,7 @@ predict(svm_rbf_fit, type = "prob", new_data = bin_test) # Regression Models -To demonstrate regression, we'll subset some data. make a training/test split, and stanrdize the predictors: +To demonstrate regression, we'll subset some data. make a training/test split, and standardize the predictors: ```{r} #| label: reg-data @@ -4549,7 +4549,7 @@ predict(rand_forest_fit, type = "time", new_data = cns_test) predict(rand_forest_fit, type = "survival", new_data = cns_test, eval_time = eval_times) ``` -## ParaMetric Survival Models (`survival_reg()`) +## Parametric Survival Models (`survival_reg()`) ## `flexsurv` Engine From b3e0d295f16e6fc5b00b62334f3ed6fd38d471d1 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 19 Nov 2025 11:43:17 -0600 Subject: [PATCH 03/23] Adds additional needed packages --- installs.R | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/installs.R b/installs.R index 08e4ecb0..9ac6f12d 100644 --- a/installs.R +++ b/installs.R @@ -76,7 +76,20 @@ packages <- c( "tune", "vip", "zoo", - "DT" + "DT", + "mars", + "earth", + "dbarts", + "catboost/catboost/catboost/R-package", + "sda", + "sparsediscrim", + "LiblineaR", + "naivebayes", + "xrf", + "pscl", + "coin", + "pec", + "flexsurv" ) pak::pak(packages) From 773384407a39cbeec7dd388bdf0ceb5de94b7f2e Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 19 Nov 2025 15:34:57 -0600 Subject: [PATCH 04/23] Adds Spark intro section, bin data setup section, random forest engine. It disables most of the code from running for now to make it easy to render test --- learn/models/parsnip-predictions/index.qmd | 72 ++++++++++++++++++++-- 1 file changed, 68 insertions(+), 4 deletions(-) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 59eddd22..6a459c5f 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -12,6 +12,8 @@ description: | toc: true toc-depth: 3 include-after-body: ../../../resources.html +execute: + eval: false --- ```{r} @@ -19,21 +21,21 @@ include-after-body: ../../../resources.html #| include: false #| message: false #| warning: false +#| eval: true source(here::here("common.R")) ``` ```{r} #| label: "load" #| include: false +#| eval: true library(tidymodels) - +library(sparklyr) # Add everything here? #' skip format pkgs <- c("tidymodels", "agua", "baguette", "bonsai", "censored", "discrim", - "multilevelmod", "plsmod", "poissonreg", "rules") - - + "multilevelmod", "plsmod", "poissonreg", "rules", "sparklyr") ``` @@ -66,6 +68,20 @@ library(tidymodels) theme_set(theme_bw() + theme(legend.position = "top")) ``` +### Apache Spark + +To use [Apache Spark](https://spark.apache.org/) as an engine, we will first +need a connection to a cluster. For this article, we will setup and use a +single-node Spark cluster running on a laptop: + +```{r} +#| label: spark-connect +#| eval: true +library(sparklyr) +sc <- spark_connect("local", version = "4.0.1") +``` + + # Classification Models To demonstrate classification, let's make a small training and test sets for a binary outcome. We'll center and scale the data since some models require the same units. @@ -108,6 +124,18 @@ mtl_train <- training(mtl_split) mtl_test <- testing(mtl_split) ``` +If using the **Apache Spark** engine, wil will need to identify the data source +and use that to create the splits. For this article, we will copy the +`two_class_dat` dataset into the Spark session. + + +```{r} +#| label: spark-bin-data +#| eval: true +tbl_two_class <- copy_to(sc, modeldata::two_class_dat) + +tbl_bin <- sdf_random_split(tbl_two_class, training = 0.994, test = 1-0.994, seed = 100) +``` ## Auto Ml (`auto_ml()`) @@ -2044,6 +2072,36 @@ predict(rand_forest_fit, type = "prob", new_data = bin_test) predict(rand_forest_fit, type = "conf_int", new_data = bin_test) ``` +## `spark` Engine + +We create a model specification via: + +```{r} +#| label: spec-spark-rand-forest-classification +#| eval: true +rand_forest_spec <- rand_forest() |> + set_mode("classification") |> + set_engine("spark") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-spark-rand-forest-classification +#| eval: true +rand_forest_fit <- rand_forest_spec |> fit(Class ~ ., data = tbl_bin$training) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| eval: true +#| label: predict-spark-rand-forest-classification +predict(rand_forest_fit, type = "class", new_data = tbl_bin$test) +predict(rand_forest_fit, type = "prob", new_data = tbl_bin$test) +``` + ## Rule Fit (`rule_fit()`) ## `h2o` Engine @@ -4755,3 +4813,9 @@ The holdout data can be predicted: predict(rand_forest_fit, type = "quantile", new_data = qnt_test) ``` +```{r} +#| label: spark-disconnect +#| include: false +spark_disconnect(sc) +``` + From 09285eab3e5586d33a44cbf5d0b6c1013324a221 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 19 Nov 2025 15:43:14 -0600 Subject: [PATCH 05/23] 'spark' engine for `boost_tree()` --- learn/models/parsnip-predictions/index.qmd | 31 ++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 6a459c5f..9384e87f 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -564,6 +564,37 @@ predict(boost_tree_fit, type = "class", new_data = bin_test) predict(boost_tree_fit, type = "prob", new_data = bin_test) ``` +## `spark` Engine + +We create a model specification via: + +```{r} +#| label: spec-spark-boost-tree-classification +#| eval: true +boost_tree_spec <- boost_tree() |> + set_mode("classification") |> + set_engine("spark") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-spark-boost-tree-classification +#| eval: true +boost_tree_fit <- boost_tree_spec |> fit(Class ~ ., data = tbl_bin$training) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-spark-boost-tree-classification +#| eval: true +predict(boost_tree_fit, type = "class", new_data = tbl_bin$test) +predict(boost_tree_fit, type = "prob", new_data = tbl_bin$test) +``` + + ## C5 Rules (`C5_rules()`) ## `C5.0` Engine From 5af8f7325386a9905e6a366f44ebad95583bc5e7 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 19 Nov 2025 15:53:11 -0600 Subject: [PATCH 06/23] 'spark' engine for `decision_tree()` --- learn/models/parsnip-predictions/index.qmd | 29 ++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 9384e87f..b29f6564 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -725,6 +725,35 @@ predict(decision_tree_fit, type = "class", new_data = bin_test) predict(decision_tree_fit, type = "prob", new_data = bin_test) ``` +## `sparklyr` Engine + +We create a model specification via: + +```{r} +#| label: spec-spark-decision-tree-classification +#| eval: true +decision_tree_spec <- decision_tree() |> + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-spark-decision-tree-classification +#| eval: true +decision_tree_fit <- decision_tree_spec |> fit(Class ~ ., data = tbl_bin$training) +decision_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-spark-decision-tree-classification +#| eval: true +predict(decision_tree_fit, type = "class", new_data = tbl_bin$test) +predict(decision_tree_fit, type = "prob", new_data = tbl_bin$test) +``` + ## Flexible Discriminant Analysis (`discrim_flexible()`) ## `earth` Engine From cc419de9d135d9756ff46b3b6ac318fce66b17e2 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 19 Nov 2025 16:10:03 -0600 Subject: [PATCH 07/23] Adds Spark section to Regression Models, and 'spark' engine for `boost_tree()` --- learn/models/parsnip-predictions/index.qmd | 48 ++++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index b29f6564..57be121a 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -124,9 +124,9 @@ mtl_train <- training(mtl_split) mtl_test <- testing(mtl_split) ``` -If using the **Apache Spark** engine, wil will need to identify the data source -and use that to create the splits. For this article, we will copy the -`two_class_dat` dataset into the Spark session. +If using the **Apache Spark** engine, we will need to identify the data source, +and then use it to create the splits. For this article, we will copy the +`two_class_dat` data set into the Spark session. ```{r} @@ -2435,6 +2435,19 @@ count_train <- bake(count_rec, new_data = NULL) count_test <- bake(count_rec, new_data = testing(count_split)) ``` +If using the **Apache Spark** engine, we will need to identify the data source, +and then use it to create the splits. For this article, we will copy the +`concrete` data set into the Spark session. + +```{r} +#| label: spark-reg-data +#| eval: true + +tbl_concrete <- copy_to(sc, modeldata::concrete) + +tbl_reg <- sdf_random_split(tbl_concrete, training = 0.95, test = 0.05, seed = 100) +``` + ## Auto Ml (`auto_ml()`) @@ -2788,6 +2801,35 @@ The holdout data can be predicted: predict(boost_tree_fit, new_data = reg_test) ``` +## `spark` Engine + +We create a model specification via: + +```{r} +#| label: spec-spark-boost-tree-regression +#| eval: true +boost_tree_spec <- boost_tree() |> + set_mode("regression") |> + set_engine("spark") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-spark-boost-tree-regression +#| eval: true +boost_tree_fit <- boost_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-spark-boost-tree-regression +#| eval: true +predict(boost_tree_fit, new_data = tbl_reg$test) +``` + ## Cubist Rules (`cubist_rules()`) ## `Cubist` Engine From 1de5046ed2c62a9dfe065efecc471b569a806ddf Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 19 Nov 2025 16:42:14 -0600 Subject: [PATCH 08/23] Adds 'spark' engine to three other specs --- learn/models/parsnip-predictions/index.qmd | 89 ++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 57be121a..8511ce24 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -1392,6 +1392,36 @@ predict(logistic_reg_fit, type = "conf_int", new_data = bin_test) predict(logistic_reg_fit, type = "pred_int", new_data = bin_test) ``` +## `spark` Engine + +We create a model specification via: + +```{r} +#| label: spec-spark-logistic-reg-classification +#| eval: true +logistic_reg_spec <- logistic_reg() |> + set_engine("spark") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-spark-logistic-reg-classification +#| eval: true +logistic_reg_fit <- logistic_reg_spec |> fit(Class ~ ., data = tbl_bin$training) +logistic_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-spark-logistic-reg-classification +#| eval: false +predict(logistic_reg_fit, type = "class", new_data = tbl_bin$test) +predict(logistic_reg_fit, type = "prob", new_data = tbl_bin$test) +``` + + ## Multivariate Adaptive Regression Splines (`mars()`) ## `earth` Engine @@ -2930,6 +2960,37 @@ The holdout data can be predicted: predict(decision_tree_fit, new_data = reg_test) ``` +## `spark` Engine + +We create a model specification via: + +```{r} +#| label: spec-spark-decision-tree-regression +#| eval: true +decision_tree_spec <- decision_tree() |> + set_mode("regression") |> + set_engine("spark") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-spark-decision-tree-regression +#| eval: true +decision_tree_fit <- decision_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training) +decision_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-spark-decision-tree-regression +#| eval: false +predict(decision_tree_fit, new_data = tbl_reg$test) +``` + + + ## Generalized Additive Models (`gen_additive_mod()`) ## `mgcv` Engine @@ -3378,6 +3439,34 @@ predict(linear_reg_fit, new_data = reg_test) predict(linear_reg_fit, type = "pred_int", new_data = reg_test) ``` +## `spark` Engine + +We create a model specification via: + +```{r} +#| label: spec-spark-linear-reg-regression +#| eval: true +linear_reg_spec <- linear_reg() |> + set_engine("spark") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-spark-linear-reg-regression +#| eval: true +linear_reg_fit <- linear_reg_spec |> fit(compressive_strength ~ ., data = tbl_reg$training) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-spark-linear-reg-regression +#| eval: true +predict(linear_reg_fit, new_data = tbl_reg$test) +``` + ## Multivariate Adaptive Regression Splines (`mars()`) ## `earth` Engine From 9a5fff6561a470c2c23f39038a6d20beb4f8bf2f Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 19 Nov 2025 17:15:09 -0600 Subject: [PATCH 09/23] Adds multinomial data copy step and `multinom_reg()` engine --- learn/models/parsnip-predictions/index.qmd | 36 +++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 8511ce24..91ec31cb 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -126,7 +126,7 @@ mtl_test <- testing(mtl_split) If using the **Apache Spark** engine, we will need to identify the data source, and then use it to create the splits. For this article, we will copy the -`two_class_dat` data set into the Spark session. +`two_class_dat` and the `mtl_data` data sets into the Spark session. ```{r} @@ -135,8 +135,14 @@ and then use it to create the splits. For this article, we will copy the tbl_two_class <- copy_to(sc, modeldata::two_class_dat) tbl_bin <- sdf_random_split(tbl_two_class, training = 0.994, test = 1-0.994, seed = 100) + +tbl_sim_mtl <- copy_to(sc, mtl_data) + +tbl_mtl <- sdf_random_split(tbl_sim_mtl, training = 0.967, test = 1-0.967, seed = 100) ``` + + ## Auto Ml (`auto_ml()`) ## `h2o` Engine @@ -1751,6 +1757,34 @@ predict(multinom_reg_fit, type = "class", new_data = mtl_test) predict(multinom_reg_fit, type = "prob", new_data = mtl_test) ``` +## `spark` Engine + +We create a model specification via: + +```{r} +#| label: spec-spark-multinom-reg-classification +#| eval: true +multinom_reg_spec <- multinom_reg() |> + set_engine("spark") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-brulee-multinom-reg-classification +multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train) +multinom_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-brulee-multinom-reg-classification +predict(multinom_reg_fit, type = "class", new_data = mtl_test) +predict(multinom_reg_fit, type = "prob", new_data = mtl_test) +``` + + ## Naive Bayes (`naive_Bayes()`) ## `h2o` Engine From 4d8f97299094e3d1ede6ed60a65707bc5ae641af Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 19 Nov 2025 17:23:12 -0600 Subject: [PATCH 10/23] Completes engines --- learn/models/parsnip-predictions/index.qmd | 41 +++++++++++++++++++--- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 91ec31cb..229f6e3d 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -108,6 +108,7 @@ For models that _only_ work for three or more classes, we'll simulate: ```{r} #| label: mtl-data +#| eval: true set.seed(1752) mtl_data <- sim_multinomial( @@ -1771,17 +1772,18 @@ multinom_reg_spec <- multinom_reg() |> Now we create the model fit object: ```{r} -#| label: fit-brulee-multinom-reg-classification -multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train) +#| label: fit-spark-multinom-reg-classification +#| eval: true +multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = tbl_mtl$training) multinom_reg_fit ``` The holdout data can be predicted: ```{r} -#| label: predict-brulee-multinom-reg-classification -predict(multinom_reg_fit, type = "class", new_data = mtl_test) -predict(multinom_reg_fit, type = "prob", new_data = mtl_test) +#| label: predict-spark-multinom-reg-classification +predict(multinom_reg_fit, type = "class", new_data = tbl_mtl$test) +predict(multinom_reg_fit, type = "prob", new_data = tbl_mtl$test) ``` @@ -4289,6 +4291,35 @@ predict(rand_forest_fit, new_data = reg_test) predict(rand_forest_fit, type = "conf_int", new_data = reg_test) ``` +## `spark` Engine + +We create a model specification via: + +```{r} +#| label: spec-spark-rand-forest-regression +#| eval: true +rand_forest_spec <- rand_forest() |> + set_engine("spark") |> + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-spark-rand-forest-regression +#| eval: true +rand_forest_fit <- rand_forest_spec |> fit(compressive_strength ~ ., data = tbl_reg$training) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-spark-rand-forest-regression +#| eval: true +predict(rand_forest_fit, new_data = tbl_reg$test) +``` + ## Rule Fit (`rule_fit()`) ## `h2o` Engine From 0fa418de1c66a231e7432daccbf713873de4a8ff Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Wed, 19 Nov 2025 17:43:28 -0600 Subject: [PATCH 11/23] Adds run_spark flag, and includes frozen results to make it easier to preview --- .../index/execute-results/html.json | 15 +++++ learn/models/parsnip-predictions/index.qmd | 61 ++++++++++--------- 2 files changed, 48 insertions(+), 28 deletions(-) create mode 100644 _freeze/learn/models/parsnip-predictions/index/execute-results/html.json diff --git a/_freeze/learn/models/parsnip-predictions/index/execute-results/html.json b/_freeze/learn/models/parsnip-predictions/index/execute-results/html.json new file mode 100644 index 00000000..1cc55f8c --- /dev/null +++ b/_freeze/learn/models/parsnip-predictions/index/execute-results/html.json @@ -0,0 +1,15 @@ +{ + "hash": "3ab2d8887c852419f30d5a7b3fc00f6a", + "result": { + "engine": "knitr", + "markdown": "---\ntitle: \"Fitting and predicting with parsnip\"\ncategories:\n - model fitting\n - parsnip\n - regression\n - classification\ntype: learn-subsection\nweight: 1\ndescription: | \n Examples that show how to fit and predict with different combinations of model, mode, and engine.\ntoc: true\ntoc-depth: 3\ninclude-after-body: ../../../resources.html\nexecute: \n eval: true\n---\n\n\n\n\n\n\n## Introduction\n\nTo use code in this article, you will need to install the following packages: agua, baguette, bonsai, censored, discrim, multilevelmod, plsmod, poissonreg, rules, sparklyr, and tidymodels.\n\nThese examples show how to *fit* and *predict* with different combinations of model, mode, and engine. As a reminder, in parsnip, \n\n- the **model type** differentiates basic modeling approaches, such as random forests, logistic regression, linear support vector machines, etc.,\n\n- the **mode** denotes in what kind of modeling context it will be used (most commonly, classification or regression), and\n\n- the computational **engine** indicates how the model is fit, such as with a specific R package implementation or even methods outside of R like Keras or Stan.\n\nThe following examples use consistent data sets throughout. \n\ntodo \n\n- multielvel examples \n- get automl working\n- expand survival prediction tibbles\n- keras3 updates\n- use `
` for long model prints\n- avoid subsection titles capitalizing the engine name (e.g., \"CATBOOST\") and text within backticks\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(tidymodels)\ntheme_set(theme_bw() + theme(legend.position = \"top\"))\n```\n:::\n\n\n### Apache Spark\n\nTo use [Apache Spark](https://spark.apache.org/) as an engine, we will first \nneed a connection to a cluster. For this article, we will setup and use a \nsingle-node Spark cluster running on a laptop:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nsc <- spark_connect(\"local\", version = \"4.0.1\")\n```\n:::\n\n\n\n# Classification Models\n\nTo demonstrate classification, let's make a small training and test sets for a binary outcome. We'll center and scale the data since some models require the same units.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(207)\nbin_split <- \n\tmodeldata::two_class_dat |> \n\trename(class = Class) |> \n\tinitial_split(prop = 0.994, strata = class)\nbin_split\n#> \n#> <785/6/791>\n\nbin_rec <- \n recipe(class ~ ., data = training(bin_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nbin_train <- bake(bin_rec, new_data = NULL)\nbin_test <- bake(bin_rec, new_data = testing(bin_split))\n```\n:::\n\n\nFor models that _only_ work for three or more classes, we'll simulate:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(1752)\nmtl_data <-\n sim_multinomial(\n 200,\n ~ -0.5 + 0.6 * abs(A),\n ~ ifelse(A > 0 & B > 0, 1.0 + 0.2 * A / B, - 2),\n ~ A + B - A * B)\n\nmtl_split <- initial_split(mtl_data, prop = 0.967, strata = class)\nmtl_split\n#> \n#> <192/8/200>\n\n# Predictors are in the same units\nmtl_train <- training(mtl_split)\nmtl_test <- testing(mtl_split)\n```\n:::\n\n\nIf using the **Apache Spark** engine, we will need to identify the data source, \nand then use it to create the splits. For this article, we will copy the \n`two_class_dat` and the `mtl_data` data sets into the Spark session.\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ntbl_two_class <- copy_to(sc, modeldata::two_class_dat)\n\ntbl_bin <- sdf_random_split(tbl_two_class, training = 0.994, test = 1-0.994, seed = 100)\n\ntbl_sim_mtl <- copy_to(sc, mtl_data)\n\ntbl_mtl <- sdf_random_split(tbl_sim_mtl, training = 0.967, test = 1-0.967, seed = 100)\n```\n:::\n\n\n\n\n## Auto Ml (`auto_ml()`) \n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n\n# and initialize a server\nh20_server <- agua::h2o_start()\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nauto_ml_spec <- auto_ml() |>\n # We dont need to set the engine (since there is only one) but we'll set\n # a time limit\n set_engine(\"h2o\", max_runtime_secs = 60 * 3) |> \n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nauto_ml_fit <- auto_ml_spec |> fit(class ~ ., data = bin_train)\nauto_ml_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(auto_ml_fit, type = \"class\", new_data = bin_test)\npredict(auto_ml_fit, type = \"prob\", new_data = bin_test)\n```\n:::\n\n\n## Bagged MARS (`bag_mars()`) \n\n## `earth` Engine \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mars_spec <- bag_mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mars_fit <- bag_mars_spec |> fit(class ~ ., data = bin_train)\n#> \n#> Attaching package: 'plotrix'\n#> The following object is masked from 'package:scales':\n#> \n#> rescale\n#> Warning: There was 1 warning in `dplyr::mutate()`.\n#> ℹ In argument: `model = iter(...)`.\n#> Caused by warning:\n#> ! glm.fit: fitted probabilities numerically 0 or 1 occurred\n#> Registered S3 method overwritten by 'butcher':\n#> method from \n#> as.character.dev_topic generics\nbag_mars_fit\n#> parsnip model object\n#> \n#> Bagged MARS (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 100 0 11\n#> 2 A 40.8 1.22 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mars_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_mars_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.444 0.556 \n#> 2 0.860 0.140 \n#> 3 0.458 0.542 \n#> 4 0.950 0.0497\n#> 5 0.941 0.0593\n#> 6 0.868 0.132\n```\n:::\n\n\n## Bagged Neural Networks (`bag_mlp()`) \n\n## `nnet` Engine \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mlp_spec <- bag_mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mlp_fit <- bag_mlp_spec |> fit(class ~ ., data = bin_train)\nbag_mlp_fit\n#> parsnip model object\n#> \n#> Bagged nnet (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 55.1 1.98 11\n#> 2 A 44.9 1.98 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.421 0.579\n#> 2 0.655 0.345\n#> 3 0.429 0.571\n#> 4 0.727 0.273\n#> 5 0.716 0.284\n#> 6 0.700 0.300\n```\n:::\n\n\n## Bagged Decision Trees (`bag_tree()`) \n\n## `C5.0` Engine \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged C5.0 (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 100 0 11\n#> 2 A 58.9 6.71 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.450 0.550 \n#> 2 0.825 0.175 \n#> 3 0.322 0.678 \n#> 4 0.911 0.0892\n#> 5 0.911 0.0892\n#> 6 0.710 0.290\n```\n:::\n\n\n## `rpart` Engine \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged CART (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 275. 3.21 11\n#> 2 A 239. 4.04 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.0909 0.909\n#> 2 1 0 \n#> 3 0 1 \n#> 4 1 0 \n#> 5 0.727 0.273\n#> 6 1 0\n```\n:::\n\n\n## Bayesian Additive Regression Trees (`bart()`) \n\n## `dbarts` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbart_spec <- bart() |>\n # We need to set the mode since this engine works with multiple modes\n # and dbarts is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbart_fit <- bart_spec |> fit(class ~ ., data = bin_train)\nbart_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> `NULL`()\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bart_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bart_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.427 0.573\n#> 2 0.744 0.256\n#> 3 0.375 0.625\n#> 4 0.951 0.049\n#> 5 0.922 0.078\n#> 6 0.786 0.214\npredict(bart_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_lower_Class2 .pred_upper_Class1 .pred_upper_Class2\n#> \n#> 1 0.812 0.00247 0.998 0.188\n#> 2 0.785 0.0248 0.975 0.215\n#> 3 0.605 0.0713 0.929 0.395\n#> 4 0.561 0.102 0.898 0.439\n#> 5 0.251 0.340 0.660 0.749\n#> 6 0.200 0.416 0.584 0.800\npredict(bart_fit, type = \"pred_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_lower_Class2 .pred_upper_Class1 .pred_upper_Class2\n#> \n#> 1 0 0 1 1\n#> 2 0 0 1 1\n#> 3 0 0 1 1\n#> 4 0 0 1 1\n#> 5 0 0 1 1\n#> 6 0 0 1 1\n```\n:::\n\n\n## Boosted Decision Trees (`boost_tree()`) \n\n## `C5.0` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = 15, control = C50::C5.0Control(minCases\n#> = 2, sample = 0))\n#> \n#> Classification Tree\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Number of boosting iterations: 15 requested; 7 used due to early stopping\n#> Average tree size: 3.1 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.307 0.693\n#> 2 0.756 0.244\n#> 3 0.281 0.719\n#> 4 1 0 \n#> 5 1 0 \n#> 6 0.626 0.374\n```\n:::\n\n\n## `catboost` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"catboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> CatBoost model (1000 trees)\n#> Loss function: Logloss\n#> Fit to 2 feature(s)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.252 0.748 \n#> 2 0.839 0.161 \n#> 3 0.348 0.652 \n#> 4 0.997 0.00279\n#> 5 0.807 0.193 \n#> 6 0.884 0.116\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: gbm\n#> Model ID: GBM_model_R_1763571327438_3826 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 25380 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 21 55 35.70000\n#> \n#> \n#> H2OBinomialMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.007948832\n#> RMSE: 0.08915622\n#> LogLoss: 0.05942305\n#> Mean Per-Class Error: 0\n#> AUC: 1\n#> AUCPR: 1\n#> Gini: 1\n#> R^2: 0.9678452\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 434 0 0.000000 =0/434\n#> Class2 0 351 0.000000 =0/351\n#> Totals 434 351 0.000000 =0/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.598690 1.000000 200\n#> 2 max f2 0.598690 1.000000 200\n#> 3 max f0point5 0.598690 1.000000 200\n#> 4 max accuracy 0.598690 1.000000 200\n#> 5 max precision 0.998192 1.000000 0\n#> 6 max recall 0.598690 1.000000 200\n#> 7 max specificity 0.998192 1.000000 0\n#> 8 max absolute_mcc 0.598690 1.000000 200\n#> 9 max min_per_class_accuracy 0.598690 1.000000 200\n#> 10 max mean_per_class_accuracy 0.598690 1.000000 200\n#> 11 max tns 0.998192 434.000000 0\n#> 12 max fns 0.998192 349.000000 0\n#> 13 max fps 0.000831 434.000000 399\n#> 14 max tps 0.598690 351.000000 200\n#> 15 max tnr 0.998192 1.000000 0\n#> 16 max fnr 0.998192 0.994302 0\n#> 17 max fpr 0.000831 1.000000 399\n#> 18 max tpr 0.598690 1.000000 200\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.0496 0.950 \n#> 2 0.905 0.0953 \n#> 3 0.0738 0.926 \n#> 4 0.997 0.00273\n#> 5 0.979 0.0206 \n#> 6 0.878 0.122\n```\n:::\n\n\n## `h2o_gbm` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: gbm\n#> Model ID: GBM_model_R_1763571327438_3878 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 25378 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 21 55 35.70000\n#> \n#> \n#> H2OBinomialMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.007948832\n#> RMSE: 0.08915622\n#> LogLoss: 0.05942305\n#> Mean Per-Class Error: 0\n#> AUC: 1\n#> AUCPR: 1\n#> Gini: 1\n#> R^2: 0.9678452\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 434 0 0.000000 =0/434\n#> Class2 0 351 0.000000 =0/351\n#> Totals 434 351 0.000000 =0/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.598690 1.000000 200\n#> 2 max f2 0.598690 1.000000 200\n#> 3 max f0point5 0.598690 1.000000 200\n#> 4 max accuracy 0.598690 1.000000 200\n#> 5 max precision 0.998192 1.000000 0\n#> 6 max recall 0.598690 1.000000 200\n#> 7 max specificity 0.998192 1.000000 0\n#> 8 max absolute_mcc 0.598690 1.000000 200\n#> 9 max min_per_class_accuracy 0.598690 1.000000 200\n#> 10 max mean_per_class_accuracy 0.598690 1.000000 200\n#> 11 max tns 0.998192 434.000000 0\n#> 12 max fns 0.998192 349.000000 0\n#> 13 max fps 0.000831 434.000000 399\n#> 14 max tps 0.598690 351.000000 200\n#> 15 max tnr 0.998192 1.000000 0\n#> 16 max fnr 0.998192 0.994302 0\n#> 17 max fpr 0.000831 1.000000 399\n#> 18 max tpr 0.598690 1.000000 200\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.0496 0.950 \n#> 2 0.905 0.0953 \n#> 3 0.0738 0.926 \n#> 4 0.997 0.00273\n#> 5 0.979 0.0206 \n#> 6 0.878 0.122\n```\n:::\n\n\n## `lightgbm` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"lightgbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> LightGBM Model (100 trees)\n#> Objective: binary\n#> Fitted to dataset with 2 columns\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.147 0.853 \n#> 2 0.930 0.0699\n#> 3 0.237 0.763 \n#> 4 0.990 0.0101\n#> 5 0.929 0.0714\n#> 6 0.956 0.0445\n```\n:::\n\n\n## `xgboost` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and xgboost is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> ##### xgb.Booster\n#> raw: 40.4 Kb \n#> call:\n#> xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, \n#> colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, \n#> subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist, \n#> verbose = 0, nthread = 1, objective = \"binary:logistic\")\n#> params (as set within xgb.train):\n#> eta = \"0.3\", max_depth = \"6\", gamma = \"0\", colsample_bytree = \"1\", colsample_bynode = \"1\", min_child_weight = \"1\", subsample = \"1\", nthread = \"1\", objective = \"binary:logistic\", validate_parameters = \"TRUE\"\n#> xgb.attributes:\n#> niter\n#> callbacks:\n#> cb.evaluation.log()\n#> # of features: 2 \n#> niter: 15\n#> nfeatures : 2 \n#> evaluation_log:\n#> iter training_logloss\n#> \n#> 1 0.5546750\n#> 2 0.4719804\n#> --- ---\n#> 14 0.2587640\n#> 15 0.2528938\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.244 0.756 \n#> 2 0.770 0.230 \n#> 3 0.307 0.693 \n#> 4 0.944 0.0565\n#> 5 0.821 0.179 \n#> 6 0.938 0.0621\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(Class ~ ., data = tbl_bin$training)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> GBTClassificationModel: uid = gradient_boosted_trees__c61f3c19_30b0_416f_af47_e371c1aea2db, numTrees=20, numClasses=2, numFeatures=2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class1 \n#> 4 Class2 \n#> 5 Class2 \n#> 6 Class1 \n#> 7 Class2\npredict(boost_tree_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 2]\n#> # Database: spark_connection\n#> pred_Class1 pred_Class2\n#> \n#> 1 0.307 0.693 \n#> 2 0.292 0.708 \n#> 3 0.856 0.144 \n#> 4 0.192 0.808 \n#> 5 0.332 0.668 \n#> 6 0.952 0.0476\n#> 7 0.0865 0.914\n```\n:::\n\n\n\n## C5 Rules (`C5_rules()`) \n\n## `C5.0` Engine \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and C5.0 is the default engine so there is no need to set that either.\nC5_rules_spec <- C5_rules()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nC5_rules_fit <- C5_rules_spec |> fit(class ~ ., data = bin_train)\nC5_rules_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = trials, rules = TRUE, control\n#> = C50::C5.0Control(minCases = minCases, seed = sample.int(10^5,\n#> 1), earlyStopping = FALSE))\n#> \n#> Rule-Based Model\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Number of Rules: 4 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(C5_rules_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(C5_rules_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 1 0\n#> 2 1 0\n#> 3 0 1\n#> 4 1 0\n#> 5 1 0\n#> 6 1 0\n```\n:::\n\n\n## Decision Tree (`decision_tree()`) \n\n## `C5.0` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = 1, control = C50::C5.0Control(minCases =\n#> 2, sample = 0))\n#> \n#> Classification Tree\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Tree size: 4 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.732 0.268\n#> 2 0.846 0.154\n#> 3 0.236 0.764\n#> 4 0.846 0.154\n#> 5 0.846 0.154\n#> 6 0.846 0.154\n```\n:::\n\n\n## `partykit` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> class ~ A + B\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] B <= -0.06906\n#> | | [3] B <= -0.50486: Class1 (n = 291, err = 8.2%)\n#> | | [4] B > -0.50486\n#> | | | [5] A <= -0.07243: Class1 (n = 77, err = 45.5%)\n#> | | | [6] A > -0.07243: Class1 (n = 31, err = 6.5%)\n#> | [7] B > -0.06906\n#> | | [8] B <= 0.72938\n#> | | | [9] A <= 0.60196: Class2 (n = 145, err = 24.8%)\n#> | | | [10] A > 0.60196\n#> | | | | [11] B <= 0.44701: Class1 (n = 23, err = 4.3%)\n#> | | | | [12] B > 0.44701: Class1 (n = 26, err = 46.2%)\n#> | | [13] B > 0.72938: Class2 (n = 192, err = 12.5%)\n#> \n#> Number of inner nodes: 6\n#> Number of terminal nodes: 7\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.538 0.462 \n#> 2 0.935 0.0645\n#> 3 0.248 0.752 \n#> 4 0.918 0.0825\n#> 5 0.918 0.0825\n#> 6 0.935 0.0645\n```\n:::\n\n\n## `rpart` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> n= 785 \n#> \n#> node), split, n, loss, yval, (yprob)\n#> * denotes terminal node\n#> \n#> 1) root 785 351 Class1 (0.5528662 0.4471338) \n#> 2) B< -0.06526451 399 61 Class1 (0.8471178 0.1528822) *\n#> 3) B>=-0.06526451 386 96 Class2 (0.2487047 0.7512953) \n#> 6) B< 0.7339337 194 72 Class2 (0.3711340 0.6288660) \n#> 12) A>=0.6073948 49 13 Class1 (0.7346939 0.2653061) *\n#> 13) A< 0.6073948 145 36 Class2 (0.2482759 0.7517241) *\n#> 7) B>=0.7339337 192 24 Class2 (0.1250000 0.8750000) *\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.735 0.265\n#> 2 0.847 0.153\n#> 3 0.248 0.752\n#> 4 0.847 0.153\n#> 5 0.847 0.153\n#> 6 0.847 0.153\n```\n:::\n\n\n## `sparklyr` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(Class ~ ., data = tbl_bin$training)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> n= 784 \n#> \n#> node), split, n, loss, yval, (yprob)\n#> * denotes terminal node\n#> \n#> 1) root 784 350 Class1 (0.5535714 0.4464286) \n#> 2) B< 1.495535 401 62 Class1 (0.8453865 0.1546135) *\n#> 3) B>=1.495535 383 95 Class2 (0.2480418 0.7519582) \n#> 6) B< 2.079458 192 71 Class2 (0.3697917 0.6302083) \n#> 12) A>=2.572663 50 14 Class1 (0.7200000 0.2800000) *\n#> 13) A< 2.572663 142 35 Class2 (0.2464789 0.7535211) *\n#> 7) B>=2.079458 191 24 Class2 (0.1256545 0.8743455) *\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = tbl_bin$test)\n#> # A tibble: 7 × 1\n#> .pred_class\n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n#> 6 \n#> 7 \npredict(decision_tree_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # A tibble: 7 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.246 0.754\n#> 2 0.246 0.754\n#> 3 0.845 0.155\n#> 4 0.246 0.754\n#> 5 0.246 0.754\n#> 6 0.845 0.155\n#> 7 0.126 0.874\n```\n:::\n\n\n## Flexible Discriminant Analysis (`discrim_flexible()`) \n\n## `earth` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and earth is the default engine so there is no need to set that either.\ndiscrim_flexible_spec <- discrim_flexible()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_flexible_fit <- discrim_flexible_spec |> fit(class ~ ., data = bin_train)\ndiscrim_flexible_fit\n#> parsnip model object\n#> \n#> Call:\n#> mda::fda(formula = class ~ ., data = data, method = earth::earth)\n#> \n#> Dimension: 1 \n#> \n#> Percent Between-Group Variance Explained:\n#> v1 \n#> 100 \n#> \n#> Training Misclassification Error: 0.1707 ( N = 785 )\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_flexible_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_flexible_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.339 0.661 \n#> 2 0.848 0.152 \n#> 3 0.342 0.658 \n#> 4 0.964 0.0360\n#> 5 0.964 0.0360\n#> 6 0.875 0.125\n```\n:::\n\n\n## Linear Discriminant Analysis (`discrim_linear()`) \n\n## `MASS` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and MASS is the default engine so there is no need to set that either.\ndiscrim_linear_spec <- discrim_linear()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Call:\n#> lda(class ~ ., data = data)\n#> \n#> Prior probabilities of groups:\n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Group means:\n#> A B\n#> Class1 -0.2982900 -0.5573140\n#> Class2 0.3688258 0.6891006\n#> \n#> Coefficients of linear discriminants:\n#> LD1\n#> A -0.6068479\n#> B 1.7079953\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.369 0.631 \n#> 2 0.868 0.132 \n#> 3 0.541 0.459 \n#> 4 0.984 0.0158\n#> 5 0.928 0.0718\n#> 6 0.854 0.146\n```\n:::\n\n\n## `mda` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"mda\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Call:\n#> mda::fda(formula = class ~ ., data = data, method = mda::gen.ridge, \n#> keep.fitted = FALSE)\n#> \n#> Dimension: 1 \n#> \n#> Percent Between-Group Variance Explained:\n#> v1 \n#> 100 \n#> \n#> Degrees of Freedom (per dimension): 1.99423 \n#> \n#> Training Misclassification Error: 0.17707 ( N = 785 )\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.368 0.632 \n#> 2 0.867 0.133 \n#> 3 0.542 0.458 \n#> 4 0.984 0.0158\n#> 5 0.928 0.0718\n#> 6 0.853 0.147\n```\n:::\n\n\n## `sda` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sda\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> $regularization\n#> lambda lambda.var lambda.freqs \n#> 0.003136201 0.067551534 0.112819609 \n#> \n#> $freqs\n#> Class1 Class2 \n#> 0.5469019 0.4530981 \n#> \n#> $alpha\n#> Class1 Class2 \n#> -0.8934125 -1.2349286 \n#> \n#> $beta\n#> A B\n#> Class1 0.4565325 -1.298858\n#> Class2 -0.5510473 1.567757\n#> attr(,\"class\")\n#> [1] \"shrinkage\"\n#> \n#> attr(,\"class\")\n#> [1] \"sda\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.366 0.634 \n#> 2 0.860 0.140 \n#> 3 0.536 0.464 \n#> 4 0.982 0.0176\n#> 5 0.923 0.0768\n#> 6 0.845 0.155\n```\n:::\n\n\n## `sparsediscrim` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sparsediscrim\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Diagonal LDA\n#> \n#> Sample Size: 785 \n#> Number of Features: 2 \n#> \n#> Classes and Prior Probabilities:\n#> Class1 (55.29%), Class2 (44.71%)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.182 0.818 \n#> 2 0.755 0.245 \n#> 3 0.552 0.448 \n#> 4 0.996 0.00372\n#> 5 0.973 0.0274 \n#> 6 0.629 0.371\n```\n:::\n\n\n## Quandratic Discriminant Analysis (`discrim_quad()`) \n\n## `MASS` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_spec <- discrim_quad()\n # This engine works with a single mode so no need to set that\n # and MASS is the default engine so there is no need to set that either.\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_fit <- discrim_quad_spec |> fit(class ~ ., data = bin_train)\ndiscrim_quad_fit\n#> parsnip model object\n#> \n#> Call:\n#> qda(class ~ ., data = data)\n#> \n#> Prior probabilities of groups:\n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Group means:\n#> A B\n#> Class1 -0.2982900 -0.5573140\n#> Class2 0.3688258 0.6891006\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_quad_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_quad_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.340 0.660 \n#> 2 0.884 0.116 \n#> 3 0.500 0.500 \n#> 4 0.965 0.0349\n#> 5 0.895 0.105 \n#> 6 0.895 0.105\n```\n:::\n\n\n## `sparsediscrim` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_spec <- discrim_quad() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sparsediscrim\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_fit <- discrim_quad_spec |> fit(class ~ ., data = bin_train)\ndiscrim_quad_fit\n#> parsnip model object\n#> \n#> Diagonal QDA\n#> \n#> Sample Size: 785 \n#> Number of Features: 2 \n#> \n#> Classes and Prior Probabilities:\n#> Class1 (55.29%), Class2 (44.71%)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_quad_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_quad_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.180 0.820 \n#> 2 0.750 0.250 \n#> 3 0.556 0.444 \n#> 4 0.994 0.00634\n#> 5 0.967 0.0328 \n#> 6 0.630 0.370\n```\n:::\n\n\n## Regularized Discriminant Analysis (`discrim_regularized()`) \n\n## `klaR` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and klaR is the default engine so there is no need to set that either.\ndiscrim_regularized_spec <- discrim_regularized()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_regularized_fit <- discrim_regularized_spec |> fit(class ~ ., data = bin_train)\ndiscrim_regularized_fit\n#> parsnip model object\n#> \n#> Call: \n#> rda(formula = class ~ ., data = data)\n#> \n#> Regularization parameters: \n#> gamma lambda \n#> 5.344614e-15 1.032850e-02 \n#> \n#> Prior probabilities of groups: \n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Misclassification rate: \n#> apparent: 17.707 %\n#> cross-validated: 17.844 %\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_regularized_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_regularized_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.340 0.660 \n#> 2 0.884 0.116 \n#> 3 0.501 0.499 \n#> 4 0.965 0.0346\n#> 5 0.895 0.105 \n#> 6 0.895 0.105\n```\n:::\n\n\n## Generalized Additive Models (`gen_additive_mod()`) \n\n## `mgcv` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_spec <- gen_additive_mod() |>\n # We need to set the mode since this engine works with multiple modes\n # and mgcv is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_fit <- \n gen_additive_mod_spec |> \n fit(class ~ s(A) + s(B), data = bin_train)\ngen_additive_mod_fit\n#> parsnip model object\n#> \n#> \n#> Family: binomial \n#> Link function: logit \n#> \n#> Formula:\n#> class ~ s(A) + s(B)\n#> \n#> Estimated degrees of freedom:\n#> 2.76 4.22 total = 7.98 \n#> \n#> UBRE score: -0.153537\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(gen_additive_mod_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(gen_additive_mod_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.400 0.600 \n#> 2 0.826 0.174 \n#> 3 0.454 0.546 \n#> 4 0.975 0.0250\n#> 5 0.929 0.0711\n#> 6 0.829 0.171\npredict(gen_additive_mod_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0.304 0.504 0.496 0.696\n#> 2 0.739 0.889 0.111 0.261\n#> 3 0.364 0.546 0.454 0.636\n#> 4 0.846 0.996 0.00358 0.154\n#> 5 0.881 0.958 0.0416 0.119\n#> 6 0.735 0.894 0.106 0.265\n```\n:::\n\n\n## Logistic Regression (`logistic_reg()`) \n\n## `brulee` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Logistic regression\n#> \n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> batch size: 707 \n#> validation loss after 2 epochs: 0.375\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.409 0.591 \n#> 2 0.865 0.135 \n#> 3 0.544 0.456 \n#> 4 0.976 0.0239\n#> 5 0.909 0.0914\n#> 6 0.857 0.143\n```\n:::\n\n\n## `gee` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n```\n:::\n\n\n## `glm` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg()\n # This engine works with a single mode so no need to set that\n # and glm is the default engine so there is no need to set that either.\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = class ~ ., family = stats::binomial, data = data)\n#> \n#> Coefficients:\n#> (Intercept) A B \n#> -0.3563 -1.1250 2.8154 \n#> \n#> Degrees of Freedom: 784 Total (i.e. Null); 782 Residual\n#> Null Deviance:\t 1079 \n#> Residual Deviance: 666.9 \tAIC: 672.9\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.400 0.600 \n#> 2 0.862 0.138 \n#> 3 0.541 0.459 \n#> 4 0.977 0.0234\n#> 5 0.909 0.0905\n#> 6 0.853 0.147\npredict(logistic_reg_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0.339 0.465 0.535 0.661 \n#> 2 0.816 0.897 0.103 0.184 \n#> 3 0.493 0.588 0.412 0.507 \n#> 4 0.960 0.986 0.0137 0.0395\n#> 5 0.875 0.935 0.0647 0.125 \n#> 6 0.800 0.894 0.106 0.200\n```\n:::\n\n\n## `glmer` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.400 0.600 \n#> 2 0.862 0.138 \n#> 3 0.541 0.459 \n#> 4 0.977 0.0234\n#> 5 0.909 0.0905\n#> 6 0.853 0.147\n```\n:::\n\n\n## `glmnet` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"binomial\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.308300\n#> 2 1 4.75 0.280900\n#> 3 1 8.73 0.256000\n#> 4 1 12.10 0.233200\n#> 5 1 14.99 0.212500\n#> 6 1 17.46 0.193600\n#> 7 1 19.60 0.176400\n#> 8 1 21.45 0.160800\n#> 9 1 23.05 0.146500\n#> 10 1 24.44 0.133500\n#> 11 1 25.65 0.121600\n#> 12 1 26.70 0.110800\n#> 13 1 27.61 0.101000\n#> 14 1 28.40 0.091990\n#> 15 1 29.08 0.083820\n#> 16 1 29.68 0.076370\n#> 17 1 30.19 0.069590\n#> 18 1 30.63 0.063410\n#> 19 1 31.00 0.057770\n#> 20 1 31.33 0.052640\n#> 21 1 31.61 0.047960\n#> 22 1 31.85 0.043700\n#> 23 1 32.05 0.039820\n#> 24 2 32.62 0.036280\n#> 25 2 33.41 0.033060\n#> 26 2 34.10 0.030120\n#> 27 2 34.68 0.027450\n#> 28 2 35.19 0.025010\n#> 29 2 35.63 0.022790\n#> 30 2 36.01 0.020760\n#> 31 2 36.33 0.018920\n#> 32 2 36.62 0.017240\n#> 33 2 36.86 0.015710\n#> 34 2 37.06 0.014310\n#> 35 2 37.24 0.013040\n#> 36 2 37.39 0.011880\n#> 37 2 37.52 0.010830\n#> 38 2 37.63 0.009864\n#> 39 2 37.72 0.008988\n#> 40 2 37.80 0.008189\n#> 41 2 37.86 0.007462\n#> 42 2 37.92 0.006799\n#> 43 2 37.97 0.006195\n#> 44 2 38.01 0.005644\n#> 45 2 38.04 0.005143\n#> 46 2 38.07 0.004686\n#> 47 2 38.10 0.004270\n#> 48 2 38.12 0.003891\n#> 49 2 38.13 0.003545\n#> 50 2 38.15 0.003230\n#> 51 2 38.16 0.002943\n#> 52 2 38.17 0.002682\n#> 53 2 38.18 0.002443\n#> 54 2 38.18 0.002226\n#> 55 2 38.19 0.002029\n#> 56 2 38.19 0.001848\n#> 57 2 38.20 0.001684\n#> 58 2 38.20 0.001534\n#> 59 2 38.20 0.001398\n#> 60 2 38.21 0.001274\n#> 61 2 38.21 0.001161\n#> 62 2 38.21 0.001058\n#> 63 2 38.21 0.000964\n#> 64 2 38.21 0.000878\n#> 65 2 38.21 0.000800\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.383 0.617 \n#> 2 0.816 0.184 \n#> 3 0.537 0.463 \n#> 4 0.969 0.0313\n#> 5 0.894 0.106 \n#> 6 0.797 0.203\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: glm\n#> Model ID: GLM_model_R_1763571327438_3930 \n#> GLM Model: summary\n#> family link regularization\n#> 1 binomial logit Elastic Net (alpha = 0.5, lambda = 6.162E-4 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 4\n#> training_frame\n#> 1 object_xtqmofwsbr\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept -0.350788 -0.350788\n#> 2 A -1.084233 -1.084233\n#> 3 B 2.759366 2.759366\n#> \n#> H2OBinomialMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.130451\n#> RMSE: 0.3611799\n#> LogLoss: 0.4248206\n#> Mean Per-Class Error: 0.1722728\n#> AUC: 0.8889644\n#> AUCPR: 0.8520865\n#> Gini: 0.7779288\n#> R^2: 0.4722968\n#> Residual Deviance: 666.9684\n#> AIC: 672.9684\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 350 84 0.193548 =84/434\n#> Class2 53 298 0.150997 =53/351\n#> Totals 403 382 0.174522 =137/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.411045 0.813097 213\n#> 2 max f2 0.229916 0.868991 279\n#> 3 max f0point5 0.565922 0.816135 166\n#> 4 max accuracy 0.503565 0.826752 185\n#> 5 max precision 0.997356 1.000000 0\n#> 6 max recall 0.009705 1.000000 395\n#> 7 max specificity 0.997356 1.000000 0\n#> 8 max absolute_mcc 0.411045 0.652014 213\n#> 9 max min_per_class_accuracy 0.454298 0.822581 201\n#> 10 max mean_per_class_accuracy 0.411045 0.827727 213\n#> 11 max tns 0.997356 434.000000 0\n#> 12 max fns 0.997356 349.000000 0\n#> 13 max fps 0.001723 434.000000 399\n#> 14 max tps 0.009705 351.000000 395\n#> 15 max tnr 0.997356 1.000000 0\n#> 16 max fnr 0.997356 0.994302 0\n#> 17 max fpr 0.001723 1.000000 399\n#> 18 max tpr 0.009705 1.000000 395\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.399 0.601 \n#> 2 0.857 0.143 \n#> 3 0.540 0.460 \n#> 4 0.976 0.0243\n#> 5 0.908 0.0925\n#> 6 0.848 0.152\n```\n:::\n\n\n## `keras` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n```\n:::\n\n\n## `LiblineaR` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"LiblineaR\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized logistic regression primal (L2R_LR)\"\n#> \n#> $Type\n#> [1] 0\n#> \n#> $W\n#> A B Bias\n#> [1,] 1.014233 -2.65166 0.3363362\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $ClassNames\n#> [1] Class1 Class2\n#> Levels: Class1 Class2\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.397 0.603 \n#> 2 0.847 0.153 \n#> 3 0.539 0.461 \n#> 4 0.973 0.0267\n#> 5 0.903 0.0974\n#> 6 0.837 0.163\n```\n:::\n\n\n## `stan` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> stan_glm\n#> family: binomial [logit]\n#> formula: class ~ .\n#> observations: 785\n#> predictors: 3\n#> ------\n#> Median MAD_SD\n#> (Intercept) -0.4 0.1 \n#> A -1.1 0.2 \n#> B 2.8 0.2 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.399 0.601 \n#> 2 0.860 0.140 \n#> 3 0.541 0.459 \n#> 4 0.977 0.0234\n#> 5 0.909 0.0906\n#> 6 0.852 0.148\npredict(logistic_reg_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0.338 0.463 0.537 0.662 \n#> 2 0.815 0.897 0.103 0.185 \n#> 3 0.493 0.588 0.412 0.507 \n#> 4 0.961 0.986 0.0135 0.0389\n#> 5 0.876 0.936 0.0643 0.124 \n#> 6 0.798 0.893 0.107 0.202\npredict(logistic_reg_fit, type = \"pred_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0 1 0 1\n#> 2 0 1 0 1\n#> 3 0 1 0 1\n#> 4 0 1 0 1\n#> 5 0 1 0 1\n#> 6 0 1 0 1\n```\n:::\n\n\n## `stan_glmer` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\npredict(logistic_reg_fit, type = \"conf_int\", new_data = bin_test)\npredict(logistic_reg_fit, type = \"pred_int\", new_data = bin_test)\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(Class ~ ., data = tbl_bin$training)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> Coefficients:\n#> (Intercept) A B \n#> -3.731170 -1.214355 3.794186\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = tbl_bin$test)\npredict(logistic_reg_fit, type = \"prob\", new_data = tbl_bin$test)\n```\n:::\n\n\n\n## Multivariate Adaptive Regression Splines (`mars()`) \n\n## `earth` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_spec <- mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_fit <- mars_spec |> fit(class ~ ., data = bin_train)\nmars_fit\n#> parsnip model object\n#> \n#> GLM (family binomial, link logit):\n#> nulldev df dev df devratio AIC iters converged\n#> 1079.45 784 638.975 779 0.408 651 5 1\n#> \n#> Earth selected 6 of 13 terms, and 2 of 2 predictors\n#> Termination condition: Reached nk 21\n#> Importance: B, A\n#> Number of terms at each degree of interaction: 1 5 (additive model)\n#> Earth GCV 0.1342746 RSS 102.4723 GRSq 0.4582121 RSq 0.4719451\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mars_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mars_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.410 0.590 \n#> 2 0.794 0.206 \n#> 3 0.356 0.644 \n#> 4 0.927 0.0729\n#> 5 0.927 0.0729\n#> 6 0.836 0.164\n```\n:::\n\n\n## Neural Networks (`mlp()`) \n\n## `brulee` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> relu activation,\n#> 3 hidden units,\n#> 17 model parameters\n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 707 \n#> learn rate: 0.01 \n#> validation loss after 4 epochs: 0.508\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.390 0.610\n#> 2 0.854 0.146\n#> 3 0.507 0.493\n#> 4 0.830 0.170\n#> 5 0.828 0.172\n#> 6 0.851 0.149\n```\n:::\n\n\n## `brulee_two_layer` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"brulee_two_layer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> c(relu,relu) activation,\n#> c(3,3) hidden units,\n#> 29 model parameters\n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 707 \n#> learn rate: 0.01 \n#> validation loss after 16 epochs: 0.307\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.411 0.589 \n#> 2 0.883 0.117 \n#> 3 0.520 0.480 \n#> 4 0.971 0.0293\n#> 5 0.938 0.0618\n#> 6 0.871 0.129\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: deeplearning\n#> Model ID: DeepLearning_model_R_1763571327438_3932 \n#> Status of Neuron Layers: predicting .outcome, 2-class classification, bernoulli distribution, CrossEntropy loss, 1,002 weights/biases, 16.9 KB, 7,850 training samples, mini-batch size 1\n#> layer units type dropout l1 l2 mean_rate rate_rms momentum\n#> 1 1 2 Input 0.00 % NA NA NA NA NA\n#> 2 2 200 Rectifier 0.00 % 0.000000 0.000000 0.008994 0.023584 0.000000\n#> 3 3 2 Softmax NA 0.000000 0.000000 0.002983 0.000548 0.000000\n#> mean_weight weight_rms mean_bias bias_rms\n#> 1 NA NA NA NA\n#> 2 0.006098 0.105669 0.492018 0.020146\n#> 3 0.033179 0.403317 -0.015716 0.023938\n#> \n#> \n#> H2OBinomialMetrics: deeplearning\n#> ** Reported on training data. **\n#> ** Metrics reported on full training frame **\n#> \n#> MSE: 0.130512\n#> RMSE: 0.3612645\n#> LogLoss: 0.4275074\n#> Mean Per-Class Error: 0.1685671\n#> AUC: 0.8893418\n#> AUCPR: 0.8486687\n#> Gini: 0.7786837\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 373 61 0.140553 =61/434\n#> Class2 69 282 0.196581 =69/351\n#> Totals 442 343 0.165605 =130/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.466071 0.812680 192\n#> 2 max f2 0.210358 0.870370 283\n#> 3 max f0point5 0.482168 0.819964 186\n#> 4 max accuracy 0.466071 0.834395 192\n#> 5 max precision 0.885661 0.950495 47\n#> 6 max recall 0.004683 1.000000 396\n#> 7 max specificity 0.991894 0.997696 0\n#> 8 max absolute_mcc 0.466071 0.664455 192\n#> 9 max min_per_class_accuracy 0.427673 0.823362 206\n#> 10 max mean_per_class_accuracy 0.466071 0.831433 192\n#> 11 max tns 0.991894 433.000000 0\n#> 12 max fns 0.991894 349.000000 0\n#> 13 max fps 0.000622 434.000000 399\n#> 14 max tps 0.004683 351.000000 396\n#> 15 max tnr 0.991894 0.997696 0\n#> 16 max fnr 0.991894 0.994302 0\n#> 17 max fpr 0.000622 1.000000 399\n#> 18 max tpr 0.004683 1.000000 396\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.469 0.531 \n#> 2 0.898 0.102 \n#> 3 0.581 0.419 \n#> 4 0.981 0.0191\n#> 5 0.919 0.0808\n#> 6 0.898 0.102\n```\n:::\n\n\n## `keras` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n```\n:::\n\n\n## `nnet` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> a 2-5-1 network with 21 weights\n#> inputs: A B \n#> output(s): class \n#> options were - entropy fitting\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.418 0.582\n#> 2 0.658 0.342\n#> 3 0.406 0.594\n#> 4 0.725 0.275\n#> 5 0.714 0.286\n#> 6 0.633 0.367\n```\n:::\n\n\n## Multinom Regression (`multinom_reg()`) \n\n## `brulee` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Multinomial regression\n#> \n#> 192 samples, 2 features, 3 classes \n#> class weights one=1, two=1, three=1 \n#> weight decay: 0.001 \n#> batch size: 173 \n#> validation loss after 1 epoch: 0.816\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.133 0.207 0.660 \n#> 2 0.298 0.189 0.512 \n#> 3 0.346 0.206 0.448 \n#> 4 0.985 0.00158 0.0134\n#> 5 0.956 0.00343 0.0404\n#> 6 0.00328 0.742 0.254 \n#> 7 0.0570 0.411 0.532 \n#> 8 0.487 0.0488 0.465\n```\n:::\n\n\n## `glmnet` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"multinomial\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.219200\n#> 2 1 1.61 0.199700\n#> 3 2 3.90 0.181900\n#> 4 2 6.07 0.165800\n#> 5 2 7.93 0.151100\n#> 6 2 9.52 0.137600\n#> 7 2 10.90 0.125400\n#> 8 2 12.09 0.114300\n#> 9 2 13.13 0.104100\n#> 10 2 14.22 0.094870\n#> 11 2 15.28 0.086440\n#> 12 2 16.20 0.078760\n#> 13 2 16.99 0.071760\n#> 14 2 17.68 0.065390\n#> 15 2 18.28 0.059580\n#> 16 2 18.80 0.054290\n#> 17 2 19.24 0.049460\n#> 18 2 19.63 0.045070\n#> 19 2 19.96 0.041070\n#> 20 2 20.25 0.037420\n#> 21 2 20.49 0.034090\n#> 22 2 20.70 0.031070\n#> 23 2 20.88 0.028310\n#> 24 2 21.04 0.025790\n#> 25 2 21.17 0.023500\n#> 26 2 21.28 0.021410\n#> 27 2 21.38 0.019510\n#> 28 2 21.46 0.017780\n#> 29 2 21.53 0.016200\n#> 30 2 21.58 0.014760\n#> 31 2 21.63 0.013450\n#> 32 2 21.67 0.012250\n#> 33 2 21.71 0.011160\n#> 34 2 21.74 0.010170\n#> 35 2 21.77 0.009269\n#> 36 2 21.79 0.008445\n#> 37 2 21.82 0.007695\n#> 38 2 21.83 0.007011\n#> 39 2 21.85 0.006389\n#> 40 2 21.86 0.005821\n#> 41 2 21.87 0.005304\n#> 42 2 21.88 0.004833\n#> 43 2 21.89 0.004403\n#> 44 2 21.89 0.004012\n#> 45 2 21.90 0.003656\n#> 46 2 21.90 0.003331\n#> 47 2 21.91 0.003035\n#> 48 2 21.91 0.002765\n#> 49 2 21.91 0.002520\n#> 50 2 21.91 0.002296\n#> 51 2 21.92 0.002092\n#> 52 2 21.92 0.001906\n#> 53 2 21.92 0.001737\n#> 54 2 21.92 0.001582\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.163 0.211 0.626 \n#> 2 0.318 0.185 0.496 \n#> 3 0.358 0.198 0.444 \n#> 4 0.976 0.00268 0.0217\n#> 5 0.940 0.00529 0.0544\n#> 6 0.00617 0.699 0.295 \n#> 7 0.0757 0.390 0.534 \n#> 8 0.506 0.0563 0.438\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OMultinomialModel: glm\n#> Model ID: GLM_model_R_1763571327438_3935 \n#> GLM Model: summary\n#> family link regularization\n#> 1 multinomial multinomial Elastic Net (alpha = 0.5, lambda = 4.372E-4 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 9 6 4\n#> training_frame\n#> 1 object_avyvxbooiq\n#> \n#> Coefficients: glm multinomial coefficients\n#> names coefs_class_0 coefs_class_1 coefs_class_2 std_coefs_class_0\n#> 1 Intercept -1.119482 -0.831434 -1.706488 -1.083442\n#> 2 A -1.119327 0.002894 0.750746 -1.029113\n#> 3 B -1.208210 0.078752 0.162842 -1.187423\n#> std_coefs_class_1 std_coefs_class_2\n#> 1 -0.819868 -1.830487\n#> 2 0.002661 0.690238\n#> 3 0.077397 0.160041\n#> \n#> H2OMultinomialMetrics: glm\n#> ** Reported on training data. **\n#> \n#> Training Set Metrics: \n#> =====================\n#> \n#> Extract training frame with `h2o.getFrame(\"object_avyvxbooiq\")`\n#> MSE: (Extract with `h2o.mse`) 0.2982118\n#> RMSE: (Extract with `h2o.rmse`) 0.5460878\n#> Logloss: (Extract with `h2o.logloss`) 0.822443\n#> Mean Per-Class Error: 0.4583896\n#> AUC: (Extract with `h2o.auc`) NaN\n#> AUCPR: (Extract with `h2o.aucpr`) NaN\n#> Null Deviance: (Extract with `h2o.nulldeviance`) 404.5036\n#> Residual Deviance: (Extract with `h2o.residual_deviance`) 315.8181\n#> R^2: (Extract with `h2o.r2`) 0.4682043\n#> AIC: (Extract with `h2o.aic`) NaN\n#> Confusion Matrix: Extract with `h2o.confusionMatrix(,train = TRUE)`)\n#> =========================================================================\n#> Confusion Matrix: Row labels: Actual class; Column labels: Predicted class\n#> one three two Error Rate\n#> one 59 18 1 0.2436 = 19 / 78\n#> three 19 52 5 0.3158 = 24 / 76\n#> two 7 24 7 0.8158 = 31 / 38\n#> Totals 85 94 13 0.3854 = 74 / 192\n#> \n#> Hit Ratio Table: Extract with `h2o.hit_ratio_table(,train = TRUE)`\n#> =======================================================================\n#> Top-3 Hit Ratios: \n#> k hit_ratio\n#> 1 1 0.614583\n#> 2 2 0.890625\n#> 3 3 1.000000\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_three .pred_two\n#> \n#> 1 0.146 0.641 0.213 \n#> 2 0.308 0.513 0.179 \n#> 3 0.350 0.460 0.190 \n#> 4 0.983 0.0158 0.00128\n#> 5 0.955 0.0422 0.00284\n#> 6 0.00329 0.244 0.752 \n#> 7 0.0599 0.527 0.413 \n#> 8 0.521 0.432 0.0469\n```\n:::\n\n\n## `keras` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n```\n:::\n\n\n## `nnet` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and nnet is the default engine so there is no need to set that either.\nmultinom_reg_spec <- multinom_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> nnet::multinom(formula = class ~ ., data = data, trace = FALSE)\n#> \n#> Coefficients:\n#> (Intercept) A B\n#> two -0.5868435 1.881920 1.379106\n#> three 0.2910810 1.129622 1.292802\n#> \n#> Residual Deviance: 315.8164 \n#> AIC: 327.8164\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.145 0.213 0.641 \n#> 2 0.308 0.178 0.514 \n#> 3 0.350 0.189 0.461 \n#> 4 0.983 0.00123 0.0155\n#> 5 0.956 0.00275 0.0415\n#> 6 0.00318 0.754 0.243 \n#> 7 0.0591 0.414 0.527 \n#> 8 0.522 0.0465 0.431\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = tbl_mtl$training)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Formula: class ~ .\n#> \n#> Coefficients:\n#> (Intercept) A B\n#> one 0.05447853 -1.0569131 -0.9049194\n#> three 0.41207949 0.1458870 0.3959664\n#> two -0.46655802 0.9110261 0.5089529\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = tbl_mtl$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 one \n#> 2 one \n#> 3 three \n#> 4 three \n#> 5 three \n#> 6 three \n#> 7 three\npredict(multinom_reg_fit, type = \"prob\", new_data = tbl_mtl$test)\n#> # Source: SQL [?? x 3]\n#> # Database: spark_connection\n#> pred_one pred_three pred_two\n#> \n#> 1 0.910 0.0814 0.00904\n#> 2 0.724 0.233 0.0427 \n#> 3 0.124 0.620 0.256 \n#> 4 0.0682 0.610 0.322 \n#> 5 0.130 0.571 0.300 \n#> 6 0.115 0.549 0.336 \n#> 7 0.0517 0.524 0.424\n```\n:::\n\n\n\n## Naive Bayes (`naive_Bayes()`) \n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_spec <- naive_Bayes() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\nnaive_Bayes_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: naivebayes\n#> Model ID: NaiveBayes_model_R_1763571327438_3936 \n#> Model Summary: \n#> number_of_response_levels min_apriori_probability max_apriori_probability\n#> 1 2 0.44713 0.55287\n#> \n#> \n#> H2OBinomialMetrics: naivebayes\n#> ** Reported on training data. **\n#> \n#> MSE: 0.1737113\n#> RMSE: 0.4167869\n#> LogLoss: 0.5473431\n#> Mean Per-Class Error: 0.2356138\n#> AUC: 0.8377152\n#> AUCPR: 0.788608\n#> Gini: 0.6754303\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 274 160 0.368664 =160/434\n#> Class2 36 315 0.102564 =36/351\n#> Totals 310 475 0.249682 =196/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.175296 0.762712 286\n#> 2 max f2 0.133412 0.851119 306\n#> 3 max f0point5 0.497657 0.731343 183\n#> 4 max accuracy 0.281344 0.765605 248\n#> 5 max precision 0.999709 1.000000 0\n#> 6 max recall 0.020983 1.000000 390\n#> 7 max specificity 0.999709 1.000000 0\n#> 8 max absolute_mcc 0.280325 0.541898 249\n#> 9 max min_per_class_accuracy 0.398369 0.758065 215\n#> 10 max mean_per_class_accuracy 0.280325 0.771945 249\n#> 11 max tns 0.999709 434.000000 0\n#> 12 max fns 0.999709 347.000000 0\n#> 13 max fps 0.006522 434.000000 399\n#> 14 max tps 0.020983 351.000000 390\n#> 15 max tnr 0.999709 1.000000 0\n#> 16 max fnr 0.999709 0.988604 0\n#> 17 max fpr 0.006522 1.000000 399\n#> 18 max tpr 0.020983 1.000000 390\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class2\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.181 0.819 \n#> 2 0.750 0.250 \n#> 3 0.556 0.444 \n#> 4 0.994 0.00643\n#> 5 0.967 0.0331 \n#> 6 0.630 0.370\n```\n:::\n\n\n## `klaR` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and klaR is the default engine so there is no need to set that either.\nnaive_Bayes_spec <- naive_Bayes()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\n\n# No real print method\n# naive_Bayes_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.250 0.750 \n#> 2 0.593 0.407 \n#> 3 0.333 0.667 \n#> 4 0.993 0.00658\n#> 5 0.978 0.0223 \n#> 6 0.531 0.469\n```\n:::\n\n\n## `naivebayes` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_spec <- naive_Bayes() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"naivebayes\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\nnaive_Bayes_fit\n#> parsnip model object\n#> \n#> \n#> ================================= Naive Bayes ==================================\n#> \n#> Call:\n#> naive_bayes.default(x = maybe_data_frame(x), y = y, usekernel = TRUE)\n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> Laplace smoothing: 0\n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> A priori probabilities: \n#> \n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> Tables: \n#> \n#> -------------------------------------------------------------------------------- \n#> :: A::Class1 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (434 obs.);\tBandwidth 'bw' = 0.2548\n#> \n#> x y \n#> Min. :-2.5638 Min. :0.0002915 \n#> 1st Qu.:-1.2013 1st Qu.:0.0506201 \n#> Median : 0.1612 Median :0.1619843 \n#> Mean : 0.1612 Mean :0.1831190 \n#> 3rd Qu.: 1.5237 3rd Qu.:0.2581668 \n#> Max. : 2.8862 Max. :0.5370762 \n#> -------------------------------------------------------------------------------- \n#> :: A::Class2 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (351 obs.);\tBandwidth 'bw' = 0.2596\n#> \n#> x y \n#> Min. :-2.5428 Min. :4.977e-05 \n#> 1st Qu.:-1.1840 1st Qu.:2.672e-02 \n#> Median : 0.1748 Median :2.239e-01 \n#> Mean : 0.1748 Mean :1.836e-01 \n#> 3rd Qu.: 1.5336 3rd Qu.:2.926e-01 \n#> Max. : 2.8924 Max. :3.740e-01 \n#> \n#> -------------------------------------------------------------------------------- \n#> :: B::Class1 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (434 obs.);\tBandwidth 'bw' = 0.1793\n#> \n#> x y \n#> Min. :-2.4501 Min. :5.747e-05 \n#> 1st Qu.:-1.0894 1st Qu.:1.424e-02 \n#> Median : 0.2713 Median :8.798e-02 \n#> Mean : 0.2713 Mean :1.834e-01 \n#> 3rd Qu.: 1.6320 3rd Qu.:2.758e-01 \n#> Max. : 2.9927 Max. :6.872e-01 \n#> \n#> -------------------------------------------------------------------------------- \n#> :: B::Class2 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (351 obs.);\tBandwidth 'bw' = 0.2309\n#> \n#> x y \n#> Min. :-2.4621 Min. :5.623e-05 \n#> 1st Qu.:-0.8979 1st Qu.:1.489e-02 \n#> Median : 0.6663 Median :7.738e-02 \n#> Mean : 0.6663 Mean :1.595e-01 \n#> 3rd Qu.: 2.2305 3rd Qu.:3.336e-01 \n#> Max. : 3.7948 Max. :4.418e-01 \n#> \n#> --------------------------------------------------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.249 0.751 \n#> 2 0.593 0.407 \n#> 3 0.332 0.668 \n#> 4 0.993 0.00674\n#> 5 0.978 0.0224 \n#> 6 0.532 0.468\n```\n:::\n\n\n## K-Nearest Neighbors (`nearest_neighbor()`) \n\n## `kknn` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_spec <- nearest_neighbor() |>\n # We need to set the mode since this engine works with multiple modes\n # and kknn is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_fit <- nearest_neighbor_spec |> fit(class ~ ., data = bin_train)\nnearest_neighbor_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> kknn::train.kknn(formula = class ~ ., data = data, ks = min_rows(5, data, 5))\n#> \n#> Type of response variable: nominal\n#> Minimal misclassification: 0.2101911\n#> Best kernel: optimal\n#> Best k: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(nearest_neighbor_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(nearest_neighbor_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.2 0.8 \n#> 2 0.72 0.28\n#> 3 0.32 0.68\n#> 4 1 0 \n#> 5 1 0 \n#> 6 1 0\n```\n:::\n\n\n## Null Model (`null_model()`) \n\n## `parsnip` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_spec <- null_model() |>\n # We need to set the mode since this engine works with multiple modes\n # and parsnip is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_fit <- null_model_spec |> fit(class ~ ., data = bin_train)\nnull_model_fit\n#> parsnip model object\n#> \n#> Null Regression Model\n#> Predicted Value: Class1\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(null_model_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(null_model_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.553 0.447\n#> 2 0.553 0.447\n#> 3 0.553 0.447\n#> 4 0.553 0.447\n#> 5 0.553 0.447\n#> 6 0.553 0.447\n```\n:::\n\n\n## Partial Least Squares (`pls()`) \n\n## `mixOmics` Engine \n\nThis engine requires the plsmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(plsmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_spec <- pls() |>\n # We need to set the mode since this engine works with multiple modes\n # and mixOmics is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_fit <- pls_spec |> fit(class ~ ., data = bin_train)\npls_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> mixOmics::splsda(X = x, Y = y, ncomp = ncomp, keepX = keepX) \n#> \n#> sPLS-DA (regression mode) with 2 sPLS-DA components. \n#> You entered data X of dimensions: 785 2 \n#> You entered data Y with 2 classes. \n#> \n#> Selection of [2] [2] variables on each of the sPLS-DA components on the X data set. \n#> No Y variables can be selected. \n#> \n#> Main numerical outputs: \n#> -------------------- \n#> loading vectors: see object$loadings \n#> variates: see object$variates \n#> variable names: see object$names \n#> \n#> Functions to visualise samples: \n#> -------------------- \n#> plotIndiv, plotArrow, cim \n#> \n#> Functions to visualise variables: \n#> -------------------- \n#> plotVar, plotLoadings, network, cim \n#> \n#> Other functions: \n#> -------------------- \n#> selectVar, tune, perf, auc\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(pls_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(pls_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.462 0.538\n#> 2 0.631 0.369\n#> 3 0.512 0.488\n#> 4 0.765 0.235\n#> 5 0.675 0.325\n#> 6 0.624 0.376\n```\n:::\n\n\n## Random Forests (`rand_forest()`) \n\n## `aorsf` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random classification forest\n#> \n#> Linear combinations: Accelerated Logistic regression\n#> N observations: 785\n#> N classes: 2\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 24.166\n#> Min observations in leaf: 5\n#> OOB stat value: 0.87\n#> OOB stat type: AUC-ROC\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.199 0.801 \n#> 2 0.882 0.118 \n#> 3 0.361 0.639 \n#> 4 0.978 0.0220\n#> 5 0.936 0.0642\n#> 6 0.904 0.0957\n```\n:::\n\n\n## `grf` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"grf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\npredict(rand_forest_fit, type = \"conf_int\", new_data = bin_test)\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: drf\n#> Model ID: DRF_model_R_1763571327438_3938 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 91643 13\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 20 16.38000 114 158 141.50000\n#> \n#> \n#> H2OBinomialMetrics: drf\n#> ** Reported on training data. **\n#> ** Metrics reported on Out-Of-Bag training samples **\n#> \n#> MSE: 0.1644052\n#> RMSE: 0.4054691\n#> LogLoss: 1.62537\n#> Mean Per-Class Error: 0.2084695\n#> AUC: 0.8379252\n#> AUCPR: 0.7897947\n#> Gini: 0.6758504\n#> R^2: 0.3349444\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 326 108 0.248848 =108/434\n#> Class2 59 292 0.168091 =59/351\n#> Totals 385 400 0.212739 =167/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.363636 0.777630 128\n#> 2 max f2 0.263158 0.827455 147\n#> 3 max f0point5 0.642857 0.762215 78\n#> 4 max accuracy 0.384615 0.787261 125\n#> 5 max precision 0.944444 0.876033 10\n#> 6 max recall 0.000000 1.000000 217\n#> 7 max specificity 1.000000 0.972350 0\n#> 8 max absolute_mcc 0.363636 0.579899 128\n#> 9 max min_per_class_accuracy 0.458333 0.780627 112\n#> 10 max mean_per_class_accuracy 0.363636 0.791530 128\n#> 11 max tns 1.000000 422.000000 0\n#> 12 max fns 1.000000 275.000000 0\n#> 13 max fps 0.000000 434.000000 217\n#> 14 max tps 0.000000 351.000000 217\n#> 15 max tnr 1.000000 0.972350 0\n#> 16 max fnr 1.000000 0.783476 0\n#> 17 max fpr 0.000000 1.000000 217\n#> 18 max tpr 0.000000 1.000000 217\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.12 0.88\n#> 2 0.88 0.12\n#> 3 0.11 0.89\n#> 4 1 0 \n#> 5 0.76 0.24\n#> 6 1 0\n```\n:::\n\n\n## `partykit` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\n\n# Too long to print\n# rand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.396 0.604 \n#> 2 0.804 0.196 \n#> 3 0.313 0.687 \n#> 4 0.966 0.0343\n#> 5 0.887 0.113 \n#> 6 0.931 0.0689\n```\n:::\n\n\n## `randomForest` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"randomForest\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> randomForest(x = maybe_data_frame(x), y = y) \n#> Type of random forest: classification\n#> Number of trees: 500\n#> No. of variables tried at each split: 1\n#> \n#> OOB estimate of error rate: 21.66%\n#> Confusion matrix:\n#> Class1 Class2 class.error\n#> Class1 348 86 0.1981567\n#> Class2 84 267 0.2393162\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.174 0.826\n#> 2 0.88 0.12 \n#> 3 0.112 0.888\n#> 4 1 0 \n#> 5 0.692 0.308\n#> 6 0.922 0.078\n```\n:::\n\n\n## `ranger` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n # and ranger is the default engine so there is no need to set that either.\n set_engine(\"ranger\", keep.inbag = TRUE) |> \n # However, we'll set the engine and use the keep.inbag=TRUE option so that we \n # can produce interval predictions. This is not generally required. \n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Ranger result\n#> \n#> Call:\n#> ranger::ranger(x = maybe_data_frame(x), y = y, keep.inbag = ~TRUE, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) \n#> \n#> Type: Probability estimation \n#> Number of trees: 500 \n#> Sample size: 785 \n#> Number of independent variables: 2 \n#> Mtry: 1 \n#> Target node size: 10 \n#> Variable importance mode: none \n#> Splitrule: gini \n#> OOB prediction error (Brier s.): 0.1486808\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.228 0.772 \n#> 2 0.828 0.172 \n#> 3 0.214 0.786 \n#> 4 0.942 0.0578\n#> 5 0.763 0.237 \n#> 6 0.900 0.100\npredict(rand_forest_fit, type = \"conf_int\", new_data = bin_test)\n#> Warning in rInfJack(x, inbag.counts): Sample size <=20, no calibration\n#> performed.\n#> Warning in rInfJack(x, inbag.counts): Sample size <=20, no calibration\n#> performed.\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0 0.510 0.490 1 \n#> 2 0.660 0.997 0.00288 0.340\n#> 3 0 0.461 0.539 1 \n#> 4 0.798 1 0 0.202\n#> 5 0.567 0.959 0.0408 0.433\n#> 6 0.745 1 0 0.255\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n set_mode(\"classification\") |>\n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(Class ~ ., data = tbl_bin$training)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> RandomForestClassificationModel: uid=random_forest__3204ae4e_77ac_4f0c_b642_fef909ba5c81, numTrees=20, numClasses=2, numFeatures=2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class1 \n#> 4 Class2 \n#> 5 Class2 \n#> 6 Class1 \n#> 7 Class2\npredict(rand_forest_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 2]\n#> # Database: spark_connection\n#> pred_Class1 pred_Class2\n#> \n#> 1 0.244 0.756 \n#> 2 0.249 0.751 \n#> 3 0.836 0.164 \n#> 4 0.227 0.773 \n#> 5 0.260 0.740 \n#> 6 0.962 0.0383\n#> 7 0.0937 0.906\n```\n:::\n\n\n## Rule Fit (`rule_fit()`) \n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: rulefit\n#> Model ID: RuleFit_model_R_1763571327438_3989 \n#> Rulefit Model Summary: \n#> family link regularization number_of_predictors_total\n#> 1 binomial logit Lasso (lambda = 0.03081 ) 2377\n#> number_of_active_predictors number_of_iterations rule_ensemble_size\n#> 1 4 5 2375\n#> number_of_trees number_of_internal_trees min_depth max_depth mean_depth\n#> 1 150 150 0 5 4.00000\n#> min_leaves max_leaves mean_leaves\n#> 1 0 31 15.83333\n#> \n#> \n#> H2OBinomialMetrics: rulefit\n#> ** Reported on training data. **\n#> \n#> MSE: 0.1422931\n#> RMSE: 0.3772176\n#> LogLoss: 0.4500322\n#> Mean Per-Class Error: 0.1867902\n#> AUC: 0.8764064\n#> AUCPR: 0.8338422\n#> Gini: 0.7528129\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 351 83 0.191244 =83/434\n#> Class2 64 287 0.182336 =64/351\n#> Totals 415 370 0.187261 =147/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.485283 0.796117 204\n#> 2 max f2 0.263811 0.861522 270\n#> 3 max f0point5 0.620200 0.799574 147\n#> 4 max accuracy 0.485283 0.812739 204\n#> 5 max precision 0.984770 1.000000 0\n#> 6 max recall 0.048801 1.000000 393\n#> 7 max specificity 0.984770 1.000000 0\n#> 8 max absolute_mcc 0.485283 0.623934 204\n#> 9 max min_per_class_accuracy 0.489555 0.808756 202\n#> 10 max mean_per_class_accuracy 0.485283 0.813210 204\n#> 11 max tns 0.984770 434.000000 0\n#> 12 max fns 0.984770 350.000000 0\n#> 13 max fps 0.037559 434.000000 399\n#> 14 max tps 0.048801 351.000000 393\n#> 15 max tnr 0.984770 1.000000 0\n#> 16 max fnr 0.984770 0.997151 0\n#> 17 max fpr 0.037559 1.000000 399\n#> 18 max tpr 0.048801 1.000000 393\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rule_fit_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.377 0.623 \n#> 2 0.737 0.263 \n#> 3 0.487 0.513 \n#> 4 0.956 0.0440\n#> 5 0.879 0.121 \n#> 6 0.693 0.307\n```\n:::\n\n\n## `xrf` Engine \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n # and xrf is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> An eXtreme RuleFit model of 358 rules.\n#> \n#> Original Formula:\n#> \n#> class ~ A + B\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rule_fit_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.419 0.581\n#> 2 0.651 0.349\n#> 3 0.506 0.494\n#> 4 0.891 0.109\n#> 5 0.805 0.195\n#> 6 0.616 0.384\n```\n:::\n\n\n## Support Vector Machine (Linear Kernel) (`svm_linear()`) \n\n## `kernlab` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"kernlab\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(class ~ ., data = bin_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Linear (vanilla) kernel function. \n#> \n#> Number of Support Vectors : 357 \n#> \n#> Objective Function Value : -353.0043 \n#> Training error : 0.17707 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.403 0.597 \n#> 2 0.858 0.142 \n#> 3 0.540 0.460 \n#> 4 0.975 0.0254\n#> 5 0.905 0.0949\n#> 6 0.849 0.151\n```\n:::\n\n\n## `LiblineaR` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n # and LiblineaR is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(class ~ ., data = bin_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized L2-loss support vector classification dual (L2R_L2LOSS_SVC_DUAL)\"\n#> \n#> $Type\n#> [1] 1\n#> \n#> $W\n#> A B Bias\n#> [1,] 0.3641925 -0.9648581 0.1182515\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $ClassNames\n#> [1] Class1 Class2\n#> Levels: Class1 Class2\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\n```\n:::\n\n\n## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) \n\n## `kernlab` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_spec <- svm_poly() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_fit <- svm_poly_spec |> fit(class ~ ., data = bin_train)\n#> Setting default kernel parameters\nsvm_poly_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Polynomial kernel function. \n#> Hyperparameters : degree = 1 scale = 1 offset = 1 \n#> \n#> Number of Support Vectors : 357 \n#> \n#> Objective Function Value : -353.0043 \n#> Training error : 0.17707 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_poly_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_poly_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.412 0.588 \n#> 2 0.863 0.137 \n#> 3 0.549 0.451 \n#> 4 0.976 0.0242\n#> 5 0.909 0.0912\n#> 6 0.855 0.145\n```\n:::\n\n\n## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) \n\n## `kernlab` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_spec <- svm_rbf() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_fit <- svm_rbf_spec |> fit(class ~ ., data = bin_train)\nsvm_rbf_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Gaussian Radial Basis kernel function. \n#> Hyperparameter : sigma = 2.60157241724157 \n#> \n#> Number of Support Vectors : 338 \n#> \n#> Objective Function Value : -292.4523 \n#> Training error : 0.170701 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_rbf_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_rbf_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.524 0.476\n#> 2 0.893 0.107\n#> 3 0.239 0.761\n#> 4 0.866 0.134\n#> 5 0.867 0.133\n#> 6 0.876 0.124\n```\n:::\n\n\n## `liquidSVM` Engine \n\nNote that this package is not on CRAN. You can install it via its :\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npak::pak(\"cran/liquidSVM\") # fails\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_spec <- svm_rbf() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"liquidSVM\")\n#> Warning: The `engine` argument of `set_engine()` cannot be liquidSVM as of\n#> parsnip 0.1.6.\n#> ℹ The liquidSVM package is no longer available on CRAN.\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_fit <- svm_rbf_spec |> fit(class ~ ., data = bin_train)\nsvm_rbf_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_rbf_fit, type = \"class\", new_data = bin_test)\npredict(svm_rbf_fit, type = \"prob\", new_data = bin_test)\n```\n:::\n\n\n# Regression Models\n\n\nTo demonstrate regression, we'll subset some data. make a training/test split, and standardize the predictors: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(938)\nreg_split <-\n modeldata::concrete |> \n slice_sample(n = 100) |> \n select(strength = compressive_strength, cement, age) |> \n initial_split(prop = 0.95, strata = strength)\nreg_split\n#> \n#> <92/8/100>\n\nreg_rec <- \n recipe(strength ~ ., data = training(reg_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nreg_train <- bake(reg_rec, new_data = NULL)\nreg_test <- bake(reg_rec, new_data = testing(reg_split))\n```\n:::\n\n\nWe also have some models that are specific to integer count outcomes. The data for these are:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(207)\ncount_split <-\n attrition |>\n select(num_years = TotalWorkingYears, age = Age, income = MonthlyIncome) |>\n initial_split(prop = 0.994)\ncount_split\n#> \n#> <1461/9/1470>\n\ncount_rec <-\n recipe(num_years ~ ., data = training(count_split)) |>\n step_normalize(all_numeric_predictors()) |>\n prep()\n\ncount_train <- bake(count_rec, new_data = NULL)\ncount_test <- bake(count_rec, new_data = testing(count_split))\n```\n:::\n\n\nIf using the **Apache Spark** engine, we will need to identify the data source, \nand then use it to create the splits. For this article, we will copy the \n`concrete` data set into the Spark session.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ntbl_concrete <- copy_to(sc, modeldata::concrete)\n\ntbl_reg <- sdf_random_split(tbl_concrete, training = 0.95, test = 0.05, seed = 100)\n```\n:::\n\n\n\n## Auto Ml (`auto_ml()`) \n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nauto_ml_spec <- auto_ml() |>\n # We dont need to set the engine (since there is only one) but we'll set\n # a time limit\n set_engine(\"h2o\", max_runtime_secs = 60 * 3) |> \n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nauto_ml_fit <- auto_ml_spec |> fit(strength ~ ., data = reg_train)\nauto_ml_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(auto_ml_fit, new_data = reg_test)\n```\n:::\n\n\n## Bagged MARS (`bag_mars()`) \n\n## `earth` Engine \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mars_spec <- bag_mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mars_fit <- bag_mars_spec |> fit(strength ~ ., data = reg_train)\nbag_mars_fit\n#> parsnip model object\n#> \n#> Bagged MARS (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 age 86.9 5.54 11\n#> 2 cement 76.6 5.73 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mars_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 21.5\n#> 2 41.3\n#> 3 27.3\n#> 4 56.6\n#> 5 35.9\n#> 6 36.5\n#> 7 38.5\n#> 8 38.2\n```\n:::\n\n\n## Bagged Neural Networks (`bag_mlp()`) \n\n## `nnet` Engine \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mlp_spec <- bag_mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mlp_fit <- bag_mlp_spec |> fit(strength ~ ., data = reg_train)\nbag_mlp_fit\n#> parsnip model object\n#> \n#> Bagged nnet (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 age 59.3 1.66 11\n#> 2 cement 40.7 1.66 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.7\n#> 2 42.0\n#> 3 27.8\n#> 4 76.0\n#> 5 37.3\n#> 6 39.0\n#> 7 35.9\n#> 8 42.4\n```\n:::\n\n\n## Bagged Decision Trees (`bag_tree()`) \n\n## `rpart` Engine \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_fit <- bag_tree_spec |> fit(strength ~ ., data = reg_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged CART (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 cement 17674. 1795. 11\n#> 2 age 12753. 489. 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.0\n#> 2 32.4\n#> 3 29.7\n#> 4 58.0\n#> 5 37.8\n#> 6 44.4\n#> 7 42.5\n#> 8 38.2\n```\n:::\n\n\n## Bayesian Additive Regression Trees (`bart()`) \n\n## `dbarts` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbart_spec <- bart() |>\n # We need to set the mode since this engine works with multiple modes\n # and dbarts is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbart_fit <- bart_spec |> fit(strength ~ ., data = reg_train)\nbart_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> `NULL`()\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bart_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.2\n#> 2 41.0\n#> 3 26.5\n#> 4 52.6\n#> 5 36.0\n#> 6 36.8\n#> 7 39.1\n#> 8 37.9\npredict(bart_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 16.7 32.0\n#> 2 33.0 49.2\n#> 3 20.5 31.5\n#> 4 41.8 63.5\n#> 5 28.1 43.9\n#> 6 30.2 42.6\n#> 7 33.3 45.3\n#> 8 27.2 50.0\npredict(bart_fit, type = \"pred_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 4.90 44.3\n#> 2 22.5 60.4\n#> 3 8.62 44.8\n#> 4 35.0 71.9\n#> 5 16.6 53.3\n#> 6 19.9 54.5\n#> 7 22.5 57.3\n#> 8 16.4 58.6\n```\n:::\n\n\n## Boosted Decision Trees (`boost_tree()`) \n\n## `catboost` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"catboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> CatBoost model (1000 trees)\n#> Loss function: RMSE\n#> Fit to 2 feature(s)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 25.3\n#> 2 33.9\n#> 3 28.1\n#> 4 60.7\n#> 5 35.4\n#> 6 38.2\n#> 7 43.3\n#> 8 29.8\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: gbm\n#> Model ID: GBM_model_R_1763571327438_4145 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 20476 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 14 43 27.92000\n#> \n#> \n#> H2ORegressionMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.001563879\n#> RMSE: 0.03954591\n#> MAE: 0.02903684\n#> RMSLE: 0.001771464\n#> Mean Residual Deviance : 0.001563879\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.7\n#> 2 32.2\n#> 3 26.9\n#> 4 63.2\n#> 5 34.9\n#> 6 39.0\n#> 7 40.0\n#> 8 32.9\n```\n:::\n\n\n## `h2o_gbm` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: gbm\n#> Model ID: GBM_model_R_1763571327438_4146 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 20476 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 14 43 27.92000\n#> \n#> \n#> H2ORegressionMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.001563879\n#> RMSE: 0.03954591\n#> MAE: 0.02903684\n#> RMSLE: 0.001771464\n#> Mean Residual Deviance : 0.001563879\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.7\n#> 2 32.2\n#> 3 26.9\n#> 4 63.2\n#> 5 34.9\n#> 6 39.0\n#> 7 40.0\n#> 8 32.9\n```\n:::\n\n\n## `lightgbm` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"lightgbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> LightGBM Model (100 trees)\n#> Objective: regression\n#> Fitted to dataset with 2 columns\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 20.6\n#> 2 42.5\n#> 3 27.0\n#> 4 49.2\n#> 5 43.7\n#> 6 38.3\n#> 7 41.1\n#> 8 36.9\n```\n:::\n\n\n## `xgboost` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and xgboost is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> ##### xgb.Booster\n#> raw: 35 Kb \n#> call:\n#> xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, \n#> colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, \n#> subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist, \n#> verbose = 0, nthread = 1, objective = \"reg:squarederror\")\n#> params (as set within xgb.train):\n#> eta = \"0.3\", max_depth = \"6\", gamma = \"0\", colsample_bytree = \"1\", colsample_bynode = \"1\", min_child_weight = \"1\", subsample = \"1\", nthread = \"1\", objective = \"reg:squarederror\", validate_parameters = \"TRUE\"\n#> xgb.attributes:\n#> niter\n#> callbacks:\n#> cb.evaluation.log()\n#> # of features: 2 \n#> niter: 15\n#> nfeatures : 2 \n#> evaluation_log:\n#> iter training_rmse\n#> \n#> 1 27.511751\n#> 2 20.726236\n#> --- ---\n#> 14 2.774394\n#> 15 2.632224\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.3\n#> 2 32.9\n#> 3 26.7\n#> 4 57.6\n#> 5 34.9\n#> 6 33.8\n#> 7 42.6\n#> 8 26.3\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n set_mode(\"regression\") |>\n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> GBTRegressionModel: uid=gradient_boosted_trees__d4414e35_351c_433f_958b_847ee38e9416, numTrees=20, numFeatures=8\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 20.8 \n#> 2 28.1 \n#> 3 15.5 \n#> 4 22.4 \n#> 5 9.37\n#> 6 40.1 \n#> 7 14.2 \n#> 8 32.1 \n#> 9 37.4 \n#> 10 49.5 \n#> # ℹ more rows\n```\n:::\n\n\n## Cubist Rules (`cubist_rules()`) \n\n## `Cubist` Engine \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and Cubist is the default engine so there is no need to set that either.\ncubist_rules_spec <- cubist_rules()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncubist_rules_fit <- cubist_rules_spec |> fit(strength ~ ., data = reg_train)\ncubist_rules_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> cubist.default(x = x, y = y, committees = 1)\n#> \n#> Number of samples: 92 \n#> Number of predictors: 2 \n#> \n#> Number of committees: 1 \n#> Number of rules: 2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(cubist_rules_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.2\n#> 2 46.3\n#> 3 23.6\n#> 4 54.4\n#> 5 32.7\n#> 6 37.8\n#> 7 38.8\n#> 8 38.6\n```\n:::\n\n\n## Decision Tree (`decision_tree()`) \n\n## `partykit` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(strength ~ ., data = reg_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> strength ~ cement + age\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] cement <= 0.72078\n#> | | [3] age <= -0.60316\n#> | | | [4] cement <= -0.38732: 11.141 (n = 12, err = 292.8)\n#> | | | [5] cement > -0.38732: 18.005 (n = 11, err = 401.5)\n#> | | [6] age > -0.60316\n#> | | | [7] cement <= 0.24945\n#> | | | | [8] age <= -0.2359: 28.756 (n = 24, err = 1450.6)\n#> | | | | [9] age > -0.2359: 39.014 (n = 11, err = 634.8)\n#> | | | [10] cement > 0.24945: 42.564 (n = 11, err = 1041.7)\n#> | [11] cement > 0.72078: 50.864 (n = 23, err = 5390.3)\n#> \n#> Number of inner nodes: 5\n#> Number of terminal nodes: 6\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 18.0\n#> 2 39.0\n#> 3 28.8\n#> 4 50.9\n#> 5 50.9\n#> 6 42.6\n#> 7 42.6\n#> 8 50.9\n```\n:::\n\n\n## `rpart` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(strength ~ ., data = reg_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> n= 92 \n#> \n#> node), split, n, deviance, yval\n#> * denotes terminal node\n#> \n#> 1) root 92 26564.7400 33.57728 \n#> 2) cement< 0.7861846 69 12009.9000 27.81493 \n#> 4) age< -0.5419541 23 964.6417 14.42348 \n#> 8) cement< -0.3695209 12 292.7811 11.14083 *\n#> 9) cement>=-0.3695209 11 401.4871 18.00455 *\n#> 5) age>=-0.5419541 46 4858.3440 34.51065 \n#> 10) age< 0.008934354 32 2208.3040 31.16781 \n#> 20) cement< 0.311975 24 1450.6200 28.75583 *\n#> 21) cement>=0.311975 8 199.1900 38.40375 *\n#> 11) age>=0.008934354 14 1475.1130 42.15143 *\n#> 3) cement>=0.7861846 23 5390.3320 50.86435 \n#> 6) age< -0.5419541 7 390.4204 40.08429 *\n#> 7) age>=-0.5419541 16 3830.5510 55.58062 *\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 18.0\n#> 2 42.2\n#> 3 28.8\n#> 4 55.6\n#> 5 40.1\n#> 6 38.4\n#> 7 38.4\n#> 8 40.1\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n set_mode(\"regression\") |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> DecisionTreeRegressionModel: uid=decision_tree_regressor__224bd5f4_4a90_4afe_9056_f064491ee63e, depth=5, numNodes=63, numFeatures=8\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = tbl_reg$test)\n```\n:::\n\n\n\n\n## Generalized Additive Models (`gen_additive_mod()`) \n\n## `mgcv` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_spec <- gen_additive_mod() |>\n # We need to set the mode since this engine works with multiple modes\n # and mgcv is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_fit <- \n gen_additive_mod_spec |> \n fit(strength ~ s(age) + s(cement), data = reg_train)\ngen_additive_mod_fit\n#> parsnip model object\n#> \n#> \n#> Family: gaussian \n#> Link function: identity \n#> \n#> Formula:\n#> strength ~ s(age) + s(cement)\n#> \n#> Estimated degrees of freedom:\n#> 4.18 3.56 total = 8.74 \n#> \n#> GCV score: 108.4401\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(gen_additive_mod_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.1\n#> 2 41.2\n#> 3 26.7\n#> 4 55.9\n#> 5 35.2\n#> 6 37.1\n#> 7 38.5\n#> 8 39.6\npredict(gen_additive_mod_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 18.9 27.4\n#> 2 35.7 46.6\n#> 3 22.4 31.0\n#> 4 47.0 64.7\n#> 5 30.1 40.4\n#> 6 32.9 41.2\n#> 7 34.3 42.6\n#> 8 30.3 49.0\n```\n:::\n\n\n## Linear Reg (`linear_reg()`) \n\n## `brulee` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Linear regression\n#> \n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> batch size: 83 \n#> scaled validation loss after 1 epoch: 291\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 33.2\n#> 2 30.0\n#> 3 21.3\n#> 4 53.7\n#> 5 42.2\n#> 6 36.2\n#> 7 37.3\n#> 8 51.6\n```\n:::\n\n\n## `gee` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n```\n:::\n\n\n## `glm` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = strength ~ ., family = stats::gaussian, \n#> data = data)\n#> \n#> Coefficients:\n#> (Intercept) cement age \n#> 33.577 8.795 5.471 \n#> \n#> Degrees of Freedom: 91 Total (i.e. Null); 89 Residual\n#> Null Deviance:\t 26560 \n#> Residual Deviance: 15480 \tAIC: 740.6\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 28.8 35.4\n#> 2 27.1 33.5\n#> 3 17.3 25.9\n#> 4 44.6 58.1\n#> 5 35.6 45.0\n#> 6 32.3 38.3\n#> 7 33.2 39.4\n#> 8 41.6 56.0\n```\n:::\n\n\n## `glmer` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\n```\n:::\n\n\n## `glmnet` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"gaussian\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 9.5680\n#> 2 1 5.38 8.7180\n#> 3 1 9.85 7.9430\n#> 4 1 13.56 7.2380\n#> 5 1 16.64 6.5950\n#> 6 2 19.99 6.0090\n#> 7 2 23.68 5.4750\n#> 8 2 26.75 4.9890\n#> 9 2 29.29 4.5450\n#> 10 2 31.40 4.1420\n#> 11 2 33.15 3.7740\n#> 12 2 34.61 3.4380\n#> 13 2 35.82 3.1330\n#> 14 2 36.82 2.8550\n#> 15 2 37.65 2.6010\n#> 16 2 38.34 2.3700\n#> 17 2 38.92 2.1590\n#> 18 2 39.39 1.9680\n#> 19 2 39.79 1.7930\n#> 20 2 40.12 1.6340\n#> 21 2 40.39 1.4880\n#> 22 2 40.62 1.3560\n#> 23 2 40.80 1.2360\n#> 24 2 40.96 1.1260\n#> 25 2 41.09 1.0260\n#> 26 2 41.20 0.9348\n#> 27 2 41.29 0.8517\n#> 28 2 41.36 0.7761\n#> 29 2 41.42 0.7071\n#> 30 2 41.47 0.6443\n#> 31 2 41.52 0.5871\n#> 32 2 41.55 0.5349\n#> 33 2 41.58 0.4874\n#> 34 2 41.60 0.4441\n#> 35 2 41.63 0.4046\n#> 36 2 41.64 0.3687\n#> 37 2 41.66 0.3359\n#> 38 2 41.67 0.3061\n#> 39 2 41.68 0.2789\n#> 40 2 41.68 0.2541\n#> 41 2 41.69 0.2316\n#> 42 2 41.70 0.2110\n#> 43 2 41.70 0.1922\n#> 44 2 41.71 0.1752\n#> 45 2 41.71 0.1596\n#> 46 2 41.71 0.1454\n#> 47 2 41.71 0.1325\n#> 48 2 41.71 0.1207\n#> 49 2 41.72 0.1100\n#> 50 2 41.72 0.1002\n#> 51 2 41.72 0.0913\n#> 52 2 41.72 0.0832\n#> 53 2 41.72 0.0758\n#> 54 2 41.72 0.0691\n#> 55 2 41.72 0.0630\n#> 56 2 41.72 0.0574\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.2\n#> 2 30.3\n#> 3 21.7\n#> 4 51.3\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.7\n```\n:::\n\n\n## `gls` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gls\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: glm\n#> Model ID: GLM_model_R_1763571327438_4147 \n#> GLM Model: summary\n#> family link regularization\n#> 1 gaussian identity Elastic Net (alpha = 0.5, lambda = 0.01903 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 1\n#> training_frame\n#> 1 object_ftjflovkts\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept 33.577283 33.577283\n#> 2 cement 8.708461 8.708461\n#> 3 age 5.422201 5.422201\n#> \n#> H2ORegressionMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 168.2822\n#> RMSE: 12.97236\n#> MAE: 10.62672\n#> RMSLE: 0.4645554\n#> Mean Residual Deviance : 168.2822\n#> R^2 : 0.4171988\n#> Null Deviance :26564.74\n#> Null D.o.F. :91\n#> Residual Deviance :15481.96\n#> Residual D.o.F. :89\n#> AIC :740.6438\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.7\n#> 4 51.2\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.7\n```\n:::\n\n\n## `keras` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n```\n:::\n\n\n## `lm` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and lm is the default engine so there is no need to set that either.\nlinear_reg_spec <- linear_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = strength ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) cement age \n#> 33.577 8.795 5.471\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 28.8 35.4\n#> 2 27.1 33.5\n#> 3 17.3 25.9\n#> 4 44.6 58.1\n#> 5 35.6 45.0\n#> 6 32.3 38.3\n#> 7 33.2 39.4\n#> 8 41.6 56.0\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 5.72 58.5\n#> 2 3.89 56.7\n#> 3 -4.94 48.2\n#> 4 24.3 78.5\n#> 5 13.7 67.0\n#> 6 8.95 61.7\n#> 7 9.89 62.7\n#> 8 21.6 76.0\n```\n:::\n\n\n## `lme` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"lme\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n```\n:::\n\n\n## `lmer` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"lmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n```\n:::\n\n\n## `stan` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> stan_glm\n#> family: gaussian [identity]\n#> formula: strength ~ .\n#> observations: 92\n#> predictors: 3\n#> ------\n#> Median MAD_SD\n#> (Intercept) 33.6 1.4 \n#> cement 8.8 1.4 \n#> age 5.5 1.5 \n#> \n#> Auxiliary parameter(s):\n#> Median MAD_SD\n#> sigma 13.3 1.0 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 28.8 35.6\n#> 2 27.1 33.5\n#> 3 17.3 26.0\n#> 4 44.7 58.0\n#> 5 35.8 45.0\n#> 6 32.3 38.3\n#> 7 33.2 39.5\n#> 8 41.8 55.8\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 6.24 58.5\n#> 2 3.92 56.5\n#> 3 -4.87 48.0\n#> 4 24.2 78.2\n#> 5 14.3 68.1\n#> 6 8.85 61.7\n#> 7 10.8 62.6\n#> 8 22.3 75.6\n```\n:::\n\n\n## `stan_glmer` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_test)\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> Coefficients:\n#> (Intercept) cement blast_furnace_slag fly_ash \n#> -21.80239627 0.12003251 0.10399582 0.08747677 \n#> water superplasticizer coarse_aggregate fine_aggregate \n#> -0.15701342 0.28531613 0.01777782 0.02018358 \n#> age \n#> 0.11678247\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 16.5\n#> 2 19.7\n#> 3 26.1\n#> 4 23.6\n#> 5 24.2\n#> 6 29.1\n#> 7 21.3\n#> 8 24.2\n#> 9 33.9\n#> 10 57.7\n#> # ℹ more rows\n```\n:::\n\n\n## Multivariate Adaptive Regression Splines (`mars()`) \n\n## `earth` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_spec <- mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_fit <- mars_spec |> fit(strength ~ ., data = reg_train)\nmars_fit\n#> parsnip model object\n#> \n#> Selected 4 of 9 terms, and 2 of 2 predictors\n#> Termination condition: RSq changed by less than 0.001 at 9 terms\n#> Importance: age, cement\n#> Number of terms at each degree of interaction: 1 3 (additive model)\n#> GCV 113.532 RSS 8915.965 GRSq 0.6153128 RSq 0.6643684\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mars_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.0\n#> 2 43.1\n#> 3 28.1\n#> 4 58.0\n#> 5 33.8\n#> 6 34.9\n#> 7 36.3\n#> 8 43.5\n```\n:::\n\n\n## Neural Networks (`mlp()`) \n\n## `brulee` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> relu activation,\n#> 3 hidden units,\n#> 13 model parameters\n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 83 \n#> learn rate: 0.01 \n#> scaled validation loss after 6 epochs: 0.21\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 21.3\n#> 2 33.9\n#> 3 23.7\n#> 4 46.9\n#> 5 42.3\n#> 6 32.2\n#> 7 34.8\n#> 8 46.9\n```\n:::\n\n\n## `brulee_two_layer` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"brulee_two_layer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> c(relu,relu) activation,\n#> c(3,3) hidden units,\n#> 25 model parameters\n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 83 \n#> learn rate: 0.01 \n#> scaled validation loss after 21 epochs: 0.129\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.8\n#> 2 41.9\n#> 3 26.5\n#> 4 56.6\n#> 5 33.1\n#> 6 40.5\n#> 7 41.5\n#> 8 38.0\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: deeplearning\n#> Model ID: DeepLearning_model_R_1763571327438_4148 \n#> Status of Neuron Layers: predicting .outcome, regression, gaussian distribution, Quadratic loss, 801 weights/biases, 14.5 KB, 920 training samples, mini-batch size 1\n#> layer units type dropout l1 l2 mean_rate rate_rms momentum\n#> 1 1 2 Input 0.00 % NA NA NA NA NA\n#> 2 2 200 Rectifier 0.00 % 0.000000 0.000000 0.005416 0.010833 0.000000\n#> 3 3 1 Linear NA 0.000000 0.000000 0.000501 0.000097 0.000000\n#> mean_weight weight_rms mean_bias bias_rms\n#> 1 NA NA NA NA\n#> 2 -0.009259 0.111978 0.497921 0.008852\n#> 3 -0.003265 0.101694 0.014595 0.000000\n#> \n#> \n#> H2ORegressionMetrics: deeplearning\n#> ** Reported on training data. **\n#> ** Metrics reported on full training frame **\n#> \n#> MSE: 156.8178\n#> RMSE: 12.52269\n#> MAE: 9.742575\n#> RMSLE: 0.4096152\n#> Mean Residual Deviance : 156.8178\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 26.9\n#> 2 28.8\n#> 3 18.3\n#> 4 47.1\n#> 5 34.8\n#> 6 31.5\n#> 7 32.5\n#> 8 42.5\n```\n:::\n\n\n## `keras` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n```\n:::\n\n\n## `nnet` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> a 2-5-1 network with 21 weights\n#> inputs: cement age \n#> output(s): strength \n#> options were - linear output units\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 26.0\n#> 2 42.1\n#> 3 29.2\n#> 4 67.8\n#> 5 36.7\n#> 6 33.3\n#> 7 33.3\n#> 8 33.9\n```\n:::\n\n\n## K-Nearest Neighbors (`nearest_neighbor()`) \n\n## `kknn` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_spec <- nearest_neighbor() |>\n # We need to set the mode since this engine works with multiple modes\n # and kknn is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_fit <- nearest_neighbor_spec |> fit(strength ~ ., data = reg_train)\nnearest_neighbor_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> kknn::train.kknn(formula = strength ~ ., data = data, ks = min_rows(5, data, 5))\n#> \n#> Type of response variable: continuous\n#> minimal mean absolute error: 8.257735\n#> Minimal mean squared error: 115.8737\n#> Best kernel: optimal\n#> Best k: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(nearest_neighbor_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 16.3\n#> 2 35.7\n#> 3 27.5\n#> 4 56.7\n#> 5 42.6\n#> 6 41.7\n#> 7 41.2\n#> 8 50.2\n```\n:::\n\n\n## Null Model (`null_model()`) \n\n## `parsnip` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_spec <- null_model() |>\n # We need to set the mode since this engine works with multiple modes\n # and parsnip is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_fit <- null_model_spec |> fit(strength ~ ., data = reg_train)\nnull_model_fit\n#> parsnip model object\n#> \n#> Null Classification Model\n#> Predicted Value: 33.57728\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(null_model_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 33.6\n#> 2 33.6\n#> 3 33.6\n#> 4 33.6\n#> 5 33.6\n#> 6 33.6\n#> 7 33.6\n#> 8 33.6\n```\n:::\n\n\n## Partial Least Squares (`pls()`) \n\n## `mixOmics` Engine \n\nThis engine requires the plsmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(plsmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_spec <- pls() |>\n # We need to set the mode since this engine works with multiple modes\n # and mixOmics is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_fit <- pls_spec |> fit(strength ~ ., data = reg_train)\npls_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> mixOmics::spls(X = x, Y = y, ncomp = ncomp, keepX = keepX) \n#> \n#> sPLS with a 'regression' mode with 2 sPLS components. \n#> You entered data X of dimensions: 92 2 \n#> You entered data Y of dimensions: 92 1 \n#> \n#> Selection of [2] [2] variables on each of the sPLS components on the X data set. \n#> Selection of [1] [1] variables on each of the sPLS components on the Y data set. \n#> \n#> Main numerical outputs: \n#> -------------------- \n#> loading vectors: see object$loadings \n#> variates: see object$variates \n#> variable names: see object$names \n#> \n#> Functions to visualise samples: \n#> -------------------- \n#> plotIndiv, plotArrow \n#> \n#> Functions to visualise variables: \n#> -------------------- \n#> plotVar, plotLoadings, network, cim\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(pls_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\n```\n:::\n\n\n## Poisson Reg (`poisson_reg()`) \n\n## `gee` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train)\npoisson_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_test)\n```\n:::\n\n\n## `glm` Engine \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and glm is the default engine so there is no need to set that either.\npoisson_reg_spec <- poisson_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = num_years ~ ., family = stats::poisson, \n#> data = data)\n#> \n#> Coefficients:\n#> (Intercept) age income \n#> 2.2861 0.2804 0.2822 \n#> \n#> Degrees of Freedom: 1460 Total (i.e. Null); 1458 Residual\n#> Null Deviance:\t 7434 \n#> Residual Deviance: 2597 \tAIC: 8446\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.6 \n#> 2 6.66\n#> 3 11.8 \n#> 4 24.8 \n#> 5 26.6 \n#> 6 8.23\n#> 7 32.1 \n#> 8 4.86\n#> 9 28.3\n```\n:::\n\n\n## `glmer` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train)\npoisson_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_test)\n```\n:::\n\n\n## `glmnet` Engine \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"poisson\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 5.9710\n#> 2 1 10.26 5.4400\n#> 3 1 18.31 4.9570\n#> 4 2 24.84 4.5170\n#> 5 2 32.06 4.1150\n#> 6 2 37.94 3.7500\n#> 7 2 42.73 3.4170\n#> 8 2 46.65 3.1130\n#> 9 2 49.87 2.8370\n#> 10 2 52.51 2.5850\n#> 11 2 54.69 2.3550\n#> 12 2 56.48 2.1460\n#> 13 2 57.96 1.9550\n#> 14 2 59.18 1.7810\n#> 15 2 60.19 1.6230\n#> 16 2 61.03 1.4790\n#> 17 2 61.72 1.3480\n#> 18 2 62.29 1.2280\n#> 19 2 62.76 1.1190\n#> 20 2 63.16 1.0190\n#> 21 2 63.48 0.9289\n#> 22 2 63.75 0.8463\n#> 23 2 63.98 0.7712\n#> 24 2 64.16 0.7026\n#> 25 2 64.31 0.6402\n#> 26 2 64.44 0.5833\n#> 27 2 64.55 0.5315\n#> 28 2 64.64 0.4843\n#> 29 2 64.71 0.4413\n#> 30 2 64.77 0.4021\n#> 31 2 64.82 0.3664\n#> 32 2 64.86 0.3338\n#> 33 2 64.90 0.3042\n#> 34 2 64.92 0.2771\n#> 35 2 64.95 0.2525\n#> 36 2 64.97 0.2301\n#> 37 2 64.98 0.2096\n#> 38 2 65.00 0.1910\n#> 39 2 65.01 0.1741\n#> 40 2 65.02 0.1586\n#> 41 2 65.03 0.1445\n#> 42 2 65.03 0.1317\n#> 43 2 65.04 0.1200\n#> 44 2 65.04 0.1093\n#> 45 2 65.05 0.0996\n#> 46 2 65.05 0.0907\n#> 47 2 65.05 0.0827\n#> 48 2 65.05 0.0753\n#> 49 2 65.06 0.0687\n#> 50 2 65.06 0.0625\n#> 51 2 65.06 0.0570\n#> 52 2 65.06 0.0519\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.4 \n#> 2 6.70\n#> 3 11.8 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.27\n#> 7 31.8 \n#> 8 4.91\n#> 9 28.1\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: glm\n#> Model ID: GLM_model_R_1763571327438_4149 \n#> GLM Model: summary\n#> family link regularization\n#> 1 poisson log Elastic Net (alpha = 0.5, lambda = 0.01194 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 4\n#> training_frame\n#> 1 object_xqwzxdmwtf\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept 2.286411 2.286411\n#> 2 age 0.279967 0.279967\n#> 3 income 0.281952 0.281952\n#> \n#> H2ORegressionMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 18.40519\n#> RMSE: 4.290128\n#> MAE: 3.297048\n#> RMSLE: 0.467537\n#> Mean Residual Deviance : 1.777749\n#> R^2 : 0.6934292\n#> Null Deviance :7434.374\n#> Null D.o.F. :1460\n#> Residual Deviance :2597.291\n#> Residual D.o.F. :1458\n#> AIC :8445.967\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.6 \n#> 2 6.67\n#> 3 11.8 \n#> 4 24.8 \n#> 5 26.5 \n#> 6 8.24\n#> 7 32.0 \n#> 8 4.87\n#> 9 28.2\n```\n:::\n\n\n## `hurdle` Engine \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"hurdle\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> pscl::hurdle(formula = num_years ~ ., data = data)\n#> \n#> Count model coefficients (truncated poisson with log link):\n#> (Intercept) age income \n#> 2.2911 0.2749 0.2820 \n#> \n#> Zero hurdle model coefficients (binomial with logit link):\n#> (Intercept) age income \n#> 24.656 5.611 13.092\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.5 \n#> 2 6.74\n#> 3 11.9 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.32\n#> 7 31.9 \n#> 8 4.89\n#> 9 28.2\n```\n:::\n\n\n## `stan` Engine \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train)\npoisson_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_test)\npredict(poisson_reg_fit, type = \"conf_int\", new_data = reg_test)\npredict(poisson_reg_fit, type = \"pred_int\", new_data = reg_test)\n```\n:::\n\n\n## `stan_glmer` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train)\npoisson_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_test)\npredict(poisson_reg_fit, type = \"pred_int\", new_data = reg_test)\n```\n:::\n\n\n## `zeroinfl` Engine \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"zeroinfl\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\n#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> pscl::zeroinfl(formula = num_years ~ ., data = data)\n#> \n#> Count model coefficients (poisson with log link):\n#> (Intercept) age income \n#> 2.2912 0.2748 0.2821 \n#> \n#> Zero-inflation model coefficients (binomial with logit link):\n#> (Intercept) age income \n#> -48.26 -18.22 -11.72\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.5 \n#> 2 6.74\n#> 3 11.9 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.31\n#> 7 31.9 \n#> 8 4.93\n#> 9 28.2\n```\n:::\n\n\n## Random Forests (`rand_forest()`) \n\n## `aorsf` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random regression forest\n#> \n#> Linear combinations: Accelerated Linear regression\n#> N observations: 92\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 13.968\n#> Min observations in leaf: 5\n#> OOB stat value: 0.59\n#> OOB stat type: RSQ\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 25.0\n#> 2 36.6\n#> 3 30.4\n#> 4 55.7\n#> 5 42.0\n#> 6 38.8\n#> 7 40.6\n#> 8 53.5\n```\n:::\n\n\n## `grf` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"grf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\npredict(rand_forest_fit, type = \"conf_int\", new_data = reg_test)\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: drf\n#> Model ID: DRF_model_R_1763571327438_4150 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 21666 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 13 8.90000 12 47 29.82000\n#> \n#> \n#> H2ORegressionMetrics: drf\n#> ** Reported on training data. **\n#> ** Metrics reported on Out-Of-Bag training samples **\n#> \n#> MSE: 90.66979\n#> RMSE: 9.522068\n#> MAE: 7.491973\n#> RMSLE: 0.3441902\n#> Mean Residual Deviance : 90.66979\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.8\n#> 2 34.6\n#> 3 29.1\n#> 4 56.9\n#> 5 36.7\n#> 6 36.3\n#> 7 39.6\n#> 8 29.3\n```\n:::\n\n\n## `partykit` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\n\n# Too long to print\n# rand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 16.8\n#> 2 38.2\n#> 3 28.4\n#> 4 49.9\n#> 5 48.5\n#> 6 36.3\n#> 7 38.5\n#> 8 48.6\n```\n:::\n\n\n## `randomForest` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"randomForest\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> randomForest(x = maybe_data_frame(x), y = y) \n#> Type of random forest: regression\n#> Number of trees: 500\n#> No. of variables tried at each split: 1\n#> \n#> Mean of squared residuals: 90.27832\n#> % Var explained: 68.73\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.6\n#> 2 36.6\n#> 3 28.3\n#> 4 57.2\n#> 5 38.5\n#> 6 35.0\n#> 7 38.8\n#> 8 35.1\n```\n:::\n\n\n## `ranger` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n # and ranger is the default engine so there is no need to set that either.\n set_engine(\"ranger\", keep.inbag = TRUE) |> \n # However, we'll set the engine and use the keep.inbag=TRUE option so that we \n # can produce interval predictions. This is not generally required. \n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Ranger result\n#> \n#> Call:\n#> ranger::ranger(x = maybe_data_frame(x), y = y, keep.inbag = ~TRUE, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1)) \n#> \n#> Type: Regression \n#> Number of trees: 500 \n#> Sample size: 92 \n#> Number of independent variables: 2 \n#> Mtry: 1 \n#> Target node size: 5 \n#> Variable importance mode: none \n#> Splitrule: variance \n#> OOB prediction error (MSE): 93.38443 \n#> R squared (OOB): 0.6801029\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.0\n#> 2 37.4\n#> 3 28.5\n#> 4 56.5\n#> 5 38.4\n#> 6 35.8\n#> 7 38.5\n#> 8 34.5\npredict(rand_forest_fit, type = \"conf_int\", new_data = reg_test)\n#> Warning in rInfJack(pred = result$predictions, inbag = inbag.counts, used.trees\n#> = 1:num.trees): Sample size <=20, no calibration performed.\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 20.4 27.6\n#> 2 31.3 43.6\n#> 3 24.2 32.7\n#> 4 44.5 68.4\n#> 5 33.4 43.4\n#> 6 31.3 40.4\n#> 7 35.5 41.4\n#> 8 27.0 42.0\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n set_engine(\"spark\") |> \n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> RandomForestRegressionModel: uid=random_forest__9f449384_cf84_4bcb_afa5_43e10c342627, numTrees=20, numFeatures=8\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 27.1\n#> 2 28.6\n#> 3 25.9\n#> 4 29.6\n#> 5 16.4\n#> 6 34.5\n#> 7 19.2\n#> 8 30.1\n#> 9 37.5\n#> 10 44.2\n#> # ℹ more rows\n```\n:::\n\n\n## Rule Fit (`rule_fit()`) \n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: rulefit\n#> Model ID: RuleFit_model_R_1763571327438_4151 \n#> Rulefit Model Summary: \n#> family link regularization number_of_predictors_total\n#> 1 gaussian identity Lasso (lambda = 0.9516 ) 1783\n#> number_of_active_predictors number_of_iterations rule_ensemble_size\n#> 1 70 1 1781\n#> number_of_trees number_of_internal_trees min_depth max_depth mean_depth\n#> 1 150 150 0 5 4.00000\n#> min_leaves max_leaves mean_leaves\n#> 1 0 26 11.87333\n#> \n#> \n#> H2ORegressionMetrics: rulefit\n#> ** Reported on training data. **\n#> \n#> MSE: 91.07972\n#> RMSE: 9.54357\n#> MAE: 7.180123\n#> RMSLE: 0.3532356\n#> Mean Residual Deviance : 91.07972\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 27.0\n#> 2 36.1\n#> 3 26.8\n#> 4 49.8\n#> 5 42.2\n#> 6 34.7\n#> 7 39.4\n#> 8 40.8\n```\n:::\n\n\n## `xrf` Engine \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n # and xrf is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> An eXtreme RuleFit model of 179 rules.\n#> \n#> Original Formula:\n#> \n#> strength ~ cement + age\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 27.5\n#> 2 32.0\n#> 3 26.5\n#> 4 52.9\n#> 5 35.9\n#> 6 31.8\n#> 7 46.2\n#> 8 30.8\n```\n:::\n\n\n## Support Vector Machine (Linear Kernel) (`svm_linear()`) \n\n## `kernlab` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"kernlab\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(strength ~ ., data = reg_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Linear (vanilla) kernel function. \n#> \n#> Number of Support Vectors : 85 \n#> \n#> Objective Function Value : -47.4495 \n#> Training error : 0.606701\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.4\n#> 2 30.9\n#> 3 21.7\n#> 4 47.1\n#> 5 36.4\n#> 6 33.4\n#> 7 34.2\n#> 8 43.2\n```\n:::\n\n\n## `LiblineaR` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n # and LiblineaR is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(strength ~ ., data = reg_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized L2-loss support vector regression primal (L2R_L2LOSS_SVR)\"\n#> \n#> $Type\n#> [1] 11\n#> \n#> $W\n#> cement age Bias\n#> [1,] 8.665447 5.486263 33.34299\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 31.9\n#> 2 30.1\n#> 3 21.5\n#> 4 50.9\n#> 5 39.9\n#> 6 35.0\n#> 7 36.0\n#> 8 48.3\n```\n:::\n\n\n## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) \n\n## `kernlab` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_spec <- svm_poly() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_fit <- svm_poly_spec |> fit(strength ~ ., data = reg_train)\n#> Setting default kernel parameters\nsvm_poly_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Polynomial kernel function. \n#> Hyperparameters : degree = 1 scale = 1 offset = 1 \n#> \n#> Number of Support Vectors : 85 \n#> \n#> Objective Function Value : -47.4495 \n#> Training error : 0.606702\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_poly_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.4\n#> 2 30.9\n#> 3 21.7\n#> 4 47.1\n#> 5 36.4\n#> 6 33.4\n#> 7 34.2\n#> 8 43.2\n```\n:::\n\n\n## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) \n\n## `kernlab` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_spec <- svm_rbf() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_fit <- svm_rbf_spec |> fit(strength ~ ., data = reg_train)\nsvm_rbf_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Gaussian Radial Basis kernel function. \n#> Hyperparameter : sigma = 2.50601403779482 \n#> \n#> Number of Support Vectors : 81 \n#> \n#> Objective Function Value : -29.5383 \n#> Training error : 0.206927\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_rbf_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 16.0\n#> 2 33.9\n#> 3 28.7\n#> 4 57.2\n#> 5 37.0\n#> 6 36.2\n#> 7 37.5\n#> 8 40.1\n```\n:::\n\n\n## `liquidSVM` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_spec <- svm_rbf() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"liquidSVM\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_fit <- svm_rbf_spec |> fit(strength ~ ., data = reg_train)\nsvm_rbf_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_rbf_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 16.0\n#> 2 33.9\n#> 3 28.7\n#> 4 57.2\n#> 5 37.0\n#> 6 36.2\n#> 7 37.5\n#> 8 40.1\n```\n:::\n\n\n# Censored Regression Models\n\nLet's simulate a data set using the prodlim and survival packages: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(survival)\n#> \n#> Attaching package: 'survival'\n#> The following object is masked from 'package:future':\n#> \n#> cluster\nlibrary(prodlim)\n\nset.seed(1000)\ncns_data <- \n SimSurv(250) |> \n mutate(event_time = Surv(time, event)) |> \n select(event_time, X1, X2)\n\ncns_split <- initial_split(cns_data, prop = 0.98)\ncns_split\n#> \n#> <245/5/250>\ncns_train <- training(cns_split)\ncns_test <- testing(cns_split)\n```\n:::\n\n\nFor some types of predictions, we need the _evaluation time(s)_ for the predictions. We'll use these three times to demonstrate: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\neval_times <- c(1, 3, 5)\n```\n:::\n\n\n## Bagged Decision Trees (`bag_tree()`) \n\n## `rpart` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"censored regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_fit <- bag_tree_spec |> fit(event_time ~ ., data = cns_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> \n#> Bagging survival trees with 25 bootstrap replications \n#> \n#> Call: bagging.data.frame(formula = event_time ~ ., data = data, cp = ~0, \n#> minsplit = ~2)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.65\n#> 2 4.12\n#> 3 5.03\n#> 4 5.58\n#> 5 4.88\npredict(bag_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\n## Boosted Decision Trees (`boost_tree()`) \n\n## `mboost` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"censored regression\") |> \n set_engine(\"mboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(event_time ~ ., data = cns_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> \n#> \t Model-based Boosting\n#> \n#> Call:\n#> mboost::blackboost(formula = formula, data = data, family = family, control = mboost::boost_control(), tree_controls = partykit::ctree_control(teststat = \"quadratic\", testtype = \"Teststatistic\", mincriterion = 0, minsplit = 10, minbucket = 4, maxdepth = 2, saveinfo = FALSE))\n#> \n#> \n#> \t Cox Partial Likelihood \n#> \n#> Loss function: \n#> \n#> Number of boosting iterations: mstop = 100 \n#> Step size: 0.1 \n#> Offset: 0 \n#> Number of baselearners: 1\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 6.51\n#> 2 3.92\n#> 3 4.51\n#> 4 7.17\n#> 5 4.51\npredict(boost_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(boost_tree_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 0.00839\n#> 2 -1.14 \n#> 3 -0.823 \n#> 4 0.229 \n#> 5 -0.823\n```\n:::\n\n\n## Decision Tree (`decision_tree()`) \n\n## `partykit` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(event_time ~ ., data = cns_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> event_time ~ X1 + X2\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] X2 <= -0.36159\n#> | | [3] X1 <= 0: 13.804 (n = 41)\n#> | | [4] X1 > 0: 8.073 (n = 47)\n#> | [5] X2 > -0.36159\n#> | | [6] X1 <= 0: 6.274 (n = 89)\n#> | | [7] X1 > 0\n#> | | | [8] X2 <= 0.56098: 5.111 (n = 39)\n#> | | | [9] X2 > 0.56098: 2.713 (n = 29)\n#> \n#> Number of inner nodes: 4\n#> Number of terminal nodes: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 6.27\n#> 2 5.11\n#> 3 6.27\n#> 4 6.27\n#> 5 6.27\npredict(decision_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\n## `rpart` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"censored regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(event_time ~ ., data = cns_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> $rpart\n#> n= 245 \n#> \n#> node), split, n, deviance, yval\n#> * denotes terminal node\n#> \n#> 1) root 245 329.03530 1.0000000 \n#> 2) X2< -0.09937043 110 119.05180 0.5464982 \n#> 4) X2< -0.9419799 41 42.43138 0.3153769 \n#> 8) X1< 0.5 20 12.93725 0.1541742 *\n#> 9) X1>=0.5 21 23.29519 0.5656502 *\n#> 5) X2>=-0.9419799 69 67.76223 0.7336317 *\n#> 3) X2>=-0.09937043 135 157.14990 1.7319010 \n#> 6) X1< 0.5 79 66.30972 1.2572690 *\n#> 7) X1>=0.5 56 69.62652 3.0428230 \n#> 14) X2< 1.222057 44 40.33335 2.5072040 *\n#> 15) X2>=1.222057 12 17.95790 6.3934130 *\n#> \n#> $survfit\n#> \n#> Call: prodlim::prodlim(formula = form, data = data)\n#> Stratified Kaplan-Meier estimator for the conditional event time survival function\n#> Discrete predictor variable: rpartFactor (0.154174164904031, 0.565650228981439, 0.733631734872791, 1.25726850344687, 2.50720371146533, 6.39341334321542)\n#> \n#> Right-censored response of a survival model\n#> \n#> No.Observations: 245 \n#> \n#> Pattern:\n#> Freq\n#> event 161 \n#> right.censored 84 \n#> \n#> $levels\n#> [1] \"0.154174164904031\" \"0.565650228981439\" \"0.733631734872791\"\n#> [4] \"1.25726850344687\" \"2.50720371146533\" \"6.39341334321542\" \n#> \n#> attr(,\"class\")\n#> [1] \"pecRpart\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 1.26\n#> 2 2.51\n#> 3 1.26\n#> 4 1.26\n#> 5 1.26\npredict(decision_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\n## Proportional Hazards (`proportional_hazards()`) \n\n## `glmnet` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_spec <- proportional_hazards(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit <- proportional_hazards_spec |> fit(event_time ~ ., data = cns_train)\nproportional_hazards_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = data_obj$x, y = data_obj$y, family = \"cox\", weights = weights, alpha = alpha, lambda = lambda) \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.39700\n#> 2 1 0.82 0.36170\n#> 3 1 1.51 0.32960\n#> 4 1 2.07 0.30030\n#> 5 1 2.54 0.27360\n#> 6 1 2.94 0.24930\n#> 7 2 3.28 0.22720\n#> 8 2 3.95 0.20700\n#> 9 2 4.50 0.18860\n#> 10 2 4.95 0.17180\n#> 11 2 5.33 0.15660\n#> 12 2 5.65 0.14270\n#> 13 2 5.91 0.13000\n#> 14 2 6.13 0.11840\n#> 15 2 6.31 0.10790\n#> 16 2 6.46 0.09833\n#> 17 2 6.58 0.08960\n#> 18 2 6.69 0.08164\n#> 19 2 6.77 0.07439\n#> 20 2 6.85 0.06778\n#> 21 2 6.91 0.06176\n#> 22 2 6.96 0.05627\n#> 23 2 7.00 0.05127\n#> 24 2 7.03 0.04672\n#> 25 2 7.06 0.04257\n#> 26 2 7.08 0.03879\n#> 27 2 7.10 0.03534\n#> 28 2 7.12 0.03220\n#> 29 2 7.13 0.02934\n#> 30 2 7.14 0.02673\n#> 31 2 7.15 0.02436\n#> 32 2 7.16 0.02219\n#> 33 2 7.17 0.02022\n#> 34 2 7.17 0.01843\n#> 35 2 7.18 0.01679\n#> 36 2 7.18 0.01530\n#> 37 2 7.18 0.01394\n#> 38 2 7.19 0.01270\n#> 39 2 7.19 0.01157\n#> 40 2 7.19 0.01054\n#> 41 2 7.19 0.00961\n#> 42 2 7.19 0.00875\n#> The training data has been saved for prediction.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(proportional_hazards_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.80\n#> 2 4.21\n#> 3 4.63\n#> 4 5.18\n#> 5 4.42\npredict(proportional_hazards_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(proportional_hazards_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -0.108\n#> 2 -1.43 \n#> 3 -1.23 \n#> 4 -0.993\n#> 5 -1.33\n```\n:::\n\n\n## `survival` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and survival is the default engine so there is no need to set that either.\nproportional_hazards_spec <- proportional_hazards()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit <- proportional_hazards_spec |> fit(event_time ~ ., data = cns_train)\nproportional_hazards_fit\n#> parsnip model object\n#> \n#> Call:\n#> survival::coxph(formula = event_time ~ ., data = data, model = TRUE, \n#> x = TRUE)\n#> \n#> coef exp(coef) se(coef) z p\n#> X1 0.99547 2.70599 0.16799 5.926 3.11e-09\n#> X2 0.91398 2.49422 0.09566 9.555 < 2e-16\n#> \n#> Likelihood ratio test=106.8 on 2 df, p=< 2.2e-16\n#> n= 245, number of events= 161\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(proportional_hazards_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.16\n#> 3 4.62\n#> 4 5.19\n#> 5 4.41\npredict(proportional_hazards_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(proportional_hazards_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -0.111\n#> 2 -1.49 \n#> 3 -1.27 \n#> 4 -1.02 \n#> 5 -1.37\n```\n:::\n\n\n## Random Forests (`rand_forest()`) \n\n## `aorsf` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random survival forest\n#> \n#> Linear combinations: Accelerated Cox regression\n#> N observations: 245\n#> N events: 161\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 12.4\n#> Min observations in leaf: 5\n#> Min events in leaf: 1\n#> OOB stat value: 0.71\n#> OOB stat type: Harrell's C-index\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.98\n#> 2 3.96\n#> 3 4.39\n#> 4 5.53\n#> 5 4.26\npredict(rand_forest_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\n## `partykit` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train)\n\n# Too long to print\n# rand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.22\n#> 2 3.99\n#> 3 3.87\n#> 4 5.54\n#> 5 3.87\npredict(rand_forest_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\n## Parametric Survival Models (`survival_reg()`) \n\n## `flexsurv` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_spec <- survival_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"flexsurv\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> flexsurv::flexsurvreg(formula = event_time ~ ., data = data, \n#> dist = \"weibull\")\n#> \n#> Estimates: \n#> data mean est L95% U95% se exp(est) L95% \n#> shape NA 2.11486 1.87774 2.38192 0.12832 NA NA\n#> scale NA 9.34809 8.38852 10.41743 0.51658 NA NA\n#> X1 0.46939 -0.46483 -0.61347 -0.31619 0.07584 0.62824 0.54147\n#> X2 -0.00874 -0.42229 -0.50641 -0.33817 0.04292 0.65554 0.60266\n#> U95% \n#> shape NA\n#> scale NA\n#> X1 0.72892\n#> X2 0.71307\n#> \n#> N = 245, Events: 161, Censored: 84\n#> Total time at risk: 1388.951\n#> Log-likelihood = -427.4387, df = 4\n#> AIC = 862.8774\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.13\n#> 3 4.61\n#> 4 5.16\n#> 5 4.40\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 2.18\n#> 2 1.54\n#> 3 1.65\n#> 4 1.76\n#> 5 1.60\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\n## `flexsurvspline` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_spec <- survival_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"flexsurvspline\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> flexsurv::flexsurvspline(formula = event_time ~ ., data = data)\n#> \n#> Estimates: \n#> data mean est L95% U95% se exp(est) L95% \n#> gamma0 NA -4.72712 -5.31772 -4.13651 0.30134 NA NA\n#> gamma1 NA 2.11487 1.86338 2.36637 0.12832 NA NA\n#> X1 0.46939 0.98305 0.65928 1.30683 0.16519 2.67261 1.93340\n#> X2 -0.00874 0.89308 0.70943 1.07673 0.09370 2.44265 2.03283\n#> U95% \n#> gamma0 NA\n#> gamma1 NA\n#> X1 3.69444\n#> X2 2.93508\n#> \n#> N = 245, Events: 161, Censored: 84\n#> Total time at risk: 1388.951\n#> Log-likelihood = -427.4387, df = 4\n#> AIC = 862.8774\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.13\n#> 3 4.61\n#> 4 5.16\n#> 5 4.40\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -4.62\n#> 2 -3.26\n#> 3 -3.49\n#> 4 -3.73\n#> 5 -3.39\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\n## `survival` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and survival is the default engine so there is no need to set that either.\nsurvival_reg_spec <- survival_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> survival::survreg(formula = event_time ~ ., data = data, model = TRUE)\n#> \n#> Coefficients:\n#> (Intercept) X1 X2 \n#> 2.2351722 -0.4648296 -0.4222887 \n#> \n#> Scale= 0.4728442 \n#> \n#> Loglik(model)= -427.4 Loglik(intercept only)= -481.3\n#> \tChisq= 107.73 on 2 degrees of freedom, p= <2e-16 \n#> n= 245\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 8.88\n#> 2 4.67\n#> 3 5.20\n#> 4 5.83\n#> 5 4.97\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 2.18\n#> 2 1.54\n#> 3 1.65\n#> 4 1.76\n#> 5 1.60\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\n# Quantile Regression Models\n\nTo demonstrate quantile regression, let's make a larger version of our regression data: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(938)\nqnt_split <-\n modeldata::concrete |> \n slice_sample(n = 100) |> \n select(strength = compressive_strength, cement, age) |> \n initial_split(prop = 0.95, strata = strength)\nqnt_split\n#> \n#> <92/8/100>\n\nqnt_rec <- \n recipe(strength ~ ., data = training(qnt_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nqnt_train <- bake(qnt_rec, new_data = NULL)\nqnt_test <- bake(qnt_rec, new_data = testing(qnt_split))\n```\n:::\n\n\nWe'll also predict these quantile levels: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nqnt_lvls <- (1:3) / 4\n```\n:::\n\n\n\n## Linear Regression (`linear_reg()`) \n\n## `quantreg` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n set_engine(\"quantreg\") |> \n set_mode(\"quantile regression\", quantile_levels = qnt_lvls)\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = qnt_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> quantreg::rq(formula = strength ~ ., tau = quantile_levels, data = data)\n#> \n#> Coefficients:\n#> tau= 0.25 tau= 0.50 tau= 0.75\n#> (Intercept) 23.498029 33.265428 42.046031\n#> cement 6.635233 7.955658 8.181235\n#> age 5.566668 9.514832 7.110702\n#> \n#> Degrees of freedom: 92 total; 89 residual\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, type = \"quantile\", new_data = qnt_test)\n#> # A tibble: 8 × 1\n#> .pred_quantile\n#> \n#> 1 [29.2]\n#> 2 [31.5]\n#> 3 [21.4]\n#> 4 [48.3]\n#> 5 [36.6]\n#> 6 [33.8]\n#> 7 [34.6]\n#> 8 [43.8]\n```\n:::\n\n\n## Random Forests (`rand_forest()`) \n\n## `grf` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |> \n set_mode(\"quantile regression\", quantile_levels = qnt_lvls) |>\n set_engine(\"grf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = qnt_train)\nrand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"quantile\", new_data = qnt_test)\n```\n:::\n\n\n\n\n", + "supporting": [], + "filters": [ + "rmarkdown/pagebreak.lua" + ], + "includes": {}, + "engineDependencies": {}, + "preserve": {}, + "postProcess": true + } +} \ No newline at end of file diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 229f6e3d..f16b3ad4 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -13,7 +13,7 @@ toc: true toc-depth: 3 include-after-body: ../../../resources.html execute: - eval: false + eval: true --- ```{r} @@ -23,6 +23,9 @@ execute: #| warning: false #| eval: true source(here::here("common.R")) + +# Indicates to enable or not running Spark code +run_spark <- TRUE ``` ```{r} @@ -76,7 +79,7 @@ single-node Spark cluster running on a laptop: ```{r} #| label: spark-connect -#| eval: true +#| eval: !expr 'run_spark' library(sparklyr) sc <- spark_connect("local", version = "4.0.1") ``` @@ -132,7 +135,7 @@ and then use it to create the splits. For this article, we will copy the ```{r} #| label: spark-bin-data -#| eval: true +#| eval: !expr 'run_spark' tbl_two_class <- copy_to(sc, modeldata::two_class_dat) tbl_bin <- sdf_random_split(tbl_two_class, training = 0.994, test = 1-0.994, seed = 100) @@ -577,7 +580,7 @@ We create a model specification via: ```{r} #| label: spec-spark-boost-tree-classification -#| eval: true +#| eval: !expr 'run_spark' boost_tree_spec <- boost_tree() |> set_mode("classification") |> set_engine("spark") @@ -587,7 +590,7 @@ Now we create the model fit object: ```{r} #| label: fit-spark-boost-tree-classification -#| eval: true +#| eval: !expr 'run_spark' boost_tree_fit <- boost_tree_spec |> fit(Class ~ ., data = tbl_bin$training) boost_tree_fit ``` @@ -596,7 +599,7 @@ The holdout data can be predicted: ```{r} #| label: predict-spark-boost-tree-classification -#| eval: true +#| eval: !expr 'run_spark' predict(boost_tree_fit, type = "class", new_data = tbl_bin$test) predict(boost_tree_fit, type = "prob", new_data = tbl_bin$test) ``` @@ -738,7 +741,7 @@ We create a model specification via: ```{r} #| label: spec-spark-decision-tree-classification -#| eval: true +#| eval: !expr 'run_spark' decision_tree_spec <- decision_tree() |> set_mode("classification") ``` @@ -747,7 +750,7 @@ Now we create the model fit object: ```{r} #| label: fit-spark-decision-tree-classification -#| eval: true +#| eval: !expr 'run_spark' decision_tree_fit <- decision_tree_spec |> fit(Class ~ ., data = tbl_bin$training) decision_tree_fit ``` @@ -756,7 +759,7 @@ The holdout data can be predicted: ```{r} #| label: predict-spark-decision-tree-classification -#| eval: true +#| eval: !expr 'run_spark' predict(decision_tree_fit, type = "class", new_data = tbl_bin$test) predict(decision_tree_fit, type = "prob", new_data = tbl_bin$test) ``` @@ -1405,7 +1408,7 @@ We create a model specification via: ```{r} #| label: spec-spark-logistic-reg-classification -#| eval: true +#| eval: !expr 'run_spark' logistic_reg_spec <- logistic_reg() |> set_engine("spark") ``` @@ -1414,7 +1417,7 @@ Now we create the model fit object: ```{r} #| label: fit-spark-logistic-reg-classification -#| eval: true +#| eval: !expr 'run_spark' logistic_reg_fit <- logistic_reg_spec |> fit(Class ~ ., data = tbl_bin$training) logistic_reg_fit ``` @@ -1764,7 +1767,7 @@ We create a model specification via: ```{r} #| label: spec-spark-multinom-reg-classification -#| eval: true +#| eval: !expr 'run_spark' multinom_reg_spec <- multinom_reg() |> set_engine("spark") ``` @@ -1773,7 +1776,7 @@ Now we create the model fit object: ```{r} #| label: fit-spark-multinom-reg-classification -#| eval: true +#| eval: !expr 'run_spark' multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = tbl_mtl$training) multinom_reg_fit ``` @@ -1782,6 +1785,7 @@ The holdout data can be predicted: ```{r} #| label: predict-spark-multinom-reg-classification +#| eval: !expr 'run_spark' predict(multinom_reg_fit, type = "class", new_data = tbl_mtl$test) predict(multinom_reg_fit, type = "prob", new_data = tbl_mtl$test) ``` @@ -2204,7 +2208,7 @@ We create a model specification via: ```{r} #| label: spec-spark-rand-forest-classification -#| eval: true +#| eval: !expr 'run_spark' rand_forest_spec <- rand_forest() |> set_mode("classification") |> set_engine("spark") @@ -2214,7 +2218,7 @@ Now we create the model fit object: ```{r} #| label: fit-spark-rand-forest-classification -#| eval: true +#| eval: !expr 'run_spark' rand_forest_fit <- rand_forest_spec |> fit(Class ~ ., data = tbl_bin$training) rand_forest_fit ``` @@ -2222,7 +2226,7 @@ rand_forest_fit The holdout data can be predicted: ```{r} -#| eval: true +#| eval: !expr 'run_spark' #| label: predict-spark-rand-forest-classification predict(rand_forest_fit, type = "class", new_data = tbl_bin$test) predict(rand_forest_fit, type = "prob", new_data = tbl_bin$test) @@ -2507,7 +2511,7 @@ and then use it to create the splits. For this article, we will copy the ```{r} #| label: spark-reg-data -#| eval: true +#| eval: !expr 'run_spark' tbl_concrete <- copy_to(sc, modeldata::concrete) @@ -2873,7 +2877,7 @@ We create a model specification via: ```{r} #| label: spec-spark-boost-tree-regression -#| eval: true +#| eval: !expr 'run_spark' boost_tree_spec <- boost_tree() |> set_mode("regression") |> set_engine("spark") @@ -2883,7 +2887,7 @@ Now we create the model fit object: ```{r} #| label: fit-spark-boost-tree-regression -#| eval: true +#| eval: !expr 'run_spark' boost_tree_fit <- boost_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training) boost_tree_fit ``` @@ -2892,7 +2896,7 @@ The holdout data can be predicted: ```{r} #| label: predict-spark-boost-tree-regression -#| eval: true +#| eval: !expr 'run_spark' predict(boost_tree_fit, new_data = tbl_reg$test) ``` @@ -3002,7 +3006,7 @@ We create a model specification via: ```{r} #| label: spec-spark-decision-tree-regression -#| eval: true +#| eval: !expr 'run_spark' decision_tree_spec <- decision_tree() |> set_mode("regression") |> set_engine("spark") @@ -3012,7 +3016,7 @@ Now we create the model fit object: ```{r} #| label: fit-spark-decision-tree-regression -#| eval: true +#| eval: !expr 'run_spark' decision_tree_fit <- decision_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training) decision_tree_fit ``` @@ -3481,7 +3485,7 @@ We create a model specification via: ```{r} #| label: spec-spark-linear-reg-regression -#| eval: true +#| eval: !expr 'run_spark' linear_reg_spec <- linear_reg() |> set_engine("spark") ``` @@ -3490,7 +3494,7 @@ Now we create the model fit object: ```{r} #| label: fit-spark-linear-reg-regression -#| eval: true +#| eval: !expr 'run_spark' linear_reg_fit <- linear_reg_spec |> fit(compressive_strength ~ ., data = tbl_reg$training) linear_reg_fit ``` @@ -3499,7 +3503,7 @@ The holdout data can be predicted: ```{r} #| label: predict-spark-linear-reg-regression -#| eval: true +#| eval: !expr 'run_spark' predict(linear_reg_fit, new_data = tbl_reg$test) ``` @@ -4297,7 +4301,7 @@ We create a model specification via: ```{r} #| label: spec-spark-rand-forest-regression -#| eval: true +#| eval: !expr 'run_spark' rand_forest_spec <- rand_forest() |> set_engine("spark") |> set_mode("regression") @@ -4307,7 +4311,7 @@ Now we create the model fit object: ```{r} #| label: fit-spark-rand-forest-regression -#| eval: true +#| eval: !expr 'run_spark' rand_forest_fit <- rand_forest_spec |> fit(compressive_strength ~ ., data = tbl_reg$training) rand_forest_fit ``` @@ -4316,7 +4320,7 @@ The holdout data can be predicted: ```{r} #| label: predict-spark-rand-forest-regression -#| eval: true +#| eval: !expr 'run_spark' predict(rand_forest_fit, new_data = tbl_reg$test) ``` @@ -5072,6 +5076,7 @@ predict(rand_forest_fit, type = "quantile", new_data = qnt_test) ```{r} #| label: spark-disconnect #| include: false +#| eval: !expr 'run_spark' spark_disconnect(sc) ``` From ecea3cb5fa3b52bcc008bfb6ab825c8b7860d3a8 Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 20 Nov 2025 09:15:44 -0500 Subject: [PATCH 12/23] Made sections for each external install --- learn/models/parsnip-predictions/index.qmd | 190 ++++++++++++++++++++- 1 file changed, 185 insertions(+), 5 deletions(-) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index f16b3ad4..29a66320 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -25,7 +25,8 @@ execute: source(here::here("common.R")) # Indicates to enable or not running Spark code -run_spark <- TRUE +run_spark <- FALSE +run_h2o <- TRUE ``` ```{r} @@ -42,7 +43,7 @@ pkgs <- c("tidymodels", "agua", "baguette", "bonsai", "censored", "discrim", ``` -## Introduction +# Introduction `r article_req_pkgs(pkgs)` @@ -64,6 +65,7 @@ todo - keras3 updates - use `
` for long model prints - avoid subsection titles capitalizing the engine name (e.g., "CATBOOST") and text within backticks +- set seeds when needed ```{r} #| label: load-tm @@ -71,11 +73,88 @@ library(tidymodels) theme_set(theme_bw() + theme(legend.position = "top")) ``` +## External Dependencies + +Some models available in parsnip use other computational frameworks for computations. There may be some additional downloads for engines using **catboost**, **Spark**, **h2o**, **tensorflow**/**keras**, and **torch**. You can expand the sections below to get basic installation instructions. + +
+ +### catboost + +catboost is a popular boosting framework. Unfortunately, the R package is not available on CRAN. First, go to [https://github.com/catboost/catboost/releases/]("https://github.com/catboost/catboost/releases/) and search for "`[R-package]`" to find the most recent release. + +The following code and be used to install and test the package (which requires the glue package to be installed): + +```{r} +#| label: catboost-install +#| eval: false +library(glue) + +# Put the current version number in this variable: +version_number <- "#.##" + +template <- "https://github.com/catboost/catboost/releases/download/v{version}/catboost-R-darwin-universal2-{version}.tgz" + +target_url <- glue::glue(template) +target_dest <- tempfile() +download.file(target_url, target_dest) + +if (grepl("^mac", .Platform$pkgType)) { + options <- "--no-staged-install" +} else { + options <- character(0) +} + +inst <- glue::glue("R CMD INSTALL {options} {target_dest}") +system(inst) +``` + +To test, fit an example model: + +```{r} +#| label: catboost-test +#| eval: false +library(catboost) + +train_pool_path <- system.file("extdata", "adult_train.1000", package = "catboost") +test_pool_path <- system.file("extdata", "adult_test.1000", package = "catboost") +cd_path <- system.file("extdata", "adult.cd", package = "catboost") +train_pool <- catboost.load_pool(train_pool_path, column_description = cd_path) +test_pool <- catboost.load_pool(test_pool_path, column_description = cd_path) +fit_params <- list( + iterations = 100, + loss_function = 'Logloss', + ignored_features = c(4, 9), + border_count = 32, + depth = 5, + learning_rate = 0.03, + l2_leaf_reg = 3.5, + train_dir = tempdir()) +fit_params +``` + ### Apache Spark -To use [Apache Spark](https://spark.apache.org/) as an engine, we will first -need a connection to a cluster. For this article, we will setup and use a -single-node Spark cluster running on a laptop: +To use [Apache Spark](https://spark.apache.org/) as an engine, we will first install Spark and then need a connection to a cluster. For this article, we will setup and use a single-node Spark cluster running on a laptop. + +To install, first install sparklyr: + +```{r} +#| label: sparklyr-install +#| eval: false +install.packages("sparklyr") +``` + +and then install the Spark backend. For example, you might use: + +```{r} +#| label: spark-install +#| eval: false +library(sparklyr) +spark_install(version = "4.0") +``` + +Once that is working, you can get ready to fit models using: ```{r} #| label: spark-connect @@ -84,6 +163,59 @@ library(sparklyr) sc <- spark_connect("local", version = "4.0.1") ``` +### h2o + +h2o.ai offers a Java based high performance computing server for machine learning. This can be run locally or externally. There are general installation instructions at [https://docs.h2o.ai/](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html). There is a package on CRAN but you can also install directly from [h2o](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html#install-in-r) via: + +```{r} +#| label: h2o-download +#| eval: false +install.packages( + "h2o", + type = "source", + repos = "http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R" +) +``` + +After installation is complete, you can start a local server via `h2o::h2o.init()`. + +The tidymodels [agua](https://agua.tidymodels.org/) package has some helpers and will also need to be installed. You can use its function to start a server too: + +```{r} +#| label: h2o-init +#| eval: !expr 'run_h2o' +library(agua) +h2o_start() +``` + +### Tensorflow and Keras + +R's tensorflow and keras3 packages call Python directly. To enable this, you'll have to install two R packages: + +```{r} +#| label: keras-install +#| eval: false +install.packages("keras3") +``` + +Once that is done, use: +```{r} +#| label: tf-install +#| eval: false +keras3::install_keras(backend = "tensorflow") +``` + +There are other options for installation. See [https://tensorflow.rstudio.com/install/index.html](https://tensorflow.rstudio.com/install/index.html) for more details. + +### Torch + +R's torch package is the low-level package containing the framework. Once you have installed it, you will get this message the first time you load the package: + +> Additional software needs to be downloaded and installed for torch to work correctly." + +Choosing "Yes" will do the _one-time_ installation. + +
# Classification Models @@ -136,6 +268,9 @@ and then use it to create the splits. For this article, we will copy the ```{r} #| label: spark-bin-data #| eval: !expr 'run_spark' +library(sparklyr) +sc <- spark_connect("local", version = "4.0.1") + tbl_two_class <- copy_to(sc, modeldata::two_class_dat) tbl_bin <- sdf_random_split(tbl_two_class, training = 0.994, test = 1-0.994, seed = 100) @@ -452,6 +587,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-boost-tree-classification +#| eval: !expr 'run_h2o' boost_tree_spec <- boost_tree() |> # We need to set the mode since this engine works with multiple modes set_mode("classification") |> @@ -462,6 +598,7 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-boost-tree-classification +#| eval: !expr 'run_h2o' boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) boost_tree_fit ``` @@ -470,6 +607,7 @@ The holdout data can be predicted: ```{r} #| label: predict-h2o-boost-tree-classification +#| eval: !expr 'run_h2o' predict(boost_tree_fit, type = "class", new_data = bin_test) predict(boost_tree_fit, type = "prob", new_data = bin_test) ``` @@ -488,6 +626,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-gbm-boost-tree-classification +#| eval: !expr 'run_h2o' boost_tree_spec <- boost_tree() |> # We need to set the mode since this engine works with multiple modes set_mode("classification") |> @@ -498,6 +637,7 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-gbm-boost-tree-classification +#| eval: !expr 'run_h2o' boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) boost_tree_fit ``` @@ -506,6 +646,7 @@ The holdout data can be predicted: ```{r} #| label: predict-h2o-gbm-boost-tree-classification +#| eval: !expr 'run_h2o' predict(boost_tree_fit, type = "class", new_data = bin_test) predict(boost_tree_fit, type = "prob", new_data = bin_test) ``` @@ -1257,6 +1398,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-logistic-reg-classification +#| eval: !expr 'run_h2o' logistic_reg_spec <- logistic_reg() |> # This engine works with a single mode so no need to set that set_engine("h2o") @@ -1266,6 +1408,7 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-logistic-reg-classification +#| eval: !expr 'run_h2o' logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) logistic_reg_fit ``` @@ -1274,6 +1417,7 @@ The holdout data can be predicted: ```{r} #| label: predict-h2o-logistic-reg-classification +#| eval: !expr 'run_h2o' predict(logistic_reg_fit, type = "class", new_data = bin_test) predict(logistic_reg_fit, type = "prob", new_data = bin_test) ``` @@ -1534,6 +1678,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-mlp-classification +#| eval: !expr 'run_h2o' mlp_spec <- mlp() |> # We need to set the mode since this engine works with multiple modes set_mode("classification") |> @@ -1544,6 +1689,7 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-mlp-classification +#| eval: !expr 'run_h2o' mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) mlp_fit ``` @@ -1552,6 +1698,7 @@ The holdout data can be predicted: ```{r} #| label: predict-h2o-mlp-classification +#| eval: !expr 'run_h2o' predict(mlp_fit, type = "class", new_data = bin_test) predict(mlp_fit, type = "prob", new_data = bin_test) ``` @@ -1684,6 +1831,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-multinom-reg-classification +#| eval: !expr 'run_h2o' multinom_reg_spec <- multinom_reg() |> # This engine works with a single mode so no need to set that set_engine("h2o") @@ -1693,6 +1841,7 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-multinom-reg-classification +#| eval: !expr 'run_h2o' multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train) multinom_reg_fit ``` @@ -1701,6 +1850,7 @@ The holdout data can be predicted: ```{r} #| label: predict-h2o-multinom-reg-classification +#| eval: !expr 'run_h2o' predict(multinom_reg_fit, type = "class", new_data = mtl_test) predict(multinom_reg_fit, type = "prob", new_data = mtl_test) ``` @@ -1807,6 +1957,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-naive-Bayes-classification +#| eval: !expr 'run_h2o' naive_Bayes_spec <- naive_Bayes() |> # This engine works with a single mode so no need to set that set_engine("h2o") @@ -1816,6 +1967,7 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-naive-Bayes-classification +#| eval: !expr 'run_h2o' naive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train) naive_Bayes_fit ``` @@ -1824,6 +1976,7 @@ The holdout data can be predicted: ```{r} #| label: predict-h2o-naive-Bayes-classification +#| eval: !expr 'run_h2o' predict(naive_Bayes_fit, type = "class", new_data = bin_test) predict(naive_Bayes_fit, type = "prob", new_data = bin_test) ``` @@ -2082,6 +2235,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-rand-forest-classification +#| eval: !expr 'run_h2o' rand_forest_spec <- rand_forest() |> # We need to set the mode since this engine works with multiple modes set_mode("classification") |> @@ -2092,6 +2246,7 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-rand-forest-classification +#| eval: !expr 'run_h2o' rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) rand_forest_fit ``` @@ -2100,6 +2255,7 @@ The holdout data can be predicted: ```{r} #| label: predict-h2o-rand-forest-classification +#| eval: !expr 'run_h2o' predict(rand_forest_fit, type = "class", new_data = bin_test) predict(rand_forest_fit, type = "prob", new_data = bin_test) ``` @@ -2248,6 +2404,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-rule-fit-classification +#| eval: !expr 'run_h2o' rule_fit_spec <- rule_fit() |> # We need to set the mode since this engine works with multiple modes set_mode("classification") |> @@ -2258,6 +2415,7 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-rule-fit-classification +#| eval: !expr 'run_h2o' rule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train) rule_fit_fit ``` @@ -2266,6 +2424,7 @@ The holdout data can be predicted: ```{r} #| label: predict-h2o-rule-fit-classification +#| eval: !expr 'run_h2o' predict(rule_fit_fit, type = "class", new_data = bin_test) predict(rule_fit_fit, type = "prob", new_data = bin_test) ``` @@ -2513,6 +2672,9 @@ and then use it to create the splits. For this article, we will copy the #| label: spark-reg-data #| eval: !expr 'run_spark' +library(sparklyr) +sc <- spark_connect("local", version = "4.0.1") + tbl_concrete <- copy_to(sc, modeldata::concrete) tbl_reg <- sdf_random_split(tbl_concrete, training = 0.95, test = 0.05, seed = 100) @@ -2753,6 +2915,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-boost-tree-regression +#| eval: !expr 'run_h2o' boost_tree_spec <- boost_tree() |> # We need to set the mode since this engine works with multiple modes set_mode("regression") |> @@ -2788,6 +2951,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-gbm-boost-tree-regression +#| eval: !expr 'run_h2o' boost_tree_spec <- boost_tree() |> # We need to set the mode since this engine works with multiple modes set_mode("regression") |> @@ -2798,6 +2962,7 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-gbm-boost-tree-regression +#| eval: !expr 'run_h2o' boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) boost_tree_fit ``` @@ -3266,6 +3431,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-linear-reg-regression +#| eval: !expr 'run_h2o' linear_reg_spec <- linear_reg() |> # This engine works with a single mode so no need to set that set_engine("h2o") @@ -3275,6 +3441,7 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-linear-reg-regression +#| eval: !expr 'run_h2o' linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) linear_reg_fit ``` @@ -3283,6 +3450,7 @@ The holdout data can be predicted: ```{r} #| label: predict-h2o-linear-reg-regression +#| eval: !expr 'run_h2o' predict(linear_reg_fit, new_data = reg_test) ``` @@ -3606,6 +3774,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-mlp-regression +#| eval: !expr 'run_h2o' mlp_spec <- mlp() |> # We need to set the mode since this engine works with multiple modes set_mode("regression") |> @@ -3616,6 +3785,7 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-mlp-regression +#| eval: !expr 'run_h2o' mlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train) mlp_fit ``` @@ -3624,6 +3794,7 @@ The holdout data can be predicted: ```{r} #| label: predict-h2o-mlp-regression +#| eval: !expr 'run_h2o' predict(mlp_fit, new_data = reg_test) ``` @@ -3934,6 +4105,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-poisson-reg-regression +#| eval: !expr 'run_h2o' poisson_reg_spec <- poisson_reg() |> # This engine works with a single mode so no need to set that set_engine("h2o") @@ -3943,6 +4115,7 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-poisson-reg-regression +#| eval: !expr 'run_h2o' poisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train) poisson_reg_fit ``` @@ -3951,6 +4124,7 @@ The holdout data can be predicted: ```{r} #| label: predict-h2o-poisson-reg-regression +#| eval: !expr 'run_h2o' predict(poisson_reg_fit, new_data = count_test) ``` @@ -4179,6 +4353,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-rand-forest-regression +#| eval: !expr 'run_h2o' rand_forest_spec <- rand_forest() |> # We need to set the mode since this engine works with multiple modes set_mode("regression") |> @@ -4189,6 +4364,7 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-rand-forest-regression +#| eval: !expr 'run_h2o' rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) rand_forest_fit ``` @@ -4197,6 +4373,7 @@ The holdout data can be predicted: ```{r} #| label: predict-h2o-rand-forest-regression +#| eval: !expr 'run_h2o' predict(rand_forest_fit, new_data = reg_test) ``` @@ -4340,6 +4517,7 @@ We create a model specification via: ```{r} #| label: spec-h2o-rule-fit-regression +#| eval: !expr 'run_h2o' rule_fit_spec <- rule_fit() |> # We need to set the mode since this engine works with multiple modes set_mode("regression") |> @@ -4350,6 +4528,7 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-rule-fit-regression +#| eval: !expr 'run_h2o' rule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train) rule_fit_fit ``` @@ -4358,6 +4537,7 @@ The holdout data can be predicted: ```{r} #| label: predict-h2o-rule-fit-regression +#| eval: !expr 'run_h2o' predict(rule_fit_fit, new_data = reg_test) ``` From 38775811eb10bb8a587a7c3f96b62d4439d8d5cb Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 20 Nov 2025 11:17:08 -0600 Subject: [PATCH 13/23] Converts classification models to tabset panels --- learn/models/parsnip-predictions/index.qmd | 391 +++++++++++++-------- 1 file changed, 244 insertions(+), 147 deletions(-) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 29a66320..5eec057d 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -13,7 +13,7 @@ toc: true toc-depth: 3 include-after-body: ../../../resources.html execute: - eval: true + eval: false --- ```{r} @@ -26,7 +26,7 @@ source(here::here("common.R")) # Indicates to enable or not running Spark code run_spark <- FALSE -run_h2o <- TRUE +run_h2o <- FALSE ``` ```{r} @@ -284,7 +284,9 @@ tbl_mtl <- sdf_random_split(tbl_sim_mtl, training = 0.967, test = 1-0.967, seed ## Auto Ml (`auto_ml()`) -## `h2o` Engine +:::{.panel-tabset} + +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -327,9 +329,13 @@ predict(auto_ml_fit, type = "class", new_data = bin_test) predict(auto_ml_fit, type = "prob", new_data = bin_test) ``` +::: + ## Bagged MARS (`bag_mars()`) -## `earth` Engine +:::{.panel-tabset} + +## `earth` This engine requires the baguette extension package, so let's load this first: @@ -365,9 +371,13 @@ predict(bag_mars_fit, type = "class", new_data = bin_test) predict(bag_mars_fit, type = "prob", new_data = bin_test) ``` +::: + ## Bagged Neural Networks (`bag_mlp()`) -## `nnet` Engine +:::{.panel-tabset} + +## `nnet` This engine requires the baguette extension package, so let's load this first: @@ -403,9 +413,13 @@ predict(bag_mlp_fit, type = "class", new_data = bin_test) predict(bag_mlp_fit, type = "prob", new_data = bin_test) ``` +::: + ## Bagged Decision Trees (`bag_tree()`) -## `C5.0` Engine +:::{.panel-tabset} + +## `C5.0` This engine requires the baguette extension package, so let's load this first: @@ -440,7 +454,7 @@ predict(bag_tree_fit, type = "class", new_data = bin_test) predict(bag_tree_fit, type = "prob", new_data = bin_test) ``` -## `rpart` Engine +## `rpart` This engine requires the baguette extension package, so let's load this first: @@ -476,9 +490,13 @@ predict(bag_tree_fit, type = "class", new_data = bin_test) predict(bag_tree_fit, type = "prob", new_data = bin_test) ``` +::: + ## Bayesian Additive Regression Trees (`bart()`) -## `dbarts` Engine +:::{.panel-tabset} + +## `dbarts` We create a model specification via: @@ -508,9 +526,13 @@ predict(bart_fit, type = "conf_int", new_data = bin_test) predict(bart_fit, type = "pred_int", new_data = bin_test) ``` +::: + ## Boosted Decision Trees (`boost_tree()`) -## `C5.0` Engine +:::{.panel-tabset} + +## `C5.0` We create a model specification via: @@ -537,7 +559,7 @@ predict(boost_tree_fit, type = "class", new_data = bin_test) predict(boost_tree_fit, type = "prob", new_data = bin_test) ``` -## `catboost` Engine +## `catboost` This engine requires the bonsai extension package, so let's load this first: @@ -573,7 +595,7 @@ predict(boost_tree_fit, type = "class", new_data = bin_test) predict(boost_tree_fit, type = "prob", new_data = bin_test) ``` -## `h2o` Engine +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -612,7 +634,7 @@ predict(boost_tree_fit, type = "class", new_data = bin_test) predict(boost_tree_fit, type = "prob", new_data = bin_test) ``` -## `h2o_gbm` Engine +## `h2o_gbm` This engine requires the agua extension package, so let's load this first: @@ -651,7 +673,7 @@ predict(boost_tree_fit, type = "class", new_data = bin_test) predict(boost_tree_fit, type = "prob", new_data = bin_test) ``` -## `lightgbm` Engine +## `lightgbm` This engine requires the bonsai extension package, so let's load this first: @@ -687,7 +709,7 @@ predict(boost_tree_fit, type = "class", new_data = bin_test) predict(boost_tree_fit, type = "prob", new_data = bin_test) ``` -## `xgboost` Engine +## `xgboost` We create a model specification via: @@ -715,7 +737,7 @@ predict(boost_tree_fit, type = "class", new_data = bin_test) predict(boost_tree_fit, type = "prob", new_data = bin_test) ``` -## `spark` Engine +## `spark` We create a model specification via: @@ -745,10 +767,13 @@ predict(boost_tree_fit, type = "class", new_data = tbl_bin$test) predict(boost_tree_fit, type = "prob", new_data = tbl_bin$test) ``` +::: ## C5 Rules (`C5_rules()`) -## `C5.0` Engine +:::{.panel-tabset} + +## `C5.0` This engine requires the rules extension package, so let's load this first: @@ -783,9 +808,13 @@ predict(C5_rules_fit, type = "class", new_data = bin_test) predict(C5_rules_fit, type = "prob", new_data = bin_test) ``` +::: + ## Decision Tree (`decision_tree()`) -## `C5.0` Engine +:::{.panel-tabset} + +## `C5.0` We create a model specification via: @@ -812,7 +841,7 @@ predict(decision_tree_fit, type = "class", new_data = bin_test) predict(decision_tree_fit, type = "prob", new_data = bin_test) ``` -## `partykit` Engine +## `partykit` This engine requires the bonsai extension package, so let's load this first: @@ -848,7 +877,7 @@ predict(decision_tree_fit, type = "class", new_data = bin_test) predict(decision_tree_fit, type = "prob", new_data = bin_test) ``` -## `rpart` Engine +## `rpart` We create a model specification via: @@ -876,7 +905,7 @@ predict(decision_tree_fit, type = "class", new_data = bin_test) predict(decision_tree_fit, type = "prob", new_data = bin_test) ``` -## `sparklyr` Engine +## `spark` We create a model specification via: @@ -905,9 +934,13 @@ predict(decision_tree_fit, type = "class", new_data = tbl_bin$test) predict(decision_tree_fit, type = "prob", new_data = tbl_bin$test) ``` +::: + ## Flexible Discriminant Analysis (`discrim_flexible()`) -## `earth` Engine +:::{.panel-tabset} + +## `earth` This engine requires the discrim extension package, so let's load this first: @@ -942,9 +975,13 @@ predict(discrim_flexible_fit, type = "class", new_data = bin_test) predict(discrim_flexible_fit, type = "prob", new_data = bin_test) ``` +::: + ## Linear Discriminant Analysis (`discrim_linear()`) -## `MASS` Engine +:::{.panel-tabset} + +## `MASS` This engine requires the discrim extension package, so let's load this first: @@ -979,7 +1016,7 @@ predict(discrim_linear_fit, type = "class", new_data = bin_test) predict(discrim_linear_fit, type = "prob", new_data = bin_test) ``` -## `mda` Engine +## `mda` This engine requires the discrim extension package, so let's load this first: @@ -1014,7 +1051,7 @@ predict(discrim_linear_fit, type = "class", new_data = bin_test) predict(discrim_linear_fit, type = "prob", new_data = bin_test) ``` -## `sda` Engine +## `sda` This engine requires the discrim extension package, so let's load this first: @@ -1049,7 +1086,7 @@ predict(discrim_linear_fit, type = "class", new_data = bin_test) predict(discrim_linear_fit, type = "prob", new_data = bin_test) ``` -## `sparsediscrim` Engine +## `sparsediscrim` This engine requires the discrim extension package, so let's load this first: @@ -1084,9 +1121,13 @@ predict(discrim_linear_fit, type = "class", new_data = bin_test) predict(discrim_linear_fit, type = "prob", new_data = bin_test) ``` +::: + ## Quandratic Discriminant Analysis (`discrim_quad()`) -## `MASS` Engine +:::{.panel-tabset} + +## `MASS` This engine requires the discrim extension package, so let's load this first: @@ -1121,7 +1162,7 @@ predict(discrim_quad_fit, type = "class", new_data = bin_test) predict(discrim_quad_fit, type = "prob", new_data = bin_test) ``` -## `sparsediscrim` Engine +## `sparsediscrim` This engine requires the discrim extension package, so let's load this first: @@ -1156,9 +1197,13 @@ predict(discrim_quad_fit, type = "class", new_data = bin_test) predict(discrim_quad_fit, type = "prob", new_data = bin_test) ``` +::: + ## Regularized Discriminant Analysis (`discrim_regularized()`) -## `klaR` Engine +:::{.panel-tabset} + +## `klaR` This engine requires the discrim extension package, so let's load this first: @@ -1193,9 +1238,13 @@ predict(discrim_regularized_fit, type = "class", new_data = bin_test) predict(discrim_regularized_fit, type = "prob", new_data = bin_test) ``` +::: + ## Generalized Additive Models (`gen_additive_mod()`) -## `mgcv` Engine +:::{.panel-tabset} + +## `mgcv` We create a model specification via: @@ -1226,9 +1275,13 @@ predict(gen_additive_mod_fit, type = "prob", new_data = bin_test) predict(gen_additive_mod_fit, type = "conf_int", new_data = bin_test) ``` +::: + ## Logistic Regression (`logistic_reg()`) -## `brulee` Engine +:::{.panel-tabset} + +## `brulee` We create a model specification via: @@ -1255,7 +1308,7 @@ predict(logistic_reg_fit, type = "class", new_data = bin_test) predict(logistic_reg_fit, type = "prob", new_data = bin_test) ``` -## `gee` Engine +## `gee` This engine requires the multilevelmod extension package, so let's load this first: @@ -1292,7 +1345,7 @@ predict(logistic_reg_fit, type = "class", new_data = bin_test) predict(logistic_reg_fit, type = "prob", new_data = bin_test) ``` -## `glm` Engine +## `glm` We create a model specification via: @@ -1320,7 +1373,7 @@ predict(logistic_reg_fit, type = "prob", new_data = bin_test) predict(logistic_reg_fit, type = "conf_int", new_data = bin_test) ``` -## `glmer` Engine +## `glmer` This engine requires the multilevelmod extension package, so let's load this first: @@ -1357,7 +1410,7 @@ predict(logistic_reg_fit, type = "class", new_data = bin_test) predict(logistic_reg_fit, type = "prob", new_data = bin_test) ``` -## `glmnet` Engine +## `glmnet` We create a model specification via: @@ -1384,7 +1437,7 @@ predict(logistic_reg_fit, type = "class", new_data = bin_test) predict(logistic_reg_fit, type = "prob", new_data = bin_test) ``` -## `h2o` Engine +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -1422,7 +1475,7 @@ predict(logistic_reg_fit, type = "class", new_data = bin_test) predict(logistic_reg_fit, type = "prob", new_data = bin_test) ``` -## `keras` Engine +## `keras` We create a model specification via: @@ -1451,7 +1504,7 @@ predict(logistic_reg_fit, type = "class", new_data = bin_test) predict(logistic_reg_fit, type = "prob", new_data = bin_test) ``` -## `LiblineaR` Engine +## `LiblineaR` We create a model specification via: @@ -1478,7 +1531,7 @@ predict(logistic_reg_fit, type = "class", new_data = bin_test) predict(logistic_reg_fit, type = "prob", new_data = bin_test) ``` -## `stan` Engine +## `stan` We create a model specification via: @@ -1507,7 +1560,7 @@ predict(logistic_reg_fit, type = "conf_int", new_data = bin_test) predict(logistic_reg_fit, type = "pred_int", new_data = bin_test) ``` -## `stan_glmer` Engine +## `stan_glmer` This engine requires the multilevelmod extension package, so let's load this first: @@ -1546,7 +1599,7 @@ predict(logistic_reg_fit, type = "conf_int", new_data = bin_test) predict(logistic_reg_fit, type = "pred_int", new_data = bin_test) ``` -## `spark` Engine +## `spark` We create a model specification via: @@ -1575,10 +1628,13 @@ predict(logistic_reg_fit, type = "class", new_data = tbl_bin$test) predict(logistic_reg_fit, type = "prob", new_data = tbl_bin$test) ``` +::: ## Multivariate Adaptive Regression Splines (`mars()`) -## `earth` Engine +:::{.panel-tabset} + +## `earth` We create a model specification via: @@ -1606,9 +1662,13 @@ predict(mars_fit, type = "class", new_data = bin_test) predict(mars_fit, type = "prob", new_data = bin_test) ``` +::: + ## Neural Networks (`mlp()`) -## `brulee` Engine +:::{.panel-tabset} + +## `brulee` We create a model specification via: @@ -1636,7 +1696,7 @@ predict(mlp_fit, type = "class", new_data = bin_test) predict(mlp_fit, type = "prob", new_data = bin_test) ``` -## `brulee_two_layer` Engine +## `brulee_two_layer` We create a model specification via: @@ -1664,7 +1724,7 @@ predict(mlp_fit, type = "class", new_data = bin_test) predict(mlp_fit, type = "prob", new_data = bin_test) ``` -## `h2o` Engine +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -1703,7 +1763,7 @@ predict(mlp_fit, type = "class", new_data = bin_test) predict(mlp_fit, type = "prob", new_data = bin_test) ``` -## `keras` Engine +## `keras` We create a model specification via: @@ -1733,7 +1793,7 @@ predict(mlp_fit, type = "class", new_data = bin_test) predict(mlp_fit, type = "prob", new_data = bin_test) ``` -## `nnet` Engine +## `nnet` We create a model specification via: @@ -1761,9 +1821,13 @@ predict(mlp_fit, type = "class", new_data = bin_test) predict(mlp_fit, type = "prob", new_data = bin_test) ``` +::: + ## Multinom Regression (`multinom_reg()`) -## `brulee` Engine +:::{.panel-tabset} + +## `brulee` We create a model specification via: @@ -1790,7 +1854,7 @@ predict(multinom_reg_fit, type = "class", new_data = mtl_test) predict(multinom_reg_fit, type = "prob", new_data = mtl_test) ``` -## `glmnet` Engine +## `glmnet` We create a model specification via: @@ -1817,7 +1881,7 @@ predict(multinom_reg_fit, type = "class", new_data = mtl_test) predict(multinom_reg_fit, type = "prob", new_data = mtl_test) ``` -## `h2o` Engine +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -1855,7 +1919,7 @@ predict(multinom_reg_fit, type = "class", new_data = mtl_test) predict(multinom_reg_fit, type = "prob", new_data = mtl_test) ``` -## `keras` Engine +## `keras` We create a model specification via: @@ -1884,7 +1948,7 @@ predict(multinom_reg_fit, type = "class", new_data = mtl_test) predict(multinom_reg_fit, type = "prob", new_data = mtl_test) ``` -## `nnet` Engine +## `nnet` We create a model specification via: @@ -1911,7 +1975,7 @@ predict(multinom_reg_fit, type = "class", new_data = mtl_test) predict(multinom_reg_fit, type = "prob", new_data = mtl_test) ``` -## `spark` Engine +## `spark` We create a model specification via: @@ -1940,10 +2004,13 @@ predict(multinom_reg_fit, type = "class", new_data = tbl_mtl$test) predict(multinom_reg_fit, type = "prob", new_data = tbl_mtl$test) ``` +::: ## Naive Bayes (`naive_Bayes()`) -## `h2o` Engine +:::{.panel-tabset} + +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -1981,7 +2048,7 @@ predict(naive_Bayes_fit, type = "class", new_data = bin_test) predict(naive_Bayes_fit, type = "prob", new_data = bin_test) ``` -## `klaR` Engine +## `klaR` This engine requires the discrim extension package, so let's load this first: @@ -2018,7 +2085,7 @@ predict(naive_Bayes_fit, type = "class", new_data = bin_test) predict(naive_Bayes_fit, type = "prob", new_data = bin_test) ``` -## `naivebayes` Engine +## `naivebayes` This engine requires the discrim extension package, so let's load this first: @@ -2053,9 +2120,13 @@ predict(naive_Bayes_fit, type = "class", new_data = bin_test) predict(naive_Bayes_fit, type = "prob", new_data = bin_test) ``` +::: + ## K-Nearest Neighbors (`nearest_neighbor()`) -## `kknn` Engine +:::{.panel-tabset} + +## `kknn` We create a model specification via: @@ -2083,9 +2154,13 @@ predict(nearest_neighbor_fit, type = "class", new_data = bin_test) predict(nearest_neighbor_fit, type = "prob", new_data = bin_test) ``` +::: + ## Null Model (`null_model()`) -## `parsnip` Engine +:::{.panel-tabset} + +## `parsnip` We create a model specification via: @@ -2113,9 +2188,13 @@ predict(null_model_fit, type = "class", new_data = bin_test) predict(null_model_fit, type = "prob", new_data = bin_test) ``` +::: + ## Partial Least Squares (`pls()`) -## `mixOmics` Engine +:::{.panel-tabset} + +## `mixOmics` This engine requires the plsmod extension package, so let's load this first: @@ -2151,9 +2230,13 @@ predict(pls_fit, type = "class", new_data = bin_test) predict(pls_fit, type = "prob", new_data = bin_test) ``` +::: + ## Random Forests (`rand_forest()`) -## `aorsf` Engine +:::{.panel-tabset} + +## `aorsf` This engine requires the bonsai extension package, so let's load this first: @@ -2189,7 +2272,7 @@ predict(rand_forest_fit, type = "class", new_data = bin_test) predict(rand_forest_fit, type = "prob", new_data = bin_test) ``` -## `grf` Engine +## `grf` We create a model specification via: @@ -2221,7 +2304,7 @@ predict(rand_forest_fit, type = "prob", new_data = bin_test) predict(rand_forest_fit, type = "conf_int", new_data = bin_test) ``` -## `h2o` Engine +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -2260,7 +2343,7 @@ predict(rand_forest_fit, type = "class", new_data = bin_test) predict(rand_forest_fit, type = "prob", new_data = bin_test) ``` -## `partykit` Engine +## `partykit` This engine requires the bonsai extension package, so let's load this first: @@ -2298,7 +2381,7 @@ predict(rand_forest_fit, type = "class", new_data = bin_test) predict(rand_forest_fit, type = "prob", new_data = bin_test) ``` -## `randomForest` Engine +## `randomForest` We create a model specification via: @@ -2326,7 +2409,7 @@ predict(rand_forest_fit, type = "class", new_data = bin_test) predict(rand_forest_fit, type = "prob", new_data = bin_test) ``` -## `ranger` Engine +## `ranger` We create a model specification via: @@ -2358,7 +2441,7 @@ predict(rand_forest_fit, type = "prob", new_data = bin_test) predict(rand_forest_fit, type = "conf_int", new_data = bin_test) ``` -## `spark` Engine +## `spark` We create a model specification via: @@ -2388,9 +2471,13 @@ predict(rand_forest_fit, type = "class", new_data = tbl_bin$test) predict(rand_forest_fit, type = "prob", new_data = tbl_bin$test) ``` +::: + ## Rule Fit (`rule_fit()`) -## `h2o` Engine +:::{.panel-tabset} + +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -2429,7 +2516,7 @@ predict(rule_fit_fit, type = "class", new_data = bin_test) predict(rule_fit_fit, type = "prob", new_data = bin_test) ``` -## `xrf` Engine +## `xrf` This engine requires the rules extension package, so let's load this first: @@ -2467,7 +2554,7 @@ predict(rule_fit_fit, type = "prob", new_data = bin_test) ## Support Vector Machine (Linear Kernel) (`svm_linear()`) -## `kernlab` Engine +## `kernlab` We create a model specification via: @@ -2495,7 +2582,7 @@ predict(svm_linear_fit, type = "class", new_data = bin_test) predict(svm_linear_fit, type = "prob", new_data = bin_test) ``` -## `LiblineaR` Engine +## `LiblineaR` We create a model specification via: @@ -2522,9 +2609,13 @@ The holdout data can be predicted: predict(svm_linear_fit, type = "class", new_data = bin_test) ``` +::: + ## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) -## `kernlab` Engine +:::{.panel-tabset} + +## `kernlab` We create a model specification via: @@ -2552,9 +2643,13 @@ predict(svm_poly_fit, type = "class", new_data = bin_test) predict(svm_poly_fit, type = "prob", new_data = bin_test) ``` +::: + ## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) -## `kernlab` Engine +:::{.panel-tabset} + +## `kernlab` We create a model specification via: @@ -2582,7 +2677,7 @@ predict(svm_rbf_fit, type = "class", new_data = bin_test) predict(svm_rbf_fit, type = "prob", new_data = bin_test) ``` -## `liquidSVM` Engine +## `liquidSVM` Note that this package is not on CRAN. You can install it via its : @@ -2620,6 +2715,8 @@ predict(svm_rbf_fit, type = "class", new_data = bin_test) predict(svm_rbf_fit, type = "prob", new_data = bin_test) ``` +::: + # Regression Models @@ -2683,7 +2780,7 @@ tbl_reg <- sdf_random_split(tbl_concrete, training = 0.95, test = 0.05, seed = 1 ## Auto Ml (`auto_ml()`) -## `h2o` Engine +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -2724,7 +2821,7 @@ predict(auto_ml_fit, new_data = reg_test) ## Bagged MARS (`bag_mars()`) -## `earth` Engine +## `earth` This engine requires the baguette extension package, so let's load this first: @@ -2761,7 +2858,7 @@ predict(bag_mars_fit, new_data = reg_test) ## Bagged Neural Networks (`bag_mlp()`) -## `nnet` Engine +## `nnet` This engine requires the baguette extension package, so let's load this first: @@ -2798,7 +2895,7 @@ predict(bag_mlp_fit, new_data = reg_test) ## Bagged Decision Trees (`bag_tree()`) -## `rpart` Engine +## `rpart` This engine requires the baguette extension package, so let's load this first: @@ -2835,7 +2932,7 @@ predict(bag_tree_fit, new_data = reg_test) ## Bayesian Additive Regression Trees (`bart()`) -## `dbarts` Engine +## `dbarts` We create a model specification via: @@ -2866,7 +2963,7 @@ predict(bart_fit, type = "pred_int", new_data = reg_test) ## Boosted Decision Trees (`boost_tree()`) -## `catboost` Engine +## `catboost` This engine requires the bonsai extension package, so let's load this first: @@ -2901,7 +2998,7 @@ The holdout data can be predicted: predict(boost_tree_fit, new_data = reg_test) ``` -## `h2o` Engine +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -2937,7 +3034,7 @@ The holdout data can be predicted: predict(boost_tree_fit, new_data = reg_test) ``` -## `h2o_gbm` Engine +## `h2o_gbm` This engine requires the agua extension package, so let's load this first: @@ -2974,7 +3071,7 @@ The holdout data can be predicted: predict(boost_tree_fit, new_data = reg_test) ``` -## `lightgbm` Engine +## `lightgbm` This engine requires the bonsai extension package, so let's load this first: @@ -3009,7 +3106,7 @@ The holdout data can be predicted: predict(boost_tree_fit, new_data = reg_test) ``` -## `xgboost` Engine +## `xgboost` We create a model specification via: @@ -3036,7 +3133,7 @@ The holdout data can be predicted: predict(boost_tree_fit, new_data = reg_test) ``` -## `spark` Engine +## `spark` We create a model specification via: @@ -3067,7 +3164,7 @@ predict(boost_tree_fit, new_data = tbl_reg$test) ## Cubist Rules (`cubist_rules()`) -## `Cubist` Engine +## `Cubist` This engine requires the rules extension package, so let's load this first: @@ -3103,7 +3200,7 @@ predict(cubist_rules_fit, new_data = reg_test) ## Decision Tree (`decision_tree()`) -## `partykit` Engine +## `partykit` This engine requires the bonsai extension package, so let's load this first: @@ -3138,7 +3235,7 @@ The holdout data can be predicted: predict(decision_tree_fit, new_data = reg_test) ``` -## `rpart` Engine +## `rpart` We create a model specification via: @@ -3165,7 +3262,7 @@ The holdout data can be predicted: predict(decision_tree_fit, new_data = reg_test) ``` -## `spark` Engine +## `spark` We create a model specification via: @@ -3198,7 +3295,7 @@ predict(decision_tree_fit, new_data = tbl_reg$test) ## Generalized Additive Models (`gen_additive_mod()`) -## `mgcv` Engine +## `mgcv` We create a model specification via: @@ -3230,7 +3327,7 @@ predict(gen_additive_mod_fit, type = "conf_int", new_data = reg_test) ## Linear Reg (`linear_reg()`) -## `brulee` Engine +## `brulee` We create a model specification via: @@ -3256,7 +3353,7 @@ The holdout data can be predicted: predict(linear_reg_fit, new_data = reg_test) ``` -## `gee` Engine +## `gee` This engine requires the multilevelmod extension package, so let's load this first: @@ -3292,7 +3389,7 @@ The holdout data can be predicted: predict(linear_reg_fit, new_data = reg_test) ``` -## `glm` Engine +## `glm` We create a model specification via: @@ -3319,7 +3416,7 @@ predict(linear_reg_fit, new_data = reg_test) predict(linear_reg_fit, type = "conf_int", new_data = reg_test) ``` -## `glmer` Engine +## `glmer` This engine requires the multilevelmod extension package, so let's load this first: @@ -3355,7 +3452,7 @@ The holdout data can be predicted: predict(linear_reg_fit, new_data = reg_test) ``` -## `glmnet` Engine +## `glmnet` We create a model specification via: @@ -3381,7 +3478,7 @@ The holdout data can be predicted: predict(linear_reg_fit, new_data = reg_test) ``` -## `gls` Engine +## `gls` This engine requires the multilevelmod extension package, so let's load this first: @@ -3417,7 +3514,7 @@ The holdout data can be predicted: predict(linear_reg_fit, new_data = reg_test) ``` -## `h2o` Engine +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -3454,7 +3551,7 @@ The holdout data can be predicted: predict(linear_reg_fit, new_data = reg_test) ``` -## `keras` Engine +## `keras` We create a model specification via: @@ -3482,7 +3579,7 @@ The holdout data can be predicted: predict(linear_reg_fit, new_data = reg_test) ``` -## `lm` Engine +## `lm` We create a model specification via: @@ -3510,7 +3607,7 @@ predict(linear_reg_fit, type = "conf_int", new_data = reg_test) predict(linear_reg_fit, type = "pred_int", new_data = reg_test) ``` -## `lme` Engine +## `lme` This engine requires the multilevelmod extension package, so let's load this first: @@ -3546,7 +3643,7 @@ The holdout data can be predicted: predict(linear_reg_fit, new_data = reg_test) ``` -## `lmer` Engine +## `lmer` This engine requires the multilevelmod extension package, so let's load this first: @@ -3582,7 +3679,7 @@ The holdout data can be predicted: predict(linear_reg_fit, new_data = reg_test) ``` -## `stan` Engine +## `stan` We create a model specification via: @@ -3610,7 +3707,7 @@ predict(linear_reg_fit, type = "conf_int", new_data = reg_test) predict(linear_reg_fit, type = "pred_int", new_data = reg_test) ``` -## `stan_glmer` Engine +## `stan_glmer` This engine requires the multilevelmod extension package, so let's load this first: @@ -3647,7 +3744,7 @@ predict(linear_reg_fit, new_data = reg_test) predict(linear_reg_fit, type = "pred_int", new_data = reg_test) ``` -## `spark` Engine +## `spark` We create a model specification via: @@ -3677,7 +3774,7 @@ predict(linear_reg_fit, new_data = tbl_reg$test) ## Multivariate Adaptive Regression Splines (`mars()`) -## `earth` Engine +## `earth` We create a model specification via: @@ -3706,7 +3803,7 @@ predict(mars_fit, new_data = reg_test) ## Neural Networks (`mlp()`) -## `brulee` Engine +## `brulee` We create a model specification via: @@ -3733,7 +3830,7 @@ The holdout data can be predicted: predict(mlp_fit, new_data = reg_test) ``` -## `brulee_two_layer` Engine +## `brulee_two_layer` We create a model specification via: @@ -3760,7 +3857,7 @@ The holdout data can be predicted: predict(mlp_fit, new_data = reg_test) ``` -## `h2o` Engine +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -3798,7 +3895,7 @@ The holdout data can be predicted: predict(mlp_fit, new_data = reg_test) ``` -## `keras` Engine +## `keras` We create a model specification via: @@ -3827,7 +3924,7 @@ The holdout data can be predicted: predict(mlp_fit, new_data = reg_test) ``` -## `nnet` Engine +## `nnet` We create a model specification via: @@ -3856,7 +3953,7 @@ predict(mlp_fit, new_data = reg_test) ## K-Nearest Neighbors (`nearest_neighbor()`) -## `kknn` Engine +## `kknn` We create a model specification via: @@ -3885,7 +3982,7 @@ predict(nearest_neighbor_fit, new_data = reg_test) ## Null Model (`null_model()`) -## `parsnip` Engine +## `parsnip` We create a model specification via: @@ -3914,7 +4011,7 @@ predict(null_model_fit, new_data = reg_test) ## Partial Least Squares (`pls()`) -## `mixOmics` Engine +## `mixOmics` This engine requires the plsmod extension package, so let's load this first: @@ -3951,7 +4048,7 @@ predict(pls_fit, new_data = reg_test) ## Poisson Reg (`poisson_reg()`) -## `gee` Engine +## `gee` This engine requires the multilevelmod extension package, so let's load this first: @@ -3987,7 +4084,7 @@ The holdout data can be predicted: predict(poisson_reg_fit, new_data = reg_test) ``` -## `glm` Engine +## `glm` This engine requires the poissonreg extension package, so let's load this first: @@ -4021,7 +4118,7 @@ The holdout data can be predicted: predict(poisson_reg_fit, new_data = count_test) ``` -## `glmer` Engine +## `glmer` This engine requires the multilevelmod extension package, so let's load this first: @@ -4057,7 +4154,7 @@ The holdout data can be predicted: predict(poisson_reg_fit, new_data = reg_test) ``` -## `glmnet` Engine +## `glmnet` This engine requires the poissonreg extension package, so let's load this first: @@ -4091,7 +4188,7 @@ The holdout data can be predicted: predict(poisson_reg_fit, new_data = count_test) ``` -## `h2o` Engine +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -4128,7 +4225,7 @@ The holdout data can be predicted: predict(poisson_reg_fit, new_data = count_test) ``` -## `hurdle` Engine +## `hurdle` This engine requires the poissonreg extension package, so let's load this first: @@ -4162,7 +4259,7 @@ The holdout data can be predicted: predict(poisson_reg_fit, new_data = count_test) ``` -## `stan` Engine +## `stan` This engine requires the poissonreg extension package, so let's load this first: @@ -4200,7 +4297,7 @@ predict(poisson_reg_fit, type = "conf_int", new_data = reg_test) predict(poisson_reg_fit, type = "pred_int", new_data = reg_test) ``` -## `stan_glmer` Engine +## `stan_glmer` This engine requires the multilevelmod extension package, so let's load this first: @@ -4237,7 +4334,7 @@ predict(poisson_reg_fit, new_data = reg_test) predict(poisson_reg_fit, type = "pred_int", new_data = reg_test) ``` -## `zeroinfl` Engine +## `zeroinfl` This engine requires the poissonreg extension package, so let's load this first: @@ -4273,7 +4370,7 @@ predict(poisson_reg_fit, new_data = count_test) ## Random Forests (`rand_forest()`) -## `aorsf` Engine +## `aorsf` This engine requires the bonsai extension package, so let's load this first: @@ -4308,7 +4405,7 @@ The holdout data can be predicted: predict(rand_forest_fit, new_data = reg_test) ``` -## `grf` Engine +## `grf` We create a model specification via: @@ -4339,7 +4436,7 @@ predict(rand_forest_fit, new_data = reg_test) predict(rand_forest_fit, type = "conf_int", new_data = reg_test) ``` -## `h2o` Engine +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -4377,7 +4474,7 @@ The holdout data can be predicted: predict(rand_forest_fit, new_data = reg_test) ``` -## `partykit` Engine +## `partykit` This engine requires the bonsai extension package, so let's load this first: @@ -4414,7 +4511,7 @@ The holdout data can be predicted: predict(rand_forest_fit, new_data = reg_test) ``` -## `randomForest` Engine +## `randomForest` We create a model specification via: @@ -4441,7 +4538,7 @@ The holdout data can be predicted: predict(rand_forest_fit, new_data = reg_test) ``` -## `ranger` Engine +## `ranger` We create a model specification via: @@ -4472,7 +4569,7 @@ predict(rand_forest_fit, new_data = reg_test) predict(rand_forest_fit, type = "conf_int", new_data = reg_test) ``` -## `spark` Engine +## `spark` We create a model specification via: @@ -4503,7 +4600,7 @@ predict(rand_forest_fit, new_data = tbl_reg$test) ## Rule Fit (`rule_fit()`) -## `h2o` Engine +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -4541,7 +4638,7 @@ The holdout data can be predicted: predict(rule_fit_fit, new_data = reg_test) ``` -## `xrf` Engine +## `xrf` This engine requires the rules extension package, so let's load this first: @@ -4578,7 +4675,7 @@ predict(rule_fit_fit, new_data = reg_test) ## Support Vector Machine (Linear Kernel) (`svm_linear()`) -## `kernlab` Engine +## `kernlab` We create a model specification via: @@ -4605,7 +4702,7 @@ The holdout data can be predicted: predict(svm_linear_fit, new_data = reg_test) ``` -## `LiblineaR` Engine +## `LiblineaR` We create a model specification via: @@ -4634,7 +4731,7 @@ predict(svm_linear_fit, new_data = reg_test) ## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) -## `kernlab` Engine +## `kernlab` We create a model specification via: @@ -4663,7 +4760,7 @@ predict(svm_poly_fit, new_data = reg_test) ## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) -## `kernlab` Engine +## `kernlab` We create a model specification via: @@ -4690,7 +4787,7 @@ The holdout data can be predicted: predict(svm_rbf_fit, new_data = reg_test) ``` -## `liquidSVM` Engine +## `liquidSVM` We create a model specification via: @@ -4749,7 +4846,7 @@ eval_times <- c(1, 3, 5) ## Bagged Decision Trees (`bag_tree()`) -## `rpart` Engine +## `rpart` This engine requires the censored extension package, so let's load this first: @@ -4787,7 +4884,7 @@ predict(bag_tree_fit, type = "survival", new_data = cns_test, eval_time = eval_t ## Boosted Decision Trees (`boost_tree()`) -## `mboost` Engine +## `mboost` This engine requires the censored extension package, so let's load this first: @@ -4825,7 +4922,7 @@ predict(boost_tree_fit, type = "linear_pred", new_data = cns_test) ## Decision Tree (`decision_tree()`) -## `partykit` Engine +## `partykit` This engine requires the censored extension package, so let's load this first: @@ -4861,7 +4958,7 @@ predict(decision_tree_fit, type = "time", new_data = cns_test) predict(decision_tree_fit, type = "survival", new_data = cns_test, eval_time = eval_times) ``` -## `rpart` Engine +## `rpart` This engine requires the censored extension package, so let's load this first: @@ -4899,7 +4996,7 @@ predict(decision_tree_fit, type = "survival", new_data = cns_test, eval_time = e ## Proportional Hazards (`proportional_hazards()`) -## `glmnet` Engine +## `glmnet` This engine requires the censored extension package, so let's load this first: @@ -4935,7 +5032,7 @@ predict(proportional_hazards_fit, type = "survival", new_data = cns_test, eval_t predict(proportional_hazards_fit, type = "linear_pred", new_data = cns_test) ``` -## `survival` Engine +## `survival` This engine requires the censored extension package, so let's load this first: @@ -4973,7 +5070,7 @@ predict(proportional_hazards_fit, type = "linear_pred", new_data = cns_test) ## Random Forests (`rand_forest()`) -## `aorsf` Engine +## `aorsf` This engine requires the censored extension package, so let's load this first: @@ -5009,7 +5106,7 @@ predict(rand_forest_fit, type = "time", new_data = cns_test) predict(rand_forest_fit, type = "survival", new_data = cns_test, eval_time = eval_times) ``` -## `partykit` Engine +## `partykit` This engine requires the censored extension package, so let's load this first: @@ -5049,7 +5146,7 @@ predict(rand_forest_fit, type = "survival", new_data = cns_test, eval_time = eva ## Parametric Survival Models (`survival_reg()`) -## `flexsurv` Engine +## `flexsurv` This engine requires the censored extension package, so let's load this first: @@ -5087,7 +5184,7 @@ predict(survival_reg_fit, type = "linear_pred", new_data = cns_test) predict(survival_reg_fit, type = "quantile", new_data = cns_test) ``` -## `flexsurvspline` Engine +## `flexsurvspline` This engine requires the censored extension package, so let's load this first: @@ -5125,7 +5222,7 @@ predict(survival_reg_fit, type = "linear_pred", new_data = cns_test) predict(survival_reg_fit, type = "quantile", new_data = cns_test) ``` -## `survival` Engine +## `survival` This engine requires the censored extension package, so let's load this first: @@ -5196,7 +5293,7 @@ qnt_lvls <- (1:3) / 4 ## Linear Regression (`linear_reg()`) -## `quantreg` Engine +## `quantreg` We create a model specification via: @@ -5224,7 +5321,7 @@ predict(linear_reg_fit, type = "quantile", new_data = qnt_test) ## Random Forests (`rand_forest()`) -## `grf` Engine +## `grf` We create a model specification via: From e6ea950f2755384b9c165cf3468de65a6e88f5da Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Thu, 20 Nov 2025 11:35:27 -0600 Subject: [PATCH 14/23] Completes tabset panel setup --- learn/models/parsnip-predictions/index.qmd | 116 ++++++++++++++++++++- 1 file changed, 114 insertions(+), 2 deletions(-) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 5eec057d..5abd70c5 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -281,7 +281,6 @@ tbl_mtl <- sdf_random_split(tbl_sim_mtl, training = 0.967, test = 1-0.967, seed ``` - ## Auto Ml (`auto_ml()`) :::{.panel-tabset} @@ -2552,8 +2551,12 @@ predict(rule_fit_fit, type = "class", new_data = bin_test) predict(rule_fit_fit, type = "prob", new_data = bin_test) ``` +::: + ## Support Vector Machine (Linear Kernel) (`svm_linear()`) +:::{.panel-tabset} + ## `kernlab` We create a model specification via: @@ -2780,6 +2783,8 @@ tbl_reg <- sdf_random_split(tbl_concrete, training = 0.95, test = 0.05, seed = 1 ## Auto Ml (`auto_ml()`) +:::{.panel-tabset} + ## `h2o` This engine requires the agua extension package, so let's load this first: @@ -2819,8 +2824,12 @@ The holdout data can be predicted: predict(auto_ml_fit, new_data = reg_test) ``` +::: + ## Bagged MARS (`bag_mars()`) +:::{.panel-tabset} + ## `earth` This engine requires the baguette extension package, so let's load this first: @@ -2856,8 +2865,12 @@ The holdout data can be predicted: predict(bag_mars_fit, new_data = reg_test) ``` +::: + ## Bagged Neural Networks (`bag_mlp()`) +:::{.panel-tabset} + ## `nnet` This engine requires the baguette extension package, so let's load this first: @@ -2893,8 +2906,12 @@ The holdout data can be predicted: predict(bag_mlp_fit, new_data = reg_test) ``` +::: + ## Bagged Decision Trees (`bag_tree()`) +:::{.panel-tabset} + ## `rpart` This engine requires the baguette extension package, so let's load this first: @@ -2930,8 +2947,12 @@ The holdout data can be predicted: predict(bag_tree_fit, new_data = reg_test) ``` +::: + ## Bayesian Additive Regression Trees (`bart()`) +:::{.panel-tabset} + ## `dbarts` We create a model specification via: @@ -2961,8 +2982,12 @@ predict(bart_fit, type = "conf_int", new_data = reg_test) predict(bart_fit, type = "pred_int", new_data = reg_test) ``` +::: + ## Boosted Decision Trees (`boost_tree()`) +:::{.panel-tabset} + ## `catboost` This engine requires the bonsai extension package, so let's load this first: @@ -3162,8 +3187,12 @@ The holdout data can be predicted: predict(boost_tree_fit, new_data = tbl_reg$test) ``` +::: + ## Cubist Rules (`cubist_rules()`) +:::{.panel-tabset} + ## `Cubist` This engine requires the rules extension package, so let's load this first: @@ -3198,8 +3227,12 @@ The holdout data can be predicted: predict(cubist_rules_fit, new_data = reg_test) ``` +::: + ## Decision Tree (`decision_tree()`) +:::{.panel-tabset} + ## `partykit` This engine requires the bonsai extension package, so let's load this first: @@ -3291,10 +3324,12 @@ The holdout data can be predicted: predict(decision_tree_fit, new_data = tbl_reg$test) ``` - +::: ## Generalized Additive Models (`gen_additive_mod()`) +:::{.panel-tabset} + ## `mgcv` We create a model specification via: @@ -3325,8 +3360,12 @@ predict(gen_additive_mod_fit, new_data = reg_test) predict(gen_additive_mod_fit, type = "conf_int", new_data = reg_test) ``` +::: + ## Linear Reg (`linear_reg()`) +:::{.panel-tabset} + ## `brulee` We create a model specification via: @@ -3772,8 +3811,12 @@ The holdout data can be predicted: predict(linear_reg_fit, new_data = tbl_reg$test) ``` +::: + ## Multivariate Adaptive Regression Splines (`mars()`) +:::{.panel-tabset} + ## `earth` We create a model specification via: @@ -3801,8 +3844,12 @@ The holdout data can be predicted: predict(mars_fit, new_data = reg_test) ``` +::: + ## Neural Networks (`mlp()`) +:::{.panel-tabset} + ## `brulee` We create a model specification via: @@ -3951,8 +3998,12 @@ The holdout data can be predicted: predict(mlp_fit, new_data = reg_test) ``` +::: + ## K-Nearest Neighbors (`nearest_neighbor()`) +:::{.panel-tabset} + ## `kknn` We create a model specification via: @@ -4009,8 +4060,12 @@ The holdout data can be predicted: predict(null_model_fit, new_data = reg_test) ``` +::: + ## Partial Least Squares (`pls()`) +:::{.panel-tabset} + ## `mixOmics` This engine requires the plsmod extension package, so let's load this first: @@ -4046,8 +4101,12 @@ The holdout data can be predicted: predict(pls_fit, new_data = reg_test) ``` +::: + ## Poisson Reg (`poisson_reg()`) +:::{.panel-tabset} + ## `gee` This engine requires the multilevelmod extension package, so let's load this first: @@ -4368,8 +4427,12 @@ The holdout data can be predicted: predict(poisson_reg_fit, new_data = count_test) ``` +::: + ## Random Forests (`rand_forest()`) +:::{.panel-tabset} + ## `aorsf` This engine requires the bonsai extension package, so let's load this first: @@ -4598,8 +4661,12 @@ The holdout data can be predicted: predict(rand_forest_fit, new_data = tbl_reg$test) ``` +::: + ## Rule Fit (`rule_fit()`) +:::{.panel-tabset} + ## `h2o` This engine requires the agua extension package, so let's load this first: @@ -4673,8 +4740,12 @@ The holdout data can be predicted: predict(rule_fit_fit, new_data = reg_test) ``` +::: + ## Support Vector Machine (Linear Kernel) (`svm_linear()`) +:::{.panel-tabset} + ## `kernlab` We create a model specification via: @@ -4729,8 +4800,12 @@ The holdout data can be predicted: predict(svm_linear_fit, new_data = reg_test) ``` +::: + ## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) +:::{.panel-tabset} + ## `kernlab` We create a model specification via: @@ -4758,8 +4833,12 @@ The holdout data can be predicted: predict(svm_poly_fit, new_data = reg_test) ``` +::: + ## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) +:::{.panel-tabset} + ## `kernlab` We create a model specification via: @@ -4844,8 +4923,12 @@ For some types of predictions, we need the _evaluation time(s)_ for the predicti eval_times <- c(1, 3, 5) ``` +::: + ## Bagged Decision Trees (`bag_tree()`) +:::{.panel-tabset} + ## `rpart` This engine requires the censored extension package, so let's load this first: @@ -4882,8 +4965,12 @@ predict(bag_tree_fit, type = "time", new_data = cns_test) predict(bag_tree_fit, type = "survival", new_data = cns_test, eval_time = eval_times) ``` +::: + ## Boosted Decision Trees (`boost_tree()`) +:::{.panel-tabset} + ## `mboost` This engine requires the censored extension package, so let's load this first: @@ -4920,8 +5007,12 @@ predict(boost_tree_fit, type = "survival", new_data = cns_test, eval_time = eval predict(boost_tree_fit, type = "linear_pred", new_data = cns_test) ``` +::: + ## Decision Tree (`decision_tree()`) +:::{.panel-tabset} + ## `partykit` This engine requires the censored extension package, so let's load this first: @@ -4994,8 +5085,12 @@ predict(decision_tree_fit, type = "time", new_data = cns_test) predict(decision_tree_fit, type = "survival", new_data = cns_test, eval_time = eval_times) ``` +::: + ## Proportional Hazards (`proportional_hazards()`) +:::{.panel-tabset} + ## `glmnet` This engine requires the censored extension package, so let's load this first: @@ -5068,8 +5163,12 @@ predict(proportional_hazards_fit, type = "survival", new_data = cns_test, eval_t predict(proportional_hazards_fit, type = "linear_pred", new_data = cns_test) ``` +::: + ## Random Forests (`rand_forest()`) +:::{.panel-tabset} + ## `aorsf` This engine requires the censored extension package, so let's load this first: @@ -5144,8 +5243,12 @@ predict(rand_forest_fit, type = "time", new_data = cns_test) predict(rand_forest_fit, type = "survival", new_data = cns_test, eval_time = eval_times) ``` +::: + ## Parametric Survival Models (`survival_reg()`) +:::{.panel-tabset} + ## `flexsurv` This engine requires the censored extension package, so let's load this first: @@ -5290,9 +5393,12 @@ We'll also predict these quantile levels: qnt_lvls <- (1:3) / 4 ``` +::: ## Linear Regression (`linear_reg()`) +:::{.panel-tabset} + ## `quantreg` We create a model specification via: @@ -5319,8 +5425,12 @@ The holdout data can be predicted: predict(linear_reg_fit, type = "quantile", new_data = qnt_test) ``` +::: + ## Random Forests (`rand_forest()`) +:::{.panel-tabset} + ## `grf` We create a model specification via: @@ -5350,6 +5460,8 @@ The holdout data can be predicted: predict(rand_forest_fit, type = "quantile", new_data = qnt_test) ``` +::: + ```{r} #| label: spark-disconnect #| include: false From 2e5f083e6b517a3c844b4b1545cd06a37d9e9faa Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 20 Nov 2025 12:51:18 -0500 Subject: [PATCH 15/23] pull sections out of tabsets --- learn/models/parsnip-predictions/index.qmd | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 5abd70c5..5f62e55b 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -4895,6 +4895,8 @@ The holdout data can be predicted: predict(svm_rbf_fit, new_data = reg_test) ``` +::: + # Censored Regression Models Let's simulate a data set using the prodlim and survival packages: @@ -4923,7 +4925,6 @@ For some types of predictions, we need the _evaluation time(s)_ for the predicti eval_times <- c(1, 3, 5) ``` -::: ## Bagged Decision Trees (`bag_tree()`) @@ -5363,6 +5364,8 @@ predict(survival_reg_fit, type = "linear_pred", new_data = cns_test) predict(survival_reg_fit, type = "quantile", new_data = cns_test) ``` +::: + # Quantile Regression Models To demonstrate quantile regression, let's make a larger version of our regression data: @@ -5393,8 +5396,6 @@ We'll also predict these quantile levels: qnt_lvls <- (1:3) / 4 ``` -::: - ## Linear Regression (`linear_reg()`) :::{.panel-tabset} From 0c4803685a96fcf0eb296ee12e44fb17c8a15261 Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 20 Nov 2025 17:46:12 -0500 Subject: [PATCH 16/23] move default engine to top and remove automl (for now) --- learn/models/parsnip-predictions/index.qmd | 1244 ++++++++++---------- 1 file changed, 633 insertions(+), 611 deletions(-) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 5f62e55b..024922a0 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -12,8 +12,6 @@ description: | toc: true toc-depth: 3 include-after-body: ../../../resources.html -execute: - eval: false --- ```{r} @@ -25,8 +23,8 @@ execute: source(here::here("common.R")) # Indicates to enable or not running Spark code -run_spark <- FALSE -run_h2o <- FALSE +run_spark <- TRUE +run_h2o <- TRUE ``` ```{r} @@ -60,10 +58,7 @@ The following examples use consistent data sets throughout. todo - multielvel examples -- get automl working -- expand survival prediction tibbles - keras3 updates -- use `
` for long model prints - avoid subsection titles capitalizing the engine name (e.g., "CATBOOST") and text within backticks - set seeds when needed @@ -160,7 +155,7 @@ Once that is working, you can get ready to fit models using: #| label: spark-connect #| eval: !expr 'run_spark' library(sparklyr) -sc <- spark_connect("local", version = "4.0.1") +sc <- spark_connect("local") ``` ### h2o @@ -269,7 +264,7 @@ and then use it to create the splits. For this article, we will copy the #| label: spark-bin-data #| eval: !expr 'run_spark' library(sparklyr) -sc <- spark_connect("local", version = "4.0.1") +sc <- spark_connect("local") tbl_two_class <- copy_to(sc, modeldata::two_class_dat) @@ -281,55 +276,6 @@ tbl_mtl <- sdf_random_split(tbl_sim_mtl, training = 0.967, test = 1-0.967, seed ``` -## Auto Ml (`auto_ml()`) - -:::{.panel-tabset} - -## `h2o` - -This engine requires the agua extension package, so let's load this first: - -```{r} -#| label: load-h2o-auto-ml-classification-agua -#| output: false -library(agua) - -# and initialize a server -h20_server <- agua::h2o_start() -``` - -We create a model specification via: - -```{r} -#| label: spec-h2o-auto-ml-classification -#| eval: false -auto_ml_spec <- auto_ml() |> - # We dont need to set the engine (since there is only one) but we'll set - # a time limit - set_engine("h2o", max_runtime_secs = 60 * 3) |> - set_mode("classification") -``` - -Now we create the model fit object: - -```{r} -#| label: fit-h2o-auto-ml-classification -#| eval: false -auto_ml_fit <- auto_ml_spec |> fit(class ~ ., data = bin_train) -auto_ml_fit -``` - -The holdout data can be predicted: - -```{r} -#| label: predict-h2o-auto-ml-classification -#| eval: false -predict(auto_ml_fit, type = "class", new_data = bin_test) -predict(auto_ml_fit, type = "prob", new_data = bin_test) -``` - -::: - ## Bagged MARS (`bag_mars()`) :::{.panel-tabset} @@ -418,12 +364,12 @@ predict(bag_mlp_fit, type = "prob", new_data = bin_test) :::{.panel-tabset} -## `C5.0` +## `rpart` This engine requires the baguette extension package, so let's load this first: ```{r} -#| label: load-C5.0-bag-tree-classification-baguette +#| label: load-rpart-bag-tree-classification-baguette #| output: false library(baguette) ``` @@ -431,16 +377,17 @@ library(baguette) We create a model specification via: ```{r} -#| label: spec-C5.0-bag-tree-classification -bag_tree_spec <- bag_tree() |> - set_mode("classification") |> - set_engine("C5.0") +#| label: spec-rpart-bag-tree-classification +bag_tree_spec <- bag_tree() |> + # We need to set the mode since this engine works with multiple modes + # and rpart is the default engine so there is no need to set that either. + set_mode("classification") ``` Now we create the model fit object: ```{r} -#| label: fit-C5.0-bag-tree-classification +#| label: fit-rpart-bag-tree-classification bag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train) bag_tree_fit ``` @@ -448,17 +395,17 @@ bag_tree_fit The holdout data can be predicted: ```{r} -#| label: predict-C5.0-bag-tree-classification +#| label: predict-rpart-bag-tree-classification predict(bag_tree_fit, type = "class", new_data = bin_test) predict(bag_tree_fit, type = "prob", new_data = bin_test) ``` -## `rpart` +## `C5.0` This engine requires the baguette extension package, so let's load this first: ```{r} -#| label: load-rpart-bag-tree-classification-baguette +#| label: load-C5.0-bag-tree-classification-baguette #| output: false library(baguette) ``` @@ -466,17 +413,16 @@ library(baguette) We create a model specification via: ```{r} -#| label: spec-rpart-bag-tree-classification -bag_tree_spec <- bag_tree() |> - # We need to set the mode since this engine works with multiple modes - # and rpart is the default engine so there is no need to set that either. - set_mode("classification") +#| label: spec-C5.0-bag-tree-classification +bag_tree_spec <- bag_tree() |> + set_mode("classification") |> + set_engine("C5.0") ``` Now we create the model fit object: ```{r} -#| label: fit-rpart-bag-tree-classification +#| label: fit-C5.0-bag-tree-classification bag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train) bag_tree_fit ``` @@ -484,7 +430,7 @@ bag_tree_fit The holdout data can be predicted: ```{r} -#| label: predict-rpart-bag-tree-classification +#| label: predict-C5.0-bag-tree-classification predict(bag_tree_fit, type = "class", new_data = bin_test) predict(bag_tree_fit, type = "prob", new_data = bin_test) ``` @@ -531,6 +477,34 @@ predict(bart_fit, type = "pred_int", new_data = bin_test) :::{.panel-tabset} +## `xgboost` + +We create a model specification via: + +```{r} +#| label: spec-xgboost-boost-tree-classification +boost_tree_spec <- boost_tree() |> + # We need to set the mode since this engine works with multiple modes + # and xgboost is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-xgboost-boost-tree-classification +boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-xgboost-boost-tree-classification +predict(boost_tree_fit, type = "class", new_data = bin_test) +predict(boost_tree_fit, type = "prob", new_data = bin_test) +``` + ## `C5.0` We create a model specification via: @@ -708,34 +682,6 @@ predict(boost_tree_fit, type = "class", new_data = bin_test) predict(boost_tree_fit, type = "prob", new_data = bin_test) ``` -## `xgboost` - -We create a model specification via: - -```{r} -#| label: spec-xgboost-boost-tree-classification -boost_tree_spec <- boost_tree() |> - # We need to set the mode since this engine works with multiple modes - # and xgboost is the default engine so there is no need to set that either. - set_mode("classification") -``` - -Now we create the model fit object: - -```{r} -#| label: fit-xgboost-boost-tree-classification -boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) -boost_tree_fit -``` - -The holdout data can be predicted: - -```{r} -#| label: predict-xgboost-boost-tree-classification -predict(boost_tree_fit, type = "class", new_data = bin_test) -predict(boost_tree_fit, type = "prob", new_data = bin_test) -``` - ## `spark` We create a model specification via: @@ -813,6 +759,34 @@ predict(C5_rules_fit, type = "prob", new_data = bin_test) :::{.panel-tabset} +## `rpart` + +We create a model specification via: + +```{r} +#| label: spec-rpart-decision-tree-classification +decision_tree_spec <- decision_tree() |> + # We need to set the mode since this engine works with multiple modes + # and rpart is the default engine so there is no need to set that either. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-rpart-decision-tree-classification +decision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train) +decision_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-rpart-decision-tree-classification +predict(decision_tree_fit, type = "class", new_data = bin_test) +predict(decision_tree_fit, type = "prob", new_data = bin_test) +``` + ## `C5.0` We create a model specification via: @@ -876,33 +850,6 @@ predict(decision_tree_fit, type = "class", new_data = bin_test) predict(decision_tree_fit, type = "prob", new_data = bin_test) ``` -## `rpart` - -We create a model specification via: - -```{r} -#| label: spec-rpart-decision-tree-classification -decision_tree_spec <- decision_tree() |> - # We need to set the mode since this engine works with multiple modes - # and rpart is the default engine so there is no need to set that either. - set_mode("classification") -``` - -Now we create the model fit object: - -```{r} -#| label: fit-rpart-decision-tree-classification -decision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train) -decision_tree_fit -``` - -The holdout data can be predicted: - -```{r} -#| label: predict-rpart-decision-tree-classification -predict(decision_tree_fit, type = "class", new_data = bin_test) -predict(decision_tree_fit, type = "prob", new_data = bin_test) -``` ## `spark` @@ -1280,21 +1227,21 @@ predict(gen_additive_mod_fit, type = "conf_int", new_data = bin_test) :::{.panel-tabset} -## `brulee` +## `glm` We create a model specification via: ```{r} -#| label: spec-brulee-logistic-reg-classification -logistic_reg_spec <- logistic_reg() |> +#| label: spec-glm-logistic-reg-classification +logistic_reg_spec <- logistic_reg() # This engine works with a single mode so no need to set that - set_engine("brulee") + # and glm is the default engine so there is no need to set that either. ``` Now we create the model fit object: ```{r} -#| label: fit-brulee-logistic-reg-classification +#| label: fit-glm-logistic-reg-classification logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) logistic_reg_fit ``` @@ -1302,35 +1249,27 @@ logistic_reg_fit The holdout data can be predicted: ```{r} -#| label: predict-brulee-logistic-reg-classification +#| label: predict-glm-logistic-reg-classification predict(logistic_reg_fit, type = "class", new_data = bin_test) predict(logistic_reg_fit, type = "prob", new_data = bin_test) +predict(logistic_reg_fit, type = "conf_int", new_data = bin_test) ``` -## `gee` - -This engine requires the multilevelmod extension package, so let's load this first: - -```{r} -#| label: load-gee-logistic-reg-classification-multilevelmod -#| output: false -library(multilevelmod) -``` +## `brulee` We create a model specification via: ```{r} -#| label: spec-gee-logistic-reg-classification +#| label: spec-brulee-logistic-reg-classification logistic_reg_spec <- logistic_reg() |> # This engine works with a single mode so no need to set that - set_engine("gee") + set_engine("brulee") ``` Now we create the model fit object: ```{r} -#| label: fit-gee-logistic-reg-classification -#| eval: false +#| label: fit-brulee-logistic-reg-classification logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) logistic_reg_fit ``` @@ -1338,27 +1277,35 @@ logistic_reg_fit The holdout data can be predicted: ```{r} -#| label: predict-gee-logistic-reg-classification -#| eval: false +#| label: predict-brulee-logistic-reg-classification predict(logistic_reg_fit, type = "class", new_data = bin_test) predict(logistic_reg_fit, type = "prob", new_data = bin_test) ``` -## `glm` +## `gee` + +This engine requires the multilevelmod extension package, so let's load this first: + +```{r} +#| label: load-gee-logistic-reg-classification-multilevelmod +#| output: false +library(multilevelmod) +``` We create a model specification via: ```{r} -#| label: spec-glm-logistic-reg-classification -logistic_reg_spec <- logistic_reg() +#| label: spec-gee-logistic-reg-classification +logistic_reg_spec <- logistic_reg() |> # This engine works with a single mode so no need to set that - # and glm is the default engine so there is no need to set that either. + set_engine("gee") ``` Now we create the model fit object: ```{r} -#| label: fit-glm-logistic-reg-classification +#| label: fit-gee-logistic-reg-classification +#| eval: false logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) logistic_reg_fit ``` @@ -1366,10 +1313,10 @@ logistic_reg_fit The holdout data can be predicted: ```{r} -#| label: predict-glm-logistic-reg-classification +#| label: predict-gee-logistic-reg-classification +#| eval: false predict(logistic_reg_fit, type = "class", new_data = bin_test) predict(logistic_reg_fit, type = "prob", new_data = bin_test) -predict(logistic_reg_fit, type = "conf_int", new_data = bin_test) ``` ## `glmer` @@ -1667,22 +1614,22 @@ predict(mars_fit, type = "prob", new_data = bin_test) :::{.panel-tabset} -## `brulee` +## `nnet` We create a model specification via: ```{r} -#| label: spec-brulee-mlp-classification +#| label: spec-nnet-mlp-classification mlp_spec <- mlp() |> # We need to set the mode since this engine works with multiple modes - set_mode("classification") |> - set_engine("brulee") + # and nnet is the default engine so there is no need to set that either. + set_mode("classification") ``` Now we create the model fit object: ```{r} -#| label: fit-brulee-mlp-classification +#| label: fit-nnet-mlp-classification mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) mlp_fit ``` @@ -1690,27 +1637,27 @@ mlp_fit The holdout data can be predicted: ```{r} -#| label: predict-brulee-mlp-classification +#| label: predict-nnet-mlp-classification predict(mlp_fit, type = "class", new_data = bin_test) predict(mlp_fit, type = "prob", new_data = bin_test) ``` -## `brulee_two_layer` +## `brulee` We create a model specification via: ```{r} -#| label: spec-brulee-two-layer-mlp-classification +#| label: spec-brulee-mlp-classification mlp_spec <- mlp() |> # We need to set the mode since this engine works with multiple modes set_mode("classification") |> - set_engine("brulee_two_layer") + set_engine("brulee") ``` Now we create the model fit object: ```{r} -#| label: fit-brulee-two-layer-mlp-classification +#| label: fit-brulee-mlp-classification mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) mlp_fit ``` @@ -1718,7 +1665,35 @@ mlp_fit The holdout data can be predicted: ```{r} -#| label: predict-brulee-two-layer-mlp-classification +#| label: predict-brulee-mlp-classification +predict(mlp_fit, type = "class", new_data = bin_test) +predict(mlp_fit, type = "prob", new_data = bin_test) +``` + +## `brulee_two_layer` + +We create a model specification via: + +```{r} +#| label: spec-brulee-two-layer-mlp-classification +mlp_spec <- mlp() |> + # We need to set the mode since this engine works with multiple modes + set_mode("classification") |> + set_engine("brulee_two_layer") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-brulee-two-layer-mlp-classification +mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) +mlp_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-brulee-two-layer-mlp-classification predict(mlp_fit, type = "class", new_data = bin_test) predict(mlp_fit, type = "prob", new_data = bin_test) ``` @@ -1792,40 +1767,39 @@ predict(mlp_fit, type = "class", new_data = bin_test) predict(mlp_fit, type = "prob", new_data = bin_test) ``` +::: + +## Multinom Regression (`multinom_reg()`) + +:::{.panel-tabset} + ## `nnet` We create a model specification via: ```{r} -#| label: spec-nnet-mlp-classification -mlp_spec <- mlp() |> - # We need to set the mode since this engine works with multiple modes - # and nnet is the default engine so there is no need to set that either. - set_mode("classification") +#| label: spec-nnet-multinom-reg-classification +# This engine works with a single mode so no need to set that +# and nnet is the default engine so there is no need to set that either. +multinom_reg_spec <- multinom_reg() ``` Now we create the model fit object: ```{r} -#| label: fit-nnet-mlp-classification -mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) -mlp_fit +#| label: fit-nnet-multinom-reg-classification +multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train) +multinom_reg_fit ``` The holdout data can be predicted: ```{r} -#| label: predict-nnet-mlp-classification -predict(mlp_fit, type = "class", new_data = bin_test) -predict(mlp_fit, type = "prob", new_data = bin_test) +#| label: predict-nnet-multinom-reg-classification +predict(multinom_reg_fit, type = "class", new_data = mtl_test) +predict(multinom_reg_fit, type = "prob", new_data = mtl_test) ``` -::: - -## Multinom Regression (`multinom_reg()`) - -:::{.panel-tabset} - ## `brulee` We create a model specification via: @@ -1947,33 +1921,6 @@ predict(multinom_reg_fit, type = "class", new_data = mtl_test) predict(multinom_reg_fit, type = "prob", new_data = mtl_test) ``` -## `nnet` - -We create a model specification via: - -```{r} -#| label: spec-nnet-multinom-reg-classification -# This engine works with a single mode so no need to set that -# and nnet is the default engine so there is no need to set that either. -multinom_reg_spec <- multinom_reg() -``` - -Now we create the model fit object: - -```{r} -#| label: fit-nnet-multinom-reg-classification -multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train) -multinom_reg_fit -``` - -The holdout data can be predicted: - -```{r} -#| label: predict-nnet-multinom-reg-classification -predict(multinom_reg_fit, type = "class", new_data = mtl_test) -predict(multinom_reg_fit, type = "prob", new_data = mtl_test) -``` - ## `spark` We create a model specification via: @@ -2071,9 +2018,6 @@ Now we create the model fit object: ```{r} #| label: fit-klaR-naive-Bayes-classification naive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train) - -# No real print method -# naive_Bayes_fit ``` The holdout data can be predicted: @@ -2235,6 +2179,38 @@ predict(pls_fit, type = "prob", new_data = bin_test) :::{.panel-tabset} +## `ranger` + +We create a model specification via: + +```{r} +#| label: spec-ranger-rand-forest-classification +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + # and ranger is the default engine so there is no need to set that either. + set_engine("ranger", keep.inbag = TRUE) |> + # However, we'll set the engine and use the keep.inbag=TRUE option so that we + # can produce interval predictions. This is not generally required. + set_mode("classification") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-ranger-rand-forest-classification +rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-ranger-rand-forest-classification +predict(rand_forest_fit, type = "class", new_data = bin_test) +predict(rand_forest_fit, type = "prob", new_data = bin_test) +predict(rand_forest_fit, type = "conf_int", new_data = bin_test) +``` + ## `aorsf` This engine requires the bonsai extension package, so let's load this first: @@ -2367,10 +2343,16 @@ Now we create the model fit object: ```{r} #| label: fit-partykit-rand-forest-classification rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) +``` + +The print method has a lot of output: -# Too long to print -# rand_forest_fit +
+```{r} +#| label: fit-partykit-rand-forest-classification-print +capture.output(print(rand_forest_fit))[1:100] |> cat(sep = "\n") ``` +
The holdout data can be predicted: @@ -2408,38 +2390,6 @@ predict(rand_forest_fit, type = "class", new_data = bin_test) predict(rand_forest_fit, type = "prob", new_data = bin_test) ``` -## `ranger` - -We create a model specification via: - -```{r} -#| label: spec-ranger-rand-forest-classification -rand_forest_spec <- rand_forest() |> - # We need to set the mode since this engine works with multiple modes - # and ranger is the default engine so there is no need to set that either. - set_engine("ranger", keep.inbag = TRUE) |> - # However, we'll set the engine and use the keep.inbag=TRUE option so that we - # can produce interval predictions. This is not generally required. - set_mode("classification") -``` - -Now we create the model fit object: - -```{r} -#| label: fit-ranger-rand-forest-classification -rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) -rand_forest_fit -``` - -The holdout data can be predicted: - -```{r} -#| label: predict-ranger-rand-forest-classification -predict(rand_forest_fit, type = "class", new_data = bin_test) -predict(rand_forest_fit, type = "prob", new_data = bin_test) -predict(rand_forest_fit, type = "conf_int", new_data = bin_test) -``` - ## `spark` We create a model specification via: @@ -2476,32 +2426,30 @@ predict(rand_forest_fit, type = "prob", new_data = tbl_bin$test) :::{.panel-tabset} -## `h2o` +## `xrf` -This engine requires the agua extension package, so let's load this first: +This engine requires the rules extension package, so let's load this first: ```{r} -#| label: load-h2o-rule-fit-classification-agua +#| label: load-xrf-rule-fit-classification-rules #| output: false -library(agua) +library(rules) ``` We create a model specification via: ```{r} -#| label: spec-h2o-rule-fit-classification -#| eval: !expr 'run_h2o' +#| label: spec-xrf-rule-fit-classification rule_fit_spec <- rule_fit() |> # We need to set the mode since this engine works with multiple modes - set_mode("classification") |> - set_engine("h2o") + # and xrf is the default engine so there is no need to set that either. + set_mode("classification") ``` Now we create the model fit object: ```{r} -#| label: fit-h2o-rule-fit-classification -#| eval: !expr 'run_h2o' +#| label: fit-xrf-rule-fit-classification rule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train) rule_fit_fit ``` @@ -2509,36 +2457,37 @@ rule_fit_fit The holdout data can be predicted: ```{r} -#| label: predict-h2o-rule-fit-classification -#| eval: !expr 'run_h2o' +#| label: predict-xrf-rule-fit-classification predict(rule_fit_fit, type = "class", new_data = bin_test) predict(rule_fit_fit, type = "prob", new_data = bin_test) ``` -## `xrf` +## `h2o` -This engine requires the rules extension package, so let's load this first: +This engine requires the agua extension package, so let's load this first: ```{r} -#| label: load-xrf-rule-fit-classification-rules +#| label: load-h2o-rule-fit-classification-agua #| output: false -library(rules) +library(agua) ``` We create a model specification via: ```{r} -#| label: spec-xrf-rule-fit-classification +#| label: spec-h2o-rule-fit-classification +#| eval: !expr 'run_h2o' rule_fit_spec <- rule_fit() |> # We need to set the mode since this engine works with multiple modes - # and xrf is the default engine so there is no need to set that either. - set_mode("classification") + set_mode("classification") |> + set_engine("h2o") ``` Now we create the model fit object: ```{r} -#| label: fit-xrf-rule-fit-classification +#| label: fit-h2o-rule-fit-classification +#| eval: !expr 'run_h2o' rule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train) rule_fit_fit ``` @@ -2546,7 +2495,8 @@ rule_fit_fit The holdout data can be predicted: ```{r} -#| label: predict-xrf-rule-fit-classification +#| label: predict-h2o-rule-fit-classification +#| eval: !expr 'run_h2o' predict(rule_fit_fit, type = "class", new_data = bin_test) predict(rule_fit_fit, type = "prob", new_data = bin_test) ``` @@ -2773,59 +2723,13 @@ and then use it to create the splits. For this article, we will copy the #| eval: !expr 'run_spark' library(sparklyr) -sc <- spark_connect("local", version = "4.0.1") +sc <- spark_connect("local") tbl_concrete <- copy_to(sc, modeldata::concrete) tbl_reg <- sdf_random_split(tbl_concrete, training = 0.95, test = 0.05, seed = 100) ``` - -## Auto Ml (`auto_ml()`) - -:::{.panel-tabset} - -## `h2o` - -This engine requires the agua extension package, so let's load this first: - -```{r} -#| label: load-h2o-auto-ml-regression-agua -#| output: false -library(agua) -``` - -We create a model specification via: - -```{r} -#| label: spec-h2o-auto-ml-regression -#| eval: false -auto_ml_spec <- auto_ml() |> - # We dont need to set the engine (since there is only one) but we'll set - # a time limit - set_engine("h2o", max_runtime_secs = 60 * 3) |> - set_mode("regression") -``` - -Now we create the model fit object: - -```{r} -#| label: fit-h2o-auto-ml-regression -#| eval: false -auto_ml_fit <- auto_ml_spec |> fit(strength ~ ., data = reg_train) -auto_ml_fit -``` - -The holdout data can be predicted: - -```{r} -#| label: predict-h2o-auto-ml-regression -#| eval: false -predict(auto_ml_fit, new_data = reg_test) -``` - -::: - ## Bagged MARS (`bag_mars()`) :::{.panel-tabset} @@ -2988,30 +2892,22 @@ predict(bart_fit, type = "pred_int", new_data = reg_test) :::{.panel-tabset} -## `catboost` - -This engine requires the bonsai extension package, so let's load this first: - -```{r} -#| label: load-catboost-boost-tree-regression-bonsai -#| output: false -library(bonsai) -``` +## `xgboost` We create a model specification via: ```{r} -#| label: spec-catboost-boost-tree-regression +#| label: spec-xgboost-boost-tree-regression boost_tree_spec <- boost_tree() |> # We need to set the mode since this engine works with multiple modes - set_mode("regression") |> - set_engine("catboost") + # and xgboost is the default engine so there is no need to set that either. + set_mode("regression") ``` Now we create the model fit object: ```{r} -#| label: fit-catboost-boost-tree-regression +#| label: fit-xgboost-boost-tree-regression boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) boost_tree_fit ``` @@ -3019,11 +2915,46 @@ boost_tree_fit The holdout data can be predicted: ```{r} -#| label: predict-catboost-boost-tree-regression +#| label: predict-xgboost-boost-tree-regression predict(boost_tree_fit, new_data = reg_test) ``` -## `h2o` +## `catboost` + +This engine requires the bonsai extension package, so let's load this first: + +```{r} +#| label: load-catboost-boost-tree-regression-bonsai +#| output: false +library(bonsai) +``` + +We create a model specification via: + +```{r} +#| label: spec-catboost-boost-tree-regression +boost_tree_spec <- boost_tree() |> + # We need to set the mode since this engine works with multiple modes + set_mode("regression") |> + set_engine("catboost") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-catboost-boost-tree-regression +boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) +boost_tree_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-catboost-boost-tree-regression +predict(boost_tree_fit, new_data = reg_test) +``` + +## `h2o` This engine requires the agua extension package, so let's load this first: @@ -3131,33 +3062,6 @@ The holdout data can be predicted: predict(boost_tree_fit, new_data = reg_test) ``` -## `xgboost` - -We create a model specification via: - -```{r} -#| label: spec-xgboost-boost-tree-regression -boost_tree_spec <- boost_tree() |> - # We need to set the mode since this engine works with multiple modes - # and xgboost is the default engine so there is no need to set that either. - set_mode("regression") -``` - -Now we create the model fit object: - -```{r} -#| label: fit-xgboost-boost-tree-regression -boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) -boost_tree_fit -``` - -The holdout data can be predicted: - -```{r} -#| label: predict-xgboost-boost-tree-regression -predict(boost_tree_fit, new_data = reg_test) -``` - ## `spark` We create a model specification via: @@ -3233,30 +3137,22 @@ predict(cubist_rules_fit, new_data = reg_test) :::{.panel-tabset} -## `partykit` - -This engine requires the bonsai extension package, so let's load this first: - -```{r} -#| label: load-partykit-decision-tree-regression-bonsai -#| output: false -library(bonsai) -``` +## `rpart` We create a model specification via: ```{r} -#| label: spec-partykit-decision-tree-regression +#| label: spec-rpart-decision-tree-regression decision_tree_spec <- decision_tree() |> # We need to set the mode since this engine works with multiple modes - set_mode("regression") |> - set_engine("partykit") + # and rpart is the default engine so there is no need to set that either. + set_mode("regression") ``` Now we create the model fit object: ```{r} -#| label: fit-partykit-decision-tree-regression +#| label: fit-rpart-decision-tree-regression decision_tree_fit <- decision_tree_spec |> fit(strength ~ ., data = reg_train) decision_tree_fit ``` @@ -3264,26 +3160,34 @@ decision_tree_fit The holdout data can be predicted: ```{r} -#| label: predict-partykit-decision-tree-regression +#| label: predict-rpart-decision-tree-regression predict(decision_tree_fit, new_data = reg_test) ``` -## `rpart` +## `partykit` + +This engine requires the bonsai extension package, so let's load this first: + +```{r} +#| label: load-partykit-decision-tree-regression-bonsai +#| output: false +library(bonsai) +``` We create a model specification via: ```{r} -#| label: spec-rpart-decision-tree-regression +#| label: spec-partykit-decision-tree-regression decision_tree_spec <- decision_tree() |> # We need to set the mode since this engine works with multiple modes - # and rpart is the default engine so there is no need to set that either. - set_mode("regression") + set_mode("regression") |> + set_engine("partykit") ``` Now we create the model fit object: ```{r} -#| label: fit-rpart-decision-tree-regression +#| label: fit-partykit-decision-tree-regression decision_tree_fit <- decision_tree_spec |> fit(strength ~ ., data = reg_train) decision_tree_fit ``` @@ -3291,7 +3195,7 @@ decision_tree_fit The holdout data can be predicted: ```{r} -#| label: predict-rpart-decision-tree-regression +#| label: predict-partykit-decision-tree-regression predict(decision_tree_fit, new_data = reg_test) ``` @@ -3366,6 +3270,34 @@ predict(gen_additive_mod_fit, type = "conf_int", new_data = reg_test) :::{.panel-tabset} +## `lm` + +We create a model specification via: + +```{r} +#| label: spec-lm-linear-reg-regression +# This engine works with a single mode so no need to set that +# and lm is the default engine so there is no need to set that either. +linear_reg_spec <- linear_reg() +``` + +Now we create the model fit object: + +```{r} +#| label: fit-lm-linear-reg-regression +linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-lm-linear-reg-regression +predict(linear_reg_fit, new_data = reg_test) +predict(linear_reg_fit, type = "conf_int", new_data = reg_test) +predict(linear_reg_fit, type = "pred_int", new_data = reg_test) +``` + ## `brulee` We create a model specification via: @@ -3618,34 +3550,6 @@ The holdout data can be predicted: predict(linear_reg_fit, new_data = reg_test) ``` -## `lm` - -We create a model specification via: - -```{r} -#| label: spec-lm-linear-reg-regression -# This engine works with a single mode so no need to set that -# and lm is the default engine so there is no need to set that either. -linear_reg_spec <- linear_reg() -``` - -Now we create the model fit object: - -```{r} -#| label: fit-lm-linear-reg-regression -linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) -linear_reg_fit -``` - -The holdout data can be predicted: - -```{r} -#| label: predict-lm-linear-reg-regression -predict(linear_reg_fit, new_data = reg_test) -predict(linear_reg_fit, type = "conf_int", new_data = reg_test) -predict(linear_reg_fit, type = "pred_int", new_data = reg_test) -``` - ## `lme` This engine requires the multilevelmod extension package, so let's load this first: @@ -3850,6 +3754,33 @@ predict(mars_fit, new_data = reg_test) :::{.panel-tabset} +## `nnet` + +We create a model specification via: + +```{r} +#| label: spec-nnet-mlp-regression +mlp_spec <- mlp() |> + # We need to set the mode since this engine works with multiple modes + # and nnet is the default engine so there is no need to set that either. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-nnet-mlp-regression +mlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train) +mlp_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-nnet-mlp-regression +predict(mlp_fit, new_data = reg_test) +``` + ## `brulee` We create a model specification via: @@ -3971,33 +3902,6 @@ The holdout data can be predicted: predict(mlp_fit, new_data = reg_test) ``` -## `nnet` - -We create a model specification via: - -```{r} -#| label: spec-nnet-mlp-regression -mlp_spec <- mlp() |> - # We need to set the mode since this engine works with multiple modes - # and nnet is the default engine so there is no need to set that either. - set_mode("regression") -``` - -Now we create the model fit object: - -```{r} -#| label: fit-nnet-mlp-regression -mlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train) -mlp_fit -``` - -The holdout data can be predicted: - -```{r} -#| label: predict-nnet-mlp-regression -predict(mlp_fit, new_data = reg_test) -``` - ::: ## K-Nearest Neighbors (`nearest_neighbor()`) @@ -4107,42 +4011,6 @@ predict(pls_fit, new_data = reg_test) :::{.panel-tabset} -## `gee` - -This engine requires the multilevelmod extension package, so let's load this first: - -```{r} -#| label: load-gee-poisson-reg-regression-multilevelmod -#| output: false -library(multilevelmod) -``` - -We create a model specification via: - -```{r} -#| label: spec-gee-poisson-reg-regression -poisson_reg_spec <- poisson_reg() |> - # This engine works with a single mode so no need to set that - set_engine("gee") -``` - -Now we create the model fit object: - -```{r} -#| label: fit-gee-poisson-reg-regression -#| eval: false -poisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train) -poisson_reg_fit -``` - -The holdout data can be predicted: - -```{r} -#| label: predict-gee-poisson-reg-regression -#| eval: false -predict(poisson_reg_fit, new_data = reg_test) -``` - ## `glm` This engine requires the poissonreg extension package, so let's load this first: @@ -4177,12 +4045,12 @@ The holdout data can be predicted: predict(poisson_reg_fit, new_data = count_test) ``` -## `glmer` +## `gee` This engine requires the multilevelmod extension package, so let's load this first: ```{r} -#| label: load-glmer-poisson-reg-regression-multilevelmod +#| label: load-gee-poisson-reg-regression-multilevelmod #| output: false library(multilevelmod) ``` @@ -4190,10 +4058,46 @@ library(multilevelmod) We create a model specification via: ```{r} -#| label: spec-glmer-poisson-reg-regression +#| label: spec-gee-poisson-reg-regression poisson_reg_spec <- poisson_reg() |> # This engine works with a single mode so no need to set that - set_engine("glmer") + set_engine("gee") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-gee-poisson-reg-regression +#| eval: false +poisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train) +poisson_reg_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-gee-poisson-reg-regression +#| eval: false +predict(poisson_reg_fit, new_data = reg_test) +``` + +## `glmer` + +This engine requires the multilevelmod extension package, so let's load this first: + +```{r} +#| label: load-glmer-poisson-reg-regression-multilevelmod +#| output: false +library(multilevelmod) +``` + +We create a model specification via: + +```{r} +#| label: spec-glmer-poisson-reg-regression +poisson_reg_spec <- poisson_reg() |> + # This engine works with a single mode so no need to set that + set_engine("glmer") ``` Now we create the model fit object: @@ -4433,6 +4337,37 @@ predict(poisson_reg_fit, new_data = count_test) :::{.panel-tabset} +## `ranger` + +We create a model specification via: + +```{r} +#| label: spec-ranger-rand-forest-regression +rand_forest_spec <- rand_forest() |> + # We need to set the mode since this engine works with multiple modes + # and ranger is the default engine so there is no need to set that either. + set_engine("ranger", keep.inbag = TRUE) |> + # However, we'll set the engine and use the keep.inbag=TRUE option so that we + # can produce interval predictions. This is not generally required. + set_mode("regression") +``` + +Now we create the model fit object: + +```{r} +#| label: fit-ranger-rand-forest-regression +rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) +rand_forest_fit +``` + +The holdout data can be predicted: + +```{r} +#| label: predict-ranger-rand-forest-regression +predict(rand_forest_fit, new_data = reg_test) +predict(rand_forest_fit, type = "conf_int", new_data = reg_test) +``` + ## `aorsf` This engine requires the bonsai extension package, so let's load this first: @@ -4562,10 +4497,16 @@ Now we create the model fit object: ```{r} #| label: fit-partykit-rand-forest-regression rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) +``` + +The print method has a lot of output: -# Too long to print -# rand_forest_fit +
+```{r} +#| label: fit-partykit-rand-forest-regression-print +capture.output(print(rand_forest_fit))[1:100] |> cat(sep = "\n") ``` +
The holdout data can be predicted: @@ -4601,37 +4542,6 @@ The holdout data can be predicted: predict(rand_forest_fit, new_data = reg_test) ``` -## `ranger` - -We create a model specification via: - -```{r} -#| label: spec-ranger-rand-forest-regression -rand_forest_spec <- rand_forest() |> - # We need to set the mode since this engine works with multiple modes - # and ranger is the default engine so there is no need to set that either. - set_engine("ranger", keep.inbag = TRUE) |> - # However, we'll set the engine and use the keep.inbag=TRUE option so that we - # can produce interval predictions. This is not generally required. - set_mode("regression") -``` - -Now we create the model fit object: - -```{r} -#| label: fit-ranger-rand-forest-regression -rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) -rand_forest_fit -``` - -The holdout data can be predicted: - -```{r} -#| label: predict-ranger-rand-forest-regression -predict(rand_forest_fit, new_data = reg_test) -predict(rand_forest_fit, type = "conf_int", new_data = reg_test) -``` - ## `spark` We create a model specification via: @@ -4667,32 +4577,30 @@ predict(rand_forest_fit, new_data = tbl_reg$test) :::{.panel-tabset} -## `h2o` +## `xrf` -This engine requires the agua extension package, so let's load this first: +This engine requires the rules extension package, so let's load this first: ```{r} -#| label: load-h2o-rule-fit-regression-agua +#| label: load-xrf-rule-fit-regression-rules #| output: false -library(agua) +library(rules) ``` We create a model specification via: ```{r} -#| label: spec-h2o-rule-fit-regression -#| eval: !expr 'run_h2o' +#| label: spec-xrf-rule-fit-regression rule_fit_spec <- rule_fit() |> # We need to set the mode since this engine works with multiple modes - set_mode("regression") |> - set_engine("h2o") + # and xrf is the default engine so there is no need to set that either. + set_mode("regression") ``` Now we create the model fit object: ```{r} -#| label: fit-h2o-rule-fit-regression -#| eval: !expr 'run_h2o' +#| label: fit-xrf-rule-fit-regression rule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train) rule_fit_fit ``` @@ -4700,35 +4608,36 @@ rule_fit_fit The holdout data can be predicted: ```{r} -#| label: predict-h2o-rule-fit-regression -#| eval: !expr 'run_h2o' +#| label: predict-xrf-rule-fit-regression predict(rule_fit_fit, new_data = reg_test) ``` -## `xrf` +## `h2o` -This engine requires the rules extension package, so let's load this first: +This engine requires the agua extension package, so let's load this first: ```{r} -#| label: load-xrf-rule-fit-regression-rules +#| label: load-h2o-rule-fit-regression-agua #| output: false -library(rules) +library(agua) ``` We create a model specification via: ```{r} -#| label: spec-xrf-rule-fit-regression +#| label: spec-h2o-rule-fit-regression +#| eval: !expr 'run_h2o' rule_fit_spec <- rule_fit() |> # We need to set the mode since this engine works with multiple modes - # and xrf is the default engine so there is no need to set that either. - set_mode("regression") + set_mode("regression") |> + set_engine("h2o") ``` Now we create the model fit object: ```{r} -#| label: fit-xrf-rule-fit-regression +#| label: fit-h2o-rule-fit-regression +#| eval: !expr 'run_h2o' rule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train) rule_fit_fit ``` @@ -4736,7 +4645,8 @@ rule_fit_fit The holdout data can be predicted: ```{r} -#| label: predict-xrf-rule-fit-regression +#| label: predict-h2o-rule-fit-regression +#| eval: !expr 'run_h2o' predict(rule_fit_fit, new_data = reg_test) ``` @@ -4866,34 +4776,6 @@ The holdout data can be predicted: predict(svm_rbf_fit, new_data = reg_test) ``` -## `liquidSVM` - -We create a model specification via: - -```{r} -#| label: spec-liquidSVM-svm-rbf-regression -#| eval: false -svm_rbf_spec <- svm_rbf() |> - # We need to set the mode since this engine works with multiple modes - set_mode("regression") |> - set_engine("liquidSVM") -``` - -Now we create the model fit object: - -```{r} -#| label: fit-liquidSVM-svm-rbf-regression -#| eval: false -svm_rbf_fit <- svm_rbf_spec |> fit(strength ~ ., data = reg_train) -svm_rbf_fit -``` - -The holdout data can be predicted: - -```{r} -#| label: predict-liquidSVM-svm-rbf-regression -predict(svm_rbf_fit, new_data = reg_test) -``` ::: @@ -4966,6 +4848,16 @@ predict(bag_tree_fit, type = "time", new_data = cns_test) predict(bag_tree_fit, type = "survival", new_data = cns_test, eval_time = eval_times) ``` +Each row of the survival predictions has results for each evaluation time: + +```{r} +#| label: predict-rpart-bag-tree-censored-regression-slice +bag_tree_fit |> + predict(type = "survival", new_data = cns_test, eval_time = eval_times) |> + slice(1) |> + pluck(".pred") +``` + ::: ## Boosted Decision Trees (`boost_tree()`) @@ -5008,18 +4900,28 @@ predict(boost_tree_fit, type = "survival", new_data = cns_test, eval_time = eval predict(boost_tree_fit, type = "linear_pred", new_data = cns_test) ``` +Each row of the survival predictions has results for each evaluation time: + +```{r} +#| label: predict-mboost-boost-tree-censored-regression-slice +boost_tree_fit |> + predict(type = "survival", new_data = cns_test, eval_time = eval_times) |> + slice(1) |> + pluck(".pred") +``` + ::: ## Decision Tree (`decision_tree()`) :::{.panel-tabset} -## `partykit` +## `rpart` This engine requires the censored extension package, so let's load this first: ```{r} -#| label: load-partykit-decision-tree-censored-regression-censored +#| label: load-rpart-decision-tree-censored-regression-censored #| output: false library(censored) ``` @@ -5027,17 +4929,17 @@ library(censored) We create a model specification via: ```{r} -#| label: spec-partykit-decision-tree-censored-regression +#| label: spec-rpart-decision-tree-censored-regression decision_tree_spec <- decision_tree() |> # We need to set the mode since this engine works with multiple modes - set_mode("censored regression") |> - set_engine("partykit") + # and rpart is the default engine so there is no need to set that either. + set_mode("censored regression") ``` Now we create the model fit object: ```{r} -#| label: fit-partykit-decision-tree-censored-regression +#| label: fit-rpart-decision-tree-censored-regression decision_tree_fit <- decision_tree_spec |> fit(event_time ~ ., data = cns_train) decision_tree_fit ``` @@ -5045,17 +4947,27 @@ decision_tree_fit The holdout data can be predicted: ```{r} -#| label: predict-partykit-decision-tree-censored-regression +#| label: predict-rpart-decision-tree-censored-regression predict(decision_tree_fit, type = "time", new_data = cns_test) predict(decision_tree_fit, type = "survival", new_data = cns_test, eval_time = eval_times) ``` -## `rpart` +Each row of the survival predictions has results for each evaluation time: + +```{r} +#| label: predict-rpart-decision-tree-censored-regression-slice +decision_tree_fit |> + predict(type = "survival", new_data = cns_test, eval_time = eval_times) |> + slice(1) |> + pluck(".pred") +``` + +## `partykit` This engine requires the censored extension package, so let's load this first: ```{r} -#| label: load-rpart-decision-tree-censored-regression-censored +#| label: load-partykit-decision-tree-censored-regression-censored #| output: false library(censored) ``` @@ -5063,17 +4975,17 @@ library(censored) We create a model specification via: ```{r} -#| label: spec-rpart-decision-tree-censored-regression +#| label: spec-partykit-decision-tree-censored-regression decision_tree_spec <- decision_tree() |> # We need to set the mode since this engine works with multiple modes - # and rpart is the default engine so there is no need to set that either. - set_mode("censored regression") + set_mode("censored regression") |> + set_engine("partykit") ``` Now we create the model fit object: ```{r} -#| label: fit-rpart-decision-tree-censored-regression +#| label: fit-partykit-decision-tree-censored-regression decision_tree_fit <- decision_tree_spec |> fit(event_time ~ ., data = cns_train) decision_tree_fit ``` @@ -5081,23 +4993,33 @@ decision_tree_fit The holdout data can be predicted: ```{r} -#| label: predict-rpart-decision-tree-censored-regression +#| label: predict-partykit-decision-tree-censored-regression predict(decision_tree_fit, type = "time", new_data = cns_test) predict(decision_tree_fit, type = "survival", new_data = cns_test, eval_time = eval_times) ``` +Each row of the survival predictions has results for each evaluation time: + +```{r} +#| label: predict-partykit-decision-tree-censored-regression-slice +decision_tree_fit |> + predict(type = "survival", new_data = cns_test, eval_time = eval_times) |> + slice(1) |> + pluck(".pred") +``` + ::: ## Proportional Hazards (`proportional_hazards()`) :::{.panel-tabset} -## `glmnet` +## `survival` This engine requires the censored extension package, so let's load this first: ```{r} -#| label: load-glmnet-proportional-hazards-censored-regression-censored +#| label: load-survival-proportional-hazards-censored-regression-censored #| output: false library(censored) ``` @@ -5105,16 +5027,16 @@ library(censored) We create a model specification via: ```{r} -#| label: spec-glmnet-proportional-hazards-censored-regression -proportional_hazards_spec <- proportional_hazards(penalty = 0.01) |> - # This engine works with a single mode so no need to set that - set_engine("glmnet") +#| label: spec-survival-proportional-hazards-censored-regression +# This engine works with a single mode so no need to set that +# and survival is the default engine so there is no need to set that either. +proportional_hazards_spec <- proportional_hazards() ``` Now we create the model fit object: ```{r} -#| label: fit-glmnet-proportional-hazards-censored-regression +#| label: fit-survival-proportional-hazards-censored-regression proportional_hazards_fit <- proportional_hazards_spec |> fit(event_time ~ ., data = cns_train) proportional_hazards_fit ``` @@ -5122,18 +5044,28 @@ proportional_hazards_fit The holdout data can be predicted: ```{r} -#| label: predict-glmnet-proportional-hazards-censored-regression +#| label: predict-survival-proportional-hazards-censored-regression predict(proportional_hazards_fit, type = "time", new_data = cns_test) predict(proportional_hazards_fit, type = "survival", new_data = cns_test, eval_time = eval_times) predict(proportional_hazards_fit, type = "linear_pred", new_data = cns_test) ``` -## `survival` +Each row of the survival predictions has results for each evaluation time: + +```{r} +#| label: predict-survival-proportional-hazards-censored-regression-slice +proportional_hazards_fit |> + predict(type = "survival", new_data = cns_test, eval_time = eval_times) |> + slice(1) |> + pluck(".pred") +``` + +## `glmnet` This engine requires the censored extension package, so let's load this first: ```{r} -#| label: load-survival-proportional-hazards-censored-regression-censored +#| label: load-glmnet-proportional-hazards-censored-regression-censored #| output: false library(censored) ``` @@ -5141,16 +5073,16 @@ library(censored) We create a model specification via: ```{r} -#| label: spec-survival-proportional-hazards-censored-regression -# This engine works with a single mode so no need to set that -# and survival is the default engine so there is no need to set that either. -proportional_hazards_spec <- proportional_hazards() +#| label: spec-glmnet-proportional-hazards-censored-regression +proportional_hazards_spec <- proportional_hazards(penalty = 0.01) |> + # This engine works with a single mode so no need to set that + set_engine("glmnet") ``` Now we create the model fit object: ```{r} -#| label: fit-survival-proportional-hazards-censored-regression +#| label: fit-glmnet-proportional-hazards-censored-regression proportional_hazards_fit <- proportional_hazards_spec |> fit(event_time ~ ., data = cns_train) proportional_hazards_fit ``` @@ -5158,12 +5090,22 @@ proportional_hazards_fit The holdout data can be predicted: ```{r} -#| label: predict-survival-proportional-hazards-censored-regression +#| label: predict-glmnet-proportional-hazards-censored-regression predict(proportional_hazards_fit, type = "time", new_data = cns_test) predict(proportional_hazards_fit, type = "survival", new_data = cns_test, eval_time = eval_times) predict(proportional_hazards_fit, type = "linear_pred", new_data = cns_test) ``` +Each row of the survival predictions has results for each evaluation time: + +```{r} +#| label: predict-glmnet-proportional-hazards-censored-regression-slice +proportional_hazards_fit |> + predict(type = "survival", new_data = cns_test, eval_time = eval_times) |> + slice(1) |> + pluck(".pred") +``` + ::: ## Random Forests (`rand_forest()`) @@ -5206,6 +5148,16 @@ predict(rand_forest_fit, type = "time", new_data = cns_test) predict(rand_forest_fit, type = "survival", new_data = cns_test, eval_time = eval_times) ``` +Each row of the survival predictions has results for each evaluation time: + +```{r} +#| label: predict-aorsf-rand-forest-censored-regression-slice +rand_forest_fit |> + predict(type = "survival", new_data = cns_test, eval_time = eval_times) |> + slice(1) |> + pluck(".pred") +``` + ## `partykit` This engine requires the censored extension package, so let's load this first: @@ -5231,10 +5183,16 @@ Now we create the model fit object: ```{r} #| label: fit-partykit-rand-forest-censored-regression rand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train) +``` -# Too long to print -# rand_forest_fit +The print method has a lot of output: + +
+```{r} +#| label: fit-partykit-rand-forest-censored-regression-print +capture.output(print(rand_forest_fit))[1:100] |> cat(sep = "\n") ``` +
The holdout data can be predicted: @@ -5244,18 +5202,28 @@ predict(rand_forest_fit, type = "time", new_data = cns_test) predict(rand_forest_fit, type = "survival", new_data = cns_test, eval_time = eval_times) ``` +Each row of the survival predictions has results for each evaluation time: + +```{r} +#| label: predict-partykit-rand-forest-censored-regression-slice +rand_forest_fit |> + predict(type = "survival", new_data = cns_test, eval_time = eval_times) |> + slice(1) |> + pluck(".pred") +``` + ::: ## Parametric Survival Models (`survival_reg()`) :::{.panel-tabset} -## `flexsurv` +## `survival` This engine requires the censored extension package, so let's load this first: ```{r} -#| label: load-flexsurv-survival-reg-censored-regression-censored +#| label: load-survival-survival-reg-censored-regression-censored #| output: false library(censored) ``` @@ -5263,16 +5231,16 @@ library(censored) We create a model specification via: ```{r} -#| label: spec-flexsurv-survival-reg-censored-regression -survival_reg_spec <- survival_reg() |> - # This engine works with a single mode so no need to set that - set_engine("flexsurv") +#| label: spec-survival-survival-reg-censored-regression +# This engine works with a single mode so no need to set that +# and survival is the default engine so there is no need to set that either. +survival_reg_spec <- survival_reg() ``` Now we create the model fit object: ```{r} -#| label: fit-flexsurv-survival-reg-censored-regression +#| label: fit-survival-survival-reg-censored-regression survival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train) survival_reg_fit ``` @@ -5280,7 +5248,7 @@ survival_reg_fit The holdout data can be predicted: ```{r} -#| label: predict-flexsurv-survival-reg-censored-regression +#| label: predict-survival-survival-reg-censored-regression predict(survival_reg_fit, type = "time", new_data = cns_test) predict(survival_reg_fit, type = "survival", new_data = cns_test, eval_time = eval_times) predict(survival_reg_fit, type = "hazard", new_data = cns_test, eval_time = eval_times) @@ -5288,12 +5256,22 @@ predict(survival_reg_fit, type = "linear_pred", new_data = cns_test) predict(survival_reg_fit, type = "quantile", new_data = cns_test) ``` -## `flexsurvspline` +Each row of the survival predictions has results for each evaluation time: + +```{r} +#| label: predict-survival-survival-reg-censored-regression-slice +survival_reg_fit |> + predict(type = "survival", new_data = cns_test, eval_time = eval_times) |> + slice(1) |> + pluck(".pred") +``` + +## `flexsurv` This engine requires the censored extension package, so let's load this first: ```{r} -#| label: load-flexsurvspline-survival-reg-censored-regression-censored +#| label: load-flexsurv-survival-reg-censored-regression-censored #| output: false library(censored) ``` @@ -5301,16 +5279,16 @@ library(censored) We create a model specification via: ```{r} -#| label: spec-flexsurvspline-survival-reg-censored-regression +#| label: spec-flexsurv-survival-reg-censored-regression survival_reg_spec <- survival_reg() |> # This engine works with a single mode so no need to set that - set_engine("flexsurvspline") + set_engine("flexsurv") ``` Now we create the model fit object: ```{r} -#| label: fit-flexsurvspline-survival-reg-censored-regression +#| label: fit-flexsurv-survival-reg-censored-regression survival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train) survival_reg_fit ``` @@ -5318,7 +5296,7 @@ survival_reg_fit The holdout data can be predicted: ```{r} -#| label: predict-flexsurvspline-survival-reg-censored-regression +#| label: predict-flexsurv-survival-reg-censored-regression predict(survival_reg_fit, type = "time", new_data = cns_test) predict(survival_reg_fit, type = "survival", new_data = cns_test, eval_time = eval_times) predict(survival_reg_fit, type = "hazard", new_data = cns_test, eval_time = eval_times) @@ -5326,12 +5304,22 @@ predict(survival_reg_fit, type = "linear_pred", new_data = cns_test) predict(survival_reg_fit, type = "quantile", new_data = cns_test) ``` -## `survival` +Each row of the survival predictions has results for each evaluation time: + +```{r} +#| label: predict-flexsurv-survival-reg-censored-regression-slice +survival_reg_fit |> + predict(type = "survival", new_data = cns_test, eval_time = eval_times) |> + slice(1) |> + pluck(".pred") +``` + +## `flexsurvspline` This engine requires the censored extension package, so let's load this first: ```{r} -#| label: load-survival-survival-reg-censored-regression-censored +#| label: load-flexsurvspline-survival-reg-censored-regression-censored #| output: false library(censored) ``` @@ -5339,16 +5327,16 @@ library(censored) We create a model specification via: ```{r} -#| label: spec-survival-survival-reg-censored-regression -# This engine works with a single mode so no need to set that -# and survival is the default engine so there is no need to set that either. -survival_reg_spec <- survival_reg() +#| label: spec-flexsurvspline-survival-reg-censored-regression +survival_reg_spec <- survival_reg() |> + # This engine works with a single mode so no need to set that + set_engine("flexsurvspline") ``` Now we create the model fit object: ```{r} -#| label: fit-survival-survival-reg-censored-regression +#| label: fit-flexsurvspline-survival-reg-censored-regression survival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train) survival_reg_fit ``` @@ -5356,7 +5344,7 @@ survival_reg_fit The holdout data can be predicted: ```{r} -#| label: predict-survival-survival-reg-censored-regression +#| label: predict-flexsurvspline-survival-reg-censored-regression predict(survival_reg_fit, type = "time", new_data = cns_test) predict(survival_reg_fit, type = "survival", new_data = cns_test, eval_time = eval_times) predict(survival_reg_fit, type = "hazard", new_data = cns_test, eval_time = eval_times) @@ -5364,6 +5352,16 @@ predict(survival_reg_fit, type = "linear_pred", new_data = cns_test) predict(survival_reg_fit, type = "quantile", new_data = cns_test) ``` +Each row of the survival predictions has results for each evaluation time: + +```{r} +#| label: predict-flexsurvspline-survival-reg-censored-regression-slice +survival_reg_fit |> + predict(type = "survival", new_data = cns_test, eval_time = eval_times) |> + slice(1) |> + pluck(".pred") +``` + ::: # Quantile Regression Models @@ -5426,6 +5424,18 @@ The holdout data can be predicted: predict(linear_reg_fit, type = "quantile", new_data = qnt_test) ``` +Each row of predictions has a special vector class containing all of the quantile predictions: + +```{r} +#| label: predict-quantreg-linear-reg-quantile-regression-expand +linear_reg_fit |> + predict(type = "quantile", new_data = qnt_test)|> + slice(1) |> + pluck(".pred_quantile") |> + # Expand the results for each quantile level by converting to a tibble + as_tibble() +``` + ::: ## Random Forests (`rand_forest()`) @@ -5461,6 +5471,18 @@ The holdout data can be predicted: predict(rand_forest_fit, type = "quantile", new_data = qnt_test) ``` +Each row of predictions has a special vector class containing all of the quantile predictions: + +```{r} +#| label: predict-grf-rand-forest-quantile-regression-expand +rand_forest_fit |> + predict(type = "quantile", new_data = qnt_test)|> + slice(1) |> + pluck(".pred_quantile") |> + # Expand the results for each quantile level by converting to a tibble + as_tibble() +``` + ::: ```{r} From 581eb77e6815c018404c7eae7e718b9ad5aa539e Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 20 Nov 2025 18:06:37 -0500 Subject: [PATCH 17/23] random integers for random number seeds --- learn/models/parsnip-predictions/index.qmd | 139 ++++++++++++++++++++- 1 file changed, 138 insertions(+), 1 deletion(-) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 024922a0..8811bdd9 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -60,7 +60,6 @@ todo - multielvel examples - keras3 updates - avoid subsection titles capitalizing the engine name (e.g., "CATBOOST") and text within backticks -- set seeds when needed ```{r} #| label: load-tm @@ -304,6 +303,8 @@ Now we create the model fit object: ```{r} #| label: fit-earth-bag-mars-classification +# Set the random number seed to an integer for reproducibility: +set.seed(268) bag_mars_fit <- bag_mars_spec |> fit(class ~ ., data = bin_train) bag_mars_fit ``` @@ -346,6 +347,8 @@ Now we create the model fit object: ```{r} #| label: fit-nnet-bag-mlp-classification +# Set the random number seed to an integer for reproducibility: +set.seed(318) bag_mlp_fit <- bag_mlp_spec |> fit(class ~ ., data = bin_train) bag_mlp_fit ``` @@ -388,6 +391,8 @@ Now we create the model fit object: ```{r} #| label: fit-rpart-bag-tree-classification +# Set the random number seed to an integer for reproducibility: +set.seed(985) bag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train) bag_tree_fit ``` @@ -423,6 +428,8 @@ Now we create the model fit object: ```{r} #| label: fit-C5.0-bag-tree-classification +# Set the random number seed to an integer for reproducibility: +set.seed(937) bag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train) bag_tree_fit ``` @@ -457,6 +464,8 @@ Now we create the model fit object: ```{r} #| label: fit-dbarts-bart-classification +# Set the random number seed to an integer for reproducibility: +set.seed(217) bart_fit <- bart_spec |> fit(class ~ ., data = bin_train) bart_fit ``` @@ -493,6 +502,8 @@ Now we create the model fit object: ```{r} #| label: fit-xgboost-boost-tree-classification +# Set the random number seed to an integer for reproducibility: +set.seed(738) boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) boost_tree_fit ``` @@ -520,6 +531,8 @@ Now we create the model fit object: ```{r} #| label: fit-C5.0-boost-tree-classification +# Set the random number seed to an integer for reproducibility: +set.seed(984) boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) boost_tree_fit ``` @@ -556,6 +569,8 @@ Now we create the model fit object: ```{r} #| label: fit-catboost-boost-tree-classification +# Set the random number seed to an integer for reproducibility: +set.seed(644) boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) boost_tree_fit ``` @@ -594,6 +609,8 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-boost-tree-classification #| eval: !expr 'run_h2o' +# Set the random number seed to an integer for reproducibility: +set.seed(186) boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) boost_tree_fit ``` @@ -633,6 +650,8 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-gbm-boost-tree-classification #| eval: !expr 'run_h2o' +# Set the random number seed to an integer for reproducibility: +set.seed(724) boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) boost_tree_fit ``` @@ -670,6 +689,8 @@ Now we create the model fit object: ```{r} #| label: fit-lightgbm-boost-tree-classification +# Set the random number seed to an integer for reproducibility: +set.seed(906) boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) boost_tree_fit ``` @@ -699,6 +720,8 @@ Now we create the model fit object: ```{r} #| label: fit-spark-boost-tree-classification #| eval: !expr 'run_spark' +# Set the random number seed to an integer for reproducibility: +set.seed(285) boost_tree_fit <- boost_tree_spec |> fit(Class ~ ., data = tbl_bin$training) boost_tree_fit ``` @@ -741,6 +764,8 @@ Now we create the model fit object: ```{r} #| label: fit-C5.0-C5-rules-classification +# Set the random number seed to an integer for reproducibility: +set.seed(93) C5_rules_fit <- C5_rules_spec |> fit(class ~ ., data = bin_train) C5_rules_fit ``` @@ -1270,6 +1295,8 @@ Now we create the model fit object: ```{r} #| label: fit-brulee-logistic-reg-classification +# Set the random number seed to an integer for reproducibility: +set.seed(466) logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) logistic_reg_fit ``` @@ -1437,6 +1464,8 @@ Now we create the model fit object: ```{r} #| label: fit-keras-logistic-reg-classification #| eval: false +# Set the random number seed to an integer for reproducibility: +set.seed(730) logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) logistic_reg_fit ``` @@ -1492,6 +1521,8 @@ Now we create the model fit object: ```{r} #| label: fit-stan-logistic-reg-classification +# Set the random number seed to an integer for reproducibility: +set.seed(96) logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) logistic_reg_fit ``` @@ -1530,6 +1561,8 @@ Now we create the model fit object: ```{r} #| label: fit-stan-glmer-logistic-reg-classification #| eval: false +# Set the random number seed to an integer for reproducibility: +set.seed(484) logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) logistic_reg_fit ``` @@ -1630,6 +1663,8 @@ Now we create the model fit object: ```{r} #| label: fit-nnet-mlp-classification +# Set the random number seed to an integer for reproducibility: +set.seed(839) mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) mlp_fit ``` @@ -1658,6 +1693,8 @@ Now we create the model fit object: ```{r} #| label: fit-brulee-mlp-classification +# Set the random number seed to an integer for reproducibility: +set.seed(38) mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) mlp_fit ``` @@ -1686,6 +1723,8 @@ Now we create the model fit object: ```{r} #| label: fit-brulee-two-layer-mlp-classification +# Set the random number seed to an integer for reproducibility: +set.seed(336) mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) mlp_fit ``` @@ -1724,6 +1763,8 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-mlp-classification #| eval: !expr 'run_h2o' +# Set the random number seed to an integer for reproducibility: +set.seed(306) mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) mlp_fit ``` @@ -1754,6 +1795,8 @@ Now we create the model fit object: ```{r} #| label: fit-keras-mlp-classification #| eval: false +# Set the random number seed to an integer for reproducibility: +set.seed(216) mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) mlp_fit ``` @@ -1788,6 +1831,8 @@ Now we create the model fit object: ```{r} #| label: fit-nnet-multinom-reg-classification +# Set the random number seed to an integer for reproducibility: +set.seed(634) multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train) multinom_reg_fit ``` @@ -1815,6 +1860,8 @@ Now we create the model fit object: ```{r} #| label: fit-brulee-multinom-reg-classification +# Set the random number seed to an integer for reproducibility: +set.seed(837) multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train) multinom_reg_fit ``` @@ -2198,6 +2245,8 @@ Now we create the model fit object: ```{r} #| label: fit-ranger-rand-forest-classification +# Set the random number seed to an integer for reproducibility: +set.seed(841) rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) rand_forest_fit ``` @@ -2235,6 +2284,8 @@ Now we create the model fit object: ```{r} #| label: fit-aorsf-rand-forest-classification +# Set the random number seed to an integer for reproducibility: +set.seed(923) rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) rand_forest_fit ``` @@ -2265,6 +2316,8 @@ Now we create the model fit object: ```{r} #| label: fit-grf-rand-forest-classification #| eval: false +# Set the random number seed to an integer for reproducibility: +set.seed(546) rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) rand_forest_fit ``` @@ -2305,6 +2358,8 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-rand-forest-classification #| eval: !expr 'run_h2o' +# Set the random number seed to an integer for reproducibility: +set.seed(493) rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) rand_forest_fit ``` @@ -2342,6 +2397,8 @@ Now we create the model fit object: ```{r} #| label: fit-partykit-rand-forest-classification +# Set the random number seed to an integer for reproducibility: +set.seed(252) rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) ``` @@ -2378,6 +2435,8 @@ Now we create the model fit object: ```{r} #| label: fit-randomForest-rand-forest-classification +# Set the random number seed to an integer for reproducibility: +set.seed(726) rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) rand_forest_fit ``` @@ -2407,6 +2466,8 @@ Now we create the model fit object: ```{r} #| label: fit-spark-rand-forest-classification #| eval: !expr 'run_spark' +# Set the random number seed to an integer for reproducibility: +set.seed(693) rand_forest_fit <- rand_forest_spec |> fit(Class ~ ., data = tbl_bin$training) rand_forest_fit ``` @@ -2450,6 +2511,8 @@ Now we create the model fit object: ```{r} #| label: fit-xrf-rule-fit-classification +# Set the random number seed to an integer for reproducibility: +set.seed(95) rule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train) rule_fit_fit ``` @@ -2488,6 +2551,8 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-rule-fit-classification #| eval: !expr 'run_h2o' +# Set the random number seed to an integer for reproducibility: +set.seed(536) rule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train) rule_fit_fit ``` @@ -2758,6 +2823,8 @@ Now we create the model fit object: ```{r} #| label: fit-earth-bag-mars-regression +# Set the random number seed to an integer for reproducibility: +set.seed(147) bag_mars_fit <- bag_mars_spec |> fit(strength ~ ., data = reg_train) bag_mars_fit ``` @@ -2799,6 +2866,8 @@ Now we create the model fit object: ```{r} #| label: fit-nnet-bag-mlp-regression +# Set the random number seed to an integer for reproducibility: +set.seed(324) bag_mlp_fit <- bag_mlp_spec |> fit(strength ~ ., data = reg_train) bag_mlp_fit ``` @@ -2840,6 +2909,8 @@ Now we create the model fit object: ```{r} #| label: fit-rpart-bag-tree-regression +# Set the random number seed to an integer for reproducibility: +set.seed(230) bag_tree_fit <- bag_tree_spec |> fit(strength ~ ., data = reg_train) bag_tree_fit ``` @@ -2873,6 +2944,8 @@ Now we create the model fit object: ```{r} #| label: fit-dbarts-bart-regression +# Set the random number seed to an integer for reproducibility: +set.seed(134) bart_fit <- bart_spec |> fit(strength ~ ., data = reg_train) bart_fit ``` @@ -2908,6 +2981,8 @@ Now we create the model fit object: ```{r} #| label: fit-xgboost-boost-tree-regression +# Set the random number seed to an integer for reproducibility: +set.seed(748) boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) boost_tree_fit ``` @@ -2943,6 +3018,8 @@ Now we create the model fit object: ```{r} #| label: fit-catboost-boost-tree-regression +# Set the random number seed to an integer for reproducibility: +set.seed(557) boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) boost_tree_fit ``` @@ -2979,6 +3056,8 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-boost-tree-regression +# Set the random number seed to an integer for reproducibility: +set.seed(720) boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) boost_tree_fit ``` @@ -3016,6 +3095,8 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-gbm-boost-tree-regression #| eval: !expr 'run_h2o' +# Set the random number seed to an integer for reproducibility: +set.seed(90) boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) boost_tree_fit ``` @@ -3051,6 +3132,8 @@ Now we create the model fit object: ```{r} #| label: fit-lightgbm-boost-tree-regression +# Set the random number seed to an integer for reproducibility: +set.seed(570) boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) boost_tree_fit ``` @@ -3079,6 +3162,8 @@ Now we create the model fit object: ```{r} #| label: fit-spark-boost-tree-regression #| eval: !expr 'run_spark' +# Set the random number seed to an integer for reproducibility: +set.seed(620) boost_tree_fit <- boost_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training) boost_tree_fit ``` @@ -3120,6 +3205,8 @@ Now we create the model fit object: ```{r} #| label: fit-Cubist-cubist-rules-regression +# Set the random number seed to an integer for reproducibility: +set.seed(188) cubist_rules_fit <- cubist_rules_spec |> fit(strength ~ ., data = reg_train) cubist_rules_fit ``` @@ -3313,6 +3400,8 @@ Now we create the model fit object: ```{r} #| label: fit-brulee-linear-reg-regression +# Set the random number seed to an integer for reproducibility: +set.seed(1) linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) linear_reg_fit ``` @@ -3538,6 +3627,8 @@ Now we create the model fit object: ```{r} #| label: fit-keras-linear-reg-regression #| eval: false +# Set the random number seed to an integer for reproducibility: +set.seed(596) linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) linear_reg_fit ``` @@ -3637,6 +3728,8 @@ Now we create the model fit object: ```{r} #| label: fit-stan-linear-reg-regression +# Set the random number seed to an integer for reproducibility: +set.seed(357) linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) linear_reg_fit ``` @@ -3674,6 +3767,8 @@ Now we create the model fit object: ```{r} #| label: fit-stan-glmer-linear-reg-regression #| eval: false +# Set the random number seed to an integer for reproducibility: +set.seed(895) linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) linear_reg_fit ``` @@ -3770,6 +3865,8 @@ Now we create the model fit object: ```{r} #| label: fit-nnet-mlp-regression +# Set the random number seed to an integer for reproducibility: +set.seed(159) mlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train) mlp_fit ``` @@ -3797,6 +3894,8 @@ Now we create the model fit object: ```{r} #| label: fit-brulee-mlp-regression +# Set the random number seed to an integer for reproducibility: +set.seed(407) mlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train) mlp_fit ``` @@ -3824,6 +3923,8 @@ Now we create the model fit object: ```{r} #| label: fit-brulee-two-layer-mlp-regression +# Set the random number seed to an integer for reproducibility: +set.seed(585) mlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train) mlp_fit ``` @@ -3861,6 +3962,8 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-mlp-regression #| eval: !expr 'run_h2o' +# Set the random number seed to an integer for reproducibility: +set.seed(93) mlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train) mlp_fit ``` @@ -3890,6 +3993,8 @@ Now we create the model fit object: ```{r} #| label: fit-keras-mlp-regression #| eval: false +# Set the random number seed to an integer for reproducibility: +set.seed(879) mlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train) mlp_fit ``` @@ -4105,6 +4210,8 @@ Now we create the model fit object: ```{r} #| label: fit-glmer-poisson-reg-regression #| eval: false +# Set the random number seed to an integer for reproducibility: +set.seed(826) poisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train) poisson_reg_fit ``` @@ -4246,6 +4353,8 @@ Now we create the model fit object: ```{r} #| label: fit-stan-poisson-reg-regression #| eval: false +# Set the random number seed to an integer for reproducibility: +set.seed(213) poisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train) poisson_reg_fit ``` @@ -4284,6 +4393,8 @@ Now we create the model fit object: ```{r} #| label: fit-stan-glmer-poisson-reg-regression #| eval: false +# Set the random number seed to an integer for reproducibility: +set.seed(690) poisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train) poisson_reg_fit ``` @@ -4356,6 +4467,8 @@ Now we create the model fit object: ```{r} #| label: fit-ranger-rand-forest-regression +# Set the random number seed to an integer for reproducibility: +set.seed(860) rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) rand_forest_fit ``` @@ -4392,6 +4505,8 @@ Now we create the model fit object: ```{r} #| label: fit-aorsf-rand-forest-regression +# Set the random number seed to an integer for reproducibility: +set.seed(47) rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) rand_forest_fit ``` @@ -4421,6 +4536,8 @@ Now we create the model fit object: ```{r} #| label: fit-grf-rand-forest-regression #| eval: false +# Set the random number seed to an integer for reproducibility: +set.seed(130) rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) rand_forest_fit ``` @@ -4460,6 +4577,8 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-rand-forest-regression #| eval: !expr 'run_h2o' +# Set the random number seed to an integer for reproducibility: +set.seed(211) rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) rand_forest_fit ``` @@ -4496,6 +4615,8 @@ Now we create the model fit object: ```{r} #| label: fit-partykit-rand-forest-regression +# Set the random number seed to an integer for reproducibility: +set.seed(981) rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) ``` @@ -4531,6 +4652,8 @@ Now we create the model fit object: ```{r} #| label: fit-randomForest-rand-forest-regression +# Set the random number seed to an integer for reproducibility: +set.seed(793) rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) rand_forest_fit ``` @@ -4559,6 +4682,8 @@ Now we create the model fit object: ```{r} #| label: fit-spark-rand-forest-regression #| eval: !expr 'run_spark' +# Set the random number seed to an integer for reproducibility: +set.seed(157) rand_forest_fit <- rand_forest_spec |> fit(compressive_strength ~ ., data = tbl_reg$training) rand_forest_fit ``` @@ -4601,6 +4726,8 @@ Now we create the model fit object: ```{r} #| label: fit-xrf-rule-fit-regression +# Set the random number seed to an integer for reproducibility: +set.seed(431) rule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train) rule_fit_fit ``` @@ -4638,6 +4765,8 @@ Now we create the model fit object: ```{r} #| label: fit-h2o-rule-fit-regression #| eval: !expr 'run_h2o' +# Set the random number seed to an integer for reproducibility: +set.seed(236) rule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train) rule_fit_fit ``` @@ -4887,6 +5016,8 @@ Now we create the model fit object: ```{r} #| label: fit-mboost-boost-tree-censored-regression +# Set the random number seed to an integer for reproducibility: +set.seed(852) boost_tree_fit <- boost_tree_spec |> fit(event_time ~ ., data = cns_train) boost_tree_fit ``` @@ -5136,6 +5267,8 @@ Now we create the model fit object: ```{r} #| label: fit-aorsf-rand-forest-censored-regression +# Set the random number seed to an integer for reproducibility: +set.seed(2) rand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train) rand_forest_fit ``` @@ -5182,6 +5315,8 @@ Now we create the model fit object: ```{r} #| label: fit-partykit-rand-forest-censored-regression +# Set the random number seed to an integer for reproducibility: +set.seed(89) rand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train) ``` @@ -5459,6 +5594,8 @@ Now we create the model fit object: ```{r} #| label: fit-grf-rand-forest-quantile-regression #| eval: false +# Set the random number seed to an integer for reproducibility: +set.seed(435) rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = qnt_train) rand_forest_fit ``` From 55d97a6c6193fdf1a75b02a165ff41a9db84d783 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Thu, 20 Nov 2025 15:28:46 -0800 Subject: [PATCH 18/23] don't capitalize code in headers --- learn/models/parsnip-predictions/index.qmd | 3 +++ learn/models/parsnip-predictions/style.scss | 5 +++++ 2 files changed, 8 insertions(+) create mode 100644 learn/models/parsnip-predictions/style.scss diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 8811bdd9..545b96fe 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -12,6 +12,9 @@ description: | toc: true toc-depth: 3 include-after-body: ../../../resources.html +format: + html: + theme: ["style.scss"] --- ```{r} diff --git a/learn/models/parsnip-predictions/style.scss b/learn/models/parsnip-predictions/style.scss new file mode 100644 index 00000000..f9a0722b --- /dev/null +++ b/learn/models/parsnip-predictions/style.scss @@ -0,0 +1,5 @@ +/*-- scss:rules --*/ + +h2 code { + text-transform: none; +} \ No newline at end of file From 0bb5b72f1eec495dc1800cb6ea94fcf72d2beb43 Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 20 Nov 2025 18:52:59 -0500 Subject: [PATCH 19/23] use dev parsnip to get the rf quantile results --- learn/models/parsnip-predictions/index.qmd | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 545b96fe..8cde171b 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -46,7 +46,7 @@ pkgs <- c("tidymodels", "agua", "baguette", "bonsai", "censored", "discrim", # Introduction -`r article_req_pkgs(pkgs)` +`r article_req_pkgs(pkgs)` There are numerous other "engine" packages that are required. If you use a model that is missing one or more installed packages, parsnip will prompt you to install them. These examples show how to *fit* and *predict* with different combinations of model, mode, and engine. As a reminder, in parsnip, @@ -62,7 +62,6 @@ todo - multielvel examples - keras3 updates -- avoid subsection titles capitalizing the engine name (e.g., "CATBOOST") and text within backticks ```{r} #| label: load-tm @@ -181,6 +180,7 @@ The tidymodels [agua](https://agua.tidymodels.org/) package has some helpers and ```{r} #| label: h2o-init #| eval: !expr 'run_h2o' +#| results: hide library(agua) h2o_start() ``` @@ -5586,17 +5586,15 @@ We create a model specification via: ```{r} #| label: spec-grf-rand-forest-quantile-regression -#| eval: false -rand_forest_spec <- rand_forest() |> - set_mode("quantile regression", quantile_levels = qnt_lvls) |> - set_engine("grf") +rand_forest_spec <- rand_forest() |> + set_engine("grf") |> + set_mode("quantile regression", quantile_levels = qnt_lvls) ``` Now we create the model fit object: ```{r} #| label: fit-grf-rand-forest-quantile-regression -#| eval: false # Set the random number seed to an integer for reproducibility: set.seed(435) rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = qnt_train) @@ -5607,7 +5605,6 @@ The holdout data can be predicted: ```{r} #| label: predict-grf-rand-forest-quantile-regression -#| eval: false predict(rand_forest_fit, type = "quantile", new_data = qnt_test) ``` From 5cb1f9232f0440cacf1c5c9fb5c90c80119cc99c Mon Sep 17 00:00:00 2001 From: topepo Date: Fri, 21 Nov 2025 14:57:38 -0500 Subject: [PATCH 20/23] multilevel regression model examples --- learn/models/parsnip-predictions/index.qmd | 241 ++++++++++++++------- 1 file changed, 158 insertions(+), 83 deletions(-) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 8cde171b..bc23992a 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -28,6 +28,9 @@ source(here::here("common.R")) # Indicates to enable or not running Spark code run_spark <- TRUE run_h2o <- TRUE +run_keras <- TRUE +run_catboost <- rlang::is_installed("catboost") +run_grf <- rlang::is_installed("parsnip", version = "1.3.3.9000") ``` ```{r} @@ -40,15 +43,14 @@ library(sparklyr) #' skip format pkgs <- c("tidymodels", "agua", "baguette", "bonsai", "censored", "discrim", - "multilevelmod", "plsmod", "poissonreg", "rules", "sparklyr") + "multilevelmod", "plsmod", "poissonreg", "rules", "sparklyr", + "HSAUR3", "lme4", "prodlim", "survival") ``` # Introduction -`r article_req_pkgs(pkgs)` There are numerous other "engine" packages that are required. If you use a model that is missing one or more installed packages, parsnip will prompt you to install them. - -These examples show how to *fit* and *predict* with different combinations of model, mode, and engine. As a reminder, in parsnip, +This page shows examples of how to *fit* and *predict* with different combinations of model, mode, and engine. As a reminder, in parsnip, - the **model type** differentiates basic modeling approaches, such as random forests, logistic regression, linear support vector machines, etc., @@ -56,18 +58,9 @@ These examples show how to *fit* and *predict* with different combinations of mo - the computational **engine** indicates how the model is fit, such as with a specific R package implementation or even methods outside of R like Keras or Stan. -The following examples use consistent data sets throughout. - -todo +We'll break the examples up by their mode. For each model, we'll show different data sets used across the different engines. -- multielvel examples -- keras3 updates - -```{r} -#| label: load-tm -library(tidymodels) -theme_set(theme_bw() + theme(legend.position = "top")) -``` +`r article_req_pkgs(pkgs)` There are numerous other "engine" packages that are required. If you use a model that is missing one or more installed packages, parsnip will prompt you to install them. There are some packages that require non-standard installation or rely on external dependencies. We'll descrine these next. ## External Dependencies @@ -204,6 +197,15 @@ keras3::install_keras(backend = "tensorflow") There are other options for installation. See [https://tensorflow.rstudio.com/install/index.html](https://tensorflow.rstudio.com/install/index.html) for more details. + +```{r} +#| label: setup-keras +#| eval: !expr 'run_keras' +# Assumes you are going to use a virtual environment called +pve <- grep("tensorflow", reticulate::virtualenv_list(), value = TRUE) +reticulate::use_virtualenv(pve) +``` + ### Torch R's torch package is the low-level package containing the framework. Once you have installed it, you will get this message the first time you load the package: @@ -214,6 +216,18 @@ Choosing "Yes" will do the _one-time_ installation.
+To get started, let's load the tidymodels package: + +todo + +- multielvel examples + +```{r} +#| label: load-tm +library(tidymodels) +theme_set(theme_bw() + theme(legend.position = "top")) +``` + # Classification Models To demonstrate classification, let's make a small training and test sets for a binary outcome. We'll center and scale the data since some models require the same units. @@ -555,6 +569,7 @@ This engine requires the bonsai extension package, so let's load this first: ```{r} #| label: load-catboost-boost-tree-classification-bonsai #| output: false +#| eval: !expr 'run_catboost' library(bonsai) ``` @@ -572,6 +587,7 @@ Now we create the model fit object: ```{r} #| label: fit-catboost-boost-tree-classification +#| eval: !expr 'run_catboost' # Set the random number seed to an integer for reproducibility: set.seed(644) boost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train) @@ -582,6 +598,7 @@ The holdout data can be predicted: ```{r} #| label: predict-catboost-boost-tree-classification +#| eval: !expr 'run_catboost' predict(boost_tree_fit, type = "class", new_data = bin_test) predict(boost_tree_fit, type = "prob", new_data = bin_test) ``` @@ -1466,10 +1483,16 @@ Now we create the model fit object: ```{r} #| label: fit-keras-logistic-reg-classification -#| eval: false +#| eval: !expr 'run_keras' +#| results: hide # Set the random number seed to an integer for reproducibility: set.seed(730) logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) +``` + +```{r} +#| label: fit-keras-logistic-reg-classification-print +#| eval: !expr 'run_keras' logistic_reg_fit ``` @@ -1477,7 +1500,7 @@ The holdout data can be predicted: ```{r} #| label: predict-keras-logistic-reg-classification -#| eval: false +#| eval: !expr 'run_keras' predict(logistic_reg_fit, type = "class", new_data = bin_test) predict(logistic_reg_fit, type = "prob", new_data = bin_test) ``` @@ -1797,10 +1820,16 @@ Now we create the model fit object: ```{r} #| label: fit-keras-mlp-classification -#| eval: false +#| eval: !expr 'run_keras' +#| results: hide # Set the random number seed to an integer for reproducibility: set.seed(216) mlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train) +``` + +```{r} +#| label: fit-keras-mlp-classification-print +#| eval: !expr 'run_keras' mlp_fit ``` @@ -1808,7 +1837,7 @@ The holdout data can be predicted: ```{r} #| label: predict-keras-mlp-classification -#| eval: false +#| eval: !expr 'run_keras' predict(mlp_fit, type = "class", new_data = bin_test) predict(mlp_fit, type = "prob", new_data = bin_test) ``` @@ -1957,8 +1986,14 @@ Now we create the model fit object: ```{r} #| label: fit-keras-multinom-reg-classification -#| eval: false +#| eval: !expr 'run_keras' +#| results: hide multinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train) +``` + +```{r} +#| label: fit-keras-multinom-reg-classification-print +#| eval: !expr 'run_keras' multinom_reg_fit ``` @@ -1966,7 +2001,7 @@ The holdout data can be predicted: ```{r} #| label: predict-keras-multinom-reg-classification -#| eval: false +#| eval: !expr 'run_keras' predict(multinom_reg_fit, type = "class", new_data = mtl_test) predict(multinom_reg_fit, type = "prob", new_data = mtl_test) ``` @@ -2307,7 +2342,7 @@ We create a model specification via: ```{r} #| label: spec-grf-rand-forest-classification -#| eval: false +#| eval: !expr 'run_grf' rand_forest_spec <- rand_forest() |> # We need to set the mode since this engine works with multiple modes set_mode("classification") |> @@ -2318,7 +2353,7 @@ Now we create the model fit object: ```{r} #| label: fit-grf-rand-forest-classification -#| eval: false +#| eval: !expr 'run_grf' # Set the random number seed to an integer for reproducibility: set.seed(546) rand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train) @@ -2329,7 +2364,7 @@ The holdout data can be predicted: ```{r} #| label: predict-grf-rand-forest-classification -#| eval: false +#| eval: !expr 'run_grf' predict(rand_forest_fit, type = "class", new_data = bin_test) predict(rand_forest_fit, type = "prob", new_data = bin_test) predict(rand_forest_fit, type = "conf_int", new_data = bin_test) @@ -2782,6 +2817,24 @@ count_train <- bake(count_rec, new_data = NULL) count_test <- bake(count_rec, new_data = testing(count_split)) ``` +Finally, we have some models that handle hierarchical data, where some rows are statistically correlated with other rows. For these examples, we'll use a data set that models body weights as a function of time for several "subjects" (rats, actually). We'll split these data in a way where all rows for a specific subject are either in the training or test sets: + +```{r} +#| label: reg-hierarchical-data +set.seed(224) +reg_group_split <- + nlme::BodyWeight |> + # Get rid of some extra attributes added by the nlme package + as_tibble() |> + # Convert to an _unordered_ factor + mutate(Rat = factor(as.character(Rat))) |> + group_initial_split(group = Rat) +reg_group_train <- training(reg_group_split) +reg_group_test <- testing(reg_group_split) +``` + +There are `r length(unique(reg_group_train$Rat))` subjects in the training set and `r length(unique(reg_group_test$Rat))` in the test set. + If using the **Apache Spark** engine, we will need to identify the data source, and then use it to create the splits. For this article, we will copy the `concrete` data set into the Spark session. @@ -3004,6 +3057,7 @@ This engine requires the bonsai extension package, so let's load this first: ```{r} #| label: load-catboost-boost-tree-regression-bonsai #| output: false +#| eval: !expr 'run_catboost' library(bonsai) ``` @@ -3021,6 +3075,7 @@ Now we create the model fit object: ```{r} #| label: fit-catboost-boost-tree-regression +#| eval: !expr 'run_catboost' # Set the random number seed to an integer for reproducibility: set.seed(557) boost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train) @@ -3031,6 +3086,7 @@ The holdout data can be predicted: ```{r} #| label: predict-catboost-boost-tree-regression +#| eval: !expr 'run_catboost' predict(boost_tree_fit, new_data = reg_test) ``` @@ -3305,7 +3361,7 @@ Now we create the model fit object: ```{r} #| label: fit-spark-decision-tree-regression -#| eval: !expr 'run_spark' +#| eval: false decision_tree_fit <- decision_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training) decision_tree_fit ``` @@ -3439,8 +3495,9 @@ Now we create the model fit object: ```{r} #| label: fit-gee-linear-reg-regression -#| eval: false -linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit <- + linear_reg_spec |> + fit(weight ~ Time + Diet + id_var(Rat), data = reg_group_train) linear_reg_fit ``` @@ -3448,8 +3505,7 @@ The holdout data can be predicted: ```{r} #| label: predict-gee-linear-reg-regression -#| eval: false -predict(linear_reg_fit, new_data = reg_test) +predict(linear_reg_fit, new_data = reg_group_test) ``` ## `glm` @@ -3493,7 +3549,6 @@ We create a model specification via: ```{r} #| label: spec-glmer-linear-reg-regression -#| eval: false linear_reg_spec <- linear_reg() |> # This engine works with a single mode so no need to set that set_engine("glmer") @@ -3503,8 +3558,9 @@ Now we create the model fit object: ```{r} #| label: fit-glmer-linear-reg-regression -#| eval: false -linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit <- + linear_reg_spec |> + fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train) linear_reg_fit ``` @@ -3512,7 +3568,7 @@ The holdout data can be predicted: ```{r} #| label: predict-glmer-linear-reg-regression -predict(linear_reg_fit, new_data = reg_test) +predict(linear_reg_fit, new_data = reg_group_test) ``` ## `glmnet` @@ -3557,15 +3613,16 @@ We create a model specification via: #| label: spec-gls-linear-reg-regression linear_reg_spec <- linear_reg() |> # This engine works with a single mode so no need to set that - set_engine("gls") + # Also, nlme::gls() specifies the random effects outside of the formula so + # we set that as an engine parameter + set_engine("gls", correlation = nlme::corCompSymm(form = ~Time|Rat)) ``` Now we create the model fit object: ```{r} #| label: fit-gls-linear-reg-regression -#| eval: false -linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit <- linear_reg_spec |> fit(weight ~ Time + Diet, data = reg_group_train) linear_reg_fit ``` @@ -3573,8 +3630,7 @@ The holdout data can be predicted: ```{r} #| label: predict-gls-linear-reg-regression -#| eval: false -predict(linear_reg_fit, new_data = reg_test) +predict(linear_reg_fit, new_data = reg_group_test) ``` ## `h2o` @@ -3629,18 +3685,25 @@ Now we create the model fit object: ```{r} #| label: fit-keras-linear-reg-regression -#| eval: false +#| eval: !expr 'run_keras' +#| results: hide # Set the random number seed to an integer for reproducibility: set.seed(596) linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) linear_reg_fit ``` +```{r} +#| label: t-keras-linear-reg-regression-print +#| eval: !expr 'run_keras' +linear_reg_fit +``` + The holdout data can be predicted: ```{r} #| label: predict-keras-linear-reg-regression -#| eval: false +#| eval: !expr 'run_keras' predict(linear_reg_fit, new_data = reg_test) ``` @@ -3659,16 +3722,17 @@ We create a model specification via: ```{r} #| label: spec-lme-linear-reg-regression linear_reg_spec <- linear_reg() |> - # This engine works with a single mode so no need to set that - set_engine("lme") + # This engine works with a single mode so no need to set that. + # nlme::lme() makes us set the random effects outside of the formula so we + # add it as an engine parameter. + set_engine("lme", random = ~ Time | Rat) ``` Now we create the model fit object: ```{r} #| label: fit-lme-linear-reg-regression -#| eval: false -linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit <- linear_reg_spec |> fit(weight ~ Diet + Time, data = reg_group_train) linear_reg_fit ``` @@ -3676,8 +3740,7 @@ The holdout data can be predicted: ```{r} #| label: predict-lme-linear-reg-regression -#| eval: false -predict(linear_reg_fit, new_data = reg_test) +predict(linear_reg_fit, new_data = reg_group_test) ``` ## `lmer` @@ -3703,8 +3766,9 @@ Now we create the model fit object: ```{r} #| label: fit-lmer-linear-reg-regression -#| eval: false -linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit <- + linear_reg_spec |> + fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train) linear_reg_fit ``` @@ -3712,8 +3776,7 @@ The holdout data can be predicted: ```{r} #| label: predict-lmer-linear-reg-regression -#| eval: false -predict(linear_reg_fit, new_data = reg_test) +predict(linear_reg_fit, new_data = reg_group_test) ``` ## `stan` @@ -3733,7 +3796,7 @@ Now we create the model fit object: #| label: fit-stan-linear-reg-regression # Set the random number seed to an integer for reproducibility: set.seed(357) -linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit <- linear_reg_spec |> fit(weight ~ Diet + Time, data = reg_group_train) linear_reg_fit ``` @@ -3741,9 +3804,9 @@ The holdout data can be predicted: ```{r} #| label: predict-stan-linear-reg-regression -predict(linear_reg_fit, new_data = reg_test) -predict(linear_reg_fit, type = "conf_int", new_data = reg_test) -predict(linear_reg_fit, type = "pred_int", new_data = reg_test) +predict(linear_reg_fit, new_data = reg_group_test) +predict(linear_reg_fit, type = "conf_int", new_data = reg_group_test) +predict(linear_reg_fit, type = "pred_int", new_data = reg_group_test) ``` ## `stan_glmer` @@ -3769,10 +3832,11 @@ Now we create the model fit object: ```{r} #| label: fit-stan-glmer-linear-reg-regression -#| eval: false # Set the random number seed to an integer for reproducibility: set.seed(895) -linear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train) +linear_reg_fit <- + linear_reg_spec |> + fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train) linear_reg_fit ``` @@ -3780,9 +3844,8 @@ The holdout data can be predicted: ```{r} #| label: predict-stan-glmer-linear-reg-regression -#| eval: false -predict(linear_reg_fit, new_data = reg_test) -predict(linear_reg_fit, type = "pred_int", new_data = reg_test) +predict(linear_reg_fit, new_data = reg_group_test) +predict(linear_reg_fit, type = "pred_int", new_data = reg_group_test) ``` ## `spark` @@ -3995,18 +4058,24 @@ Now we create the model fit object: ```{r} #| label: fit-keras-mlp-regression -#| eval: false +#| eval: !expr 'run_keras' +#| results: hide # Set the random number seed to an integer for reproducibility: set.seed(879) mlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train) -mlp_fit +``` + +```{r} +#| label: fit-keras-mlp-regression-print +#| eval: !expr 'run_keras' +linear_reg_fit ``` The holdout data can be predicted: ```{r} #| label: predict-keras-mlp-regression -#| eval: false +#| eval: !expr 'run_keras' predict(mlp_fit, new_data = reg_test) ``` @@ -4177,7 +4246,9 @@ Now we create the model fit object: ```{r} #| label: fit-gee-poisson-reg-regression #| eval: false -poisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train) +poisson_reg_fit <- + poisson_reg_spec |> + fit(weight ~ Diet + Time + id_var(Rat), data = reg_group_train) poisson_reg_fit ``` @@ -4185,8 +4256,8 @@ The holdout data can be predicted: ```{r} #| label: predict-gee-poisson-reg-regression -#| eval: false -predict(poisson_reg_fit, new_data = reg_test) +# Can't reproduce this: +# predict(poisson_reg_fit, new_data = reg_group_test) ``` ## `glmer` @@ -4212,10 +4283,11 @@ Now we create the model fit object: ```{r} #| label: fit-glmer-poisson-reg-regression -#| eval: false # Set the random number seed to an integer for reproducibility: set.seed(826) -poisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train) +poisson_reg_fit <- + poisson_reg_spec |> + fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train) poisson_reg_fit ``` @@ -4223,8 +4295,7 @@ The holdout data can be predicted: ```{r} #| label: predict-glmer-poisson-reg-regression -#| eval: false -predict(poisson_reg_fit, new_data = reg_test) +predict(poisson_reg_fit, new_data = reg_group_test) ``` ## `glmnet` @@ -4355,10 +4426,11 @@ Now we create the model fit object: ```{r} #| label: fit-stan-poisson-reg-regression -#| eval: false # Set the random number seed to an integer for reproducibility: set.seed(213) -poisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train) +poisson_reg_fit <- + poisson_reg_spec |> + fit(weight ~ Diet + Time, data = reg_group_train) poisson_reg_fit ``` @@ -4366,10 +4438,9 @@ The holdout data can be predicted: ```{r} #| label: predict-stan-poisson-reg-regression -#| eval: false -predict(poisson_reg_fit, new_data = reg_test) -predict(poisson_reg_fit, type = "conf_int", new_data = reg_test) -predict(poisson_reg_fit, type = "pred_int", new_data = reg_test) +predict(poisson_reg_fit, new_data = reg_group_test) +predict(poisson_reg_fit, type = "conf_int", new_data = reg_group_test) +predict(poisson_reg_fit, type = "pred_int", new_data = reg_group_test) ``` ## `stan_glmer` @@ -4395,10 +4466,11 @@ Now we create the model fit object: ```{r} #| label: fit-stan-glmer-poisson-reg-regression -#| eval: false # Set the random number seed to an integer for reproducibility: set.seed(690) -poisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train) +poisson_reg_fit <- + poisson_reg_spec |> + fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train) poisson_reg_fit ``` @@ -4406,9 +4478,8 @@ The holdout data can be predicted: ```{r} #| label: predict-stan-glmer-poisson-reg-regression -#| eval: false -predict(poisson_reg_fit, new_data = reg_test) -predict(poisson_reg_fit, type = "pred_int", new_data = reg_test) +predict(poisson_reg_fit, new_data = reg_group_test) +predict(poisson_reg_fit, type = "pred_int", new_data = reg_group_test) ``` ## `zeroinfl` @@ -4527,7 +4598,7 @@ We create a model specification via: ```{r} #| label: spec-grf-rand-forest-regression -#| eval: false +#| eval: !expr 'run_grf' rand_forest_spec <- rand_forest() |> # We need to set the mode since this engine works with multiple modes set_mode("regression") |> @@ -4538,7 +4609,7 @@ Now we create the model fit object: ```{r} #| label: fit-grf-rand-forest-regression -#| eval: false +#| eval: !expr 'run_grf' # Set the random number seed to an integer for reproducibility: set.seed(130) rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train) @@ -4549,7 +4620,7 @@ The holdout data can be predicted: ```{r} #| label: predict-grf-rand-forest-regression -#| eval: false +#| eval: !expr 'run_grf' predict(rand_forest_fit, new_data = reg_test) predict(rand_forest_fit, type = "conf_int", new_data = reg_test) ``` @@ -5586,6 +5657,7 @@ We create a model specification via: ```{r} #| label: spec-grf-rand-forest-quantile-regression +#| eval: !expr 'run_grf' rand_forest_spec <- rand_forest() |> set_engine("grf") |> set_mode("quantile regression", quantile_levels = qnt_lvls) @@ -5595,6 +5667,7 @@ Now we create the model fit object: ```{r} #| label: fit-grf-rand-forest-quantile-regression +#| eval: !expr 'run_grf' # Set the random number seed to an integer for reproducibility: set.seed(435) rand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = qnt_train) @@ -5605,6 +5678,7 @@ The holdout data can be predicted: ```{r} #| label: predict-grf-rand-forest-quantile-regression +#| eval: !expr 'run_grf' predict(rand_forest_fit, type = "quantile", new_data = qnt_test) ``` @@ -5612,6 +5686,7 @@ Each row of predictions has a special vector class containing all of the quantil ```{r} #| label: predict-grf-rand-forest-quantile-regression-expand +#| eval: !expr 'run_grf' rand_forest_fit |> predict(type = "quantile", new_data = qnt_test)|> slice(1) |> From d3f771276db2fddfbdc7005d1d1ae454669cbf4b Mon Sep 17 00:00:00 2001 From: topepo Date: Fri, 21 Nov 2025 15:41:01 -0500 Subject: [PATCH 21/23] hierarchical classification example + small updates --- learn/models/parsnip-predictions/index.qmd | 129 ++++++++------------- 1 file changed, 48 insertions(+), 81 deletions(-) diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index bc23992a..4722d177 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -60,7 +60,7 @@ This page shows examples of how to *fit* and *predict* with different combinatio We'll break the examples up by their mode. For each model, we'll show different data sets used across the different engines. -`r article_req_pkgs(pkgs)` There are numerous other "engine" packages that are required. If you use a model that is missing one or more installed packages, parsnip will prompt you to install them. There are some packages that require non-standard installation or rely on external dependencies. We'll descrine these next. +`r article_req_pkgs(pkgs)` There are numerous other "engine" packages that are required. If you use a model that is missing one or more installed packages, parsnip will prompt you to install them. There are some packages that require non-standard installation or rely on external dependencies. We'll describe these next. ## External Dependencies @@ -124,7 +124,7 @@ fit_params ### Apache Spark -To use [Apache Spark](https://spark.apache.org/) as an engine, we will first install Spark and then need a connection to a cluster. For this article, we will setup and use a single-node Spark cluster running on a laptop. +To use [Apache Spark](https://spark.apache.org/) as an engine, we will first install Spark and then need a connection to a cluster. For this article, we will set up and use a single-node Spark cluster running on a laptop. To install, first install sparklyr: @@ -154,7 +154,7 @@ sc <- spark_connect("local") ### h2o -h2o.ai offers a Java based high performance computing server for machine learning. This can be run locally or externally. There are general installation instructions at [https://docs.h2o.ai/](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html). There is a package on CRAN but you can also install directly from [h2o](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html#install-in-r) via: +h2o.ai offers a Java-based high-performance computing server for machine learning. This can be run locally or externally. There are general installation instructions at [https://docs.h2o.ai/](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html). There is a package on CRAN, but you can also install directly from [h2o](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html#install-in-r) via: ```{r} #| label: h2o-download @@ -168,7 +168,7 @@ install.packages( After installation is complete, you can start a local server via `h2o::h2o.init()`. -The tidymodels [agua](https://agua.tidymodels.org/) package has some helpers and will also need to be installed. You can use its function to start a server too: +The tidymodels [agua](https://agua.tidymodels.org/) package contains some helpers and will also need to be installed. You can use its function to start a server too: ```{r} #| label: h2o-init @@ -197,7 +197,6 @@ keras3::install_keras(backend = "tensorflow") There are other options for installation. See [https://tensorflow.rstudio.com/install/index.html](https://tensorflow.rstudio.com/install/index.html) for more details. - ```{r} #| label: setup-keras #| eval: !expr 'run_keras' @@ -218,10 +217,6 @@ Choosing "Yes" will do the _one-time_ installation. To get started, let's load the tidymodels package: -todo - -- multielvel examples - ```{r} #| label: load-tm library(tidymodels) @@ -271,10 +266,21 @@ mtl_train <- training(mtl_split) mtl_test <- testing(mtl_split) ``` -If using the **Apache Spark** engine, we will need to identify the data source, -and then use it to create the splits. For this article, we will copy the -`two_class_dat` and the `mtl_data` data sets into the Spark session. +Finally, we have some models that handle hierarchical data, where some rows are statistically correlated with other rows. For these examples, we'll use data from a clinical trial where patients were followed over time. The outcome is binary. The data are in the HSAUR3 package. We'll split these data in a way where all rows for a specific subject are either in the training or test sets: + +```{r} +#| label: cls-hierarchical-data +set.seed(72) +cls_group_split <- + HSAUR3::toenail |> + group_initial_split(group = patientID) +cls_group_train <- training(cls_group_split) +cls_group_test <- testing(cls_group_split) +``` + +There are `r length(unique(cls_group_train$patientID))` subjects in the training set and `r length(unique(cls_group_test$patientID))` in the test set. +If using the **Apache Spark** engine, we will need to identify the data source and then use it to create the splits. For this article, we will copy the `two_class_dat` and the `mtl_data` data sets into the Spark session. ```{r} #| label: spark-bin-data @@ -1337,6 +1343,7 @@ This engine requires the multilevelmod extension package, so let's load this fir #| label: load-gee-logistic-reg-classification-multilevelmod #| output: false library(multilevelmod) + ``` We create a model specification via: @@ -1352,8 +1359,9 @@ Now we create the model fit object: ```{r} #| label: fit-gee-logistic-reg-classification -#| eval: false -logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) +logistic_reg_fit <- + logistic_reg_spec |> + fit(outcome ~ treatment * visit + id_var(patientID), data = cls_group_train) logistic_reg_fit ``` @@ -1361,9 +1369,8 @@ The holdout data can be predicted: ```{r} #| label: predict-gee-logistic-reg-classification -#| eval: false -predict(logistic_reg_fit, type = "class", new_data = bin_test) -predict(logistic_reg_fit, type = "prob", new_data = bin_test) +predict(logistic_reg_fit, type = "class", new_data = cls_group_test) +predict(logistic_reg_fit, type = "prob", new_data = cls_group_test) ``` ## `glmer` @@ -1380,7 +1387,6 @@ We create a model specification via: ```{r} #| label: spec-glmer-logistic-reg-classification -#| eval: false logistic_reg_spec <- logistic_reg() |> # This engine works with a single mode so no need to set that set_engine("glmer") @@ -1390,8 +1396,9 @@ Now we create the model fit object: ```{r} #| label: fit-glmer-logistic-reg-classification -#| eval: false -logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) +logistic_reg_fit <- + logistic_reg_spec |> + fit(outcome ~ treatment * visit + (1 | patientID), data = cls_group_train) logistic_reg_fit ``` @@ -1399,8 +1406,8 @@ The holdout data can be predicted: ```{r} #| label: predict-glmer-logistic-reg-classification -predict(logistic_reg_fit, type = "class", new_data = bin_test) -predict(logistic_reg_fit, type = "prob", new_data = bin_test) +predict(logistic_reg_fit, type = "class", new_data = cls_group_test) +predict(logistic_reg_fit, type = "prob", new_data = cls_group_test) ``` ## `glmnet` @@ -1549,18 +1556,20 @@ Now we create the model fit object: #| label: fit-stan-logistic-reg-classification # Set the random number seed to an integer for reproducibility: set.seed(96) -logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) -logistic_reg_fit +logistic_reg_fit <- + logistic_reg_spec |> + fit(outcome ~ treatment * visit, data = cls_group_train) +logistic_reg_fit |> print(digits = 3) ``` The holdout data can be predicted: ```{r} #| label: predict-stan-logistic-reg-classification -predict(logistic_reg_fit, type = "class", new_data = bin_test) -predict(logistic_reg_fit, type = "prob", new_data = bin_test) -predict(logistic_reg_fit, type = "conf_int", new_data = bin_test) -predict(logistic_reg_fit, type = "pred_int", new_data = bin_test) +predict(logistic_reg_fit, type = "class", new_data = cls_group_test) +predict(logistic_reg_fit, type = "prob", new_data = cls_group_test) +predict(logistic_reg_fit, type = "conf_int", new_data = cls_group_test) +predict(logistic_reg_fit, type = "pred_int", new_data = cls_group_test) ``` ## `stan_glmer` @@ -1586,22 +1595,22 @@ Now we create the model fit object: ```{r} #| label: fit-stan-glmer-logistic-reg-classification -#| eval: false # Set the random number seed to an integer for reproducibility: set.seed(484) -logistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train) -logistic_reg_fit +logistic_reg_fit <- + logistic_reg_spec |> + fit(outcome ~ treatment * visit + (1 | patientID), data = cls_group_train) +logistic_reg_fit |> print(digits = 3) ``` The holdout data can be predicted: ```{r} #| label: predict-stan-glmer-logistic-reg-classification -#| eval: false -predict(logistic_reg_fit, type = "class", new_data = bin_test) -predict(logistic_reg_fit, type = "prob", new_data = bin_test) -predict(logistic_reg_fit, type = "conf_int", new_data = bin_test) -predict(logistic_reg_fit, type = "pred_int", new_data = bin_test) +predict(logistic_reg_fit, type = "class", new_data = cls_group_test) +predict(logistic_reg_fit, type = "prob", new_data = cls_group_test) +predict(logistic_reg_fit, type = "conf_int", new_data = cls_group_test) +predict(logistic_reg_fit, type = "pred_int", new_data = cls_group_test) ``` ## `spark` @@ -1628,7 +1637,7 @@ The holdout data can be predicted: ```{r} #| label: predict-spark-logistic-reg-classification -#| eval: false +#| eval: !expr 'run_spark' predict(logistic_reg_fit, type = "class", new_data = tbl_bin$test) predict(logistic_reg_fit, type = "prob", new_data = tbl_bin$test) ``` @@ -2733,49 +2742,10 @@ predict(svm_rbf_fit, type = "class", new_data = bin_test) predict(svm_rbf_fit, type = "prob", new_data = bin_test) ``` -## `liquidSVM` - -Note that this package is not on CRAN. You can install it via its : - -```{r} -#| label: install-liquidSVM -#| eval: false -pak::pak("cran/liquidSVM") # fails -``` - -We create a model specification via: - -```{r} -#| label: spec-liquidSVM-svm-rbf-classification -svm_rbf_spec <- svm_rbf() |> - # We need to set the mode since this engine works with multiple modes - set_mode("classification") |> - set_engine("liquidSVM") -``` - -Now we create the model fit object: - -```{r} -#| label: fit-liquidSVM-svm-rbf-classification -#| eval: false -svm_rbf_fit <- svm_rbf_spec |> fit(class ~ ., data = bin_train) -svm_rbf_fit -``` - -The holdout data can be predicted: - -```{r} -#| label: predict-liquidSVM-svm-rbf-classification -#| eval: false -predict(svm_rbf_fit, type = "class", new_data = bin_test) -predict(svm_rbf_fit, type = "prob", new_data = bin_test) -``` - ::: # Regression Models - To demonstrate regression, we'll subset some data. make a training/test split, and standardize the predictors: ```{r} @@ -2797,7 +2767,7 @@ reg_train <- bake(reg_rec, new_data = NULL) reg_test <- bake(reg_rec, new_data = testing(reg_split)) ``` -We also have some models that are specific to integer count outcomes. The data for these are: +We also have models that are specifically designed for integer count outcomes. The data for these are: ```{r} #| label: count-data @@ -2835,9 +2805,7 @@ reg_group_test <- testing(reg_group_split) There are `r length(unique(reg_group_train$Rat))` subjects in the training set and `r length(unique(reg_group_test$Rat))` in the test set. -If using the **Apache Spark** engine, we will need to identify the data source, -and then use it to create the splits. For this article, we will copy the -`concrete` data set into the Spark session. +If using the **Apache Spark** engine, we will need to identify the data source, and then use it to create the splits. For this article, we will copy the `concrete` data set into the Spark session. ```{r} #| label: spark-reg-data @@ -4245,7 +4213,6 @@ Now we create the model fit object: ```{r} #| label: fit-gee-poisson-reg-regression -#| eval: false poisson_reg_fit <- poisson_reg_spec |> fit(weight ~ Diet + Time + id_var(Rat), data = reg_group_train) From 4f2afc287fae41fbae18c7ee05f2d8cca14fd343 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Tue, 25 Nov 2025 13:56:04 -0600 Subject: [PATCH 22/23] Updates freeze, adds needed packages to install.R --- .../index/execute-results/html.json | 8 +++++--- installs.R | 10 +++++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/_freeze/learn/models/parsnip-predictions/index/execute-results/html.json b/_freeze/learn/models/parsnip-predictions/index/execute-results/html.json index 1cc55f8c..2236c48a 100644 --- a/_freeze/learn/models/parsnip-predictions/index/execute-results/html.json +++ b/_freeze/learn/models/parsnip-predictions/index/execute-results/html.json @@ -1,9 +1,11 @@ { - "hash": "3ab2d8887c852419f30d5a7b3fc00f6a", + "hash": "dff776495a405e30e27b4af8898dacff", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Fitting and predicting with parsnip\"\ncategories:\n - model fitting\n - parsnip\n - regression\n - classification\ntype: learn-subsection\nweight: 1\ndescription: | \n Examples that show how to fit and predict with different combinations of model, mode, and engine.\ntoc: true\ntoc-depth: 3\ninclude-after-body: ../../../resources.html\nexecute: \n eval: true\n---\n\n\n\n\n\n\n## Introduction\n\nTo use code in this article, you will need to install the following packages: agua, baguette, bonsai, censored, discrim, multilevelmod, plsmod, poissonreg, rules, sparklyr, and tidymodels.\n\nThese examples show how to *fit* and *predict* with different combinations of model, mode, and engine. As a reminder, in parsnip, \n\n- the **model type** differentiates basic modeling approaches, such as random forests, logistic regression, linear support vector machines, etc.,\n\n- the **mode** denotes in what kind of modeling context it will be used (most commonly, classification or regression), and\n\n- the computational **engine** indicates how the model is fit, such as with a specific R package implementation or even methods outside of R like Keras or Stan.\n\nThe following examples use consistent data sets throughout. \n\ntodo \n\n- multielvel examples \n- get automl working\n- expand survival prediction tibbles\n- keras3 updates\n- use `
` for long model prints\n- avoid subsection titles capitalizing the engine name (e.g., \"CATBOOST\") and text within backticks\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(tidymodels)\ntheme_set(theme_bw() + theme(legend.position = \"top\"))\n```\n:::\n\n\n### Apache Spark\n\nTo use [Apache Spark](https://spark.apache.org/) as an engine, we will first \nneed a connection to a cluster. For this article, we will setup and use a \nsingle-node Spark cluster running on a laptop:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nsc <- spark_connect(\"local\", version = \"4.0.1\")\n```\n:::\n\n\n\n# Classification Models\n\nTo demonstrate classification, let's make a small training and test sets for a binary outcome. We'll center and scale the data since some models require the same units.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(207)\nbin_split <- \n\tmodeldata::two_class_dat |> \n\trename(class = Class) |> \n\tinitial_split(prop = 0.994, strata = class)\nbin_split\n#> \n#> <785/6/791>\n\nbin_rec <- \n recipe(class ~ ., data = training(bin_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nbin_train <- bake(bin_rec, new_data = NULL)\nbin_test <- bake(bin_rec, new_data = testing(bin_split))\n```\n:::\n\n\nFor models that _only_ work for three or more classes, we'll simulate:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(1752)\nmtl_data <-\n sim_multinomial(\n 200,\n ~ -0.5 + 0.6 * abs(A),\n ~ ifelse(A > 0 & B > 0, 1.0 + 0.2 * A / B, - 2),\n ~ A + B - A * B)\n\nmtl_split <- initial_split(mtl_data, prop = 0.967, strata = class)\nmtl_split\n#> \n#> <192/8/200>\n\n# Predictors are in the same units\nmtl_train <- training(mtl_split)\nmtl_test <- testing(mtl_split)\n```\n:::\n\n\nIf using the **Apache Spark** engine, we will need to identify the data source, \nand then use it to create the splits. For this article, we will copy the \n`two_class_dat` and the `mtl_data` data sets into the Spark session.\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ntbl_two_class <- copy_to(sc, modeldata::two_class_dat)\n\ntbl_bin <- sdf_random_split(tbl_two_class, training = 0.994, test = 1-0.994, seed = 100)\n\ntbl_sim_mtl <- copy_to(sc, mtl_data)\n\ntbl_mtl <- sdf_random_split(tbl_sim_mtl, training = 0.967, test = 1-0.967, seed = 100)\n```\n:::\n\n\n\n\n## Auto Ml (`auto_ml()`) \n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n\n# and initialize a server\nh20_server <- agua::h2o_start()\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nauto_ml_spec <- auto_ml() |>\n # We dont need to set the engine (since there is only one) but we'll set\n # a time limit\n set_engine(\"h2o\", max_runtime_secs = 60 * 3) |> \n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nauto_ml_fit <- auto_ml_spec |> fit(class ~ ., data = bin_train)\nauto_ml_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(auto_ml_fit, type = \"class\", new_data = bin_test)\npredict(auto_ml_fit, type = \"prob\", new_data = bin_test)\n```\n:::\n\n\n## Bagged MARS (`bag_mars()`) \n\n## `earth` Engine \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mars_spec <- bag_mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mars_fit <- bag_mars_spec |> fit(class ~ ., data = bin_train)\n#> \n#> Attaching package: 'plotrix'\n#> The following object is masked from 'package:scales':\n#> \n#> rescale\n#> Warning: There was 1 warning in `dplyr::mutate()`.\n#> ℹ In argument: `model = iter(...)`.\n#> Caused by warning:\n#> ! glm.fit: fitted probabilities numerically 0 or 1 occurred\n#> Registered S3 method overwritten by 'butcher':\n#> method from \n#> as.character.dev_topic generics\nbag_mars_fit\n#> parsnip model object\n#> \n#> Bagged MARS (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 100 0 11\n#> 2 A 40.8 1.22 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mars_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_mars_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.444 0.556 \n#> 2 0.860 0.140 \n#> 3 0.458 0.542 \n#> 4 0.950 0.0497\n#> 5 0.941 0.0593\n#> 6 0.868 0.132\n```\n:::\n\n\n## Bagged Neural Networks (`bag_mlp()`) \n\n## `nnet` Engine \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mlp_spec <- bag_mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mlp_fit <- bag_mlp_spec |> fit(class ~ ., data = bin_train)\nbag_mlp_fit\n#> parsnip model object\n#> \n#> Bagged nnet (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 55.1 1.98 11\n#> 2 A 44.9 1.98 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.421 0.579\n#> 2 0.655 0.345\n#> 3 0.429 0.571\n#> 4 0.727 0.273\n#> 5 0.716 0.284\n#> 6 0.700 0.300\n```\n:::\n\n\n## Bagged Decision Trees (`bag_tree()`) \n\n## `C5.0` Engine \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged C5.0 (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 100 0 11\n#> 2 A 58.9 6.71 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.450 0.550 \n#> 2 0.825 0.175 \n#> 3 0.322 0.678 \n#> 4 0.911 0.0892\n#> 5 0.911 0.0892\n#> 6 0.710 0.290\n```\n:::\n\n\n## `rpart` Engine \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged CART (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 275. 3.21 11\n#> 2 A 239. 4.04 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.0909 0.909\n#> 2 1 0 \n#> 3 0 1 \n#> 4 1 0 \n#> 5 0.727 0.273\n#> 6 1 0\n```\n:::\n\n\n## Bayesian Additive Regression Trees (`bart()`) \n\n## `dbarts` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbart_spec <- bart() |>\n # We need to set the mode since this engine works with multiple modes\n # and dbarts is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbart_fit <- bart_spec |> fit(class ~ ., data = bin_train)\nbart_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> `NULL`()\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bart_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bart_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.427 0.573\n#> 2 0.744 0.256\n#> 3 0.375 0.625\n#> 4 0.951 0.049\n#> 5 0.922 0.078\n#> 6 0.786 0.214\npredict(bart_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_lower_Class2 .pred_upper_Class1 .pred_upper_Class2\n#> \n#> 1 0.812 0.00247 0.998 0.188\n#> 2 0.785 0.0248 0.975 0.215\n#> 3 0.605 0.0713 0.929 0.395\n#> 4 0.561 0.102 0.898 0.439\n#> 5 0.251 0.340 0.660 0.749\n#> 6 0.200 0.416 0.584 0.800\npredict(bart_fit, type = \"pred_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_lower_Class2 .pred_upper_Class1 .pred_upper_Class2\n#> \n#> 1 0 0 1 1\n#> 2 0 0 1 1\n#> 3 0 0 1 1\n#> 4 0 0 1 1\n#> 5 0 0 1 1\n#> 6 0 0 1 1\n```\n:::\n\n\n## Boosted Decision Trees (`boost_tree()`) \n\n## `C5.0` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = 15, control = C50::C5.0Control(minCases\n#> = 2, sample = 0))\n#> \n#> Classification Tree\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Number of boosting iterations: 15 requested; 7 used due to early stopping\n#> Average tree size: 3.1 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.307 0.693\n#> 2 0.756 0.244\n#> 3 0.281 0.719\n#> 4 1 0 \n#> 5 1 0 \n#> 6 0.626 0.374\n```\n:::\n\n\n## `catboost` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"catboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> CatBoost model (1000 trees)\n#> Loss function: Logloss\n#> Fit to 2 feature(s)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.252 0.748 \n#> 2 0.839 0.161 \n#> 3 0.348 0.652 \n#> 4 0.997 0.00279\n#> 5 0.807 0.193 \n#> 6 0.884 0.116\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: gbm\n#> Model ID: GBM_model_R_1763571327438_3826 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 25380 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 21 55 35.70000\n#> \n#> \n#> H2OBinomialMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.007948832\n#> RMSE: 0.08915622\n#> LogLoss: 0.05942305\n#> Mean Per-Class Error: 0\n#> AUC: 1\n#> AUCPR: 1\n#> Gini: 1\n#> R^2: 0.9678452\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 434 0 0.000000 =0/434\n#> Class2 0 351 0.000000 =0/351\n#> Totals 434 351 0.000000 =0/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.598690 1.000000 200\n#> 2 max f2 0.598690 1.000000 200\n#> 3 max f0point5 0.598690 1.000000 200\n#> 4 max accuracy 0.598690 1.000000 200\n#> 5 max precision 0.998192 1.000000 0\n#> 6 max recall 0.598690 1.000000 200\n#> 7 max specificity 0.998192 1.000000 0\n#> 8 max absolute_mcc 0.598690 1.000000 200\n#> 9 max min_per_class_accuracy 0.598690 1.000000 200\n#> 10 max mean_per_class_accuracy 0.598690 1.000000 200\n#> 11 max tns 0.998192 434.000000 0\n#> 12 max fns 0.998192 349.000000 0\n#> 13 max fps 0.000831 434.000000 399\n#> 14 max tps 0.598690 351.000000 200\n#> 15 max tnr 0.998192 1.000000 0\n#> 16 max fnr 0.998192 0.994302 0\n#> 17 max fpr 0.000831 1.000000 399\n#> 18 max tpr 0.598690 1.000000 200\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.0496 0.950 \n#> 2 0.905 0.0953 \n#> 3 0.0738 0.926 \n#> 4 0.997 0.00273\n#> 5 0.979 0.0206 \n#> 6 0.878 0.122\n```\n:::\n\n\n## `h2o_gbm` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: gbm\n#> Model ID: GBM_model_R_1763571327438_3878 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 25378 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 21 55 35.70000\n#> \n#> \n#> H2OBinomialMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.007948832\n#> RMSE: 0.08915622\n#> LogLoss: 0.05942305\n#> Mean Per-Class Error: 0\n#> AUC: 1\n#> AUCPR: 1\n#> Gini: 1\n#> R^2: 0.9678452\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 434 0 0.000000 =0/434\n#> Class2 0 351 0.000000 =0/351\n#> Totals 434 351 0.000000 =0/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.598690 1.000000 200\n#> 2 max f2 0.598690 1.000000 200\n#> 3 max f0point5 0.598690 1.000000 200\n#> 4 max accuracy 0.598690 1.000000 200\n#> 5 max precision 0.998192 1.000000 0\n#> 6 max recall 0.598690 1.000000 200\n#> 7 max specificity 0.998192 1.000000 0\n#> 8 max absolute_mcc 0.598690 1.000000 200\n#> 9 max min_per_class_accuracy 0.598690 1.000000 200\n#> 10 max mean_per_class_accuracy 0.598690 1.000000 200\n#> 11 max tns 0.998192 434.000000 0\n#> 12 max fns 0.998192 349.000000 0\n#> 13 max fps 0.000831 434.000000 399\n#> 14 max tps 0.598690 351.000000 200\n#> 15 max tnr 0.998192 1.000000 0\n#> 16 max fnr 0.998192 0.994302 0\n#> 17 max fpr 0.000831 1.000000 399\n#> 18 max tpr 0.598690 1.000000 200\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.0496 0.950 \n#> 2 0.905 0.0953 \n#> 3 0.0738 0.926 \n#> 4 0.997 0.00273\n#> 5 0.979 0.0206 \n#> 6 0.878 0.122\n```\n:::\n\n\n## `lightgbm` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"lightgbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> LightGBM Model (100 trees)\n#> Objective: binary\n#> Fitted to dataset with 2 columns\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.147 0.853 \n#> 2 0.930 0.0699\n#> 3 0.237 0.763 \n#> 4 0.990 0.0101\n#> 5 0.929 0.0714\n#> 6 0.956 0.0445\n```\n:::\n\n\n## `xgboost` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and xgboost is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> ##### xgb.Booster\n#> raw: 40.4 Kb \n#> call:\n#> xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, \n#> colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, \n#> subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist, \n#> verbose = 0, nthread = 1, objective = \"binary:logistic\")\n#> params (as set within xgb.train):\n#> eta = \"0.3\", max_depth = \"6\", gamma = \"0\", colsample_bytree = \"1\", colsample_bynode = \"1\", min_child_weight = \"1\", subsample = \"1\", nthread = \"1\", objective = \"binary:logistic\", validate_parameters = \"TRUE\"\n#> xgb.attributes:\n#> niter\n#> callbacks:\n#> cb.evaluation.log()\n#> # of features: 2 \n#> niter: 15\n#> nfeatures : 2 \n#> evaluation_log:\n#> iter training_logloss\n#> \n#> 1 0.5546750\n#> 2 0.4719804\n#> --- ---\n#> 14 0.2587640\n#> 15 0.2528938\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.244 0.756 \n#> 2 0.770 0.230 \n#> 3 0.307 0.693 \n#> 4 0.944 0.0565\n#> 5 0.821 0.179 \n#> 6 0.938 0.0621\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(Class ~ ., data = tbl_bin$training)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> GBTClassificationModel: uid = gradient_boosted_trees__c61f3c19_30b0_416f_af47_e371c1aea2db, numTrees=20, numClasses=2, numFeatures=2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class1 \n#> 4 Class2 \n#> 5 Class2 \n#> 6 Class1 \n#> 7 Class2\npredict(boost_tree_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 2]\n#> # Database: spark_connection\n#> pred_Class1 pred_Class2\n#> \n#> 1 0.307 0.693 \n#> 2 0.292 0.708 \n#> 3 0.856 0.144 \n#> 4 0.192 0.808 \n#> 5 0.332 0.668 \n#> 6 0.952 0.0476\n#> 7 0.0865 0.914\n```\n:::\n\n\n\n## C5 Rules (`C5_rules()`) \n\n## `C5.0` Engine \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and C5.0 is the default engine so there is no need to set that either.\nC5_rules_spec <- C5_rules()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nC5_rules_fit <- C5_rules_spec |> fit(class ~ ., data = bin_train)\nC5_rules_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = trials, rules = TRUE, control\n#> = C50::C5.0Control(minCases = minCases, seed = sample.int(10^5,\n#> 1), earlyStopping = FALSE))\n#> \n#> Rule-Based Model\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Number of Rules: 4 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(C5_rules_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(C5_rules_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 1 0\n#> 2 1 0\n#> 3 0 1\n#> 4 1 0\n#> 5 1 0\n#> 6 1 0\n```\n:::\n\n\n## Decision Tree (`decision_tree()`) \n\n## `C5.0` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = 1, control = C50::C5.0Control(minCases =\n#> 2, sample = 0))\n#> \n#> Classification Tree\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Tree size: 4 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.732 0.268\n#> 2 0.846 0.154\n#> 3 0.236 0.764\n#> 4 0.846 0.154\n#> 5 0.846 0.154\n#> 6 0.846 0.154\n```\n:::\n\n\n## `partykit` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> class ~ A + B\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] B <= -0.06906\n#> | | [3] B <= -0.50486: Class1 (n = 291, err = 8.2%)\n#> | | [4] B > -0.50486\n#> | | | [5] A <= -0.07243: Class1 (n = 77, err = 45.5%)\n#> | | | [6] A > -0.07243: Class1 (n = 31, err = 6.5%)\n#> | [7] B > -0.06906\n#> | | [8] B <= 0.72938\n#> | | | [9] A <= 0.60196: Class2 (n = 145, err = 24.8%)\n#> | | | [10] A > 0.60196\n#> | | | | [11] B <= 0.44701: Class1 (n = 23, err = 4.3%)\n#> | | | | [12] B > 0.44701: Class1 (n = 26, err = 46.2%)\n#> | | [13] B > 0.72938: Class2 (n = 192, err = 12.5%)\n#> \n#> Number of inner nodes: 6\n#> Number of terminal nodes: 7\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.538 0.462 \n#> 2 0.935 0.0645\n#> 3 0.248 0.752 \n#> 4 0.918 0.0825\n#> 5 0.918 0.0825\n#> 6 0.935 0.0645\n```\n:::\n\n\n## `rpart` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> n= 785 \n#> \n#> node), split, n, loss, yval, (yprob)\n#> * denotes terminal node\n#> \n#> 1) root 785 351 Class1 (0.5528662 0.4471338) \n#> 2) B< -0.06526451 399 61 Class1 (0.8471178 0.1528822) *\n#> 3) B>=-0.06526451 386 96 Class2 (0.2487047 0.7512953) \n#> 6) B< 0.7339337 194 72 Class2 (0.3711340 0.6288660) \n#> 12) A>=0.6073948 49 13 Class1 (0.7346939 0.2653061) *\n#> 13) A< 0.6073948 145 36 Class2 (0.2482759 0.7517241) *\n#> 7) B>=0.7339337 192 24 Class2 (0.1250000 0.8750000) *\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.735 0.265\n#> 2 0.847 0.153\n#> 3 0.248 0.752\n#> 4 0.847 0.153\n#> 5 0.847 0.153\n#> 6 0.847 0.153\n```\n:::\n\n\n## `sparklyr` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(Class ~ ., data = tbl_bin$training)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> n= 784 \n#> \n#> node), split, n, loss, yval, (yprob)\n#> * denotes terminal node\n#> \n#> 1) root 784 350 Class1 (0.5535714 0.4464286) \n#> 2) B< 1.495535 401 62 Class1 (0.8453865 0.1546135) *\n#> 3) B>=1.495535 383 95 Class2 (0.2480418 0.7519582) \n#> 6) B< 2.079458 192 71 Class2 (0.3697917 0.6302083) \n#> 12) A>=2.572663 50 14 Class1 (0.7200000 0.2800000) *\n#> 13) A< 2.572663 142 35 Class2 (0.2464789 0.7535211) *\n#> 7) B>=2.079458 191 24 Class2 (0.1256545 0.8743455) *\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = tbl_bin$test)\n#> # A tibble: 7 × 1\n#> .pred_class\n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n#> 6 \n#> 7 \npredict(decision_tree_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # A tibble: 7 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.246 0.754\n#> 2 0.246 0.754\n#> 3 0.845 0.155\n#> 4 0.246 0.754\n#> 5 0.246 0.754\n#> 6 0.845 0.155\n#> 7 0.126 0.874\n```\n:::\n\n\n## Flexible Discriminant Analysis (`discrim_flexible()`) \n\n## `earth` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and earth is the default engine so there is no need to set that either.\ndiscrim_flexible_spec <- discrim_flexible()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_flexible_fit <- discrim_flexible_spec |> fit(class ~ ., data = bin_train)\ndiscrim_flexible_fit\n#> parsnip model object\n#> \n#> Call:\n#> mda::fda(formula = class ~ ., data = data, method = earth::earth)\n#> \n#> Dimension: 1 \n#> \n#> Percent Between-Group Variance Explained:\n#> v1 \n#> 100 \n#> \n#> Training Misclassification Error: 0.1707 ( N = 785 )\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_flexible_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_flexible_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.339 0.661 \n#> 2 0.848 0.152 \n#> 3 0.342 0.658 \n#> 4 0.964 0.0360\n#> 5 0.964 0.0360\n#> 6 0.875 0.125\n```\n:::\n\n\n## Linear Discriminant Analysis (`discrim_linear()`) \n\n## `MASS` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and MASS is the default engine so there is no need to set that either.\ndiscrim_linear_spec <- discrim_linear()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Call:\n#> lda(class ~ ., data = data)\n#> \n#> Prior probabilities of groups:\n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Group means:\n#> A B\n#> Class1 -0.2982900 -0.5573140\n#> Class2 0.3688258 0.6891006\n#> \n#> Coefficients of linear discriminants:\n#> LD1\n#> A -0.6068479\n#> B 1.7079953\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.369 0.631 \n#> 2 0.868 0.132 \n#> 3 0.541 0.459 \n#> 4 0.984 0.0158\n#> 5 0.928 0.0718\n#> 6 0.854 0.146\n```\n:::\n\n\n## `mda` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"mda\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Call:\n#> mda::fda(formula = class ~ ., data = data, method = mda::gen.ridge, \n#> keep.fitted = FALSE)\n#> \n#> Dimension: 1 \n#> \n#> Percent Between-Group Variance Explained:\n#> v1 \n#> 100 \n#> \n#> Degrees of Freedom (per dimension): 1.99423 \n#> \n#> Training Misclassification Error: 0.17707 ( N = 785 )\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.368 0.632 \n#> 2 0.867 0.133 \n#> 3 0.542 0.458 \n#> 4 0.984 0.0158\n#> 5 0.928 0.0718\n#> 6 0.853 0.147\n```\n:::\n\n\n## `sda` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sda\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> $regularization\n#> lambda lambda.var lambda.freqs \n#> 0.003136201 0.067551534 0.112819609 \n#> \n#> $freqs\n#> Class1 Class2 \n#> 0.5469019 0.4530981 \n#> \n#> $alpha\n#> Class1 Class2 \n#> -0.8934125 -1.2349286 \n#> \n#> $beta\n#> A B\n#> Class1 0.4565325 -1.298858\n#> Class2 -0.5510473 1.567757\n#> attr(,\"class\")\n#> [1] \"shrinkage\"\n#> \n#> attr(,\"class\")\n#> [1] \"sda\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.366 0.634 \n#> 2 0.860 0.140 \n#> 3 0.536 0.464 \n#> 4 0.982 0.0176\n#> 5 0.923 0.0768\n#> 6 0.845 0.155\n```\n:::\n\n\n## `sparsediscrim` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sparsediscrim\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Diagonal LDA\n#> \n#> Sample Size: 785 \n#> Number of Features: 2 \n#> \n#> Classes and Prior Probabilities:\n#> Class1 (55.29%), Class2 (44.71%)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.182 0.818 \n#> 2 0.755 0.245 \n#> 3 0.552 0.448 \n#> 4 0.996 0.00372\n#> 5 0.973 0.0274 \n#> 6 0.629 0.371\n```\n:::\n\n\n## Quandratic Discriminant Analysis (`discrim_quad()`) \n\n## `MASS` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_spec <- discrim_quad()\n # This engine works with a single mode so no need to set that\n # and MASS is the default engine so there is no need to set that either.\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_fit <- discrim_quad_spec |> fit(class ~ ., data = bin_train)\ndiscrim_quad_fit\n#> parsnip model object\n#> \n#> Call:\n#> qda(class ~ ., data = data)\n#> \n#> Prior probabilities of groups:\n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Group means:\n#> A B\n#> Class1 -0.2982900 -0.5573140\n#> Class2 0.3688258 0.6891006\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_quad_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_quad_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.340 0.660 \n#> 2 0.884 0.116 \n#> 3 0.500 0.500 \n#> 4 0.965 0.0349\n#> 5 0.895 0.105 \n#> 6 0.895 0.105\n```\n:::\n\n\n## `sparsediscrim` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_spec <- discrim_quad() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sparsediscrim\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_fit <- discrim_quad_spec |> fit(class ~ ., data = bin_train)\ndiscrim_quad_fit\n#> parsnip model object\n#> \n#> Diagonal QDA\n#> \n#> Sample Size: 785 \n#> Number of Features: 2 \n#> \n#> Classes and Prior Probabilities:\n#> Class1 (55.29%), Class2 (44.71%)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_quad_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_quad_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.180 0.820 \n#> 2 0.750 0.250 \n#> 3 0.556 0.444 \n#> 4 0.994 0.00634\n#> 5 0.967 0.0328 \n#> 6 0.630 0.370\n```\n:::\n\n\n## Regularized Discriminant Analysis (`discrim_regularized()`) \n\n## `klaR` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and klaR is the default engine so there is no need to set that either.\ndiscrim_regularized_spec <- discrim_regularized()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_regularized_fit <- discrim_regularized_spec |> fit(class ~ ., data = bin_train)\ndiscrim_regularized_fit\n#> parsnip model object\n#> \n#> Call: \n#> rda(formula = class ~ ., data = data)\n#> \n#> Regularization parameters: \n#> gamma lambda \n#> 5.344614e-15 1.032850e-02 \n#> \n#> Prior probabilities of groups: \n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Misclassification rate: \n#> apparent: 17.707 %\n#> cross-validated: 17.844 %\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_regularized_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_regularized_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.340 0.660 \n#> 2 0.884 0.116 \n#> 3 0.501 0.499 \n#> 4 0.965 0.0346\n#> 5 0.895 0.105 \n#> 6 0.895 0.105\n```\n:::\n\n\n## Generalized Additive Models (`gen_additive_mod()`) \n\n## `mgcv` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_spec <- gen_additive_mod() |>\n # We need to set the mode since this engine works with multiple modes\n # and mgcv is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_fit <- \n gen_additive_mod_spec |> \n fit(class ~ s(A) + s(B), data = bin_train)\ngen_additive_mod_fit\n#> parsnip model object\n#> \n#> \n#> Family: binomial \n#> Link function: logit \n#> \n#> Formula:\n#> class ~ s(A) + s(B)\n#> \n#> Estimated degrees of freedom:\n#> 2.76 4.22 total = 7.98 \n#> \n#> UBRE score: -0.153537\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(gen_additive_mod_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(gen_additive_mod_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.400 0.600 \n#> 2 0.826 0.174 \n#> 3 0.454 0.546 \n#> 4 0.975 0.0250\n#> 5 0.929 0.0711\n#> 6 0.829 0.171\npredict(gen_additive_mod_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0.304 0.504 0.496 0.696\n#> 2 0.739 0.889 0.111 0.261\n#> 3 0.364 0.546 0.454 0.636\n#> 4 0.846 0.996 0.00358 0.154\n#> 5 0.881 0.958 0.0416 0.119\n#> 6 0.735 0.894 0.106 0.265\n```\n:::\n\n\n## Logistic Regression (`logistic_reg()`) \n\n## `brulee` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Logistic regression\n#> \n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> batch size: 707 \n#> validation loss after 2 epochs: 0.375\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.409 0.591 \n#> 2 0.865 0.135 \n#> 3 0.544 0.456 \n#> 4 0.976 0.0239\n#> 5 0.909 0.0914\n#> 6 0.857 0.143\n```\n:::\n\n\n## `gee` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n```\n:::\n\n\n## `glm` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg()\n # This engine works with a single mode so no need to set that\n # and glm is the default engine so there is no need to set that either.\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = class ~ ., family = stats::binomial, data = data)\n#> \n#> Coefficients:\n#> (Intercept) A B \n#> -0.3563 -1.1250 2.8154 \n#> \n#> Degrees of Freedom: 784 Total (i.e. Null); 782 Residual\n#> Null Deviance:\t 1079 \n#> Residual Deviance: 666.9 \tAIC: 672.9\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.400 0.600 \n#> 2 0.862 0.138 \n#> 3 0.541 0.459 \n#> 4 0.977 0.0234\n#> 5 0.909 0.0905\n#> 6 0.853 0.147\npredict(logistic_reg_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0.339 0.465 0.535 0.661 \n#> 2 0.816 0.897 0.103 0.184 \n#> 3 0.493 0.588 0.412 0.507 \n#> 4 0.960 0.986 0.0137 0.0395\n#> 5 0.875 0.935 0.0647 0.125 \n#> 6 0.800 0.894 0.106 0.200\n```\n:::\n\n\n## `glmer` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.400 0.600 \n#> 2 0.862 0.138 \n#> 3 0.541 0.459 \n#> 4 0.977 0.0234\n#> 5 0.909 0.0905\n#> 6 0.853 0.147\n```\n:::\n\n\n## `glmnet` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"binomial\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.308300\n#> 2 1 4.75 0.280900\n#> 3 1 8.73 0.256000\n#> 4 1 12.10 0.233200\n#> 5 1 14.99 0.212500\n#> 6 1 17.46 0.193600\n#> 7 1 19.60 0.176400\n#> 8 1 21.45 0.160800\n#> 9 1 23.05 0.146500\n#> 10 1 24.44 0.133500\n#> 11 1 25.65 0.121600\n#> 12 1 26.70 0.110800\n#> 13 1 27.61 0.101000\n#> 14 1 28.40 0.091990\n#> 15 1 29.08 0.083820\n#> 16 1 29.68 0.076370\n#> 17 1 30.19 0.069590\n#> 18 1 30.63 0.063410\n#> 19 1 31.00 0.057770\n#> 20 1 31.33 0.052640\n#> 21 1 31.61 0.047960\n#> 22 1 31.85 0.043700\n#> 23 1 32.05 0.039820\n#> 24 2 32.62 0.036280\n#> 25 2 33.41 0.033060\n#> 26 2 34.10 0.030120\n#> 27 2 34.68 0.027450\n#> 28 2 35.19 0.025010\n#> 29 2 35.63 0.022790\n#> 30 2 36.01 0.020760\n#> 31 2 36.33 0.018920\n#> 32 2 36.62 0.017240\n#> 33 2 36.86 0.015710\n#> 34 2 37.06 0.014310\n#> 35 2 37.24 0.013040\n#> 36 2 37.39 0.011880\n#> 37 2 37.52 0.010830\n#> 38 2 37.63 0.009864\n#> 39 2 37.72 0.008988\n#> 40 2 37.80 0.008189\n#> 41 2 37.86 0.007462\n#> 42 2 37.92 0.006799\n#> 43 2 37.97 0.006195\n#> 44 2 38.01 0.005644\n#> 45 2 38.04 0.005143\n#> 46 2 38.07 0.004686\n#> 47 2 38.10 0.004270\n#> 48 2 38.12 0.003891\n#> 49 2 38.13 0.003545\n#> 50 2 38.15 0.003230\n#> 51 2 38.16 0.002943\n#> 52 2 38.17 0.002682\n#> 53 2 38.18 0.002443\n#> 54 2 38.18 0.002226\n#> 55 2 38.19 0.002029\n#> 56 2 38.19 0.001848\n#> 57 2 38.20 0.001684\n#> 58 2 38.20 0.001534\n#> 59 2 38.20 0.001398\n#> 60 2 38.21 0.001274\n#> 61 2 38.21 0.001161\n#> 62 2 38.21 0.001058\n#> 63 2 38.21 0.000964\n#> 64 2 38.21 0.000878\n#> 65 2 38.21 0.000800\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.383 0.617 \n#> 2 0.816 0.184 \n#> 3 0.537 0.463 \n#> 4 0.969 0.0313\n#> 5 0.894 0.106 \n#> 6 0.797 0.203\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: glm\n#> Model ID: GLM_model_R_1763571327438_3930 \n#> GLM Model: summary\n#> family link regularization\n#> 1 binomial logit Elastic Net (alpha = 0.5, lambda = 6.162E-4 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 4\n#> training_frame\n#> 1 object_xtqmofwsbr\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept -0.350788 -0.350788\n#> 2 A -1.084233 -1.084233\n#> 3 B 2.759366 2.759366\n#> \n#> H2OBinomialMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.130451\n#> RMSE: 0.3611799\n#> LogLoss: 0.4248206\n#> Mean Per-Class Error: 0.1722728\n#> AUC: 0.8889644\n#> AUCPR: 0.8520865\n#> Gini: 0.7779288\n#> R^2: 0.4722968\n#> Residual Deviance: 666.9684\n#> AIC: 672.9684\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 350 84 0.193548 =84/434\n#> Class2 53 298 0.150997 =53/351\n#> Totals 403 382 0.174522 =137/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.411045 0.813097 213\n#> 2 max f2 0.229916 0.868991 279\n#> 3 max f0point5 0.565922 0.816135 166\n#> 4 max accuracy 0.503565 0.826752 185\n#> 5 max precision 0.997356 1.000000 0\n#> 6 max recall 0.009705 1.000000 395\n#> 7 max specificity 0.997356 1.000000 0\n#> 8 max absolute_mcc 0.411045 0.652014 213\n#> 9 max min_per_class_accuracy 0.454298 0.822581 201\n#> 10 max mean_per_class_accuracy 0.411045 0.827727 213\n#> 11 max tns 0.997356 434.000000 0\n#> 12 max fns 0.997356 349.000000 0\n#> 13 max fps 0.001723 434.000000 399\n#> 14 max tps 0.009705 351.000000 395\n#> 15 max tnr 0.997356 1.000000 0\n#> 16 max fnr 0.997356 0.994302 0\n#> 17 max fpr 0.001723 1.000000 399\n#> 18 max tpr 0.009705 1.000000 395\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.399 0.601 \n#> 2 0.857 0.143 \n#> 3 0.540 0.460 \n#> 4 0.976 0.0243\n#> 5 0.908 0.0925\n#> 6 0.848 0.152\n```\n:::\n\n\n## `keras` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n```\n:::\n\n\n## `LiblineaR` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"LiblineaR\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized logistic regression primal (L2R_LR)\"\n#> \n#> $Type\n#> [1] 0\n#> \n#> $W\n#> A B Bias\n#> [1,] 1.014233 -2.65166 0.3363362\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $ClassNames\n#> [1] Class1 Class2\n#> Levels: Class1 Class2\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.397 0.603 \n#> 2 0.847 0.153 \n#> 3 0.539 0.461 \n#> 4 0.973 0.0267\n#> 5 0.903 0.0974\n#> 6 0.837 0.163\n```\n:::\n\n\n## `stan` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> stan_glm\n#> family: binomial [logit]\n#> formula: class ~ .\n#> observations: 785\n#> predictors: 3\n#> ------\n#> Median MAD_SD\n#> (Intercept) -0.4 0.1 \n#> A -1.1 0.2 \n#> B 2.8 0.2 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.399 0.601 \n#> 2 0.860 0.140 \n#> 3 0.541 0.459 \n#> 4 0.977 0.0234\n#> 5 0.909 0.0906\n#> 6 0.852 0.148\npredict(logistic_reg_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0.338 0.463 0.537 0.662 \n#> 2 0.815 0.897 0.103 0.185 \n#> 3 0.493 0.588 0.412 0.507 \n#> 4 0.961 0.986 0.0135 0.0389\n#> 5 0.876 0.936 0.0643 0.124 \n#> 6 0.798 0.893 0.107 0.202\npredict(logistic_reg_fit, type = \"pred_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0 1 0 1\n#> 2 0 1 0 1\n#> 3 0 1 0 1\n#> 4 0 1 0 1\n#> 5 0 1 0 1\n#> 6 0 1 0 1\n```\n:::\n\n\n## `stan_glmer` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\npredict(logistic_reg_fit, type = \"conf_int\", new_data = bin_test)\npredict(logistic_reg_fit, type = \"pred_int\", new_data = bin_test)\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(Class ~ ., data = tbl_bin$training)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> Coefficients:\n#> (Intercept) A B \n#> -3.731170 -1.214355 3.794186\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = tbl_bin$test)\npredict(logistic_reg_fit, type = \"prob\", new_data = tbl_bin$test)\n```\n:::\n\n\n\n## Multivariate Adaptive Regression Splines (`mars()`) \n\n## `earth` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_spec <- mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_fit <- mars_spec |> fit(class ~ ., data = bin_train)\nmars_fit\n#> parsnip model object\n#> \n#> GLM (family binomial, link logit):\n#> nulldev df dev df devratio AIC iters converged\n#> 1079.45 784 638.975 779 0.408 651 5 1\n#> \n#> Earth selected 6 of 13 terms, and 2 of 2 predictors\n#> Termination condition: Reached nk 21\n#> Importance: B, A\n#> Number of terms at each degree of interaction: 1 5 (additive model)\n#> Earth GCV 0.1342746 RSS 102.4723 GRSq 0.4582121 RSq 0.4719451\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mars_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mars_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.410 0.590 \n#> 2 0.794 0.206 \n#> 3 0.356 0.644 \n#> 4 0.927 0.0729\n#> 5 0.927 0.0729\n#> 6 0.836 0.164\n```\n:::\n\n\n## Neural Networks (`mlp()`) \n\n## `brulee` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> relu activation,\n#> 3 hidden units,\n#> 17 model parameters\n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 707 \n#> learn rate: 0.01 \n#> validation loss after 4 epochs: 0.508\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.390 0.610\n#> 2 0.854 0.146\n#> 3 0.507 0.493\n#> 4 0.830 0.170\n#> 5 0.828 0.172\n#> 6 0.851 0.149\n```\n:::\n\n\n## `brulee_two_layer` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"brulee_two_layer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> c(relu,relu) activation,\n#> c(3,3) hidden units,\n#> 29 model parameters\n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 707 \n#> learn rate: 0.01 \n#> validation loss after 16 epochs: 0.307\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.411 0.589 \n#> 2 0.883 0.117 \n#> 3 0.520 0.480 \n#> 4 0.971 0.0293\n#> 5 0.938 0.0618\n#> 6 0.871 0.129\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: deeplearning\n#> Model ID: DeepLearning_model_R_1763571327438_3932 \n#> Status of Neuron Layers: predicting .outcome, 2-class classification, bernoulli distribution, CrossEntropy loss, 1,002 weights/biases, 16.9 KB, 7,850 training samples, mini-batch size 1\n#> layer units type dropout l1 l2 mean_rate rate_rms momentum\n#> 1 1 2 Input 0.00 % NA NA NA NA NA\n#> 2 2 200 Rectifier 0.00 % 0.000000 0.000000 0.008994 0.023584 0.000000\n#> 3 3 2 Softmax NA 0.000000 0.000000 0.002983 0.000548 0.000000\n#> mean_weight weight_rms mean_bias bias_rms\n#> 1 NA NA NA NA\n#> 2 0.006098 0.105669 0.492018 0.020146\n#> 3 0.033179 0.403317 -0.015716 0.023938\n#> \n#> \n#> H2OBinomialMetrics: deeplearning\n#> ** Reported on training data. **\n#> ** Metrics reported on full training frame **\n#> \n#> MSE: 0.130512\n#> RMSE: 0.3612645\n#> LogLoss: 0.4275074\n#> Mean Per-Class Error: 0.1685671\n#> AUC: 0.8893418\n#> AUCPR: 0.8486687\n#> Gini: 0.7786837\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 373 61 0.140553 =61/434\n#> Class2 69 282 0.196581 =69/351\n#> Totals 442 343 0.165605 =130/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.466071 0.812680 192\n#> 2 max f2 0.210358 0.870370 283\n#> 3 max f0point5 0.482168 0.819964 186\n#> 4 max accuracy 0.466071 0.834395 192\n#> 5 max precision 0.885661 0.950495 47\n#> 6 max recall 0.004683 1.000000 396\n#> 7 max specificity 0.991894 0.997696 0\n#> 8 max absolute_mcc 0.466071 0.664455 192\n#> 9 max min_per_class_accuracy 0.427673 0.823362 206\n#> 10 max mean_per_class_accuracy 0.466071 0.831433 192\n#> 11 max tns 0.991894 433.000000 0\n#> 12 max fns 0.991894 349.000000 0\n#> 13 max fps 0.000622 434.000000 399\n#> 14 max tps 0.004683 351.000000 396\n#> 15 max tnr 0.991894 0.997696 0\n#> 16 max fnr 0.991894 0.994302 0\n#> 17 max fpr 0.000622 1.000000 399\n#> 18 max tpr 0.004683 1.000000 396\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.469 0.531 \n#> 2 0.898 0.102 \n#> 3 0.581 0.419 \n#> 4 0.981 0.0191\n#> 5 0.919 0.0808\n#> 6 0.898 0.102\n```\n:::\n\n\n## `keras` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n```\n:::\n\n\n## `nnet` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> a 2-5-1 network with 21 weights\n#> inputs: A B \n#> output(s): class \n#> options were - entropy fitting\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.418 0.582\n#> 2 0.658 0.342\n#> 3 0.406 0.594\n#> 4 0.725 0.275\n#> 5 0.714 0.286\n#> 6 0.633 0.367\n```\n:::\n\n\n## Multinom Regression (`multinom_reg()`) \n\n## `brulee` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Multinomial regression\n#> \n#> 192 samples, 2 features, 3 classes \n#> class weights one=1, two=1, three=1 \n#> weight decay: 0.001 \n#> batch size: 173 \n#> validation loss after 1 epoch: 0.816\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.133 0.207 0.660 \n#> 2 0.298 0.189 0.512 \n#> 3 0.346 0.206 0.448 \n#> 4 0.985 0.00158 0.0134\n#> 5 0.956 0.00343 0.0404\n#> 6 0.00328 0.742 0.254 \n#> 7 0.0570 0.411 0.532 \n#> 8 0.487 0.0488 0.465\n```\n:::\n\n\n## `glmnet` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"multinomial\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.219200\n#> 2 1 1.61 0.199700\n#> 3 2 3.90 0.181900\n#> 4 2 6.07 0.165800\n#> 5 2 7.93 0.151100\n#> 6 2 9.52 0.137600\n#> 7 2 10.90 0.125400\n#> 8 2 12.09 0.114300\n#> 9 2 13.13 0.104100\n#> 10 2 14.22 0.094870\n#> 11 2 15.28 0.086440\n#> 12 2 16.20 0.078760\n#> 13 2 16.99 0.071760\n#> 14 2 17.68 0.065390\n#> 15 2 18.28 0.059580\n#> 16 2 18.80 0.054290\n#> 17 2 19.24 0.049460\n#> 18 2 19.63 0.045070\n#> 19 2 19.96 0.041070\n#> 20 2 20.25 0.037420\n#> 21 2 20.49 0.034090\n#> 22 2 20.70 0.031070\n#> 23 2 20.88 0.028310\n#> 24 2 21.04 0.025790\n#> 25 2 21.17 0.023500\n#> 26 2 21.28 0.021410\n#> 27 2 21.38 0.019510\n#> 28 2 21.46 0.017780\n#> 29 2 21.53 0.016200\n#> 30 2 21.58 0.014760\n#> 31 2 21.63 0.013450\n#> 32 2 21.67 0.012250\n#> 33 2 21.71 0.011160\n#> 34 2 21.74 0.010170\n#> 35 2 21.77 0.009269\n#> 36 2 21.79 0.008445\n#> 37 2 21.82 0.007695\n#> 38 2 21.83 0.007011\n#> 39 2 21.85 0.006389\n#> 40 2 21.86 0.005821\n#> 41 2 21.87 0.005304\n#> 42 2 21.88 0.004833\n#> 43 2 21.89 0.004403\n#> 44 2 21.89 0.004012\n#> 45 2 21.90 0.003656\n#> 46 2 21.90 0.003331\n#> 47 2 21.91 0.003035\n#> 48 2 21.91 0.002765\n#> 49 2 21.91 0.002520\n#> 50 2 21.91 0.002296\n#> 51 2 21.92 0.002092\n#> 52 2 21.92 0.001906\n#> 53 2 21.92 0.001737\n#> 54 2 21.92 0.001582\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.163 0.211 0.626 \n#> 2 0.318 0.185 0.496 \n#> 3 0.358 0.198 0.444 \n#> 4 0.976 0.00268 0.0217\n#> 5 0.940 0.00529 0.0544\n#> 6 0.00617 0.699 0.295 \n#> 7 0.0757 0.390 0.534 \n#> 8 0.506 0.0563 0.438\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OMultinomialModel: glm\n#> Model ID: GLM_model_R_1763571327438_3935 \n#> GLM Model: summary\n#> family link regularization\n#> 1 multinomial multinomial Elastic Net (alpha = 0.5, lambda = 4.372E-4 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 9 6 4\n#> training_frame\n#> 1 object_avyvxbooiq\n#> \n#> Coefficients: glm multinomial coefficients\n#> names coefs_class_0 coefs_class_1 coefs_class_2 std_coefs_class_0\n#> 1 Intercept -1.119482 -0.831434 -1.706488 -1.083442\n#> 2 A -1.119327 0.002894 0.750746 -1.029113\n#> 3 B -1.208210 0.078752 0.162842 -1.187423\n#> std_coefs_class_1 std_coefs_class_2\n#> 1 -0.819868 -1.830487\n#> 2 0.002661 0.690238\n#> 3 0.077397 0.160041\n#> \n#> H2OMultinomialMetrics: glm\n#> ** Reported on training data. **\n#> \n#> Training Set Metrics: \n#> =====================\n#> \n#> Extract training frame with `h2o.getFrame(\"object_avyvxbooiq\")`\n#> MSE: (Extract with `h2o.mse`) 0.2982118\n#> RMSE: (Extract with `h2o.rmse`) 0.5460878\n#> Logloss: (Extract with `h2o.logloss`) 0.822443\n#> Mean Per-Class Error: 0.4583896\n#> AUC: (Extract with `h2o.auc`) NaN\n#> AUCPR: (Extract with `h2o.aucpr`) NaN\n#> Null Deviance: (Extract with `h2o.nulldeviance`) 404.5036\n#> Residual Deviance: (Extract with `h2o.residual_deviance`) 315.8181\n#> R^2: (Extract with `h2o.r2`) 0.4682043\n#> AIC: (Extract with `h2o.aic`) NaN\n#> Confusion Matrix: Extract with `h2o.confusionMatrix(,train = TRUE)`)\n#> =========================================================================\n#> Confusion Matrix: Row labels: Actual class; Column labels: Predicted class\n#> one three two Error Rate\n#> one 59 18 1 0.2436 = 19 / 78\n#> three 19 52 5 0.3158 = 24 / 76\n#> two 7 24 7 0.8158 = 31 / 38\n#> Totals 85 94 13 0.3854 = 74 / 192\n#> \n#> Hit Ratio Table: Extract with `h2o.hit_ratio_table(,train = TRUE)`\n#> =======================================================================\n#> Top-3 Hit Ratios: \n#> k hit_ratio\n#> 1 1 0.614583\n#> 2 2 0.890625\n#> 3 3 1.000000\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_three .pred_two\n#> \n#> 1 0.146 0.641 0.213 \n#> 2 0.308 0.513 0.179 \n#> 3 0.350 0.460 0.190 \n#> 4 0.983 0.0158 0.00128\n#> 5 0.955 0.0422 0.00284\n#> 6 0.00329 0.244 0.752 \n#> 7 0.0599 0.527 0.413 \n#> 8 0.521 0.432 0.0469\n```\n:::\n\n\n## `keras` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n```\n:::\n\n\n## `nnet` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and nnet is the default engine so there is no need to set that either.\nmultinom_reg_spec <- multinom_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> nnet::multinom(formula = class ~ ., data = data, trace = FALSE)\n#> \n#> Coefficients:\n#> (Intercept) A B\n#> two -0.5868435 1.881920 1.379106\n#> three 0.2910810 1.129622 1.292802\n#> \n#> Residual Deviance: 315.8164 \n#> AIC: 327.8164\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.145 0.213 0.641 \n#> 2 0.308 0.178 0.514 \n#> 3 0.350 0.189 0.461 \n#> 4 0.983 0.00123 0.0155\n#> 5 0.956 0.00275 0.0415\n#> 6 0.00318 0.754 0.243 \n#> 7 0.0591 0.414 0.527 \n#> 8 0.522 0.0465 0.431\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = tbl_mtl$training)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Formula: class ~ .\n#> \n#> Coefficients:\n#> (Intercept) A B\n#> one 0.05447853 -1.0569131 -0.9049194\n#> three 0.41207949 0.1458870 0.3959664\n#> two -0.46655802 0.9110261 0.5089529\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = tbl_mtl$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 one \n#> 2 one \n#> 3 three \n#> 4 three \n#> 5 three \n#> 6 three \n#> 7 three\npredict(multinom_reg_fit, type = \"prob\", new_data = tbl_mtl$test)\n#> # Source: SQL [?? x 3]\n#> # Database: spark_connection\n#> pred_one pred_three pred_two\n#> \n#> 1 0.910 0.0814 0.00904\n#> 2 0.724 0.233 0.0427 \n#> 3 0.124 0.620 0.256 \n#> 4 0.0682 0.610 0.322 \n#> 5 0.130 0.571 0.300 \n#> 6 0.115 0.549 0.336 \n#> 7 0.0517 0.524 0.424\n```\n:::\n\n\n\n## Naive Bayes (`naive_Bayes()`) \n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_spec <- naive_Bayes() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\nnaive_Bayes_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: naivebayes\n#> Model ID: NaiveBayes_model_R_1763571327438_3936 \n#> Model Summary: \n#> number_of_response_levels min_apriori_probability max_apriori_probability\n#> 1 2 0.44713 0.55287\n#> \n#> \n#> H2OBinomialMetrics: naivebayes\n#> ** Reported on training data. **\n#> \n#> MSE: 0.1737113\n#> RMSE: 0.4167869\n#> LogLoss: 0.5473431\n#> Mean Per-Class Error: 0.2356138\n#> AUC: 0.8377152\n#> AUCPR: 0.788608\n#> Gini: 0.6754303\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 274 160 0.368664 =160/434\n#> Class2 36 315 0.102564 =36/351\n#> Totals 310 475 0.249682 =196/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.175296 0.762712 286\n#> 2 max f2 0.133412 0.851119 306\n#> 3 max f0point5 0.497657 0.731343 183\n#> 4 max accuracy 0.281344 0.765605 248\n#> 5 max precision 0.999709 1.000000 0\n#> 6 max recall 0.020983 1.000000 390\n#> 7 max specificity 0.999709 1.000000 0\n#> 8 max absolute_mcc 0.280325 0.541898 249\n#> 9 max min_per_class_accuracy 0.398369 0.758065 215\n#> 10 max mean_per_class_accuracy 0.280325 0.771945 249\n#> 11 max tns 0.999709 434.000000 0\n#> 12 max fns 0.999709 347.000000 0\n#> 13 max fps 0.006522 434.000000 399\n#> 14 max tps 0.020983 351.000000 390\n#> 15 max tnr 0.999709 1.000000 0\n#> 16 max fnr 0.999709 0.988604 0\n#> 17 max fpr 0.006522 1.000000 399\n#> 18 max tpr 0.020983 1.000000 390\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class2\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.181 0.819 \n#> 2 0.750 0.250 \n#> 3 0.556 0.444 \n#> 4 0.994 0.00643\n#> 5 0.967 0.0331 \n#> 6 0.630 0.370\n```\n:::\n\n\n## `klaR` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and klaR is the default engine so there is no need to set that either.\nnaive_Bayes_spec <- naive_Bayes()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\n\n# No real print method\n# naive_Bayes_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.250 0.750 \n#> 2 0.593 0.407 \n#> 3 0.333 0.667 \n#> 4 0.993 0.00658\n#> 5 0.978 0.0223 \n#> 6 0.531 0.469\n```\n:::\n\n\n## `naivebayes` Engine \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_spec <- naive_Bayes() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"naivebayes\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\nnaive_Bayes_fit\n#> parsnip model object\n#> \n#> \n#> ================================= Naive Bayes ==================================\n#> \n#> Call:\n#> naive_bayes.default(x = maybe_data_frame(x), y = y, usekernel = TRUE)\n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> Laplace smoothing: 0\n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> A priori probabilities: \n#> \n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> Tables: \n#> \n#> -------------------------------------------------------------------------------- \n#> :: A::Class1 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (434 obs.);\tBandwidth 'bw' = 0.2548\n#> \n#> x y \n#> Min. :-2.5638 Min. :0.0002915 \n#> 1st Qu.:-1.2013 1st Qu.:0.0506201 \n#> Median : 0.1612 Median :0.1619843 \n#> Mean : 0.1612 Mean :0.1831190 \n#> 3rd Qu.: 1.5237 3rd Qu.:0.2581668 \n#> Max. : 2.8862 Max. :0.5370762 \n#> -------------------------------------------------------------------------------- \n#> :: A::Class2 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (351 obs.);\tBandwidth 'bw' = 0.2596\n#> \n#> x y \n#> Min. :-2.5428 Min. :4.977e-05 \n#> 1st Qu.:-1.1840 1st Qu.:2.672e-02 \n#> Median : 0.1748 Median :2.239e-01 \n#> Mean : 0.1748 Mean :1.836e-01 \n#> 3rd Qu.: 1.5336 3rd Qu.:2.926e-01 \n#> Max. : 2.8924 Max. :3.740e-01 \n#> \n#> -------------------------------------------------------------------------------- \n#> :: B::Class1 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (434 obs.);\tBandwidth 'bw' = 0.1793\n#> \n#> x y \n#> Min. :-2.4501 Min. :5.747e-05 \n#> 1st Qu.:-1.0894 1st Qu.:1.424e-02 \n#> Median : 0.2713 Median :8.798e-02 \n#> Mean : 0.2713 Mean :1.834e-01 \n#> 3rd Qu.: 1.6320 3rd Qu.:2.758e-01 \n#> Max. : 2.9927 Max. :6.872e-01 \n#> \n#> -------------------------------------------------------------------------------- \n#> :: B::Class2 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (351 obs.);\tBandwidth 'bw' = 0.2309\n#> \n#> x y \n#> Min. :-2.4621 Min. :5.623e-05 \n#> 1st Qu.:-0.8979 1st Qu.:1.489e-02 \n#> Median : 0.6663 Median :7.738e-02 \n#> Mean : 0.6663 Mean :1.595e-01 \n#> 3rd Qu.: 2.2305 3rd Qu.:3.336e-01 \n#> Max. : 3.7948 Max. :4.418e-01 \n#> \n#> --------------------------------------------------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.249 0.751 \n#> 2 0.593 0.407 \n#> 3 0.332 0.668 \n#> 4 0.993 0.00674\n#> 5 0.978 0.0224 \n#> 6 0.532 0.468\n```\n:::\n\n\n## K-Nearest Neighbors (`nearest_neighbor()`) \n\n## `kknn` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_spec <- nearest_neighbor() |>\n # We need to set the mode since this engine works with multiple modes\n # and kknn is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_fit <- nearest_neighbor_spec |> fit(class ~ ., data = bin_train)\nnearest_neighbor_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> kknn::train.kknn(formula = class ~ ., data = data, ks = min_rows(5, data, 5))\n#> \n#> Type of response variable: nominal\n#> Minimal misclassification: 0.2101911\n#> Best kernel: optimal\n#> Best k: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(nearest_neighbor_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(nearest_neighbor_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.2 0.8 \n#> 2 0.72 0.28\n#> 3 0.32 0.68\n#> 4 1 0 \n#> 5 1 0 \n#> 6 1 0\n```\n:::\n\n\n## Null Model (`null_model()`) \n\n## `parsnip` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_spec <- null_model() |>\n # We need to set the mode since this engine works with multiple modes\n # and parsnip is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_fit <- null_model_spec |> fit(class ~ ., data = bin_train)\nnull_model_fit\n#> parsnip model object\n#> \n#> Null Regression Model\n#> Predicted Value: Class1\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(null_model_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(null_model_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.553 0.447\n#> 2 0.553 0.447\n#> 3 0.553 0.447\n#> 4 0.553 0.447\n#> 5 0.553 0.447\n#> 6 0.553 0.447\n```\n:::\n\n\n## Partial Least Squares (`pls()`) \n\n## `mixOmics` Engine \n\nThis engine requires the plsmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(plsmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_spec <- pls() |>\n # We need to set the mode since this engine works with multiple modes\n # and mixOmics is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_fit <- pls_spec |> fit(class ~ ., data = bin_train)\npls_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> mixOmics::splsda(X = x, Y = y, ncomp = ncomp, keepX = keepX) \n#> \n#> sPLS-DA (regression mode) with 2 sPLS-DA components. \n#> You entered data X of dimensions: 785 2 \n#> You entered data Y with 2 classes. \n#> \n#> Selection of [2] [2] variables on each of the sPLS-DA components on the X data set. \n#> No Y variables can be selected. \n#> \n#> Main numerical outputs: \n#> -------------------- \n#> loading vectors: see object$loadings \n#> variates: see object$variates \n#> variable names: see object$names \n#> \n#> Functions to visualise samples: \n#> -------------------- \n#> plotIndiv, plotArrow, cim \n#> \n#> Functions to visualise variables: \n#> -------------------- \n#> plotVar, plotLoadings, network, cim \n#> \n#> Other functions: \n#> -------------------- \n#> selectVar, tune, perf, auc\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(pls_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(pls_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.462 0.538\n#> 2 0.631 0.369\n#> 3 0.512 0.488\n#> 4 0.765 0.235\n#> 5 0.675 0.325\n#> 6 0.624 0.376\n```\n:::\n\n\n## Random Forests (`rand_forest()`) \n\n## `aorsf` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random classification forest\n#> \n#> Linear combinations: Accelerated Logistic regression\n#> N observations: 785\n#> N classes: 2\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 24.166\n#> Min observations in leaf: 5\n#> OOB stat value: 0.87\n#> OOB stat type: AUC-ROC\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.199 0.801 \n#> 2 0.882 0.118 \n#> 3 0.361 0.639 \n#> 4 0.978 0.0220\n#> 5 0.936 0.0642\n#> 6 0.904 0.0957\n```\n:::\n\n\n## `grf` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"grf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\npredict(rand_forest_fit, type = \"conf_int\", new_data = bin_test)\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: drf\n#> Model ID: DRF_model_R_1763571327438_3938 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 91643 13\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 20 16.38000 114 158 141.50000\n#> \n#> \n#> H2OBinomialMetrics: drf\n#> ** Reported on training data. **\n#> ** Metrics reported on Out-Of-Bag training samples **\n#> \n#> MSE: 0.1644052\n#> RMSE: 0.4054691\n#> LogLoss: 1.62537\n#> Mean Per-Class Error: 0.2084695\n#> AUC: 0.8379252\n#> AUCPR: 0.7897947\n#> Gini: 0.6758504\n#> R^2: 0.3349444\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 326 108 0.248848 =108/434\n#> Class2 59 292 0.168091 =59/351\n#> Totals 385 400 0.212739 =167/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.363636 0.777630 128\n#> 2 max f2 0.263158 0.827455 147\n#> 3 max f0point5 0.642857 0.762215 78\n#> 4 max accuracy 0.384615 0.787261 125\n#> 5 max precision 0.944444 0.876033 10\n#> 6 max recall 0.000000 1.000000 217\n#> 7 max specificity 1.000000 0.972350 0\n#> 8 max absolute_mcc 0.363636 0.579899 128\n#> 9 max min_per_class_accuracy 0.458333 0.780627 112\n#> 10 max mean_per_class_accuracy 0.363636 0.791530 128\n#> 11 max tns 1.000000 422.000000 0\n#> 12 max fns 1.000000 275.000000 0\n#> 13 max fps 0.000000 434.000000 217\n#> 14 max tps 0.000000 351.000000 217\n#> 15 max tnr 1.000000 0.972350 0\n#> 16 max fnr 1.000000 0.783476 0\n#> 17 max fpr 0.000000 1.000000 217\n#> 18 max tpr 0.000000 1.000000 217\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.12 0.88\n#> 2 0.88 0.12\n#> 3 0.11 0.89\n#> 4 1 0 \n#> 5 0.76 0.24\n#> 6 1 0\n```\n:::\n\n\n## `partykit` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\n\n# Too long to print\n# rand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.396 0.604 \n#> 2 0.804 0.196 \n#> 3 0.313 0.687 \n#> 4 0.966 0.0343\n#> 5 0.887 0.113 \n#> 6 0.931 0.0689\n```\n:::\n\n\n## `randomForest` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"randomForest\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> randomForest(x = maybe_data_frame(x), y = y) \n#> Type of random forest: classification\n#> Number of trees: 500\n#> No. of variables tried at each split: 1\n#> \n#> OOB estimate of error rate: 21.66%\n#> Confusion matrix:\n#> Class1 Class2 class.error\n#> Class1 348 86 0.1981567\n#> Class2 84 267 0.2393162\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.174 0.826\n#> 2 0.88 0.12 \n#> 3 0.112 0.888\n#> 4 1 0 \n#> 5 0.692 0.308\n#> 6 0.922 0.078\n```\n:::\n\n\n## `ranger` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n # and ranger is the default engine so there is no need to set that either.\n set_engine(\"ranger\", keep.inbag = TRUE) |> \n # However, we'll set the engine and use the keep.inbag=TRUE option so that we \n # can produce interval predictions. This is not generally required. \n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Ranger result\n#> \n#> Call:\n#> ranger::ranger(x = maybe_data_frame(x), y = y, keep.inbag = ~TRUE, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) \n#> \n#> Type: Probability estimation \n#> Number of trees: 500 \n#> Sample size: 785 \n#> Number of independent variables: 2 \n#> Mtry: 1 \n#> Target node size: 10 \n#> Variable importance mode: none \n#> Splitrule: gini \n#> OOB prediction error (Brier s.): 0.1486808\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.228 0.772 \n#> 2 0.828 0.172 \n#> 3 0.214 0.786 \n#> 4 0.942 0.0578\n#> 5 0.763 0.237 \n#> 6 0.900 0.100\npredict(rand_forest_fit, type = \"conf_int\", new_data = bin_test)\n#> Warning in rInfJack(x, inbag.counts): Sample size <=20, no calibration\n#> performed.\n#> Warning in rInfJack(x, inbag.counts): Sample size <=20, no calibration\n#> performed.\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0 0.510 0.490 1 \n#> 2 0.660 0.997 0.00288 0.340\n#> 3 0 0.461 0.539 1 \n#> 4 0.798 1 0 0.202\n#> 5 0.567 0.959 0.0408 0.433\n#> 6 0.745 1 0 0.255\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n set_mode(\"classification\") |>\n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(Class ~ ., data = tbl_bin$training)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> RandomForestClassificationModel: uid=random_forest__3204ae4e_77ac_4f0c_b642_fef909ba5c81, numTrees=20, numClasses=2, numFeatures=2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class1 \n#> 4 Class2 \n#> 5 Class2 \n#> 6 Class1 \n#> 7 Class2\npredict(rand_forest_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 2]\n#> # Database: spark_connection\n#> pred_Class1 pred_Class2\n#> \n#> 1 0.244 0.756 \n#> 2 0.249 0.751 \n#> 3 0.836 0.164 \n#> 4 0.227 0.773 \n#> 5 0.260 0.740 \n#> 6 0.962 0.0383\n#> 7 0.0937 0.906\n```\n:::\n\n\n## Rule Fit (`rule_fit()`) \n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: rulefit\n#> Model ID: RuleFit_model_R_1763571327438_3989 \n#> Rulefit Model Summary: \n#> family link regularization number_of_predictors_total\n#> 1 binomial logit Lasso (lambda = 0.03081 ) 2377\n#> number_of_active_predictors number_of_iterations rule_ensemble_size\n#> 1 4 5 2375\n#> number_of_trees number_of_internal_trees min_depth max_depth mean_depth\n#> 1 150 150 0 5 4.00000\n#> min_leaves max_leaves mean_leaves\n#> 1 0 31 15.83333\n#> \n#> \n#> H2OBinomialMetrics: rulefit\n#> ** Reported on training data. **\n#> \n#> MSE: 0.1422931\n#> RMSE: 0.3772176\n#> LogLoss: 0.4500322\n#> Mean Per-Class Error: 0.1867902\n#> AUC: 0.8764064\n#> AUCPR: 0.8338422\n#> Gini: 0.7528129\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 351 83 0.191244 =83/434\n#> Class2 64 287 0.182336 =64/351\n#> Totals 415 370 0.187261 =147/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.485283 0.796117 204\n#> 2 max f2 0.263811 0.861522 270\n#> 3 max f0point5 0.620200 0.799574 147\n#> 4 max accuracy 0.485283 0.812739 204\n#> 5 max precision 0.984770 1.000000 0\n#> 6 max recall 0.048801 1.000000 393\n#> 7 max specificity 0.984770 1.000000 0\n#> 8 max absolute_mcc 0.485283 0.623934 204\n#> 9 max min_per_class_accuracy 0.489555 0.808756 202\n#> 10 max mean_per_class_accuracy 0.485283 0.813210 204\n#> 11 max tns 0.984770 434.000000 0\n#> 12 max fns 0.984770 350.000000 0\n#> 13 max fps 0.037559 434.000000 399\n#> 14 max tps 0.048801 351.000000 393\n#> 15 max tnr 0.984770 1.000000 0\n#> 16 max fnr 0.984770 0.997151 0\n#> 17 max fpr 0.037559 1.000000 399\n#> 18 max tpr 0.048801 1.000000 393\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rule_fit_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.377 0.623 \n#> 2 0.737 0.263 \n#> 3 0.487 0.513 \n#> 4 0.956 0.0440\n#> 5 0.879 0.121 \n#> 6 0.693 0.307\n```\n:::\n\n\n## `xrf` Engine \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n # and xrf is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> An eXtreme RuleFit model of 358 rules.\n#> \n#> Original Formula:\n#> \n#> class ~ A + B\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rule_fit_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.419 0.581\n#> 2 0.651 0.349\n#> 3 0.506 0.494\n#> 4 0.891 0.109\n#> 5 0.805 0.195\n#> 6 0.616 0.384\n```\n:::\n\n\n## Support Vector Machine (Linear Kernel) (`svm_linear()`) \n\n## `kernlab` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"kernlab\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(class ~ ., data = bin_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Linear (vanilla) kernel function. \n#> \n#> Number of Support Vectors : 357 \n#> \n#> Objective Function Value : -353.0043 \n#> Training error : 0.17707 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.403 0.597 \n#> 2 0.858 0.142 \n#> 3 0.540 0.460 \n#> 4 0.975 0.0254\n#> 5 0.905 0.0949\n#> 6 0.849 0.151\n```\n:::\n\n\n## `LiblineaR` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n # and LiblineaR is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(class ~ ., data = bin_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized L2-loss support vector classification dual (L2R_L2LOSS_SVC_DUAL)\"\n#> \n#> $Type\n#> [1] 1\n#> \n#> $W\n#> A B Bias\n#> [1,] 0.3641925 -0.9648581 0.1182515\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $ClassNames\n#> [1] Class1 Class2\n#> Levels: Class1 Class2\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\n```\n:::\n\n\n## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) \n\n## `kernlab` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_spec <- svm_poly() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_fit <- svm_poly_spec |> fit(class ~ ., data = bin_train)\n#> Setting default kernel parameters\nsvm_poly_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Polynomial kernel function. \n#> Hyperparameters : degree = 1 scale = 1 offset = 1 \n#> \n#> Number of Support Vectors : 357 \n#> \n#> Objective Function Value : -353.0043 \n#> Training error : 0.17707 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_poly_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_poly_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.412 0.588 \n#> 2 0.863 0.137 \n#> 3 0.549 0.451 \n#> 4 0.976 0.0242\n#> 5 0.909 0.0912\n#> 6 0.855 0.145\n```\n:::\n\n\n## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) \n\n## `kernlab` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_spec <- svm_rbf() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_fit <- svm_rbf_spec |> fit(class ~ ., data = bin_train)\nsvm_rbf_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Gaussian Radial Basis kernel function. \n#> Hyperparameter : sigma = 2.60157241724157 \n#> \n#> Number of Support Vectors : 338 \n#> \n#> Objective Function Value : -292.4523 \n#> Training error : 0.170701 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_rbf_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_rbf_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.524 0.476\n#> 2 0.893 0.107\n#> 3 0.239 0.761\n#> 4 0.866 0.134\n#> 5 0.867 0.133\n#> 6 0.876 0.124\n```\n:::\n\n\n## `liquidSVM` Engine \n\nNote that this package is not on CRAN. You can install it via its :\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npak::pak(\"cran/liquidSVM\") # fails\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_spec <- svm_rbf() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"liquidSVM\")\n#> Warning: The `engine` argument of `set_engine()` cannot be liquidSVM as of\n#> parsnip 0.1.6.\n#> ℹ The liquidSVM package is no longer available on CRAN.\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_fit <- svm_rbf_spec |> fit(class ~ ., data = bin_train)\nsvm_rbf_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_rbf_fit, type = \"class\", new_data = bin_test)\npredict(svm_rbf_fit, type = \"prob\", new_data = bin_test)\n```\n:::\n\n\n# Regression Models\n\n\nTo demonstrate regression, we'll subset some data. make a training/test split, and standardize the predictors: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(938)\nreg_split <-\n modeldata::concrete |> \n slice_sample(n = 100) |> \n select(strength = compressive_strength, cement, age) |> \n initial_split(prop = 0.95, strata = strength)\nreg_split\n#> \n#> <92/8/100>\n\nreg_rec <- \n recipe(strength ~ ., data = training(reg_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nreg_train <- bake(reg_rec, new_data = NULL)\nreg_test <- bake(reg_rec, new_data = testing(reg_split))\n```\n:::\n\n\nWe also have some models that are specific to integer count outcomes. The data for these are:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(207)\ncount_split <-\n attrition |>\n select(num_years = TotalWorkingYears, age = Age, income = MonthlyIncome) |>\n initial_split(prop = 0.994)\ncount_split\n#> \n#> <1461/9/1470>\n\ncount_rec <-\n recipe(num_years ~ ., data = training(count_split)) |>\n step_normalize(all_numeric_predictors()) |>\n prep()\n\ncount_train <- bake(count_rec, new_data = NULL)\ncount_test <- bake(count_rec, new_data = testing(count_split))\n```\n:::\n\n\nIf using the **Apache Spark** engine, we will need to identify the data source, \nand then use it to create the splits. For this article, we will copy the \n`concrete` data set into the Spark session.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ntbl_concrete <- copy_to(sc, modeldata::concrete)\n\ntbl_reg <- sdf_random_split(tbl_concrete, training = 0.95, test = 0.05, seed = 100)\n```\n:::\n\n\n\n## Auto Ml (`auto_ml()`) \n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nauto_ml_spec <- auto_ml() |>\n # We dont need to set the engine (since there is only one) but we'll set\n # a time limit\n set_engine(\"h2o\", max_runtime_secs = 60 * 3) |> \n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nauto_ml_fit <- auto_ml_spec |> fit(strength ~ ., data = reg_train)\nauto_ml_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(auto_ml_fit, new_data = reg_test)\n```\n:::\n\n\n## Bagged MARS (`bag_mars()`) \n\n## `earth` Engine \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mars_spec <- bag_mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mars_fit <- bag_mars_spec |> fit(strength ~ ., data = reg_train)\nbag_mars_fit\n#> parsnip model object\n#> \n#> Bagged MARS (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 age 86.9 5.54 11\n#> 2 cement 76.6 5.73 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mars_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 21.5\n#> 2 41.3\n#> 3 27.3\n#> 4 56.6\n#> 5 35.9\n#> 6 36.5\n#> 7 38.5\n#> 8 38.2\n```\n:::\n\n\n## Bagged Neural Networks (`bag_mlp()`) \n\n## `nnet` Engine \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mlp_spec <- bag_mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mlp_fit <- bag_mlp_spec |> fit(strength ~ ., data = reg_train)\nbag_mlp_fit\n#> parsnip model object\n#> \n#> Bagged nnet (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 age 59.3 1.66 11\n#> 2 cement 40.7 1.66 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.7\n#> 2 42.0\n#> 3 27.8\n#> 4 76.0\n#> 5 37.3\n#> 6 39.0\n#> 7 35.9\n#> 8 42.4\n```\n:::\n\n\n## Bagged Decision Trees (`bag_tree()`) \n\n## `rpart` Engine \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_fit <- bag_tree_spec |> fit(strength ~ ., data = reg_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged CART (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 cement 17674. 1795. 11\n#> 2 age 12753. 489. 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.0\n#> 2 32.4\n#> 3 29.7\n#> 4 58.0\n#> 5 37.8\n#> 6 44.4\n#> 7 42.5\n#> 8 38.2\n```\n:::\n\n\n## Bayesian Additive Regression Trees (`bart()`) \n\n## `dbarts` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbart_spec <- bart() |>\n # We need to set the mode since this engine works with multiple modes\n # and dbarts is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbart_fit <- bart_spec |> fit(strength ~ ., data = reg_train)\nbart_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> `NULL`()\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bart_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.2\n#> 2 41.0\n#> 3 26.5\n#> 4 52.6\n#> 5 36.0\n#> 6 36.8\n#> 7 39.1\n#> 8 37.9\npredict(bart_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 16.7 32.0\n#> 2 33.0 49.2\n#> 3 20.5 31.5\n#> 4 41.8 63.5\n#> 5 28.1 43.9\n#> 6 30.2 42.6\n#> 7 33.3 45.3\n#> 8 27.2 50.0\npredict(bart_fit, type = \"pred_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 4.90 44.3\n#> 2 22.5 60.4\n#> 3 8.62 44.8\n#> 4 35.0 71.9\n#> 5 16.6 53.3\n#> 6 19.9 54.5\n#> 7 22.5 57.3\n#> 8 16.4 58.6\n```\n:::\n\n\n## Boosted Decision Trees (`boost_tree()`) \n\n## `catboost` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"catboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> CatBoost model (1000 trees)\n#> Loss function: RMSE\n#> Fit to 2 feature(s)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 25.3\n#> 2 33.9\n#> 3 28.1\n#> 4 60.7\n#> 5 35.4\n#> 6 38.2\n#> 7 43.3\n#> 8 29.8\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: gbm\n#> Model ID: GBM_model_R_1763571327438_4145 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 20476 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 14 43 27.92000\n#> \n#> \n#> H2ORegressionMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.001563879\n#> RMSE: 0.03954591\n#> MAE: 0.02903684\n#> RMSLE: 0.001771464\n#> Mean Residual Deviance : 0.001563879\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.7\n#> 2 32.2\n#> 3 26.9\n#> 4 63.2\n#> 5 34.9\n#> 6 39.0\n#> 7 40.0\n#> 8 32.9\n```\n:::\n\n\n## `h2o_gbm` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: gbm\n#> Model ID: GBM_model_R_1763571327438_4146 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 20476 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 14 43 27.92000\n#> \n#> \n#> H2ORegressionMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.001563879\n#> RMSE: 0.03954591\n#> MAE: 0.02903684\n#> RMSLE: 0.001771464\n#> Mean Residual Deviance : 0.001563879\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.7\n#> 2 32.2\n#> 3 26.9\n#> 4 63.2\n#> 5 34.9\n#> 6 39.0\n#> 7 40.0\n#> 8 32.9\n```\n:::\n\n\n## `lightgbm` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"lightgbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> LightGBM Model (100 trees)\n#> Objective: regression\n#> Fitted to dataset with 2 columns\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 20.6\n#> 2 42.5\n#> 3 27.0\n#> 4 49.2\n#> 5 43.7\n#> 6 38.3\n#> 7 41.1\n#> 8 36.9\n```\n:::\n\n\n## `xgboost` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and xgboost is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> ##### xgb.Booster\n#> raw: 35 Kb \n#> call:\n#> xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, \n#> colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, \n#> subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist, \n#> verbose = 0, nthread = 1, objective = \"reg:squarederror\")\n#> params (as set within xgb.train):\n#> eta = \"0.3\", max_depth = \"6\", gamma = \"0\", colsample_bytree = \"1\", colsample_bynode = \"1\", min_child_weight = \"1\", subsample = \"1\", nthread = \"1\", objective = \"reg:squarederror\", validate_parameters = \"TRUE\"\n#> xgb.attributes:\n#> niter\n#> callbacks:\n#> cb.evaluation.log()\n#> # of features: 2 \n#> niter: 15\n#> nfeatures : 2 \n#> evaluation_log:\n#> iter training_rmse\n#> \n#> 1 27.511751\n#> 2 20.726236\n#> --- ---\n#> 14 2.774394\n#> 15 2.632224\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.3\n#> 2 32.9\n#> 3 26.7\n#> 4 57.6\n#> 5 34.9\n#> 6 33.8\n#> 7 42.6\n#> 8 26.3\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n set_mode(\"regression\") |>\n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> GBTRegressionModel: uid=gradient_boosted_trees__d4414e35_351c_433f_958b_847ee38e9416, numTrees=20, numFeatures=8\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 20.8 \n#> 2 28.1 \n#> 3 15.5 \n#> 4 22.4 \n#> 5 9.37\n#> 6 40.1 \n#> 7 14.2 \n#> 8 32.1 \n#> 9 37.4 \n#> 10 49.5 \n#> # ℹ more rows\n```\n:::\n\n\n## Cubist Rules (`cubist_rules()`) \n\n## `Cubist` Engine \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and Cubist is the default engine so there is no need to set that either.\ncubist_rules_spec <- cubist_rules()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncubist_rules_fit <- cubist_rules_spec |> fit(strength ~ ., data = reg_train)\ncubist_rules_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> cubist.default(x = x, y = y, committees = 1)\n#> \n#> Number of samples: 92 \n#> Number of predictors: 2 \n#> \n#> Number of committees: 1 \n#> Number of rules: 2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(cubist_rules_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.2\n#> 2 46.3\n#> 3 23.6\n#> 4 54.4\n#> 5 32.7\n#> 6 37.8\n#> 7 38.8\n#> 8 38.6\n```\n:::\n\n\n## Decision Tree (`decision_tree()`) \n\n## `partykit` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(strength ~ ., data = reg_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> strength ~ cement + age\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] cement <= 0.72078\n#> | | [3] age <= -0.60316\n#> | | | [4] cement <= -0.38732: 11.141 (n = 12, err = 292.8)\n#> | | | [5] cement > -0.38732: 18.005 (n = 11, err = 401.5)\n#> | | [6] age > -0.60316\n#> | | | [7] cement <= 0.24945\n#> | | | | [8] age <= -0.2359: 28.756 (n = 24, err = 1450.6)\n#> | | | | [9] age > -0.2359: 39.014 (n = 11, err = 634.8)\n#> | | | [10] cement > 0.24945: 42.564 (n = 11, err = 1041.7)\n#> | [11] cement > 0.72078: 50.864 (n = 23, err = 5390.3)\n#> \n#> Number of inner nodes: 5\n#> Number of terminal nodes: 6\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 18.0\n#> 2 39.0\n#> 3 28.8\n#> 4 50.9\n#> 5 50.9\n#> 6 42.6\n#> 7 42.6\n#> 8 50.9\n```\n:::\n\n\n## `rpart` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(strength ~ ., data = reg_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> n= 92 \n#> \n#> node), split, n, deviance, yval\n#> * denotes terminal node\n#> \n#> 1) root 92 26564.7400 33.57728 \n#> 2) cement< 0.7861846 69 12009.9000 27.81493 \n#> 4) age< -0.5419541 23 964.6417 14.42348 \n#> 8) cement< -0.3695209 12 292.7811 11.14083 *\n#> 9) cement>=-0.3695209 11 401.4871 18.00455 *\n#> 5) age>=-0.5419541 46 4858.3440 34.51065 \n#> 10) age< 0.008934354 32 2208.3040 31.16781 \n#> 20) cement< 0.311975 24 1450.6200 28.75583 *\n#> 21) cement>=0.311975 8 199.1900 38.40375 *\n#> 11) age>=0.008934354 14 1475.1130 42.15143 *\n#> 3) cement>=0.7861846 23 5390.3320 50.86435 \n#> 6) age< -0.5419541 7 390.4204 40.08429 *\n#> 7) age>=-0.5419541 16 3830.5510 55.58062 *\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 18.0\n#> 2 42.2\n#> 3 28.8\n#> 4 55.6\n#> 5 40.1\n#> 6 38.4\n#> 7 38.4\n#> 8 40.1\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n set_mode(\"regression\") |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> DecisionTreeRegressionModel: uid=decision_tree_regressor__224bd5f4_4a90_4afe_9056_f064491ee63e, depth=5, numNodes=63, numFeatures=8\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = tbl_reg$test)\n```\n:::\n\n\n\n\n## Generalized Additive Models (`gen_additive_mod()`) \n\n## `mgcv` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_spec <- gen_additive_mod() |>\n # We need to set the mode since this engine works with multiple modes\n # and mgcv is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_fit <- \n gen_additive_mod_spec |> \n fit(strength ~ s(age) + s(cement), data = reg_train)\ngen_additive_mod_fit\n#> parsnip model object\n#> \n#> \n#> Family: gaussian \n#> Link function: identity \n#> \n#> Formula:\n#> strength ~ s(age) + s(cement)\n#> \n#> Estimated degrees of freedom:\n#> 4.18 3.56 total = 8.74 \n#> \n#> GCV score: 108.4401\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(gen_additive_mod_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.1\n#> 2 41.2\n#> 3 26.7\n#> 4 55.9\n#> 5 35.2\n#> 6 37.1\n#> 7 38.5\n#> 8 39.6\npredict(gen_additive_mod_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 18.9 27.4\n#> 2 35.7 46.6\n#> 3 22.4 31.0\n#> 4 47.0 64.7\n#> 5 30.1 40.4\n#> 6 32.9 41.2\n#> 7 34.3 42.6\n#> 8 30.3 49.0\n```\n:::\n\n\n## Linear Reg (`linear_reg()`) \n\n## `brulee` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Linear regression\n#> \n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> batch size: 83 \n#> scaled validation loss after 1 epoch: 291\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 33.2\n#> 2 30.0\n#> 3 21.3\n#> 4 53.7\n#> 5 42.2\n#> 6 36.2\n#> 7 37.3\n#> 8 51.6\n```\n:::\n\n\n## `gee` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n```\n:::\n\n\n## `glm` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = strength ~ ., family = stats::gaussian, \n#> data = data)\n#> \n#> Coefficients:\n#> (Intercept) cement age \n#> 33.577 8.795 5.471 \n#> \n#> Degrees of Freedom: 91 Total (i.e. Null); 89 Residual\n#> Null Deviance:\t 26560 \n#> Residual Deviance: 15480 \tAIC: 740.6\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 28.8 35.4\n#> 2 27.1 33.5\n#> 3 17.3 25.9\n#> 4 44.6 58.1\n#> 5 35.6 45.0\n#> 6 32.3 38.3\n#> 7 33.2 39.4\n#> 8 41.6 56.0\n```\n:::\n\n\n## `glmer` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\n```\n:::\n\n\n## `glmnet` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"gaussian\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 9.5680\n#> 2 1 5.38 8.7180\n#> 3 1 9.85 7.9430\n#> 4 1 13.56 7.2380\n#> 5 1 16.64 6.5950\n#> 6 2 19.99 6.0090\n#> 7 2 23.68 5.4750\n#> 8 2 26.75 4.9890\n#> 9 2 29.29 4.5450\n#> 10 2 31.40 4.1420\n#> 11 2 33.15 3.7740\n#> 12 2 34.61 3.4380\n#> 13 2 35.82 3.1330\n#> 14 2 36.82 2.8550\n#> 15 2 37.65 2.6010\n#> 16 2 38.34 2.3700\n#> 17 2 38.92 2.1590\n#> 18 2 39.39 1.9680\n#> 19 2 39.79 1.7930\n#> 20 2 40.12 1.6340\n#> 21 2 40.39 1.4880\n#> 22 2 40.62 1.3560\n#> 23 2 40.80 1.2360\n#> 24 2 40.96 1.1260\n#> 25 2 41.09 1.0260\n#> 26 2 41.20 0.9348\n#> 27 2 41.29 0.8517\n#> 28 2 41.36 0.7761\n#> 29 2 41.42 0.7071\n#> 30 2 41.47 0.6443\n#> 31 2 41.52 0.5871\n#> 32 2 41.55 0.5349\n#> 33 2 41.58 0.4874\n#> 34 2 41.60 0.4441\n#> 35 2 41.63 0.4046\n#> 36 2 41.64 0.3687\n#> 37 2 41.66 0.3359\n#> 38 2 41.67 0.3061\n#> 39 2 41.68 0.2789\n#> 40 2 41.68 0.2541\n#> 41 2 41.69 0.2316\n#> 42 2 41.70 0.2110\n#> 43 2 41.70 0.1922\n#> 44 2 41.71 0.1752\n#> 45 2 41.71 0.1596\n#> 46 2 41.71 0.1454\n#> 47 2 41.71 0.1325\n#> 48 2 41.71 0.1207\n#> 49 2 41.72 0.1100\n#> 50 2 41.72 0.1002\n#> 51 2 41.72 0.0913\n#> 52 2 41.72 0.0832\n#> 53 2 41.72 0.0758\n#> 54 2 41.72 0.0691\n#> 55 2 41.72 0.0630\n#> 56 2 41.72 0.0574\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.2\n#> 2 30.3\n#> 3 21.7\n#> 4 51.3\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.7\n```\n:::\n\n\n## `gls` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gls\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: glm\n#> Model ID: GLM_model_R_1763571327438_4147 \n#> GLM Model: summary\n#> family link regularization\n#> 1 gaussian identity Elastic Net (alpha = 0.5, lambda = 0.01903 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 1\n#> training_frame\n#> 1 object_ftjflovkts\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept 33.577283 33.577283\n#> 2 cement 8.708461 8.708461\n#> 3 age 5.422201 5.422201\n#> \n#> H2ORegressionMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 168.2822\n#> RMSE: 12.97236\n#> MAE: 10.62672\n#> RMSLE: 0.4645554\n#> Mean Residual Deviance : 168.2822\n#> R^2 : 0.4171988\n#> Null Deviance :26564.74\n#> Null D.o.F. :91\n#> Residual Deviance :15481.96\n#> Residual D.o.F. :89\n#> AIC :740.6438\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.7\n#> 4 51.2\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.7\n```\n:::\n\n\n## `keras` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n```\n:::\n\n\n## `lm` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and lm is the default engine so there is no need to set that either.\nlinear_reg_spec <- linear_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = strength ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) cement age \n#> 33.577 8.795 5.471\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 28.8 35.4\n#> 2 27.1 33.5\n#> 3 17.3 25.9\n#> 4 44.6 58.1\n#> 5 35.6 45.0\n#> 6 32.3 38.3\n#> 7 33.2 39.4\n#> 8 41.6 56.0\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 5.72 58.5\n#> 2 3.89 56.7\n#> 3 -4.94 48.2\n#> 4 24.3 78.5\n#> 5 13.7 67.0\n#> 6 8.95 61.7\n#> 7 9.89 62.7\n#> 8 21.6 76.0\n```\n:::\n\n\n## `lme` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"lme\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n```\n:::\n\n\n## `lmer` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"lmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n```\n:::\n\n\n## `stan` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> stan_glm\n#> family: gaussian [identity]\n#> formula: strength ~ .\n#> observations: 92\n#> predictors: 3\n#> ------\n#> Median MAD_SD\n#> (Intercept) 33.6 1.4 \n#> cement 8.8 1.4 \n#> age 5.5 1.5 \n#> \n#> Auxiliary parameter(s):\n#> Median MAD_SD\n#> sigma 13.3 1.0 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 28.8 35.6\n#> 2 27.1 33.5\n#> 3 17.3 26.0\n#> 4 44.7 58.0\n#> 5 35.8 45.0\n#> 6 32.3 38.3\n#> 7 33.2 39.5\n#> 8 41.8 55.8\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 6.24 58.5\n#> 2 3.92 56.5\n#> 3 -4.87 48.0\n#> 4 24.2 78.2\n#> 5 14.3 68.1\n#> 6 8.85 61.7\n#> 7 10.8 62.6\n#> 8 22.3 75.6\n```\n:::\n\n\n## `stan_glmer` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_test)\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> Coefficients:\n#> (Intercept) cement blast_furnace_slag fly_ash \n#> -21.80239627 0.12003251 0.10399582 0.08747677 \n#> water superplasticizer coarse_aggregate fine_aggregate \n#> -0.15701342 0.28531613 0.01777782 0.02018358 \n#> age \n#> 0.11678247\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 16.5\n#> 2 19.7\n#> 3 26.1\n#> 4 23.6\n#> 5 24.2\n#> 6 29.1\n#> 7 21.3\n#> 8 24.2\n#> 9 33.9\n#> 10 57.7\n#> # ℹ more rows\n```\n:::\n\n\n## Multivariate Adaptive Regression Splines (`mars()`) \n\n## `earth` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_spec <- mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_fit <- mars_spec |> fit(strength ~ ., data = reg_train)\nmars_fit\n#> parsnip model object\n#> \n#> Selected 4 of 9 terms, and 2 of 2 predictors\n#> Termination condition: RSq changed by less than 0.001 at 9 terms\n#> Importance: age, cement\n#> Number of terms at each degree of interaction: 1 3 (additive model)\n#> GCV 113.532 RSS 8915.965 GRSq 0.6153128 RSq 0.6643684\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mars_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.0\n#> 2 43.1\n#> 3 28.1\n#> 4 58.0\n#> 5 33.8\n#> 6 34.9\n#> 7 36.3\n#> 8 43.5\n```\n:::\n\n\n## Neural Networks (`mlp()`) \n\n## `brulee` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> relu activation,\n#> 3 hidden units,\n#> 13 model parameters\n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 83 \n#> learn rate: 0.01 \n#> scaled validation loss after 6 epochs: 0.21\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 21.3\n#> 2 33.9\n#> 3 23.7\n#> 4 46.9\n#> 5 42.3\n#> 6 32.2\n#> 7 34.8\n#> 8 46.9\n```\n:::\n\n\n## `brulee_two_layer` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"brulee_two_layer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> c(relu,relu) activation,\n#> c(3,3) hidden units,\n#> 25 model parameters\n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 83 \n#> learn rate: 0.01 \n#> scaled validation loss after 21 epochs: 0.129\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.8\n#> 2 41.9\n#> 3 26.5\n#> 4 56.6\n#> 5 33.1\n#> 6 40.5\n#> 7 41.5\n#> 8 38.0\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: deeplearning\n#> Model ID: DeepLearning_model_R_1763571327438_4148 \n#> Status of Neuron Layers: predicting .outcome, regression, gaussian distribution, Quadratic loss, 801 weights/biases, 14.5 KB, 920 training samples, mini-batch size 1\n#> layer units type dropout l1 l2 mean_rate rate_rms momentum\n#> 1 1 2 Input 0.00 % NA NA NA NA NA\n#> 2 2 200 Rectifier 0.00 % 0.000000 0.000000 0.005416 0.010833 0.000000\n#> 3 3 1 Linear NA 0.000000 0.000000 0.000501 0.000097 0.000000\n#> mean_weight weight_rms mean_bias bias_rms\n#> 1 NA NA NA NA\n#> 2 -0.009259 0.111978 0.497921 0.008852\n#> 3 -0.003265 0.101694 0.014595 0.000000\n#> \n#> \n#> H2ORegressionMetrics: deeplearning\n#> ** Reported on training data. **\n#> ** Metrics reported on full training frame **\n#> \n#> MSE: 156.8178\n#> RMSE: 12.52269\n#> MAE: 9.742575\n#> RMSLE: 0.4096152\n#> Mean Residual Deviance : 156.8178\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 26.9\n#> 2 28.8\n#> 3 18.3\n#> 4 47.1\n#> 5 34.8\n#> 6 31.5\n#> 7 32.5\n#> 8 42.5\n```\n:::\n\n\n## `keras` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n```\n:::\n\n\n## `nnet` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> a 2-5-1 network with 21 weights\n#> inputs: cement age \n#> output(s): strength \n#> options were - linear output units\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 26.0\n#> 2 42.1\n#> 3 29.2\n#> 4 67.8\n#> 5 36.7\n#> 6 33.3\n#> 7 33.3\n#> 8 33.9\n```\n:::\n\n\n## K-Nearest Neighbors (`nearest_neighbor()`) \n\n## `kknn` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_spec <- nearest_neighbor() |>\n # We need to set the mode since this engine works with multiple modes\n # and kknn is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_fit <- nearest_neighbor_spec |> fit(strength ~ ., data = reg_train)\nnearest_neighbor_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> kknn::train.kknn(formula = strength ~ ., data = data, ks = min_rows(5, data, 5))\n#> \n#> Type of response variable: continuous\n#> minimal mean absolute error: 8.257735\n#> Minimal mean squared error: 115.8737\n#> Best kernel: optimal\n#> Best k: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(nearest_neighbor_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 16.3\n#> 2 35.7\n#> 3 27.5\n#> 4 56.7\n#> 5 42.6\n#> 6 41.7\n#> 7 41.2\n#> 8 50.2\n```\n:::\n\n\n## Null Model (`null_model()`) \n\n## `parsnip` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_spec <- null_model() |>\n # We need to set the mode since this engine works with multiple modes\n # and parsnip is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_fit <- null_model_spec |> fit(strength ~ ., data = reg_train)\nnull_model_fit\n#> parsnip model object\n#> \n#> Null Classification Model\n#> Predicted Value: 33.57728\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(null_model_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 33.6\n#> 2 33.6\n#> 3 33.6\n#> 4 33.6\n#> 5 33.6\n#> 6 33.6\n#> 7 33.6\n#> 8 33.6\n```\n:::\n\n\n## Partial Least Squares (`pls()`) \n\n## `mixOmics` Engine \n\nThis engine requires the plsmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(plsmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_spec <- pls() |>\n # We need to set the mode since this engine works with multiple modes\n # and mixOmics is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_fit <- pls_spec |> fit(strength ~ ., data = reg_train)\npls_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> mixOmics::spls(X = x, Y = y, ncomp = ncomp, keepX = keepX) \n#> \n#> sPLS with a 'regression' mode with 2 sPLS components. \n#> You entered data X of dimensions: 92 2 \n#> You entered data Y of dimensions: 92 1 \n#> \n#> Selection of [2] [2] variables on each of the sPLS components on the X data set. \n#> Selection of [1] [1] variables on each of the sPLS components on the Y data set. \n#> \n#> Main numerical outputs: \n#> -------------------- \n#> loading vectors: see object$loadings \n#> variates: see object$variates \n#> variable names: see object$names \n#> \n#> Functions to visualise samples: \n#> -------------------- \n#> plotIndiv, plotArrow \n#> \n#> Functions to visualise variables: \n#> -------------------- \n#> plotVar, plotLoadings, network, cim\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(pls_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\n```\n:::\n\n\n## Poisson Reg (`poisson_reg()`) \n\n## `gee` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train)\npoisson_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_test)\n```\n:::\n\n\n## `glm` Engine \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and glm is the default engine so there is no need to set that either.\npoisson_reg_spec <- poisson_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = num_years ~ ., family = stats::poisson, \n#> data = data)\n#> \n#> Coefficients:\n#> (Intercept) age income \n#> 2.2861 0.2804 0.2822 \n#> \n#> Degrees of Freedom: 1460 Total (i.e. Null); 1458 Residual\n#> Null Deviance:\t 7434 \n#> Residual Deviance: 2597 \tAIC: 8446\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.6 \n#> 2 6.66\n#> 3 11.8 \n#> 4 24.8 \n#> 5 26.6 \n#> 6 8.23\n#> 7 32.1 \n#> 8 4.86\n#> 9 28.3\n```\n:::\n\n\n## `glmer` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train)\npoisson_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_test)\n```\n:::\n\n\n## `glmnet` Engine \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"poisson\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 5.9710\n#> 2 1 10.26 5.4400\n#> 3 1 18.31 4.9570\n#> 4 2 24.84 4.5170\n#> 5 2 32.06 4.1150\n#> 6 2 37.94 3.7500\n#> 7 2 42.73 3.4170\n#> 8 2 46.65 3.1130\n#> 9 2 49.87 2.8370\n#> 10 2 52.51 2.5850\n#> 11 2 54.69 2.3550\n#> 12 2 56.48 2.1460\n#> 13 2 57.96 1.9550\n#> 14 2 59.18 1.7810\n#> 15 2 60.19 1.6230\n#> 16 2 61.03 1.4790\n#> 17 2 61.72 1.3480\n#> 18 2 62.29 1.2280\n#> 19 2 62.76 1.1190\n#> 20 2 63.16 1.0190\n#> 21 2 63.48 0.9289\n#> 22 2 63.75 0.8463\n#> 23 2 63.98 0.7712\n#> 24 2 64.16 0.7026\n#> 25 2 64.31 0.6402\n#> 26 2 64.44 0.5833\n#> 27 2 64.55 0.5315\n#> 28 2 64.64 0.4843\n#> 29 2 64.71 0.4413\n#> 30 2 64.77 0.4021\n#> 31 2 64.82 0.3664\n#> 32 2 64.86 0.3338\n#> 33 2 64.90 0.3042\n#> 34 2 64.92 0.2771\n#> 35 2 64.95 0.2525\n#> 36 2 64.97 0.2301\n#> 37 2 64.98 0.2096\n#> 38 2 65.00 0.1910\n#> 39 2 65.01 0.1741\n#> 40 2 65.02 0.1586\n#> 41 2 65.03 0.1445\n#> 42 2 65.03 0.1317\n#> 43 2 65.04 0.1200\n#> 44 2 65.04 0.1093\n#> 45 2 65.05 0.0996\n#> 46 2 65.05 0.0907\n#> 47 2 65.05 0.0827\n#> 48 2 65.05 0.0753\n#> 49 2 65.06 0.0687\n#> 50 2 65.06 0.0625\n#> 51 2 65.06 0.0570\n#> 52 2 65.06 0.0519\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.4 \n#> 2 6.70\n#> 3 11.8 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.27\n#> 7 31.8 \n#> 8 4.91\n#> 9 28.1\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: glm\n#> Model ID: GLM_model_R_1763571327438_4149 \n#> GLM Model: summary\n#> family link regularization\n#> 1 poisson log Elastic Net (alpha = 0.5, lambda = 0.01194 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 4\n#> training_frame\n#> 1 object_xqwzxdmwtf\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept 2.286411 2.286411\n#> 2 age 0.279967 0.279967\n#> 3 income 0.281952 0.281952\n#> \n#> H2ORegressionMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 18.40519\n#> RMSE: 4.290128\n#> MAE: 3.297048\n#> RMSLE: 0.467537\n#> Mean Residual Deviance : 1.777749\n#> R^2 : 0.6934292\n#> Null Deviance :7434.374\n#> Null D.o.F. :1460\n#> Residual Deviance :2597.291\n#> Residual D.o.F. :1458\n#> AIC :8445.967\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.6 \n#> 2 6.67\n#> 3 11.8 \n#> 4 24.8 \n#> 5 26.5 \n#> 6 8.24\n#> 7 32.0 \n#> 8 4.87\n#> 9 28.2\n```\n:::\n\n\n## `hurdle` Engine \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"hurdle\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> pscl::hurdle(formula = num_years ~ ., data = data)\n#> \n#> Count model coefficients (truncated poisson with log link):\n#> (Intercept) age income \n#> 2.2911 0.2749 0.2820 \n#> \n#> Zero hurdle model coefficients (binomial with logit link):\n#> (Intercept) age income \n#> 24.656 5.611 13.092\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.5 \n#> 2 6.74\n#> 3 11.9 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.32\n#> 7 31.9 \n#> 8 4.89\n#> 9 28.2\n```\n:::\n\n\n## `stan` Engine \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train)\npoisson_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_test)\npredict(poisson_reg_fit, type = \"conf_int\", new_data = reg_test)\npredict(poisson_reg_fit, type = \"pred_int\", new_data = reg_test)\n```\n:::\n\n\n## `stan_glmer` Engine \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(strength ~ ., data = reg_train)\npoisson_reg_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_test)\npredict(poisson_reg_fit, type = \"pred_int\", new_data = reg_test)\n```\n:::\n\n\n## `zeroinfl` Engine \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"zeroinfl\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\n#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> pscl::zeroinfl(formula = num_years ~ ., data = data)\n#> \n#> Count model coefficients (poisson with log link):\n#> (Intercept) age income \n#> 2.2912 0.2748 0.2821 \n#> \n#> Zero-inflation model coefficients (binomial with logit link):\n#> (Intercept) age income \n#> -48.26 -18.22 -11.72\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.5 \n#> 2 6.74\n#> 3 11.9 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.31\n#> 7 31.9 \n#> 8 4.93\n#> 9 28.2\n```\n:::\n\n\n## Random Forests (`rand_forest()`) \n\n## `aorsf` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random regression forest\n#> \n#> Linear combinations: Accelerated Linear regression\n#> N observations: 92\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 13.968\n#> Min observations in leaf: 5\n#> OOB stat value: 0.59\n#> OOB stat type: RSQ\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 25.0\n#> 2 36.6\n#> 3 30.4\n#> 4 55.7\n#> 5 42.0\n#> 6 38.8\n#> 7 40.6\n#> 8 53.5\n```\n:::\n\n\n## `grf` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"grf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\npredict(rand_forest_fit, type = \"conf_int\", new_data = reg_test)\n```\n:::\n\n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: drf\n#> Model ID: DRF_model_R_1763571327438_4150 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 21666 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 13 8.90000 12 47 29.82000\n#> \n#> \n#> H2ORegressionMetrics: drf\n#> ** Reported on training data. **\n#> ** Metrics reported on Out-Of-Bag training samples **\n#> \n#> MSE: 90.66979\n#> RMSE: 9.522068\n#> MAE: 7.491973\n#> RMSLE: 0.3441902\n#> Mean Residual Deviance : 90.66979\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.8\n#> 2 34.6\n#> 3 29.1\n#> 4 56.9\n#> 5 36.7\n#> 6 36.3\n#> 7 39.6\n#> 8 29.3\n```\n:::\n\n\n## `partykit` Engine \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\n\n# Too long to print\n# rand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 16.8\n#> 2 38.2\n#> 3 28.4\n#> 4 49.9\n#> 5 48.5\n#> 6 36.3\n#> 7 38.5\n#> 8 48.6\n```\n:::\n\n\n## `randomForest` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"randomForest\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> randomForest(x = maybe_data_frame(x), y = y) \n#> Type of random forest: regression\n#> Number of trees: 500\n#> No. of variables tried at each split: 1\n#> \n#> Mean of squared residuals: 90.27832\n#> % Var explained: 68.73\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.6\n#> 2 36.6\n#> 3 28.3\n#> 4 57.2\n#> 5 38.5\n#> 6 35.0\n#> 7 38.8\n#> 8 35.1\n```\n:::\n\n\n## `ranger` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n # and ranger is the default engine so there is no need to set that either.\n set_engine(\"ranger\", keep.inbag = TRUE) |> \n # However, we'll set the engine and use the keep.inbag=TRUE option so that we \n # can produce interval predictions. This is not generally required. \n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Ranger result\n#> \n#> Call:\n#> ranger::ranger(x = maybe_data_frame(x), y = y, keep.inbag = ~TRUE, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1)) \n#> \n#> Type: Regression \n#> Number of trees: 500 \n#> Sample size: 92 \n#> Number of independent variables: 2 \n#> Mtry: 1 \n#> Target node size: 5 \n#> Variable importance mode: none \n#> Splitrule: variance \n#> OOB prediction error (MSE): 93.38443 \n#> R squared (OOB): 0.6801029\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.0\n#> 2 37.4\n#> 3 28.5\n#> 4 56.5\n#> 5 38.4\n#> 6 35.8\n#> 7 38.5\n#> 8 34.5\npredict(rand_forest_fit, type = \"conf_int\", new_data = reg_test)\n#> Warning in rInfJack(pred = result$predictions, inbag = inbag.counts, used.trees\n#> = 1:num.trees): Sample size <=20, no calibration performed.\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 20.4 27.6\n#> 2 31.3 43.6\n#> 3 24.2 32.7\n#> 4 44.5 68.4\n#> 5 33.4 43.4\n#> 6 31.3 40.4\n#> 7 35.5 41.4\n#> 8 27.0 42.0\n```\n:::\n\n\n## `spark` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n set_engine(\"spark\") |> \n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> RandomForestRegressionModel: uid=random_forest__9f449384_cf84_4bcb_afa5_43e10c342627, numTrees=20, numFeatures=8\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 27.1\n#> 2 28.6\n#> 3 25.9\n#> 4 29.6\n#> 5 16.4\n#> 6 34.5\n#> 7 19.2\n#> 8 30.1\n#> 9 37.5\n#> 10 44.2\n#> # ℹ more rows\n```\n:::\n\n\n## Rule Fit (`rule_fit()`) \n\n## `h2o` Engine \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: rulefit\n#> Model ID: RuleFit_model_R_1763571327438_4151 \n#> Rulefit Model Summary: \n#> family link regularization number_of_predictors_total\n#> 1 gaussian identity Lasso (lambda = 0.9516 ) 1783\n#> number_of_active_predictors number_of_iterations rule_ensemble_size\n#> 1 70 1 1781\n#> number_of_trees number_of_internal_trees min_depth max_depth mean_depth\n#> 1 150 150 0 5 4.00000\n#> min_leaves max_leaves mean_leaves\n#> 1 0 26 11.87333\n#> \n#> \n#> H2ORegressionMetrics: rulefit\n#> ** Reported on training data. **\n#> \n#> MSE: 91.07972\n#> RMSE: 9.54357\n#> MAE: 7.180123\n#> RMSLE: 0.3532356\n#> Mean Residual Deviance : 91.07972\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 27.0\n#> 2 36.1\n#> 3 26.8\n#> 4 49.8\n#> 5 42.2\n#> 6 34.7\n#> 7 39.4\n#> 8 40.8\n```\n:::\n\n\n## `xrf` Engine \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n # and xrf is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> An eXtreme RuleFit model of 179 rules.\n#> \n#> Original Formula:\n#> \n#> strength ~ cement + age\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 27.5\n#> 2 32.0\n#> 3 26.5\n#> 4 52.9\n#> 5 35.9\n#> 6 31.8\n#> 7 46.2\n#> 8 30.8\n```\n:::\n\n\n## Support Vector Machine (Linear Kernel) (`svm_linear()`) \n\n## `kernlab` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"kernlab\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(strength ~ ., data = reg_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Linear (vanilla) kernel function. \n#> \n#> Number of Support Vectors : 85 \n#> \n#> Objective Function Value : -47.4495 \n#> Training error : 0.606701\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.4\n#> 2 30.9\n#> 3 21.7\n#> 4 47.1\n#> 5 36.4\n#> 6 33.4\n#> 7 34.2\n#> 8 43.2\n```\n:::\n\n\n## `LiblineaR` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n # and LiblineaR is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(strength ~ ., data = reg_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized L2-loss support vector regression primal (L2R_L2LOSS_SVR)\"\n#> \n#> $Type\n#> [1] 11\n#> \n#> $W\n#> cement age Bias\n#> [1,] 8.665447 5.486263 33.34299\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 31.9\n#> 2 30.1\n#> 3 21.5\n#> 4 50.9\n#> 5 39.9\n#> 6 35.0\n#> 7 36.0\n#> 8 48.3\n```\n:::\n\n\n## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) \n\n## `kernlab` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_spec <- svm_poly() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_fit <- svm_poly_spec |> fit(strength ~ ., data = reg_train)\n#> Setting default kernel parameters\nsvm_poly_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Polynomial kernel function. \n#> Hyperparameters : degree = 1 scale = 1 offset = 1 \n#> \n#> Number of Support Vectors : 85 \n#> \n#> Objective Function Value : -47.4495 \n#> Training error : 0.606702\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_poly_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.4\n#> 2 30.9\n#> 3 21.7\n#> 4 47.1\n#> 5 36.4\n#> 6 33.4\n#> 7 34.2\n#> 8 43.2\n```\n:::\n\n\n## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) \n\n## `kernlab` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_spec <- svm_rbf() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_fit <- svm_rbf_spec |> fit(strength ~ ., data = reg_train)\nsvm_rbf_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Gaussian Radial Basis kernel function. \n#> Hyperparameter : sigma = 2.50601403779482 \n#> \n#> Number of Support Vectors : 81 \n#> \n#> Objective Function Value : -29.5383 \n#> Training error : 0.206927\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_rbf_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 16.0\n#> 2 33.9\n#> 3 28.7\n#> 4 57.2\n#> 5 37.0\n#> 6 36.2\n#> 7 37.5\n#> 8 40.1\n```\n:::\n\n\n## `liquidSVM` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_spec <- svm_rbf() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"liquidSVM\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_fit <- svm_rbf_spec |> fit(strength ~ ., data = reg_train)\nsvm_rbf_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_rbf_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 16.0\n#> 2 33.9\n#> 3 28.7\n#> 4 57.2\n#> 5 37.0\n#> 6 36.2\n#> 7 37.5\n#> 8 40.1\n```\n:::\n\n\n# Censored Regression Models\n\nLet's simulate a data set using the prodlim and survival packages: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(survival)\n#> \n#> Attaching package: 'survival'\n#> The following object is masked from 'package:future':\n#> \n#> cluster\nlibrary(prodlim)\n\nset.seed(1000)\ncns_data <- \n SimSurv(250) |> \n mutate(event_time = Surv(time, event)) |> \n select(event_time, X1, X2)\n\ncns_split <- initial_split(cns_data, prop = 0.98)\ncns_split\n#> \n#> <245/5/250>\ncns_train <- training(cns_split)\ncns_test <- testing(cns_split)\n```\n:::\n\n\nFor some types of predictions, we need the _evaluation time(s)_ for the predictions. We'll use these three times to demonstrate: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\neval_times <- c(1, 3, 5)\n```\n:::\n\n\n## Bagged Decision Trees (`bag_tree()`) \n\n## `rpart` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"censored regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_fit <- bag_tree_spec |> fit(event_time ~ ., data = cns_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> \n#> Bagging survival trees with 25 bootstrap replications \n#> \n#> Call: bagging.data.frame(formula = event_time ~ ., data = data, cp = ~0, \n#> minsplit = ~2)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.65\n#> 2 4.12\n#> 3 5.03\n#> 4 5.58\n#> 5 4.88\npredict(bag_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\n## Boosted Decision Trees (`boost_tree()`) \n\n## `mboost` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"censored regression\") |> \n set_engine(\"mboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit <- boost_tree_spec |> fit(event_time ~ ., data = cns_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> \n#> \t Model-based Boosting\n#> \n#> Call:\n#> mboost::blackboost(formula = formula, data = data, family = family, control = mboost::boost_control(), tree_controls = partykit::ctree_control(teststat = \"quadratic\", testtype = \"Teststatistic\", mincriterion = 0, minsplit = 10, minbucket = 4, maxdepth = 2, saveinfo = FALSE))\n#> \n#> \n#> \t Cox Partial Likelihood \n#> \n#> Loss function: \n#> \n#> Number of boosting iterations: mstop = 100 \n#> Step size: 0.1 \n#> Offset: 0 \n#> Number of baselearners: 1\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 6.51\n#> 2 3.92\n#> 3 4.51\n#> 4 7.17\n#> 5 4.51\npredict(boost_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(boost_tree_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 0.00839\n#> 2 -1.14 \n#> 3 -0.823 \n#> 4 0.229 \n#> 5 -0.823\n```\n:::\n\n\n## Decision Tree (`decision_tree()`) \n\n## `partykit` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(event_time ~ ., data = cns_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> event_time ~ X1 + X2\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] X2 <= -0.36159\n#> | | [3] X1 <= 0: 13.804 (n = 41)\n#> | | [4] X1 > 0: 8.073 (n = 47)\n#> | [5] X2 > -0.36159\n#> | | [6] X1 <= 0: 6.274 (n = 89)\n#> | | [7] X1 > 0\n#> | | | [8] X2 <= 0.56098: 5.111 (n = 39)\n#> | | | [9] X2 > 0.56098: 2.713 (n = 29)\n#> \n#> Number of inner nodes: 4\n#> Number of terminal nodes: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 6.27\n#> 2 5.11\n#> 3 6.27\n#> 4 6.27\n#> 5 6.27\npredict(decision_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\n## `rpart` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"censored regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(event_time ~ ., data = cns_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> $rpart\n#> n= 245 \n#> \n#> node), split, n, deviance, yval\n#> * denotes terminal node\n#> \n#> 1) root 245 329.03530 1.0000000 \n#> 2) X2< -0.09937043 110 119.05180 0.5464982 \n#> 4) X2< -0.9419799 41 42.43138 0.3153769 \n#> 8) X1< 0.5 20 12.93725 0.1541742 *\n#> 9) X1>=0.5 21 23.29519 0.5656502 *\n#> 5) X2>=-0.9419799 69 67.76223 0.7336317 *\n#> 3) X2>=-0.09937043 135 157.14990 1.7319010 \n#> 6) X1< 0.5 79 66.30972 1.2572690 *\n#> 7) X1>=0.5 56 69.62652 3.0428230 \n#> 14) X2< 1.222057 44 40.33335 2.5072040 *\n#> 15) X2>=1.222057 12 17.95790 6.3934130 *\n#> \n#> $survfit\n#> \n#> Call: prodlim::prodlim(formula = form, data = data)\n#> Stratified Kaplan-Meier estimator for the conditional event time survival function\n#> Discrete predictor variable: rpartFactor (0.154174164904031, 0.565650228981439, 0.733631734872791, 1.25726850344687, 2.50720371146533, 6.39341334321542)\n#> \n#> Right-censored response of a survival model\n#> \n#> No.Observations: 245 \n#> \n#> Pattern:\n#> Freq\n#> event 161 \n#> right.censored 84 \n#> \n#> $levels\n#> [1] \"0.154174164904031\" \"0.565650228981439\" \"0.733631734872791\"\n#> [4] \"1.25726850344687\" \"2.50720371146533\" \"6.39341334321542\" \n#> \n#> attr(,\"class\")\n#> [1] \"pecRpart\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 1.26\n#> 2 2.51\n#> 3 1.26\n#> 4 1.26\n#> 5 1.26\npredict(decision_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\n## Proportional Hazards (`proportional_hazards()`) \n\n## `glmnet` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_spec <- proportional_hazards(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit <- proportional_hazards_spec |> fit(event_time ~ ., data = cns_train)\nproportional_hazards_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = data_obj$x, y = data_obj$y, family = \"cox\", weights = weights, alpha = alpha, lambda = lambda) \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.39700\n#> 2 1 0.82 0.36170\n#> 3 1 1.51 0.32960\n#> 4 1 2.07 0.30030\n#> 5 1 2.54 0.27360\n#> 6 1 2.94 0.24930\n#> 7 2 3.28 0.22720\n#> 8 2 3.95 0.20700\n#> 9 2 4.50 0.18860\n#> 10 2 4.95 0.17180\n#> 11 2 5.33 0.15660\n#> 12 2 5.65 0.14270\n#> 13 2 5.91 0.13000\n#> 14 2 6.13 0.11840\n#> 15 2 6.31 0.10790\n#> 16 2 6.46 0.09833\n#> 17 2 6.58 0.08960\n#> 18 2 6.69 0.08164\n#> 19 2 6.77 0.07439\n#> 20 2 6.85 0.06778\n#> 21 2 6.91 0.06176\n#> 22 2 6.96 0.05627\n#> 23 2 7.00 0.05127\n#> 24 2 7.03 0.04672\n#> 25 2 7.06 0.04257\n#> 26 2 7.08 0.03879\n#> 27 2 7.10 0.03534\n#> 28 2 7.12 0.03220\n#> 29 2 7.13 0.02934\n#> 30 2 7.14 0.02673\n#> 31 2 7.15 0.02436\n#> 32 2 7.16 0.02219\n#> 33 2 7.17 0.02022\n#> 34 2 7.17 0.01843\n#> 35 2 7.18 0.01679\n#> 36 2 7.18 0.01530\n#> 37 2 7.18 0.01394\n#> 38 2 7.19 0.01270\n#> 39 2 7.19 0.01157\n#> 40 2 7.19 0.01054\n#> 41 2 7.19 0.00961\n#> 42 2 7.19 0.00875\n#> The training data has been saved for prediction.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(proportional_hazards_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.80\n#> 2 4.21\n#> 3 4.63\n#> 4 5.18\n#> 5 4.42\npredict(proportional_hazards_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(proportional_hazards_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -0.108\n#> 2 -1.43 \n#> 3 -1.23 \n#> 4 -0.993\n#> 5 -1.33\n```\n:::\n\n\n## `survival` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and survival is the default engine so there is no need to set that either.\nproportional_hazards_spec <- proportional_hazards()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit <- proportional_hazards_spec |> fit(event_time ~ ., data = cns_train)\nproportional_hazards_fit\n#> parsnip model object\n#> \n#> Call:\n#> survival::coxph(formula = event_time ~ ., data = data, model = TRUE, \n#> x = TRUE)\n#> \n#> coef exp(coef) se(coef) z p\n#> X1 0.99547 2.70599 0.16799 5.926 3.11e-09\n#> X2 0.91398 2.49422 0.09566 9.555 < 2e-16\n#> \n#> Likelihood ratio test=106.8 on 2 df, p=< 2.2e-16\n#> n= 245, number of events= 161\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(proportional_hazards_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.16\n#> 3 4.62\n#> 4 5.19\n#> 5 4.41\npredict(proportional_hazards_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(proportional_hazards_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -0.111\n#> 2 -1.49 \n#> 3 -1.27 \n#> 4 -1.02 \n#> 5 -1.37\n```\n:::\n\n\n## Random Forests (`rand_forest()`) \n\n## `aorsf` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random survival forest\n#> \n#> Linear combinations: Accelerated Cox regression\n#> N observations: 245\n#> N events: 161\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 12.4\n#> Min observations in leaf: 5\n#> Min events in leaf: 1\n#> OOB stat value: 0.71\n#> OOB stat type: Harrell's C-index\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.98\n#> 2 3.96\n#> 3 4.39\n#> 4 5.53\n#> 5 4.26\npredict(rand_forest_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\n## `partykit` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train)\n\n# Too long to print\n# rand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.22\n#> 2 3.99\n#> 3 3.87\n#> 4 5.54\n#> 5 3.87\npredict(rand_forest_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\n## Parametric Survival Models (`survival_reg()`) \n\n## `flexsurv` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_spec <- survival_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"flexsurv\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> flexsurv::flexsurvreg(formula = event_time ~ ., data = data, \n#> dist = \"weibull\")\n#> \n#> Estimates: \n#> data mean est L95% U95% se exp(est) L95% \n#> shape NA 2.11486 1.87774 2.38192 0.12832 NA NA\n#> scale NA 9.34809 8.38852 10.41743 0.51658 NA NA\n#> X1 0.46939 -0.46483 -0.61347 -0.31619 0.07584 0.62824 0.54147\n#> X2 -0.00874 -0.42229 -0.50641 -0.33817 0.04292 0.65554 0.60266\n#> U95% \n#> shape NA\n#> scale NA\n#> X1 0.72892\n#> X2 0.71307\n#> \n#> N = 245, Events: 161, Censored: 84\n#> Total time at risk: 1388.951\n#> Log-likelihood = -427.4387, df = 4\n#> AIC = 862.8774\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.13\n#> 3 4.61\n#> 4 5.16\n#> 5 4.40\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 2.18\n#> 2 1.54\n#> 3 1.65\n#> 4 1.76\n#> 5 1.60\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\n## `flexsurvspline` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_spec <- survival_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"flexsurvspline\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> flexsurv::flexsurvspline(formula = event_time ~ ., data = data)\n#> \n#> Estimates: \n#> data mean est L95% U95% se exp(est) L95% \n#> gamma0 NA -4.72712 -5.31772 -4.13651 0.30134 NA NA\n#> gamma1 NA 2.11487 1.86338 2.36637 0.12832 NA NA\n#> X1 0.46939 0.98305 0.65928 1.30683 0.16519 2.67261 1.93340\n#> X2 -0.00874 0.89308 0.70943 1.07673 0.09370 2.44265 2.03283\n#> U95% \n#> gamma0 NA\n#> gamma1 NA\n#> X1 3.69444\n#> X2 2.93508\n#> \n#> N = 245, Events: 161, Censored: 84\n#> Total time at risk: 1388.951\n#> Log-likelihood = -427.4387, df = 4\n#> AIC = 862.8774\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.13\n#> 3 4.61\n#> 4 5.16\n#> 5 4.40\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -4.62\n#> 2 -3.26\n#> 3 -3.49\n#> 4 -3.73\n#> 5 -3.39\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\n## `survival` Engine \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and survival is the default engine so there is no need to set that either.\nsurvival_reg_spec <- survival_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> survival::survreg(formula = event_time ~ ., data = data, model = TRUE)\n#> \n#> Coefficients:\n#> (Intercept) X1 X2 \n#> 2.2351722 -0.4648296 -0.4222887 \n#> \n#> Scale= 0.4728442 \n#> \n#> Loglik(model)= -427.4 Loglik(intercept only)= -481.3\n#> \tChisq= 107.73 on 2 degrees of freedom, p= <2e-16 \n#> n= 245\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 8.88\n#> 2 4.67\n#> 3 5.20\n#> 4 5.83\n#> 5 4.97\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 2.18\n#> 2 1.54\n#> 3 1.65\n#> 4 1.76\n#> 5 1.60\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\n# Quantile Regression Models\n\nTo demonstrate quantile regression, let's make a larger version of our regression data: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(938)\nqnt_split <-\n modeldata::concrete |> \n slice_sample(n = 100) |> \n select(strength = compressive_strength, cement, age) |> \n initial_split(prop = 0.95, strata = strength)\nqnt_split\n#> \n#> <92/8/100>\n\nqnt_rec <- \n recipe(strength ~ ., data = training(qnt_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nqnt_train <- bake(qnt_rec, new_data = NULL)\nqnt_test <- bake(qnt_rec, new_data = testing(qnt_split))\n```\n:::\n\n\nWe'll also predict these quantile levels: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nqnt_lvls <- (1:3) / 4\n```\n:::\n\n\n\n## Linear Regression (`linear_reg()`) \n\n## `quantreg` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n set_engine(\"quantreg\") |> \n set_mode(\"quantile regression\", quantile_levels = qnt_lvls)\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = qnt_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> quantreg::rq(formula = strength ~ ., tau = quantile_levels, data = data)\n#> \n#> Coefficients:\n#> tau= 0.25 tau= 0.50 tau= 0.75\n#> (Intercept) 23.498029 33.265428 42.046031\n#> cement 6.635233 7.955658 8.181235\n#> age 5.566668 9.514832 7.110702\n#> \n#> Degrees of freedom: 92 total; 89 residual\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, type = \"quantile\", new_data = qnt_test)\n#> # A tibble: 8 × 1\n#> .pred_quantile\n#> \n#> 1 [29.2]\n#> 2 [31.5]\n#> 3 [21.4]\n#> 4 [48.3]\n#> 5 [36.6]\n#> 6 [33.8]\n#> 7 [34.6]\n#> 8 [43.8]\n```\n:::\n\n\n## Random Forests (`rand_forest()`) \n\n## `grf` Engine \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |> \n set_mode(\"quantile regression\", quantile_levels = qnt_lvls) |>\n set_engine(\"grf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = qnt_train)\nrand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"quantile\", new_data = qnt_test)\n```\n:::\n\n\n\n\n", - "supporting": [], + "markdown": "---\ntitle: \"Fitting and predicting with parsnip\"\ncategories:\n - model fitting\n - parsnip\n - regression\n - classification\ntype: learn-subsection\nweight: 1\ndescription: | \n Examples that show how to fit and predict with different combinations of model, mode, and engine.\ntoc: true\ntoc-depth: 3\ninclude-after-body: ../../../resources.html\nformat:\n html:\n theme: [\"style.scss\"]\n---\n\n\n\n\n\n\n# Introduction\n\nThis page shows examples of how to *fit* and *predict* with different combinations of model, mode, and engine. As a reminder, in parsnip, \n\n- the **model type** differentiates basic modeling approaches, such as random forests, logistic regression, linear support vector machines, etc.,\n\n- the **mode** denotes in what kind of modeling context it will be used (most commonly, classification or regression), and\n\n- the computational **engine** indicates how the model is fit, such as with a specific R package implementation or even methods outside of R like Keras or Stan.\n\nWe'll break the examples up by their mode. For each model, we'll show different data sets used across the different engines. \n\nTo use code in this article, you will need to install the following packages: agua, baguette, bonsai, censored, discrim, HSAUR3, lme4, multilevelmod, plsmod, poissonreg, prodlim, rules, sparklyr, survival, and tidymodels. There are numerous other \"engine\" packages that are required. If you use a model that is missing one or more installed packages, parsnip will prompt you to install them. There are some packages that require non-standard installation or rely on external dependencies. We'll describe these next. \n\n## External Dependencies\n\nSome models available in parsnip use other computational frameworks for computations. There may be some additional downloads for engines using **catboost**, **Spark**, **h2o**, **tensorflow**/**keras**, and **torch**. You can expand the sections below to get basic installation instructions.\n\n
\n\n### catboost\n\ncatboost is a popular boosting framework. Unfortunately, the R package is not available on CRAN. First, go to [https://github.com/catboost/catboost/releases/](\"https://github.com/catboost/catboost/releases/) and search for \"`[R-package]`\" to find the most recent release. \n\nThe following code and be used to install and test the package (which requires the glue package to be installed): \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(glue)\n\n# Put the current version number in this variable: \nversion_number <- \"#.##\"\n\ntemplate <- \"https://github.com/catboost/catboost/releases/download/v{version}/catboost-R-darwin-universal2-{version}.tgz\"\n\ntarget_url <- glue::glue(template)\ntarget_dest <- tempfile()\ndownload.file(target_url, target_dest)\n\nif (grepl(\"^mac\", .Platform$pkgType)) {\n options <- \"--no-staged-install\"\n} else {\n options <- character(0)\n}\n\ninst <- glue::glue(\"R CMD INSTALL {options} {target_dest}\")\nsystem(inst)\n```\n:::\n\n\nTo test, fit an example model: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(catboost)\n\ntrain_pool_path <- system.file(\"extdata\", \"adult_train.1000\", package = \"catboost\")\ntest_pool_path <- system.file(\"extdata\", \"adult_test.1000\", package = \"catboost\")\ncd_path <- system.file(\"extdata\", \"adult.cd\", package = \"catboost\")\ntrain_pool <- catboost.load_pool(train_pool_path, column_description = cd_path)\ntest_pool <- catboost.load_pool(test_pool_path, column_description = cd_path)\nfit_params <- list(\n iterations = 100,\n loss_function = 'Logloss',\n ignored_features = c(4, 9),\n border_count = 32,\n depth = 5,\n learning_rate = 0.03,\n l2_leaf_reg = 3.5,\n train_dir = tempdir())\nfit_params\n```\n:::\n\n\n### Apache Spark\n\nTo use [Apache Spark](https://spark.apache.org/) as an engine, we will first install Spark and then need a connection to a cluster. For this article, we will set up and use a single-node Spark cluster running on a laptop.\n\nTo install, first install sparklyr:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ninstall.packages(\"sparklyr\")\n```\n:::\n\n\nand then install the Spark backend. For example, you might use: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nspark_install(version = \"4.0\")\n```\n:::\n\n\nOnce that is working, you can get ready to fit models using: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nsc <- spark_connect(\"local\")\n#> Warning in sprintf(version$pattern, version$spark, version$hadoop): 2 arguments\n#> not used by format 'spark-4.1.0-preview3-bin-hadoop3'\n```\n:::\n\n\n### h2o \n\nh2o.ai offers a Java-based high-performance computing server for machine learning. This can be run locally or externally. There are general installation instructions at [https://docs.h2o.ai/](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html). There is a package on CRAN, but you can also install directly from [h2o](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html#install-in-r) via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ninstall.packages(\n \"h2o\",\n type = \"source\",\n repos = \"http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R\"\n)\n```\n:::\n\n\nAfter installation is complete, you can start a local server via `h2o::h2o.init()`. \n\nThe tidymodels [agua](https://agua.tidymodels.org/) package contains some helpers and will also need to be installed. You can use its function to start a server too:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n#> \n#> Attaching package: 'agua'\n#> The following object is masked from 'package:workflowsets':\n#> \n#> rank_results\nh2o_start()\n#> Warning: JAVA not found, H2O may take minutes trying to connect.\n#> Warning in h2o.clusterInfo(): \n#> Your H2O cluster version is (1 year, 11 months and 4 days) old. There may be a newer version available.\n#> Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html\n```\n:::\n\n\n### Tensorflow and Keras\n\nR's tensorflow and keras3 packages call Python directly. To enable this, you'll have to install two R packages: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ninstall.packages(\"keras3\")\n```\n:::\n\n\nOnce that is done, use: \n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nkeras3::install_keras(backend = \"tensorflow\")\n```\n:::\n\n\nThere are other options for installation. See [https://tensorflow.rstudio.com/install/index.html](https://tensorflow.rstudio.com/install/index.html) for more details. \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Assumes you are going to use a virtual environment called \npve <- grep(\"tensorflow\", reticulate::virtualenv_list(), value = TRUE)\nreticulate::use_virtualenv(pve)\n```\n:::\n\n\n### Torch\n\nR's torch package is the low-level package containing the framework. Once you have installed it, you will get this message the first time you load the package: \n\n> Additional software needs to be downloaded and installed for torch to work correctly.\"\n\nChoosing \"Yes\" will do the _one-time_ installation. \n\n
\n\nTo get started, let's load the tidymodels package: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(tidymodels)\ntheme_set(theme_bw() + theme(legend.position = \"top\"))\n```\n:::\n\n\n# Classification Models\n\nTo demonstrate classification, let's make a small training and test sets for a binary outcome. We'll center and scale the data since some models require the same units.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(207)\nbin_split <- \n\tmodeldata::two_class_dat |> \n\trename(class = Class) |> \n\tinitial_split(prop = 0.994, strata = class)\nbin_split\n#> \n#> <785/6/791>\n\nbin_rec <- \n recipe(class ~ ., data = training(bin_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nbin_train <- bake(bin_rec, new_data = NULL)\nbin_test <- bake(bin_rec, new_data = testing(bin_split))\n```\n:::\n\n\nFor models that _only_ work for three or more classes, we'll simulate:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(1752)\nmtl_data <-\n sim_multinomial(\n 200,\n ~ -0.5 + 0.6 * abs(A),\n ~ ifelse(A > 0 & B > 0, 1.0 + 0.2 * A / B, - 2),\n ~ A + B - A * B)\n\nmtl_split <- initial_split(mtl_data, prop = 0.967, strata = class)\nmtl_split\n#> \n#> <192/8/200>\n\n# Predictors are in the same units\nmtl_train <- training(mtl_split)\nmtl_test <- testing(mtl_split)\n```\n:::\n\n\nFinally, we have some models that handle hierarchical data, where some rows are statistically correlated with other rows. For these examples, we'll use data from a clinical trial where patients were followed over time. The outcome is binary. The data are in the HSAUR3 package. We'll split these data in a way where all rows for a specific subject are either in the training or test sets: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(72)\ncls_group_split <- \n HSAUR3::toenail |> \n group_initial_split(group = patientID)\ncls_group_train <- training(cls_group_split)\ncls_group_test <- testing(cls_group_split)\n```\n:::\n\n\nThere are 219 subjects in the training set and 75 in the test set. \n\nIf using the **Apache Spark** engine, we will need to identify the data source and then use it to create the splits. For this article, we will copy the `two_class_dat` and the `mtl_data` data sets into the Spark session.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nsc <- spark_connect(\"local\")\n#> Re-using existing Spark connection to local\n\ntbl_two_class <- copy_to(sc, modeldata::two_class_dat)\n\ntbl_bin <- sdf_random_split(tbl_two_class, training = 0.994, test = 1-0.994, seed = 100)\n\ntbl_sim_mtl <- copy_to(sc, mtl_data)\n\ntbl_mtl <- sdf_random_split(tbl_sim_mtl, training = 0.967, test = 1-0.967, seed = 100)\n```\n:::\n\n\n\n## Bagged MARS (`bag_mars()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mars_spec <- bag_mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(268)\nbag_mars_fit <- bag_mars_spec |> fit(class ~ ., data = bin_train)\n#> \n#> Attaching package: 'plotrix'\n#> The following object is masked from 'package:scales':\n#> \n#> rescale\n#> Registered S3 method overwritten by 'butcher':\n#> method from \n#> as.character.dev_topic generics\nbag_mars_fit\n#> parsnip model object\n#> \n#> Bagged MARS (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 100 0 11\n#> 2 A 40.4 1.60 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mars_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_mars_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.452 0.548 \n#> 2 0.854 0.146 \n#> 3 0.455 0.545 \n#> 4 0.968 0.0316\n#> 5 0.939 0.0610\n#> 6 0.872 0.128\n```\n:::\n\n\n:::\n\n## Bagged Neural Networks (`bag_mlp()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mlp_spec <- bag_mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(318)\nbag_mlp_fit <- bag_mlp_spec |> fit(class ~ ., data = bin_train)\nbag_mlp_fit\n#> parsnip model object\n#> \n#> Bagged nnet (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 A 52.1 2.16 11\n#> 2 B 47.9 2.16 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.439 0.561\n#> 2 0.676 0.324\n#> 3 0.428 0.572\n#> 4 0.727 0.273\n#> 5 0.709 0.291\n#> 6 0.660 0.340\n```\n:::\n\n\n:::\n\n## Bagged Decision Trees (`bag_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(985)\nbag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged CART (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 271. 4.35 11\n#> 2 A 237. 5.58 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0 1 \n#> 2 1 0 \n#> 3 0.0909 0.909 \n#> 4 1 0 \n#> 5 0.727 0.273 \n#> 6 0.909 0.0909\n```\n:::\n\n\n## `C5.0` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(937)\nbag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged C5.0 (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 100 0 11\n#> 2 A 48.7 7.33 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.269 0.731\n#> 2 0.863 0.137\n#> 3 0.259 0.741\n#> 4 0.897 0.103\n#> 5 0.897 0.103\n#> 6 0.870 0.130\n```\n:::\n\n\n:::\n\n## Bayesian Additive Regression Trees (`bart()`) \n\n:::{.panel-tabset}\n\n## `dbarts` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbart_spec <- bart() |>\n # We need to set the mode since this engine works with multiple modes\n # and dbarts is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(217)\nbart_fit <- bart_spec |> fit(class ~ ., data = bin_train)\nbart_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> `NULL`()\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bart_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bart_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.439 0.561\n#> 2 0.734 0.266\n#> 3 0.34 0.66 \n#> 4 0.957 0.043\n#> 5 0.931 0.069\n#> 6 0.782 0.218\npredict(bart_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_lower_Class2 .pred_upper_Class1 .pred_upper_Class2\n#> \n#> 1 0.815 0.00280 0.997 0.185\n#> 2 0.781 0.0223 0.978 0.219\n#> 3 0.558 0.0702 0.930 0.442\n#> 4 0.540 0.105 0.895 0.460\n#> 5 0.239 0.345 0.655 0.761\n#> 6 0.195 0.469 0.531 0.805\npredict(bart_fit, type = \"pred_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_lower_Class2 .pred_upper_Class1 .pred_upper_Class2\n#> \n#> 1 0 0 1 1\n#> 2 0 0 1 1\n#> 3 0 0 1 1\n#> 4 0 0 1 1\n#> 5 0 0 1 1\n#> 6 0 0 1 1\n```\n:::\n\n\n:::\n\n## Boosted Decision Trees (`boost_tree()`) \n\n:::{.panel-tabset}\n\n## `xgboost` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and xgboost is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(738)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> ##### xgb.Booster\n#> raw: 40.4 Kb \n#> call:\n#> xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, \n#> colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, \n#> subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist, \n#> verbose = 0, nthread = 1, objective = \"binary:logistic\")\n#> params (as set within xgb.train):\n#> eta = \"0.3\", max_depth = \"6\", gamma = \"0\", colsample_bytree = \"1\", colsample_bynode = \"1\", min_child_weight = \"1\", subsample = \"1\", nthread = \"1\", objective = \"binary:logistic\", validate_parameters = \"TRUE\"\n#> xgb.attributes:\n#> niter\n#> callbacks:\n#> cb.evaluation.log()\n#> # of features: 2 \n#> niter: 15\n#> nfeatures : 2 \n#> evaluation_log:\n#> iter training_logloss\n#> \n#> 1 0.5546750\n#> 2 0.4719804\n#> --- ---\n#> 14 0.2587640\n#> 15 0.2528938\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.244 0.756 \n#> 2 0.770 0.230 \n#> 3 0.307 0.693 \n#> 4 0.944 0.0565\n#> 5 0.821 0.179 \n#> 6 0.938 0.0621\n```\n:::\n\n\n## `C5.0` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(984)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = 15, control = C50::C5.0Control(minCases\n#> = 2, sample = 0))\n#> \n#> Classification Tree\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Number of boosting iterations: 15 requested; 7 used due to early stopping\n#> Average tree size: 3.1 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.307 0.693\n#> 2 0.756 0.244\n#> 3 0.281 0.719\n#> 4 1 0 \n#> 5 1 0 \n#> 6 0.626 0.374\n```\n:::\n\n\n## `catboost` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"catboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(644)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> CatBoost model (1000 trees)\n#> Loss function: Logloss\n#> Fit to 2 feature(s)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.291 0.709 \n#> 2 0.836 0.164 \n#> 3 0.344 0.656 \n#> 4 0.998 0.00245\n#> 5 0.864 0.136 \n#> 6 0.902 0.0983\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(186)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: gbm\n#> Model ID: GBM_model_R_1763571327438_5073 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 25379 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 21 55 35.70000\n#> \n#> \n#> H2OBinomialMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.007948832\n#> RMSE: 0.08915622\n#> LogLoss: 0.05942305\n#> Mean Per-Class Error: 0\n#> AUC: 1\n#> AUCPR: 1\n#> Gini: 1\n#> R^2: 0.9678452\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 434 0 0.000000 =0/434\n#> Class2 0 351 0.000000 =0/351\n#> Totals 434 351 0.000000 =0/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.598690 1.000000 200\n#> 2 max f2 0.598690 1.000000 200\n#> 3 max f0point5 0.598690 1.000000 200\n#> 4 max accuracy 0.598690 1.000000 200\n#> 5 max precision 0.998192 1.000000 0\n#> 6 max recall 0.598690 1.000000 200\n#> 7 max specificity 0.998192 1.000000 0\n#> 8 max absolute_mcc 0.598690 1.000000 200\n#> 9 max min_per_class_accuracy 0.598690 1.000000 200\n#> 10 max mean_per_class_accuracy 0.598690 1.000000 200\n#> 11 max tns 0.998192 434.000000 0\n#> 12 max fns 0.998192 349.000000 0\n#> 13 max fps 0.000831 434.000000 399\n#> 14 max tps 0.598690 351.000000 200\n#> 15 max tnr 0.998192 1.000000 0\n#> 16 max fnr 0.998192 0.994302 0\n#> 17 max fpr 0.000831 1.000000 399\n#> 18 max tpr 0.598690 1.000000 200\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.0496 0.950 \n#> 2 0.905 0.0953 \n#> 3 0.0738 0.926 \n#> 4 0.997 0.00273\n#> 5 0.979 0.0206 \n#> 6 0.878 0.122\n```\n:::\n\n\n## `h2o_gbm` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(724)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: gbm\n#> Model ID: GBM_model_R_1763571327438_5125 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 25379 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 21 55 35.70000\n#> \n#> \n#> H2OBinomialMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.007948832\n#> RMSE: 0.08915622\n#> LogLoss: 0.05942305\n#> Mean Per-Class Error: 0\n#> AUC: 1\n#> AUCPR: 1\n#> Gini: 1\n#> R^2: 0.9678452\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 434 0 0.000000 =0/434\n#> Class2 0 351 0.000000 =0/351\n#> Totals 434 351 0.000000 =0/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.598690 1.000000 200\n#> 2 max f2 0.598690 1.000000 200\n#> 3 max f0point5 0.598690 1.000000 200\n#> 4 max accuracy 0.598690 1.000000 200\n#> 5 max precision 0.998192 1.000000 0\n#> 6 max recall 0.598690 1.000000 200\n#> 7 max specificity 0.998192 1.000000 0\n#> 8 max absolute_mcc 0.598690 1.000000 200\n#> 9 max min_per_class_accuracy 0.598690 1.000000 200\n#> 10 max mean_per_class_accuracy 0.598690 1.000000 200\n#> 11 max tns 0.998192 434.000000 0\n#> 12 max fns 0.998192 349.000000 0\n#> 13 max fps 0.000831 434.000000 399\n#> 14 max tps 0.598690 351.000000 200\n#> 15 max tnr 0.998192 1.000000 0\n#> 16 max fnr 0.998192 0.994302 0\n#> 17 max fpr 0.000831 1.000000 399\n#> 18 max tpr 0.598690 1.000000 200\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.0496 0.950 \n#> 2 0.905 0.0953 \n#> 3 0.0738 0.926 \n#> 4 0.997 0.00273\n#> 5 0.979 0.0206 \n#> 6 0.878 0.122\n```\n:::\n\n\n## `lightgbm` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"lightgbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(906)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> LightGBM Model (100 trees)\n#> Objective: binary\n#> Fitted to dataset with 2 columns\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.147 0.853 \n#> 2 0.930 0.0699\n#> 3 0.237 0.763 \n#> 4 0.990 0.0101\n#> 5 0.929 0.0714\n#> 6 0.956 0.0445\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(285)\nboost_tree_fit <- boost_tree_spec |> fit(Class ~ ., data = tbl_bin$training)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> GBTClassificationModel: uid = gradient_boosted_trees__254e29b6_2f3f_43c5_b7d4_b4473d59cf31, numTrees=20, numClasses=2, numFeatures=2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class1 \n#> 4 Class2 \n#> 5 Class2 \n#> 6 Class1 \n#> 7 Class2\npredict(boost_tree_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 2]\n#> # Database: spark_connection\n#> pred_Class1 pred_Class2\n#> \n#> 1 0.307 0.693 \n#> 2 0.292 0.708 \n#> 3 0.856 0.144 \n#> 4 0.192 0.808 \n#> 5 0.332 0.668 \n#> 6 0.952 0.0476\n#> 7 0.0865 0.914\n```\n:::\n\n\n:::\n\n## C5 Rules (`C5_rules()`) \n\n:::{.panel-tabset}\n\n## `C5.0` \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and C5.0 is the default engine so there is no need to set that either.\nC5_rules_spec <- C5_rules()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(93)\nC5_rules_fit <- C5_rules_spec |> fit(class ~ ., data = bin_train)\nC5_rules_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = trials, rules = TRUE, control\n#> = C50::C5.0Control(minCases = minCases, seed = sample.int(10^5,\n#> 1), earlyStopping = FALSE))\n#> \n#> Rule-Based Model\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Number of Rules: 4 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(C5_rules_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(C5_rules_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 1 0\n#> 2 1 0\n#> 3 0 1\n#> 4 1 0\n#> 5 1 0\n#> 6 1 0\n```\n:::\n\n\n:::\n\n## Decision Tree (`decision_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> n= 785 \n#> \n#> node), split, n, loss, yval, (yprob)\n#> * denotes terminal node\n#> \n#> 1) root 785 351 Class1 (0.5528662 0.4471338) \n#> 2) B< -0.06526451 399 61 Class1 (0.8471178 0.1528822) *\n#> 3) B>=-0.06526451 386 96 Class2 (0.2487047 0.7512953) \n#> 6) B< 0.7339337 194 72 Class2 (0.3711340 0.6288660) \n#> 12) A>=0.6073948 49 13 Class1 (0.7346939 0.2653061) *\n#> 13) A< 0.6073948 145 36 Class2 (0.2482759 0.7517241) *\n#> 7) B>=0.7339337 192 24 Class2 (0.1250000 0.8750000) *\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.735 0.265\n#> 2 0.847 0.153\n#> 3 0.248 0.752\n#> 4 0.847 0.153\n#> 5 0.847 0.153\n#> 6 0.847 0.153\n```\n:::\n\n\n## `C5.0` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = 1, control = C50::C5.0Control(minCases =\n#> 2, sample = 0))\n#> \n#> Classification Tree\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Tree size: 4 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.732 0.268\n#> 2 0.846 0.154\n#> 3 0.236 0.764\n#> 4 0.846 0.154\n#> 5 0.846 0.154\n#> 6 0.846 0.154\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> class ~ A + B\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] B <= -0.06906\n#> | | [3] B <= -0.50486: Class1 (n = 291, err = 8.2%)\n#> | | [4] B > -0.50486\n#> | | | [5] A <= -0.07243: Class1 (n = 77, err = 45.5%)\n#> | | | [6] A > -0.07243: Class1 (n = 31, err = 6.5%)\n#> | [7] B > -0.06906\n#> | | [8] B <= 0.72938\n#> | | | [9] A <= 0.60196: Class2 (n = 145, err = 24.8%)\n#> | | | [10] A > 0.60196\n#> | | | | [11] B <= 0.44701: Class1 (n = 23, err = 4.3%)\n#> | | | | [12] B > 0.44701: Class1 (n = 26, err = 46.2%)\n#> | | [13] B > 0.72938: Class2 (n = 192, err = 12.5%)\n#> \n#> Number of inner nodes: 6\n#> Number of terminal nodes: 7\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.538 0.462 \n#> 2 0.935 0.0645\n#> 3 0.248 0.752 \n#> 4 0.918 0.0825\n#> 5 0.918 0.0825\n#> 6 0.935 0.0645\n```\n:::\n\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(Class ~ ., data = tbl_bin$training)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> n= 784 \n#> \n#> node), split, n, loss, yval, (yprob)\n#> * denotes terminal node\n#> \n#> 1) root 784 350 Class1 (0.5535714 0.4464286) \n#> 2) B< 1.495535 401 62 Class1 (0.8453865 0.1546135) *\n#> 3) B>=1.495535 383 95 Class2 (0.2480418 0.7519582) \n#> 6) B< 2.079458 192 71 Class2 (0.3697917 0.6302083) \n#> 12) A>=2.572663 50 14 Class1 (0.7200000 0.2800000) *\n#> 13) A< 2.572663 142 35 Class2 (0.2464789 0.7535211) *\n#> 7) B>=2.079458 191 24 Class2 (0.1256545 0.8743455) *\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = tbl_bin$test)\n#> # A tibble: 7 × 1\n#> .pred_class\n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n#> 6 \n#> 7 \npredict(decision_tree_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # A tibble: 7 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.246 0.754\n#> 2 0.246 0.754\n#> 3 0.845 0.155\n#> 4 0.246 0.754\n#> 5 0.246 0.754\n#> 6 0.845 0.155\n#> 7 0.126 0.874\n```\n:::\n\n\n:::\n\n## Flexible Discriminant Analysis (`discrim_flexible()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and earth is the default engine so there is no need to set that either.\ndiscrim_flexible_spec <- discrim_flexible()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_flexible_fit <- discrim_flexible_spec |> fit(class ~ ., data = bin_train)\ndiscrim_flexible_fit\n#> parsnip model object\n#> \n#> Call:\n#> mda::fda(formula = class ~ ., data = data, method = earth::earth)\n#> \n#> Dimension: 1 \n#> \n#> Percent Between-Group Variance Explained:\n#> v1 \n#> 100 \n#> \n#> Training Misclassification Error: 0.1707 ( N = 785 )\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_flexible_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_flexible_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.339 0.661 \n#> 2 0.848 0.152 \n#> 3 0.342 0.658 \n#> 4 0.964 0.0360\n#> 5 0.964 0.0360\n#> 6 0.875 0.125\n```\n:::\n\n\n:::\n\n## Linear Discriminant Analysis (`discrim_linear()`) \n\n:::{.panel-tabset}\n\n## `MASS` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and MASS is the default engine so there is no need to set that either.\ndiscrim_linear_spec <- discrim_linear()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Call:\n#> lda(class ~ ., data = data)\n#> \n#> Prior probabilities of groups:\n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Group means:\n#> A B\n#> Class1 -0.2982900 -0.5573140\n#> Class2 0.3688258 0.6891006\n#> \n#> Coefficients of linear discriminants:\n#> LD1\n#> A -0.6068479\n#> B 1.7079953\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.369 0.631 \n#> 2 0.868 0.132 \n#> 3 0.541 0.459 \n#> 4 0.984 0.0158\n#> 5 0.928 0.0718\n#> 6 0.854 0.146\n```\n:::\n\n\n## `mda` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"mda\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Call:\n#> mda::fda(formula = class ~ ., data = data, method = mda::gen.ridge, \n#> keep.fitted = FALSE)\n#> \n#> Dimension: 1 \n#> \n#> Percent Between-Group Variance Explained:\n#> v1 \n#> 100 \n#> \n#> Degrees of Freedom (per dimension): 1.99423 \n#> \n#> Training Misclassification Error: 0.17707 ( N = 785 )\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.368 0.632 \n#> 2 0.867 0.133 \n#> 3 0.542 0.458 \n#> 4 0.984 0.0158\n#> 5 0.928 0.0718\n#> 6 0.853 0.147\n```\n:::\n\n\n## `sda` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sda\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> $regularization\n#> lambda lambda.var lambda.freqs \n#> 0.003136201 0.067551534 0.112819609 \n#> \n#> $freqs\n#> Class1 Class2 \n#> 0.5469019 0.4530981 \n#> \n#> $alpha\n#> Class1 Class2 \n#> -0.8934125 -1.2349286 \n#> \n#> $beta\n#> A B\n#> Class1 0.4565325 -1.298858\n#> Class2 -0.5510473 1.567757\n#> attr(,\"class\")\n#> [1] \"shrinkage\"\n#> \n#> attr(,\"class\")\n#> [1] \"sda\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.366 0.634 \n#> 2 0.860 0.140 \n#> 3 0.536 0.464 \n#> 4 0.982 0.0176\n#> 5 0.923 0.0768\n#> 6 0.845 0.155\n```\n:::\n\n\n## `sparsediscrim` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sparsediscrim\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Diagonal LDA\n#> \n#> Sample Size: 785 \n#> Number of Features: 2 \n#> \n#> Classes and Prior Probabilities:\n#> Class1 (55.29%), Class2 (44.71%)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.182 0.818 \n#> 2 0.755 0.245 \n#> 3 0.552 0.448 \n#> 4 0.996 0.00372\n#> 5 0.973 0.0274 \n#> 6 0.629 0.371\n```\n:::\n\n\n:::\n\n## Quandratic Discriminant Analysis (`discrim_quad()`) \n\n:::{.panel-tabset}\n\n## `MASS` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_spec <- discrim_quad()\n # This engine works with a single mode so no need to set that\n # and MASS is the default engine so there is no need to set that either.\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_fit <- discrim_quad_spec |> fit(class ~ ., data = bin_train)\ndiscrim_quad_fit\n#> parsnip model object\n#> \n#> Call:\n#> qda(class ~ ., data = data)\n#> \n#> Prior probabilities of groups:\n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Group means:\n#> A B\n#> Class1 -0.2982900 -0.5573140\n#> Class2 0.3688258 0.6891006\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_quad_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_quad_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.340 0.660 \n#> 2 0.884 0.116 \n#> 3 0.500 0.500 \n#> 4 0.965 0.0349\n#> 5 0.895 0.105 \n#> 6 0.895 0.105\n```\n:::\n\n\n## `sparsediscrim` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_spec <- discrim_quad() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sparsediscrim\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_fit <- discrim_quad_spec |> fit(class ~ ., data = bin_train)\ndiscrim_quad_fit\n#> parsnip model object\n#> \n#> Diagonal QDA\n#> \n#> Sample Size: 785 \n#> Number of Features: 2 \n#> \n#> Classes and Prior Probabilities:\n#> Class1 (55.29%), Class2 (44.71%)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_quad_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_quad_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.180 0.820 \n#> 2 0.750 0.250 \n#> 3 0.556 0.444 \n#> 4 0.994 0.00634\n#> 5 0.967 0.0328 \n#> 6 0.630 0.370\n```\n:::\n\n\n:::\n\n## Regularized Discriminant Analysis (`discrim_regularized()`) \n\n:::{.panel-tabset}\n\n## `klaR` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and klaR is the default engine so there is no need to set that either.\ndiscrim_regularized_spec <- discrim_regularized()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_regularized_fit <- discrim_regularized_spec |> fit(class ~ ., data = bin_train)\ndiscrim_regularized_fit\n#> parsnip model object\n#> \n#> Call: \n#> rda(formula = class ~ ., data = data)\n#> \n#> Regularization parameters: \n#> gamma lambda \n#> 0.0005969518 0.0131575746 \n#> \n#> Prior probabilities of groups: \n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Misclassification rate: \n#> apparent: 17.707 %\n#> cross-validated: 17.682 %\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_regularized_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_regularized_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.340 0.660 \n#> 2 0.883 0.117 \n#> 3 0.501 0.499 \n#> 4 0.965 0.0346\n#> 5 0.895 0.105 \n#> 6 0.894 0.106\n```\n:::\n\n\n:::\n\n## Generalized Additive Models (`gen_additive_mod()`) \n\n:::{.panel-tabset}\n\n## `mgcv` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_spec <- gen_additive_mod() |>\n # We need to set the mode since this engine works with multiple modes\n # and mgcv is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_fit <- \n gen_additive_mod_spec |> \n fit(class ~ s(A) + s(B), data = bin_train)\ngen_additive_mod_fit\n#> parsnip model object\n#> \n#> \n#> Family: binomial \n#> Link function: logit \n#> \n#> Formula:\n#> class ~ s(A) + s(B)\n#> \n#> Estimated degrees of freedom:\n#> 2.76 4.22 total = 7.98 \n#> \n#> UBRE score: -0.153537\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(gen_additive_mod_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(gen_additive_mod_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.400 0.600 \n#> 2 0.826 0.174 \n#> 3 0.454 0.546 \n#> 4 0.975 0.0250\n#> 5 0.929 0.0711\n#> 6 0.829 0.171\npredict(gen_additive_mod_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0.304 0.504 0.496 0.696\n#> 2 0.739 0.889 0.111 0.261\n#> 3 0.364 0.546 0.454 0.636\n#> 4 0.846 0.996 0.00358 0.154\n#> 5 0.881 0.958 0.0416 0.119\n#> 6 0.735 0.894 0.106 0.265\n```\n:::\n\n\n:::\n\n## Logistic Regression (`logistic_reg()`) \n\n:::{.panel-tabset}\n\n## `glm` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg()\n # This engine works with a single mode so no need to set that\n # and glm is the default engine so there is no need to set that either.\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = class ~ ., family = stats::binomial, data = data)\n#> \n#> Coefficients:\n#> (Intercept) A B \n#> -0.3563 -1.1250 2.8154 \n#> \n#> Degrees of Freedom: 784 Total (i.e. Null); 782 Residual\n#> Null Deviance:\t 1079 \n#> Residual Deviance: 666.9 \tAIC: 672.9\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.400 0.600 \n#> 2 0.862 0.138 \n#> 3 0.541 0.459 \n#> 4 0.977 0.0234\n#> 5 0.909 0.0905\n#> 6 0.853 0.147\npredict(logistic_reg_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0.339 0.465 0.535 0.661 \n#> 2 0.816 0.897 0.103 0.184 \n#> 3 0.493 0.588 0.412 0.507 \n#> 4 0.960 0.986 0.0137 0.0395\n#> 5 0.875 0.935 0.0647 0.125 \n#> 6 0.800 0.894 0.106 0.200\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(466)\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Logistic regression\n#> \n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> batch size: 707 \n#> validation loss after 1 epoch: 0.283\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.412 0.588 \n#> 2 0.854 0.146 \n#> 3 0.537 0.463 \n#> 4 0.971 0.0294\n#> 5 0.896 0.104 \n#> 6 0.848 0.152\n```\n:::\n\n\n## `gee` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- \n logistic_reg_spec |> \n fit(outcome ~ treatment * visit + id_var(patientID), data = cls_group_train)\n#> Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27\n#> running glm to get initial regression estimate\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> \n#> GEE: GENERALIZED LINEAR MODELS FOR DEPENDENT DATA\n#> gee S-function, version 4.13 modified 98/01/27 (1998) \n#> \n#> Model:\n#> Link: Logit \n#> Variance to Mean Relation: Binomial \n#> Correlation Structure: Independent \n#> \n#> Call:\n#> gee::gee(formula = outcome ~ treatment + visit, id = data$patientID, \n#> data = data, family = binomial)\n#> \n#> Number of observations : 1433 \n#> \n#> Maximum cluster size : 7 \n#> \n#> \n#> Coefficients:\n#> (Intercept) treatmentterbinafine visit \n#> -0.06853546 -0.25700680 -0.35646522 \n#> \n#> Estimated Scale Parameter: 0.9903994\n#> Number of Iterations: 1\n#> \n#> Working Correlation[1:4,1:4]\n#> [,1] [,2] [,3] [,4]\n#> [1,] 1 0 0 0\n#> [2,] 0 1 0 0\n#> [3,] 0 0 1 0\n#> [4,] 0 0 0 1\n#> \n#> \n#> Returned Error Value:\n#> [1] 0\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = cls_group_test)\n#> # A tibble: 475 × 1\n#> .pred_class \n#> \n#> 1 none or mild\n#> 2 none or mild\n#> 3 none or mild\n#> 4 none or mild\n#> 5 none or mild\n#> 6 none or mild\n#> 7 none or mild\n#> 8 none or mild\n#> 9 none or mild\n#> 10 none or mild\n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"prob\", new_data = cls_group_test)\n#> # A tibble: 475 × 2\n#> `.pred_none or mild` `.pred_moderate or severe`\n#> \n#> 1 0.664 0.336 \n#> 2 0.739 0.261 \n#> 3 0.801 0.199 \n#> 4 0.852 0.148 \n#> 5 0.892 0.108 \n#> 6 0.922 0.0784\n#> 7 0.944 0.0562\n#> 8 0.605 0.395 \n#> 9 0.686 0.314 \n#> 10 0.757 0.243 \n#> # ℹ 465 more rows\n```\n:::\n\n\n## `glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- \n logistic_reg_spec |> \n fit(outcome ~ treatment * visit + (1 | patientID), data = cls_group_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Generalized linear mixed model fit by maximum likelihood (Laplace\n#> Approximation) [glmerMod]\n#> Family: binomial ( logit )\n#> Formula: outcome ~ treatment * visit + (1 | patientID)\n#> Data: data\n#> AIC BIC logLik -2*log(L) df.resid \n#> 863.8271 890.1647 -426.9135 853.8271 1428 \n#> Random effects:\n#> Groups Name Std.Dev.\n#> patientID (Intercept) 8.35 \n#> Number of obs: 1433, groups: patientID, 219\n#> Fixed Effects:\n#> (Intercept) treatmentterbinafine \n#> -4.57420 -0.51193 \n#> visit treatmentterbinafine:visit \n#> -0.98725 -0.00112\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = cls_group_test)\n#> # A tibble: 475 × 1\n#> .pred_class \n#> \n#> 1 none or mild\n#> 2 none or mild\n#> 3 none or mild\n#> 4 none or mild\n#> 5 none or mild\n#> 6 none or mild\n#> 7 none or mild\n#> 8 none or mild\n#> 9 none or mild\n#> 10 none or mild\n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"prob\", new_data = cls_group_test)\n#> # A tibble: 475 × 2\n#> `.pred_none or mild` `.pred_moderate or severe`\n#> \n#> 1 0.998 0.00230 \n#> 2 0.999 0.000856 \n#> 3 1.000 0.000319 \n#> 4 1.000 0.000119 \n#> 5 1.000 0.0000441 \n#> 6 1.000 0.0000164 \n#> 7 1.000 0.00000612\n#> 8 0.996 0.00383 \n#> 9 0.999 0.00143 \n#> 10 0.999 0.000533 \n#> # ℹ 465 more rows\n```\n:::\n\n\n## `glmnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"binomial\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.308300\n#> 2 1 4.75 0.280900\n#> 3 1 8.73 0.256000\n#> 4 1 12.10 0.233200\n#> 5 1 14.99 0.212500\n#> 6 1 17.46 0.193600\n#> 7 1 19.60 0.176400\n#> 8 1 21.45 0.160800\n#> 9 1 23.05 0.146500\n#> 10 1 24.44 0.133500\n#> 11 1 25.65 0.121600\n#> 12 1 26.70 0.110800\n#> 13 1 27.61 0.101000\n#> 14 1 28.40 0.091990\n#> 15 1 29.08 0.083820\n#> 16 1 29.68 0.076370\n#> 17 1 30.19 0.069590\n#> 18 1 30.63 0.063410\n#> 19 1 31.00 0.057770\n#> 20 1 31.33 0.052640\n#> 21 1 31.61 0.047960\n#> 22 1 31.85 0.043700\n#> 23 1 32.05 0.039820\n#> 24 2 32.62 0.036280\n#> 25 2 33.41 0.033060\n#> 26 2 34.10 0.030120\n#> 27 2 34.68 0.027450\n#> 28 2 35.19 0.025010\n#> 29 2 35.63 0.022790\n#> 30 2 36.01 0.020760\n#> 31 2 36.33 0.018920\n#> 32 2 36.62 0.017240\n#> 33 2 36.86 0.015710\n#> 34 2 37.06 0.014310\n#> 35 2 37.24 0.013040\n#> 36 2 37.39 0.011880\n#> 37 2 37.52 0.010830\n#> 38 2 37.63 0.009864\n#> 39 2 37.72 0.008988\n#> 40 2 37.80 0.008189\n#> 41 2 37.86 0.007462\n#> 42 2 37.92 0.006799\n#> 43 2 37.97 0.006195\n#> 44 2 38.01 0.005644\n#> 45 2 38.04 0.005143\n#> 46 2 38.07 0.004686\n#> 47 2 38.10 0.004270\n#> 48 2 38.12 0.003891\n#> 49 2 38.13 0.003545\n#> 50 2 38.15 0.003230\n#> 51 2 38.16 0.002943\n#> 52 2 38.17 0.002682\n#> 53 2 38.18 0.002443\n#> 54 2 38.18 0.002226\n#> 55 2 38.19 0.002029\n#> 56 2 38.19 0.001848\n#> 57 2 38.20 0.001684\n#> 58 2 38.20 0.001534\n#> 59 2 38.20 0.001398\n#> 60 2 38.21 0.001274\n#> 61 2 38.21 0.001161\n#> 62 2 38.21 0.001058\n#> 63 2 38.21 0.000964\n#> 64 2 38.21 0.000878\n#> 65 2 38.21 0.000800\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.383 0.617 \n#> 2 0.816 0.184 \n#> 3 0.537 0.463 \n#> 4 0.969 0.0313\n#> 5 0.894 0.106 \n#> 6 0.797 0.203\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: glm\n#> Model ID: GLM_model_R_1763571327438_5177 \n#> GLM Model: summary\n#> family link regularization\n#> 1 binomial logit Elastic Net (alpha = 0.5, lambda = 6.162E-4 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 4\n#> training_frame\n#> 1 object_zkelygexok\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept -0.350788 -0.350788\n#> 2 A -1.084233 -1.084233\n#> 3 B 2.759366 2.759366\n#> \n#> H2OBinomialMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.130451\n#> RMSE: 0.3611799\n#> LogLoss: 0.4248206\n#> Mean Per-Class Error: 0.1722728\n#> AUC: 0.8889644\n#> AUCPR: 0.8520865\n#> Gini: 0.7779288\n#> R^2: 0.4722968\n#> Residual Deviance: 666.9684\n#> AIC: 672.9684\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 350 84 0.193548 =84/434\n#> Class2 53 298 0.150997 =53/351\n#> Totals 403 382 0.174522 =137/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.411045 0.813097 213\n#> 2 max f2 0.229916 0.868991 279\n#> 3 max f0point5 0.565922 0.816135 166\n#> 4 max accuracy 0.503565 0.826752 185\n#> 5 max precision 0.997356 1.000000 0\n#> 6 max recall 0.009705 1.000000 395\n#> 7 max specificity 0.997356 1.000000 0\n#> 8 max absolute_mcc 0.411045 0.652014 213\n#> 9 max min_per_class_accuracy 0.454298 0.822581 201\n#> 10 max mean_per_class_accuracy 0.411045 0.827727 213\n#> 11 max tns 0.997356 434.000000 0\n#> 12 max fns 0.997356 349.000000 0\n#> 13 max fps 0.001723 434.000000 399\n#> 14 max tps 0.009705 351.000000 395\n#> 15 max tnr 0.997356 1.000000 0\n#> 16 max fnr 0.997356 0.994302 0\n#> 17 max fpr 0.001723 1.000000 399\n#> 18 max tpr 0.009705 1.000000 395\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.399 0.601 \n#> 2 0.857 0.143 \n#> 3 0.540 0.460 \n#> 4 0.976 0.0243\n#> 5 0.908 0.0925\n#> 6 0.848 0.152\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(730)\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Model: \"sequential\"\n#> ________________________________________________________________________________\n#> Layer (type) Output Shape Param # \n#> ================================================================================\n#> dense (Dense) (None, 1) 3 \n#> dense_1 (Dense) (None, 2) 4 \n#> ================================================================================\n#> Total params: 7 (28.00 Byte)\n#> Trainable params: 7 (28.00 Byte)\n#> Non-trainable params: 0 (0.00 Byte)\n#> ________________________________________________________________________________\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> 1/1 - 0s - 92ms/epoch - 92ms/step\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class2\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> 1/1 - 0s - 6ms/epoch - 6ms/step\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.214 0.786 \n#> 2 0.633 0.367 \n#> 3 0.584 0.416 \n#> 4 0.990 0.00975\n#> 5 0.955 0.0449 \n#> 6 0.477 0.523\n```\n:::\n\n\n## `LiblineaR` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"LiblineaR\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized logistic regression primal (L2R_LR)\"\n#> \n#> $Type\n#> [1] 0\n#> \n#> $W\n#> A B Bias\n#> [1,] 1.014233 -2.65166 0.3363362\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $ClassNames\n#> [1] Class1 Class2\n#> Levels: Class1 Class2\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.397 0.603 \n#> 2 0.847 0.153 \n#> 3 0.539 0.461 \n#> 4 0.973 0.0267\n#> 5 0.903 0.0974\n#> 6 0.837 0.163\n```\n:::\n\n\n## `stan` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(96)\nlogistic_reg_fit <- \n logistic_reg_spec |> \n fit(outcome ~ treatment * visit, data = cls_group_train)\nlogistic_reg_fit |> print(digits = 3)\n#> parsnip model object\n#> \n#> stan_glm\n#> family: binomial [logit]\n#> formula: outcome ~ treatment * visit\n#> observations: 1433\n#> predictors: 4\n#> ------\n#> Median MAD_SD\n#> (Intercept) -0.137 0.187\n#> treatmentterbinafine -0.108 0.264\n#> visit -0.335 0.050\n#> treatmentterbinafine:visit -0.048 0.073\n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = cls_group_test)\n#> # A tibble: 475 × 1\n#> .pred_class \n#> \n#> 1 none or mild\n#> 2 none or mild\n#> 3 none or mild\n#> 4 none or mild\n#> 5 none or mild\n#> 6 none or mild\n#> 7 none or mild\n#> 8 none or mild\n#> 9 none or mild\n#> 10 none or mild\n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"prob\", new_data = cls_group_test)\n#> # A tibble: 475 × 2\n#> `.pred_none or mild` `.pred_moderate or severe`\n#> \n#> 1 0.652 0.348 \n#> 2 0.734 0.266 \n#> 3 0.802 0.198 \n#> 4 0.856 0.144 \n#> 5 0.898 0.102 \n#> 6 0.928 0.0721\n#> 7 0.950 0.0502\n#> 8 0.617 0.383 \n#> 9 0.692 0.308 \n#> 10 0.759 0.241 \n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"conf_int\", new_data = cls_group_test)\n#> # A tibble: 475 × 4\n#> `.pred_lower_none or mild` `.pred_upper_none or mild` .pred_lower_moderate …¹\n#> \n#> 1 0.583 0.715 0.285 \n#> 2 0.689 0.776 0.224 \n#> 3 0.771 0.832 0.168 \n#> 4 0.827 0.883 0.117 \n#> 5 0.868 0.924 0.0761\n#> 6 0.899 0.952 0.0482\n#> 7 0.922 0.970 0.0302\n#> 8 0.547 0.683 0.317 \n#> 9 0.644 0.736 0.264 \n#> 10 0.723 0.791 0.209 \n#> # ℹ 465 more rows\n#> # ℹ abbreviated name: ¹​`.pred_lower_moderate or severe`\n#> # ℹ 1 more variable: `.pred_upper_moderate or severe` \npredict(logistic_reg_fit, type = \"pred_int\", new_data = cls_group_test)\n#> # A tibble: 475 × 4\n#> `.pred_lower_none or mild` `.pred_upper_none or mild` .pred_lower_moderate …¹\n#> \n#> 1 0 1 0\n#> 2 0 1 0\n#> 3 0 1 0\n#> 4 0 1 0\n#> 5 0 1 0\n#> 6 0 1 0\n#> 7 0 1 0\n#> 8 0 1 0\n#> 9 0 1 0\n#> 10 0 1 0\n#> # ℹ 465 more rows\n#> # ℹ abbreviated name: ¹​`.pred_lower_moderate or severe`\n#> # ℹ 1 more variable: `.pred_upper_moderate or severe` \n```\n:::\n\n\n## `stan_glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(484)\nlogistic_reg_fit <- \n logistic_reg_spec |> \n fit(outcome ~ treatment * visit + (1 | patientID), data = cls_group_train)\nlogistic_reg_fit |> print(digits = 3)\n#> parsnip model object\n#> \n#> stan_glmer\n#> family: binomial [logit]\n#> formula: outcome ~ treatment * visit + (1 | patientID)\n#> observations: 1433\n#> ------\n#> Median MAD_SD\n#> (Intercept) -0.628 0.585\n#> treatmentterbinafine -0.686 0.821\n#> visit -0.830 0.105\n#> treatmentterbinafine:visit -0.023 0.143\n#> \n#> Error terms:\n#> Groups Name Std.Dev.\n#> patientID (Intercept) 4.376 \n#> Num. levels: patientID 219 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = cls_group_test)\n#> # A tibble: 475 × 1\n#> .pred_class \n#> \n#> 1 none or mild\n#> 2 none or mild\n#> 3 none or mild\n#> 4 none or mild\n#> 5 none or mild\n#> 6 none or mild\n#> 7 none or mild\n#> 8 none or mild\n#> 9 none or mild\n#> 10 none or mild\n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"prob\", new_data = cls_group_test)\n#> # A tibble: 475 × 2\n#> `.pred_none or mild` `.pred_moderate or severe`\n#> \n#> 1 0.671 0.329 \n#> 2 0.730 0.270 \n#> 3 0.796 0.204 \n#> 4 0.847 0.153 \n#> 5 0.882 0.118 \n#> 6 0.909 0.0908\n#> 7 0.934 0.0655\n#> 8 0.613 0.387 \n#> 9 0.681 0.319 \n#> 10 0.744 0.256 \n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"conf_int\", new_data = cls_group_test)\n#> # A tibble: 475 × 4\n#> `.pred_lower_none or mild` `.pred_upper_none or mild` .pred_lower_moderate …¹\n#> \n#> 1 0.00184 1.000 0.0000217 \n#> 2 0.00417 1.000 0.00000942 \n#> 3 0.00971 1.000 0.00000412 \n#> 4 0.0214 1.000 0.00000169 \n#> 5 0.0465 1.000 0.000000706\n#> 6 0.101 1.000 0.000000300\n#> 7 0.203 1.000 0.000000120\n#> 8 0.000923 1.000 0.0000440 \n#> 9 0.00196 1.000 0.0000175 \n#> 10 0.00447 1.000 0.00000724 \n#> # ℹ 465 more rows\n#> # ℹ abbreviated name: ¹​`.pred_lower_moderate or severe`\n#> # ℹ 1 more variable: `.pred_upper_moderate or severe` \npredict(logistic_reg_fit, type = \"pred_int\", new_data = cls_group_test)\n#> # A tibble: 475 × 4\n#> `.pred_lower_none or mild` `.pred_upper_none or mild` .pred_lower_moderate …¹\n#> \n#> 1 0 1 0\n#> 2 0 1 0\n#> 3 0 1 0\n#> 4 0 1 0\n#> 5 0 1 0\n#> 6 0 1 0\n#> 7 0 1 0\n#> 8 0 1 0\n#> 9 0 1 0\n#> 10 0 1 0\n#> # ℹ 465 more rows\n#> # ℹ abbreviated name: ¹​`.pred_lower_moderate or severe`\n#> # ℹ 1 more variable: `.pred_upper_moderate or severe` \n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(Class ~ ., data = tbl_bin$training)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> Coefficients:\n#> (Intercept) A B \n#> -3.731170 -1.214355 3.794186\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class1 \n#> 4 Class2 \n#> 5 Class2 \n#> 6 Class1 \n#> 7 Class2\npredict(logistic_reg_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 2]\n#> # Database: spark_connection\n#> pred_Class1 pred_Class2\n#> \n#> 1 0.130 0.870\n#> 2 0.262 0.738\n#> 3 0.787 0.213\n#> 4 0.279 0.721\n#> 5 0.498 0.502\n#> 6 0.900 0.100\n#> 7 0.161 0.839\n```\n:::\n\n\n:::\n\n## Multivariate Adaptive Regression Splines (`mars()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_spec <- mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_fit <- mars_spec |> fit(class ~ ., data = bin_train)\nmars_fit\n#> parsnip model object\n#> \n#> GLM (family binomial, link logit):\n#> nulldev df dev df devratio AIC iters converged\n#> 1079.45 784 638.975 779 0.408 651 5 1\n#> \n#> Earth selected 6 of 13 terms, and 2 of 2 predictors\n#> Termination condition: Reached nk 21\n#> Importance: B, A\n#> Number of terms at each degree of interaction: 1 5 (additive model)\n#> Earth GCV 0.1342746 RSS 102.4723 GRSq 0.4582121 RSq 0.4719451\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mars_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mars_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.410 0.590 \n#> 2 0.794 0.206 \n#> 3 0.356 0.644 \n#> 4 0.927 0.0729\n#> 5 0.927 0.0729\n#> 6 0.836 0.164\n```\n:::\n\n\n:::\n\n## Neural Networks (`mlp()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(839)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> a 2-5-1 network with 21 weights\n#> inputs: A B \n#> output(s): class \n#> options were - entropy fitting\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.390 0.610\n#> 2 0.685 0.315\n#> 3 0.433 0.567\n#> 4 0.722 0.278\n#> 5 0.720 0.280\n#> 6 0.684 0.316\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(38)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> relu activation,\n#> 3 hidden units,\n#> 17 model parameters\n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 707 \n#> learn rate: 0.01 \n#> validation loss after 5 epochs: 0.427\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.387 0.613 \n#> 2 0.854 0.146 \n#> 3 0.540 0.460 \n#> 4 0.941 0.0589\n#> 5 0.882 0.118 \n#> 6 0.842 0.158\n```\n:::\n\n\n## `brulee_two_layer` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"brulee_two_layer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(336)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> c(relu,relu) activation,\n#> c(3,3) hidden units,\n#> 29 model parameters\n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 707 \n#> learn rate: 0.01 \n#> validation loss after 17 epochs: 0.405\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.392 0.608 \n#> 2 0.835 0.165 \n#> 3 0.440 0.560 \n#> 4 0.938 0.0620\n#> 5 0.938 0.0620\n#> 6 0.848 0.152\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(306)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: deeplearning\n#> Model ID: DeepLearning_model_R_1763571327438_5179 \n#> Status of Neuron Layers: predicting .outcome, 2-class classification, bernoulli distribution, CrossEntropy loss, 1,002 weights/biases, 16.9 KB, 7,850 training samples, mini-batch size 1\n#> layer units type dropout l1 l2 mean_rate rate_rms momentum\n#> 1 1 2 Input 0.00 % NA NA NA NA NA\n#> 2 2 200 Rectifier 0.00 % 0.000000 0.000000 0.006954 0.012998 0.000000\n#> 3 3 2 Softmax NA 0.000000 0.000000 0.003180 0.000140 0.000000\n#> mean_weight weight_rms mean_bias bias_rms\n#> 1 NA NA NA NA\n#> 2 0.001014 0.103167 0.490217 0.023645\n#> 3 -0.003600 0.402544 0.019355 0.013006\n#> \n#> \n#> H2OBinomialMetrics: deeplearning\n#> ** Reported on training data. **\n#> ** Metrics reported on full training frame **\n#> \n#> MSE: 0.1724685\n#> RMSE: 0.4152933\n#> LogLoss: 0.5401076\n#> Mean Per-Class Error: 0.1731524\n#> AUC: 0.8892926\n#> AUCPR: 0.8518107\n#> Gini: 0.7785852\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 348 86 0.198157 =86/434\n#> Class2 52 299 0.148148 =52/351\n#> Totals 400 385 0.175796 =138/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.719329 0.812500 153\n#> 2 max f2 0.540433 0.869565 213\n#> 3 max f0point5 0.836246 0.815873 105\n#> 4 max accuracy 0.793925 0.825478 126\n#> 5 max precision 0.998841 1.000000 0\n#> 6 max recall 0.026905 1.000000 393\n#> 7 max specificity 0.998841 1.000000 0\n#> 8 max absolute_mcc 0.719329 0.650150 153\n#> 9 max min_per_class_accuracy 0.761683 0.820513 139\n#> 10 max mean_per_class_accuracy 0.719329 0.826848 153\n#> 11 max tns 0.998841 434.000000 0\n#> 12 max fns 0.998841 349.000000 0\n#> 13 max fps 0.004356 434.000000 399\n#> 14 max tps 0.026905 351.000000 393\n#> 15 max tnr 0.998841 1.000000 0\n#> 16 max fnr 0.998841 0.994302 0\n#> 17 max fpr 0.004356 1.000000 399\n#> 18 max tpr 0.026905 1.000000 393\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.149 0.851 \n#> 2 0.639 0.361 \n#> 3 0.237 0.763 \n#> 4 0.924 0.0763\n#> 5 0.739 0.261 \n#> 6 0.623 0.377\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(216)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit\n#> parsnip model object\n#> \n#> Model: \"sequential_1\"\n#> ________________________________________________________________________________\n#> Layer (type) Output Shape Param # \n#> ================================================================================\n#> dense_2 (Dense) (None, 5) 15 \n#> dense_3 (Dense) (None, 2) 12 \n#> ================================================================================\n#> Total params: 27 (108.00 Byte)\n#> Trainable params: 27 (108.00 Byte)\n#> Non-trainable params: 0 (0.00 Byte)\n#> ________________________________________________________________________________\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> 1/1 - 0s - 42ms/epoch - 42ms/step\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class2\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> 1/1 - 0s - 6ms/epoch - 6ms/step\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.313 0.687\n#> 2 0.578 0.422\n#> 3 0.503 0.497\n#> 4 0.894 0.106\n#> 5 0.869 0.131\n#> 6 0.470 0.530\n```\n:::\n\n\n:::\n\n## Multinom Regression (`multinom_reg()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and nnet is the default engine so there is no need to set that either.\nmultinom_reg_spec <- multinom_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(634)\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> nnet::multinom(formula = class ~ ., data = data, trace = FALSE)\n#> \n#> Coefficients:\n#> (Intercept) A B\n#> two -0.5868435 1.881920 1.379106\n#> three 0.2910810 1.129622 1.292802\n#> \n#> Residual Deviance: 315.8164 \n#> AIC: 327.8164\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.145 0.213 0.641 \n#> 2 0.308 0.178 0.514 \n#> 3 0.350 0.189 0.461 \n#> 4 0.983 0.00123 0.0155\n#> 5 0.956 0.00275 0.0415\n#> 6 0.00318 0.754 0.243 \n#> 7 0.0591 0.414 0.527 \n#> 8 0.522 0.0465 0.431\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(837)\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Multinomial regression\n#> \n#> 192 samples, 2 features, 3 classes \n#> class weights one=1, two=1, three=1 \n#> weight decay: 0.001 \n#> batch size: 173 \n#> validation loss after 1 epoch: 0.953\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 three\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.131 0.190 0.679 \n#> 2 0.303 0.174 0.523 \n#> 3 0.358 0.192 0.449 \n#> 4 0.983 0.00125 0.0154\n#> 5 0.948 0.00275 0.0491\n#> 6 0.00344 0.796 0.200 \n#> 7 0.0611 0.420 0.518 \n#> 8 0.443 0.0390 0.518\n```\n:::\n\n\n## `glmnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"multinomial\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.219200\n#> 2 1 1.61 0.199700\n#> 3 2 3.90 0.181900\n#> 4 2 6.07 0.165800\n#> 5 2 7.93 0.151100\n#> 6 2 9.52 0.137600\n#> 7 2 10.90 0.125400\n#> 8 2 12.09 0.114300\n#> 9 2 13.13 0.104100\n#> 10 2 14.22 0.094870\n#> 11 2 15.28 0.086440\n#> 12 2 16.20 0.078760\n#> 13 2 16.99 0.071760\n#> 14 2 17.68 0.065390\n#> 15 2 18.28 0.059580\n#> 16 2 18.80 0.054290\n#> 17 2 19.24 0.049460\n#> 18 2 19.63 0.045070\n#> 19 2 19.96 0.041070\n#> 20 2 20.25 0.037420\n#> 21 2 20.49 0.034090\n#> 22 2 20.70 0.031070\n#> 23 2 20.88 0.028310\n#> 24 2 21.04 0.025790\n#> 25 2 21.17 0.023500\n#> 26 2 21.28 0.021410\n#> 27 2 21.38 0.019510\n#> 28 2 21.46 0.017780\n#> 29 2 21.53 0.016200\n#> 30 2 21.58 0.014760\n#> 31 2 21.63 0.013450\n#> 32 2 21.67 0.012250\n#> 33 2 21.71 0.011160\n#> 34 2 21.74 0.010170\n#> 35 2 21.77 0.009269\n#> 36 2 21.79 0.008445\n#> 37 2 21.82 0.007695\n#> 38 2 21.83 0.007011\n#> 39 2 21.85 0.006389\n#> 40 2 21.86 0.005821\n#> 41 2 21.87 0.005304\n#> 42 2 21.88 0.004833\n#> 43 2 21.89 0.004403\n#> 44 2 21.89 0.004012\n#> 45 2 21.90 0.003656\n#> 46 2 21.90 0.003331\n#> 47 2 21.91 0.003035\n#> 48 2 21.91 0.002765\n#> 49 2 21.91 0.002520\n#> 50 2 21.91 0.002296\n#> 51 2 21.92 0.002092\n#> 52 2 21.92 0.001906\n#> 53 2 21.92 0.001737\n#> 54 2 21.92 0.001582\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.163 0.211 0.626 \n#> 2 0.318 0.185 0.496 \n#> 3 0.358 0.198 0.444 \n#> 4 0.976 0.00268 0.0217\n#> 5 0.940 0.00529 0.0544\n#> 6 0.00617 0.699 0.295 \n#> 7 0.0757 0.390 0.534 \n#> 8 0.506 0.0563 0.438\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OMultinomialModel: glm\n#> Model ID: GLM_model_R_1763571327438_5182 \n#> GLM Model: summary\n#> family link regularization\n#> 1 multinomial multinomial Elastic Net (alpha = 0.5, lambda = 4.372E-4 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 9 6 4\n#> training_frame\n#> 1 object_jbhwnlsrno\n#> \n#> Coefficients: glm multinomial coefficients\n#> names coefs_class_0 coefs_class_1 coefs_class_2 std_coefs_class_0\n#> 1 Intercept -1.119482 -0.831434 -1.706488 -1.083442\n#> 2 A -1.119327 0.002894 0.750746 -1.029113\n#> 3 B -1.208210 0.078752 0.162842 -1.187423\n#> std_coefs_class_1 std_coefs_class_2\n#> 1 -0.819868 -1.830487\n#> 2 0.002661 0.690238\n#> 3 0.077397 0.160041\n#> \n#> H2OMultinomialMetrics: glm\n#> ** Reported on training data. **\n#> \n#> Training Set Metrics: \n#> =====================\n#> \n#> Extract training frame with `h2o.getFrame(\"object_jbhwnlsrno\")`\n#> MSE: (Extract with `h2o.mse`) 0.2982118\n#> RMSE: (Extract with `h2o.rmse`) 0.5460878\n#> Logloss: (Extract with `h2o.logloss`) 0.822443\n#> Mean Per-Class Error: 0.4583896\n#> AUC: (Extract with `h2o.auc`) NaN\n#> AUCPR: (Extract with `h2o.aucpr`) NaN\n#> Null Deviance: (Extract with `h2o.nulldeviance`) 404.5036\n#> Residual Deviance: (Extract with `h2o.residual_deviance`) 315.8181\n#> R^2: (Extract with `h2o.r2`) 0.4682043\n#> AIC: (Extract with `h2o.aic`) NaN\n#> Confusion Matrix: Extract with `h2o.confusionMatrix(,train = TRUE)`)\n#> =========================================================================\n#> Confusion Matrix: Row labels: Actual class; Column labels: Predicted class\n#> one three two Error Rate\n#> one 59 18 1 0.2436 = 19 / 78\n#> three 19 52 5 0.3158 = 24 / 76\n#> two 7 24 7 0.8158 = 31 / 38\n#> Totals 85 94 13 0.3854 = 74 / 192\n#> \n#> Hit Ratio Table: Extract with `h2o.hit_ratio_table(,train = TRUE)`\n#> =======================================================================\n#> Top-3 Hit Ratios: \n#> k hit_ratio\n#> 1 1 0.614583\n#> 2 2 0.890625\n#> 3 3 1.000000\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_three .pred_two\n#> \n#> 1 0.146 0.641 0.213 \n#> 2 0.308 0.513 0.179 \n#> 3 0.350 0.460 0.190 \n#> 4 0.983 0.0158 0.00128\n#> 5 0.955 0.0422 0.00284\n#> 6 0.00329 0.244 0.752 \n#> 7 0.0599 0.527 0.413 \n#> 8 0.521 0.432 0.0469\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Model: \"sequential_2\"\n#> ________________________________________________________________________________\n#> Layer (type) Output Shape Param # \n#> ================================================================================\n#> dense_4 (Dense) (None, 1) 3 \n#> dense_5 (Dense) (None, 3) 6 \n#> ================================================================================\n#> Total params: 9 (36.00 Byte)\n#> Trainable params: 9 (36.00 Byte)\n#> Non-trainable params: 0 (0.00 Byte)\n#> ________________________________________________________________________________\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> 1/1 - 0s - 43ms/epoch - 43ms/step\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 one \n#> 4 one \n#> 5 one \n#> 6 three \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> 1/1 - 0s - 6ms/epoch - 6ms/step\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.262 0.342 0.396 \n#> 2 0.335 0.326 0.338 \n#> 3 0.352 0.322 0.326 \n#> 4 0.749 0.159 0.0919\n#> 5 0.680 0.194 0.126 \n#> 6 0.0924 0.335 0.573 \n#> 7 0.203 0.349 0.448 \n#> 8 0.417 0.303 0.280\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = tbl_mtl$training)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Formula: class ~ .\n#> \n#> Coefficients:\n#> (Intercept) A B\n#> one 0.05447853 -1.0569131 -0.9049194\n#> three 0.41207949 0.1458870 0.3959664\n#> two -0.46655802 0.9110261 0.5089529\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = tbl_mtl$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 one \n#> 2 one \n#> 3 three \n#> 4 three \n#> 5 three \n#> 6 three \n#> 7 three\npredict(multinom_reg_fit, type = \"prob\", new_data = tbl_mtl$test)\n#> # Source: SQL [?? x 3]\n#> # Database: spark_connection\n#> pred_one pred_three pred_two\n#> \n#> 1 0.910 0.0814 0.00904\n#> 2 0.724 0.233 0.0427 \n#> 3 0.124 0.620 0.256 \n#> 4 0.0682 0.610 0.322 \n#> 5 0.130 0.571 0.300 \n#> 6 0.115 0.549 0.336 \n#> 7 0.0517 0.524 0.424\n```\n:::\n\n\n:::\n\n## Naive Bayes (`naive_Bayes()`) \n\n:::{.panel-tabset}\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_spec <- naive_Bayes() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\nnaive_Bayes_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: naivebayes\n#> Model ID: NaiveBayes_model_R_1763571327438_5183 \n#> Model Summary: \n#> number_of_response_levels min_apriori_probability max_apriori_probability\n#> 1 2 0.44713 0.55287\n#> \n#> \n#> H2OBinomialMetrics: naivebayes\n#> ** Reported on training data. **\n#> \n#> MSE: 0.1737113\n#> RMSE: 0.4167869\n#> LogLoss: 0.5473431\n#> Mean Per-Class Error: 0.2356138\n#> AUC: 0.8377152\n#> AUCPR: 0.788608\n#> Gini: 0.6754303\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 274 160 0.368664 =160/434\n#> Class2 36 315 0.102564 =36/351\n#> Totals 310 475 0.249682 =196/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.175296 0.762712 286\n#> 2 max f2 0.133412 0.851119 306\n#> 3 max f0point5 0.497657 0.731343 183\n#> 4 max accuracy 0.281344 0.765605 248\n#> 5 max precision 0.999709 1.000000 0\n#> 6 max recall 0.020983 1.000000 390\n#> 7 max specificity 0.999709 1.000000 0\n#> 8 max absolute_mcc 0.280325 0.541898 249\n#> 9 max min_per_class_accuracy 0.398369 0.758065 215\n#> 10 max mean_per_class_accuracy 0.280325 0.771945 249\n#> 11 max tns 0.999709 434.000000 0\n#> 12 max fns 0.999709 347.000000 0\n#> 13 max fps 0.006522 434.000000 399\n#> 14 max tps 0.020983 351.000000 390\n#> 15 max tnr 0.999709 1.000000 0\n#> 16 max fnr 0.999709 0.988604 0\n#> 17 max fpr 0.006522 1.000000 399\n#> 18 max tpr 0.020983 1.000000 390\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class2\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.181 0.819 \n#> 2 0.750 0.250 \n#> 3 0.556 0.444 \n#> 4 0.994 0.00643\n#> 5 0.967 0.0331 \n#> 6 0.630 0.370\n```\n:::\n\n\n## `klaR` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and klaR is the default engine so there is no need to set that either.\nnaive_Bayes_spec <- naive_Bayes()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.250 0.750 \n#> 2 0.593 0.407 \n#> 3 0.333 0.667 \n#> 4 0.993 0.00658\n#> 5 0.978 0.0223 \n#> 6 0.531 0.469\n```\n:::\n\n\n## `naivebayes` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_spec <- naive_Bayes() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"naivebayes\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\nnaive_Bayes_fit\n#> parsnip model object\n#> \n#> \n#> ================================= Naive Bayes ==================================\n#> \n#> Call:\n#> naive_bayes.default(x = maybe_data_frame(x), y = y, usekernel = TRUE)\n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> Laplace smoothing: 0\n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> A priori probabilities: \n#> \n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> Tables: \n#> \n#> -------------------------------------------------------------------------------- \n#> :: A::Class1 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (434 obs.);\tBandwidth 'bw' = 0.2548\n#> \n#> x y \n#> Min. :-2.5638 Min. :0.0002915 \n#> 1st Qu.:-1.2013 1st Qu.:0.0506201 \n#> Median : 0.1612 Median :0.1619843 \n#> Mean : 0.1612 Mean :0.1831190 \n#> 3rd Qu.: 1.5237 3rd Qu.:0.2581668 \n#> Max. : 2.8862 Max. :0.5370762 \n#> -------------------------------------------------------------------------------- \n#> :: A::Class2 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (351 obs.);\tBandwidth 'bw' = 0.2596\n#> \n#> x y \n#> Min. :-2.5428 Min. :4.977e-05 \n#> 1st Qu.:-1.1840 1st Qu.:2.672e-02 \n#> Median : 0.1748 Median :2.239e-01 \n#> Mean : 0.1748 Mean :1.836e-01 \n#> 3rd Qu.: 1.5336 3rd Qu.:2.926e-01 \n#> Max. : 2.8924 Max. :3.740e-01 \n#> \n#> -------------------------------------------------------------------------------- \n#> :: B::Class1 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (434 obs.);\tBandwidth 'bw' = 0.1793\n#> \n#> x y \n#> Min. :-2.4501 Min. :5.747e-05 \n#> 1st Qu.:-1.0894 1st Qu.:1.424e-02 \n#> Median : 0.2713 Median :8.798e-02 \n#> Mean : 0.2713 Mean :1.834e-01 \n#> 3rd Qu.: 1.6320 3rd Qu.:2.758e-01 \n#> Max. : 2.9927 Max. :6.872e-01 \n#> \n#> -------------------------------------------------------------------------------- \n#> :: B::Class2 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (351 obs.);\tBandwidth 'bw' = 0.2309\n#> \n#> x y \n#> Min. :-2.4621 Min. :5.623e-05 \n#> 1st Qu.:-0.8979 1st Qu.:1.489e-02 \n#> Median : 0.6663 Median :7.738e-02 \n#> Mean : 0.6663 Mean :1.595e-01 \n#> 3rd Qu.: 2.2305 3rd Qu.:3.336e-01 \n#> Max. : 3.7948 Max. :4.418e-01 \n#> \n#> --------------------------------------------------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.249 0.751 \n#> 2 0.593 0.407 \n#> 3 0.332 0.668 \n#> 4 0.993 0.00674\n#> 5 0.978 0.0224 \n#> 6 0.532 0.468\n```\n:::\n\n\n:::\n\n## K-Nearest Neighbors (`nearest_neighbor()`) \n\n:::{.panel-tabset}\n\n## `kknn` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_spec <- nearest_neighbor() |>\n # We need to set the mode since this engine works with multiple modes\n # and kknn is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_fit <- nearest_neighbor_spec |> fit(class ~ ., data = bin_train)\nnearest_neighbor_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> kknn::train.kknn(formula = class ~ ., data = data, ks = min_rows(5, data, 5))\n#> \n#> Type of response variable: nominal\n#> Minimal misclassification: 0.2101911\n#> Best kernel: optimal\n#> Best k: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(nearest_neighbor_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(nearest_neighbor_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.2 0.8 \n#> 2 0.72 0.28\n#> 3 0.32 0.68\n#> 4 1 0 \n#> 5 1 0 \n#> 6 1 0\n```\n:::\n\n\n:::\n\n## Null Model (`null_model()`) \n\n:::{.panel-tabset}\n\n## `parsnip` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_spec <- null_model() |>\n # We need to set the mode since this engine works with multiple modes\n # and parsnip is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_fit <- null_model_spec |> fit(class ~ ., data = bin_train)\nnull_model_fit\n#> parsnip model object\n#> \n#> Null Regression Model\n#> Predicted Value: Class1\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(null_model_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(null_model_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.553 0.447\n#> 2 0.553 0.447\n#> 3 0.553 0.447\n#> 4 0.553 0.447\n#> 5 0.553 0.447\n#> 6 0.553 0.447\n```\n:::\n\n\n:::\n\n## Partial Least Squares (`pls()`) \n\n:::{.panel-tabset}\n\n## `mixOmics` \n\nThis engine requires the plsmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(plsmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_spec <- pls() |>\n # We need to set the mode since this engine works with multiple modes\n # and mixOmics is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_fit <- pls_spec |> fit(class ~ ., data = bin_train)\npls_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> mixOmics::splsda(X = x, Y = y, ncomp = ncomp, keepX = keepX) \n#> \n#> sPLS-DA (regression mode) with 2 sPLS-DA components. \n#> You entered data X of dimensions: 785 2 \n#> You entered data Y with 2 classes. \n#> \n#> Selection of [2] [2] variables on each of the sPLS-DA components on the X data set. \n#> No Y variables can be selected. \n#> \n#> Main numerical outputs: \n#> -------------------- \n#> loading vectors: see object$loadings \n#> variates: see object$variates \n#> variable names: see object$names \n#> \n#> Functions to visualise samples: \n#> -------------------- \n#> plotIndiv, plotArrow, cim \n#> \n#> Functions to visualise variables: \n#> -------------------- \n#> plotVar, plotLoadings, network, cim \n#> \n#> Other functions: \n#> -------------------- \n#> selectVar, tune, perf, auc\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(pls_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(pls_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.462 0.538\n#> 2 0.631 0.369\n#> 3 0.512 0.488\n#> 4 0.765 0.235\n#> 5 0.675 0.325\n#> 6 0.624 0.376\n```\n:::\n\n\n:::\n\n## Random Forests (`rand_forest()`) \n\n:::{.panel-tabset}\n\n## `ranger` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n # and ranger is the default engine so there is no need to set that either.\n set_engine(\"ranger\", keep.inbag = TRUE) |> \n # However, we'll set the engine and use the keep.inbag=TRUE option so that we \n # can produce interval predictions. This is not generally required. \n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(841)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Ranger result\n#> \n#> Call:\n#> ranger::ranger(x = maybe_data_frame(x), y = y, keep.inbag = ~TRUE, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) \n#> \n#> Type: Probability estimation \n#> Number of trees: 500 \n#> Sample size: 785 \n#> Number of independent variables: 2 \n#> Mtry: 1 \n#> Target node size: 10 \n#> Variable importance mode: none \n#> Splitrule: gini \n#> OOB prediction error (Brier s.): 0.1477679\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.220 0.780 \n#> 2 0.837 0.163 \n#> 3 0.220 0.780 \n#> 4 0.951 0.0485\n#> 5 0.785 0.215 \n#> 6 0.913 0.0868\npredict(rand_forest_fit, type = \"conf_int\", new_data = bin_test)\n#> Warning in rInfJack(x, inbag.counts): Sample size <=20, no calibration\n#> performed.\n#> Warning in rInfJack(x, inbag.counts): Sample size <=20, no calibration\n#> performed.\n#> Warning in sqrt(infjack): NaNs produced\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0 0.477 0.523 1 \n#> 2 0.604 1 0 0.396\n#> 3 0.01000 0.431 0.569 0.990\n#> 4 0.846 1 0 0.154\n#> 5 0.469 1 0 0.531\n#> 6 NaN NaN NaN NaN\n```\n:::\n\n\n## `aorsf` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(923)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random classification forest\n#> \n#> Linear combinations: Accelerated Logistic regression\n#> N observations: 785\n#> N classes: 2\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 24.092\n#> Min observations in leaf: 5\n#> OOB stat value: 0.87\n#> OOB stat type: AUC-ROC\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.189 0.811 \n#> 2 0.870 0.130 \n#> 3 0.346 0.654 \n#> 4 0.979 0.0206\n#> 5 0.940 0.0599\n#> 6 0.899 0.101\n```\n:::\n\n\n## `grf` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"grf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(546)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\npredict(rand_forest_fit, type = \"conf_int\", new_data = bin_test)\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(493)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: drf\n#> Model ID: DRF_model_R_1763571327438_5185 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 92621 12\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 20 16.60000 126 166 143.08000\n#> \n#> \n#> H2OBinomialMetrics: drf\n#> ** Reported on training data. **\n#> ** Metrics reported on Out-Of-Bag training samples **\n#> \n#> MSE: 0.164699\n#> RMSE: 0.4058312\n#> LogLoss: 1.506369\n#> Mean Per-Class Error: 0.200195\n#> AUC: 0.8389854\n#> AUCPR: 0.7931927\n#> Gini: 0.6779708\n#> R^2: 0.3337559\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 327 107 0.246544 =107/434\n#> Class2 54 297 0.153846 =54/351\n#> Totals 381 404 0.205096 =161/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.363636 0.786755 125\n#> 2 max f2 0.238095 0.832435 148\n#> 3 max f0point5 0.421053 0.760108 115\n#> 4 max accuracy 0.363636 0.794904 125\n#> 5 max precision 1.000000 0.890244 0\n#> 6 max recall 0.000000 1.000000 208\n#> 7 max specificity 1.000000 0.979263 0\n#> 8 max absolute_mcc 0.363636 0.596505 125\n#> 9 max min_per_class_accuracy 0.450000 0.785714 110\n#> 10 max mean_per_class_accuracy 0.363636 0.799805 125\n#> 11 max tns 1.000000 425.000000 0\n#> 12 max fns 1.000000 278.000000 0\n#> 13 max fps 0.000000 434.000000 208\n#> 14 max tps 0.000000 351.000000 208\n#> 15 max tnr 1.000000 0.979263 0\n#> 16 max fnr 1.000000 0.792023 0\n#> 17 max fpr 0.000000 1.000000 208\n#> 18 max tpr 0.000000 1.000000 208\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.12 0.88 \n#> 2 0.94 0.0600\n#> 3 0.175 0.825 \n#> 4 1 0 \n#> 5 0.78 0.22 \n#> 6 0.92 0.0800\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(252)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\n```\n:::\n\n\nThe print method has a lot of output: \n\n
\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncapture.output(print(rand_forest_fit))[1:100] |> cat(sep = \"\\n\")\n#> parsnip model object\n#> \n#> $nodes\n#> $nodes[[1]]\n#> [1] root\n#> | [2] V3 <= -0.06906\n#> | | [3] V3 <= -0.61707\n#> | | | [4] V3 <= -0.83314\n#> | | | | [5] V3 <= -0.99048\n#> | | | | | [6] V3 <= -1.29863\n#> | | | | | | [7] V2 <= -0.93951 *\n#> | | | | | | [8] V2 > -0.93951 *\n#> | | | | | [9] V3 > -1.29863\n#> | | | | | | [10] V3 <= -1.21418 *\n#> | | | | | | [11] V3 > -1.21418\n#> | | | | | | | [12] V2 <= -1.13676 *\n#> | | | | | | | [13] V2 > -1.13676\n#> | | | | | | | | [14] V3 <= -1.14373 *\n#> | | | | | | | | [15] V3 > -1.14373 *\n#> | | | | [16] V3 > -0.99048\n#> | | | | | [17] V2 <= -1.10136 *\n#> | | | | | [18] V2 > -1.10136 *\n#> | | | [19] V3 > -0.83314\n#> | | | | [20] V3 <= -0.68684\n#> | | | | | [21] V2 <= -0.62666 *\n#> | | | | | [22] V2 > -0.62666 *\n#> | | | | [23] V3 > -0.68684 *\n#> | | [24] V3 > -0.61707\n#> | | | [25] V2 <= -0.10774\n#> | | | | [26] V3 <= -0.35574\n#> | | | | | [27] V3 <= -0.41085\n#> | | | | | | [28] V3 <= -0.52674 *\n#> | | | | | | [29] V3 > -0.52674 *\n#> | | | | | [30] V3 > -0.41085 *\n#> | | | | [31] V3 > -0.35574\n#> | | | | | [32] V3 <= -0.17325 *\n#> | | | | | [33] V3 > -0.17325 *\n#> | | | [34] V2 > -0.10774\n#> | | | | [35] V3 <= -0.38428 *\n#> | | | | [36] V3 > -0.38428 *\n#> | [37] V3 > -0.06906\n#> | | [38] V3 <= 0.54852\n#> | | | [39] V2 <= 0.53027\n#> | | | | [40] V2 <= 0.21749\n#> | | | | | [41] V3 <= 0.09376 *\n#> | | | | | [42] V3 > 0.09376\n#> | | | | | | [43] V3 <= 0.28687\n#> | | | | | | | [44] V3 <= 0.17513 *\n#> | | | | | | | [45] V3 > 0.17513 *\n#> | | | | | | [46] V3 > 0.28687 *\n#> | | | | [47] V2 > 0.21749 *\n#> | | | [48] V2 > 0.53027 *\n#> | | [49] V3 > 0.54852\n#> | | | [50] V2 <= 1.99786\n#> | | | | [51] V3 <= 1.02092\n#> | | | | | [52] V2 <= 0.5469\n#> | | | | | | [53] V3 <= 0.83487\n#> | | | | | | | [54] V2 <= 0.36626 *\n#> | | | | | | | [55] V2 > 0.36626 *\n#> | | | | | | [56] V3 > 0.83487 *\n#> | | | | | [57] V2 > 0.5469\n#> | | | | | | [58] V3 <= 0.62673 *\n#> | | | | | | [59] V3 > 0.62673 *\n#> | | | | [60] V3 > 1.02092\n#> | | | | | [61] V3 <= 1.29539\n#> | | | | | | [62] V3 <= 1.2241 *\n#> | | | | | | [63] V3 > 1.2241 *\n#> | | | | | [64] V3 > 1.29539\n#> | | | | | | [65] V3 <= 2.01809 *\n#> | | | | | | [66] V3 > 2.01809 *\n#> | | | [67] V2 > 1.99786 *\n#> \n#> $nodes[[2]]\n#> [1] root\n#> | [2] V3 <= -0.00054\n#> | | [3] V3 <= -0.58754\n#> | | | [4] V3 <= -0.83314\n#> | | | | [5] V2 <= -1.15852\n#> | | | | | [6] V2 <= -1.76192 *\n#> | | | | | [7] V2 > -1.76192 *\n#> | | | | [8] V2 > -1.15852\n#> | | | | | [9] V3 <= -1.21418\n#> | | | | | | [10] V3 <= -1.32176 *\n#> | | | | | | [11] V3 > -1.32176 *\n#> | | | | | [12] V3 > -1.21418\n#> | | | | | | [13] V2 <= -1.08164 *\n#> | | | | | | [14] V2 > -1.08164\n#> | | | | | | | [15] V3 <= -1.14373 *\n#> | | | | | | | [16] V3 > -1.14373 *\n#> | | | [17] V3 > -0.83314\n#> | | | | [18] V2 <= -0.51524\n#> | | | | | [19] V3 <= -0.66041\n#> | | | | | | [20] V3 <= -0.70885 *\n#> | | | | | | [21] V3 > -0.70885 *\n#> | | | | | [22] V3 > -0.66041 *\n#> | | | | [23] V2 > -0.51524 *\n#> | | [24] V3 > -0.58754\n#> | | | [25] V2 <= -0.07243\n#> | | | | [26] V3 <= -0.31247\n#> | | | | | [27] V2 <= -0.98014 *\n```\n:::\n\n
\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.375 0.625 \n#> 2 0.813 0.187 \n#> 3 0.284 0.716 \n#> 4 0.963 0.0365\n#> 5 0.892 0.108 \n#> 6 0.922 0.0785\n```\n:::\n\n\n## `randomForest` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"randomForest\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(726)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> randomForest(x = maybe_data_frame(x), y = y) \n#> Type of random forest: classification\n#> Number of trees: 500\n#> No. of variables tried at each split: 1\n#> \n#> OOB estimate of error rate: 21.53%\n#> Confusion matrix:\n#> Class1 Class2 class.error\n#> Class1 349 85 0.1958525\n#> Class2 84 267 0.2393162\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.162 0.838\n#> 2 0.848 0.152\n#> 3 0.108 0.892\n#> 4 1 0 \n#> 5 0.74 0.26 \n#> 6 0.91 0.09\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n set_mode(\"classification\") |>\n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(693)\nrand_forest_fit <- rand_forest_spec |> fit(Class ~ ., data = tbl_bin$training)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> RandomForestClassificationModel: uid=random_forest__ffe2aceb_0ffa_4c2c_9cac_0d7e0f09c9f5, numTrees=20, numClasses=2, numFeatures=2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class1 \n#> 4 Class2 \n#> 5 Class2 \n#> 6 Class1 \n#> 7 Class2\npredict(rand_forest_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 2]\n#> # Database: spark_connection\n#> pred_Class1 pred_Class2\n#> \n#> 1 0.315 0.685 \n#> 2 0.241 0.759 \n#> 3 0.732 0.268 \n#> 4 0.235 0.765 \n#> 5 0.259 0.741 \n#> 6 0.933 0.0674\n#> 7 0.0968 0.903\n```\n:::\n\n\n:::\n\n## Rule Fit (`rule_fit()`) \n\n:::{.panel-tabset}\n\n## `xrf` \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n # and xrf is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(95)\nrule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> An eXtreme RuleFit model of 358 rules.\n#> \n#> Original Formula:\n#> \n#> class ~ A + B\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rule_fit_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.419 0.581\n#> 2 0.651 0.349\n#> 3 0.506 0.494\n#> 4 0.891 0.109\n#> 5 0.805 0.195\n#> 6 0.616 0.384\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(536)\nrule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: rulefit\n#> Model ID: RuleFit_model_R_1763571327438_5236 \n#> Rulefit Model Summary: \n#> family link regularization number_of_predictors_total\n#> 1 binomial logit Lasso (lambda = 0.03081 ) 2329\n#> number_of_active_predictors number_of_iterations rule_ensemble_size\n#> 1 3 4 2327\n#> number_of_trees number_of_internal_trees min_depth max_depth mean_depth\n#> 1 150 150 0 5 4.00000\n#> min_leaves max_leaves mean_leaves\n#> 1 0 29 15.51333\n#> \n#> \n#> H2OBinomialMetrics: rulefit\n#> ** Reported on training data. **\n#> \n#> MSE: 0.1411478\n#> RMSE: 0.3756964\n#> LogLoss: 0.4472749\n#> Mean Per-Class Error: 0.1850933\n#> AUC: 0.8779327\n#> AUCPR: 0.8372496\n#> Gini: 0.7558654\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 350 84 0.193548 =84/434\n#> Class2 62 289 0.176638 =62/351\n#> Totals 412 373 0.185987 =146/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.499611 0.798343 199\n#> 2 max f2 0.226927 0.861169 285\n#> 3 max f0point5 0.626200 0.803634 144\n#> 4 max accuracy 0.523044 0.815287 191\n#> 5 max precision 0.980574 1.000000 0\n#> 6 max recall 0.052101 1.000000 394\n#> 7 max specificity 0.980574 1.000000 0\n#> 8 max absolute_mcc 0.523044 0.627478 191\n#> 9 max min_per_class_accuracy 0.512020 0.813364 196\n#> 10 max mean_per_class_accuracy 0.499611 0.814907 199\n#> 11 max tns 0.980574 434.000000 0\n#> 12 max fns 0.980574 350.000000 0\n#> 13 max fps 0.043433 434.000000 399\n#> 14 max tps 0.052101 351.000000 394\n#> 15 max tnr 0.980574 1.000000 0\n#> 16 max fnr 0.980574 0.997151 0\n#> 17 max fpr 0.043433 1.000000 399\n#> 18 max tpr 0.052101 1.000000 394\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rule_fit_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.393 0.607 \n#> 2 0.739 0.261 \n#> 3 0.455 0.545 \n#> 4 0.956 0.0442\n#> 5 0.882 0.118 \n#> 6 0.693 0.307\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Linear Kernel) (`svm_linear()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"kernlab\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(class ~ ., data = bin_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Linear (vanilla) kernel function. \n#> \n#> Number of Support Vectors : 357 \n#> \n#> Objective Function Value : -353.0043 \n#> Training error : 0.17707 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.404 0.596 \n#> 2 0.858 0.142 \n#> 3 0.541 0.459 \n#> 4 0.975 0.0254\n#> 5 0.905 0.0950\n#> 6 0.850 0.150\n```\n:::\n\n\n## `LiblineaR` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n # and LiblineaR is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(class ~ ., data = bin_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized L2-loss support vector classification dual (L2R_L2LOSS_SVC_DUAL)\"\n#> \n#> $Type\n#> [1] 1\n#> \n#> $W\n#> A B Bias\n#> [1,] 0.3641766 -0.9648797 0.1182725\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $ClassNames\n#> [1] Class1 Class2\n#> Levels: Class1 Class2\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_spec <- svm_poly() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_fit <- svm_poly_spec |> fit(class ~ ., data = bin_train)\n#> Setting default kernel parameters\nsvm_poly_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Polynomial kernel function. \n#> Hyperparameters : degree = 1 scale = 1 offset = 1 \n#> \n#> Number of Support Vectors : 357 \n#> \n#> Objective Function Value : -353.0043 \n#> Training error : 0.17707 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_poly_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_poly_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.399 0.601 \n#> 2 0.861 0.139 \n#> 3 0.538 0.462 \n#> 4 0.976 0.0237\n#> 5 0.908 0.0917\n#> 6 0.853 0.147\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_spec <- svm_rbf() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_fit <- svm_rbf_spec |> fit(class ~ ., data = bin_train)\nsvm_rbf_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Gaussian Radial Basis kernel function. \n#> Hyperparameter : sigma = 1.9107071282545 \n#> \n#> Number of Support Vectors : 335 \n#> \n#> Objective Function Value : -296.4885 \n#> Training error : 0.173248 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_rbf_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_rbf_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.547 0.453\n#> 2 0.871 0.129\n#> 3 0.260 0.740\n#> 4 0.861 0.139\n#> 5 0.863 0.137\n#> 6 0.863 0.137\n```\n:::\n\n\n:::\n\n# Regression Models\n\nTo demonstrate regression, we'll subset some data. make a training/test split, and standardize the predictors: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(938)\nreg_split <-\n modeldata::concrete |> \n slice_sample(n = 100) |> \n select(strength = compressive_strength, cement, age) |> \n initial_split(prop = 0.95, strata = strength)\nreg_split\n#> \n#> <92/8/100>\n\nreg_rec <- \n recipe(strength ~ ., data = training(reg_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nreg_train <- bake(reg_rec, new_data = NULL)\nreg_test <- bake(reg_rec, new_data = testing(reg_split))\n```\n:::\n\n\nWe also have models that are specifically designed for integer count outcomes. The data for these are:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(207)\ncount_split <-\n attrition |>\n select(num_years = TotalWorkingYears, age = Age, income = MonthlyIncome) |>\n initial_split(prop = 0.994)\ncount_split\n#> \n#> <1461/9/1470>\n\ncount_rec <-\n recipe(num_years ~ ., data = training(count_split)) |>\n step_normalize(all_numeric_predictors()) |>\n prep()\n\ncount_train <- bake(count_rec, new_data = NULL)\ncount_test <- bake(count_rec, new_data = testing(count_split))\n```\n:::\n\n\nFinally, we have some models that handle hierarchical data, where some rows are statistically correlated with other rows. For these examples, we'll use a data set that models body weights as a function of time for several \"subjects\" (rats, actually). We'll split these data in a way where all rows for a specific subject are either in the training or test sets: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(224)\nreg_group_split <- \n nlme::BodyWeight |> \n # Get rid of some extra attributes added by the nlme package\n as_tibble() |> \n # Convert to an _unordered_ factor\n mutate(Rat = factor(as.character(Rat))) |> \n group_initial_split(group = Rat)\nreg_group_train <- training(reg_group_split)\nreg_group_test <- testing(reg_group_split)\n```\n:::\n\n\nThere are 12 subjects in the training set and 4 in the test set. \n\nIf using the **Apache Spark** engine, we will need to identify the data source, and then use it to create the splits. For this article, we will copy the `concrete` data set into the Spark session.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nsc <- spark_connect(\"local\")\n#> Re-using existing Spark connection to local\n\ntbl_concrete <- copy_to(sc, modeldata::concrete)\n\ntbl_reg <- sdf_random_split(tbl_concrete, training = 0.95, test = 0.05, seed = 100)\n```\n:::\n\n\n## Bagged MARS (`bag_mars()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mars_spec <- bag_mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(147)\nbag_mars_fit <- bag_mars_spec |> fit(strength ~ ., data = reg_train)\nbag_mars_fit\n#> parsnip model object\n#> \n#> Bagged MARS (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 age 93.1 4.61 11\n#> 2 cement 69.4 4.95 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mars_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.4\n#> 2 41.9\n#> 3 26.7\n#> 4 56.6\n#> 5 36.4\n#> 6 36.2\n#> 7 37.8\n#> 8 37.7\n```\n:::\n\n\n:::\n\n## Bagged Neural Networks (`bag_mlp()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mlp_spec <- bag_mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(324)\nbag_mlp_fit <- bag_mlp_spec |> fit(strength ~ ., data = reg_train)\nbag_mlp_fit\n#> parsnip model object\n#> \n#> Bagged nnet (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 age 55.9 2.96 11\n#> 2 cement 44.1 2.96 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 19.9\n#> 2 39.1\n#> 3 28.3\n#> 4 68.8\n#> 5 44.1\n#> 6 36.3\n#> 7 40.8\n#> 8 37.0\n```\n:::\n\n\n:::\n\n## Bagged Decision Trees (`bag_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(230)\nbag_tree_fit <- bag_tree_spec |> fit(strength ~ ., data = reg_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged CART (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 cement 16621. 1392. 11\n#> 2 age 12264. 710. 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.0\n#> 2 33.0\n#> 3 29.6\n#> 4 54.2\n#> 5 36.2\n#> 6 39.4\n#> 7 40.7\n#> 8 46.5\n```\n:::\n\n\n:::\n\n## Bayesian Additive Regression Trees (`bart()`) \n\n:::{.panel-tabset}\n\n## `dbarts` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbart_spec <- bart() |>\n # We need to set the mode since this engine works with multiple modes\n # and dbarts is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(134)\nbart_fit <- bart_spec |> fit(strength ~ ., data = reg_train)\nbart_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> `NULL`()\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bart_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.2\n#> 2 40.9\n#> 3 26.0\n#> 4 52.0\n#> 5 36.5\n#> 6 36.7\n#> 7 39.0\n#> 8 37.8\npredict(bart_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 17.0 32.4\n#> 2 33.0 48.9\n#> 3 20.1 31.5\n#> 4 42.0 62.5\n#> 5 28.5 44.5\n#> 6 30.3 42.3\n#> 7 33.1 45.3\n#> 8 26.3 48.8\npredict(bart_fit, type = \"pred_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 5.00 41.8\n#> 2 19.9 60.5\n#> 3 7.37 44.3\n#> 4 32.4 72.1\n#> 5 15.7 56.4\n#> 6 18.9 56.8\n#> 7 21.2 57.2\n#> 8 17.2 58.5\n```\n:::\n\n\n:::\n\n## Boosted Decision Trees (`boost_tree()`) \n\n:::{.panel-tabset}\n\n## `xgboost` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and xgboost is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(748)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> ##### xgb.Booster\n#> raw: 35 Kb \n#> call:\n#> xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, \n#> colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, \n#> subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist, \n#> verbose = 0, nthread = 1, objective = \"reg:squarederror\")\n#> params (as set within xgb.train):\n#> eta = \"0.3\", max_depth = \"6\", gamma = \"0\", colsample_bytree = \"1\", colsample_bynode = \"1\", min_child_weight = \"1\", subsample = \"1\", nthread = \"1\", objective = \"reg:squarederror\", validate_parameters = \"TRUE\"\n#> xgb.attributes:\n#> niter\n#> callbacks:\n#> cb.evaluation.log()\n#> # of features: 2 \n#> niter: 15\n#> nfeatures : 2 \n#> evaluation_log:\n#> iter training_rmse\n#> \n#> 1 27.511751\n#> 2 20.726236\n#> --- ---\n#> 14 2.774394\n#> 15 2.632224\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.3\n#> 2 32.9\n#> 3 26.7\n#> 4 57.6\n#> 5 34.9\n#> 6 33.8\n#> 7 42.6\n#> 8 26.3\n```\n:::\n\n\n## `catboost` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"catboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(557)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> CatBoost model (1000 trees)\n#> Loss function: RMSE\n#> Fit to 2 feature(s)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 26.6\n#> 2 33.9\n#> 3 27.8\n#> 4 60.6\n#> 5 34.7\n#> 6 36.3\n#> 7 43.6\n#> 8 29.3\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(720)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: gbm\n#> Model ID: GBM_model_R_1763571327438_5392 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 20472 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 14 43 27.92000\n#> \n#> \n#> H2ORegressionMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.001563879\n#> RMSE: 0.03954591\n#> MAE: 0.02903684\n#> RMSLE: 0.001771464\n#> Mean Residual Deviance : 0.001563879\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.7\n#> 2 32.2\n#> 3 26.9\n#> 4 63.2\n#> 5 34.9\n#> 6 39.0\n#> 7 40.0\n#> 8 32.9\n```\n:::\n\n\n## `h2o_gbm` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(90)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: gbm\n#> Model ID: GBM_model_R_1763571327438_5393 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 20473 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 14 43 27.92000\n#> \n#> \n#> H2ORegressionMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.001563879\n#> RMSE: 0.03954591\n#> MAE: 0.02903684\n#> RMSLE: 0.001771464\n#> Mean Residual Deviance : 0.001563879\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.7\n#> 2 32.2\n#> 3 26.9\n#> 4 63.2\n#> 5 34.9\n#> 6 39.0\n#> 7 40.0\n#> 8 32.9\n```\n:::\n\n\n## `lightgbm` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"lightgbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(570)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> LightGBM Model (100 trees)\n#> Objective: regression\n#> Fitted to dataset with 2 columns\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 20.6\n#> 2 42.5\n#> 3 27.0\n#> 4 49.2\n#> 5 43.7\n#> 6 38.3\n#> 7 41.1\n#> 8 36.9\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n set_mode(\"regression\") |>\n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(620)\nboost_tree_fit <- boost_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> GBTRegressionModel: uid=gradient_boosted_trees__1965cfeb_e7de_44f1_a524_4ebd0e873064, numTrees=20, numFeatures=8\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 20.8 \n#> 2 28.1 \n#> 3 15.5 \n#> 4 22.4 \n#> 5 9.37\n#> 6 40.1 \n#> 7 14.2 \n#> 8 32.1 \n#> 9 37.4 \n#> 10 49.5 \n#> # ℹ more rows\n```\n:::\n\n\n:::\n\n## Cubist Rules (`cubist_rules()`) \n\n:::{.panel-tabset}\n\n## `Cubist` \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and Cubist is the default engine so there is no need to set that either.\ncubist_rules_spec <- cubist_rules()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(188)\ncubist_rules_fit <- cubist_rules_spec |> fit(strength ~ ., data = reg_train)\ncubist_rules_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> cubist.default(x = x, y = y, committees = 1)\n#> \n#> Number of samples: 92 \n#> Number of predictors: 2 \n#> \n#> Number of committees: 1 \n#> Number of rules: 2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(cubist_rules_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.2\n#> 2 46.3\n#> 3 23.6\n#> 4 54.4\n#> 5 32.7\n#> 6 37.8\n#> 7 38.8\n#> 8 38.6\n```\n:::\n\n\n:::\n\n## Decision Tree (`decision_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(strength ~ ., data = reg_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> n= 92 \n#> \n#> node), split, n, deviance, yval\n#> * denotes terminal node\n#> \n#> 1) root 92 26564.7400 33.57728 \n#> 2) cement< 0.7861846 69 12009.9000 27.81493 \n#> 4) age< -0.5419541 23 964.6417 14.42348 \n#> 8) cement< -0.3695209 12 292.7811 11.14083 *\n#> 9) cement>=-0.3695209 11 401.4871 18.00455 *\n#> 5) age>=-0.5419541 46 4858.3440 34.51065 \n#> 10) age< 0.008934354 32 2208.3040 31.16781 \n#> 20) cement< 0.311975 24 1450.6200 28.75583 *\n#> 21) cement>=0.311975 8 199.1900 38.40375 *\n#> 11) age>=0.008934354 14 1475.1130 42.15143 *\n#> 3) cement>=0.7861846 23 5390.3320 50.86435 \n#> 6) age< -0.5419541 7 390.4204 40.08429 *\n#> 7) age>=-0.5419541 16 3830.5510 55.58062 *\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 18.0\n#> 2 42.2\n#> 3 28.8\n#> 4 55.6\n#> 5 40.1\n#> 6 38.4\n#> 7 38.4\n#> 8 40.1\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(strength ~ ., data = reg_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> strength ~ cement + age\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] cement <= 0.72078\n#> | | [3] age <= -0.60316\n#> | | | [4] cement <= -0.38732: 11.141 (n = 12, err = 292.8)\n#> | | | [5] cement > -0.38732: 18.005 (n = 11, err = 401.5)\n#> | | [6] age > -0.60316\n#> | | | [7] cement <= 0.24945\n#> | | | | [8] age <= -0.2359: 28.756 (n = 24, err = 1450.6)\n#> | | | | [9] age > -0.2359: 39.014 (n = 11, err = 634.8)\n#> | | | [10] cement > 0.24945: 42.564 (n = 11, err = 1041.7)\n#> | [11] cement > 0.72078: 50.864 (n = 23, err = 5390.3)\n#> \n#> Number of inner nodes: 5\n#> Number of terminal nodes: 6\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 18.0\n#> 2 39.0\n#> 3 28.8\n#> 4 50.9\n#> 5 50.9\n#> 6 42.6\n#> 7 42.6\n#> 8 50.9\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n set_mode(\"regression\") |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\ndecision_tree_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = tbl_reg$test)\n```\n:::\n\n\n:::\n\n## Generalized Additive Models (`gen_additive_mod()`) \n\n:::{.panel-tabset}\n\n## `mgcv` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_spec <- gen_additive_mod() |>\n # We need to set the mode since this engine works with multiple modes\n # and mgcv is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_fit <- \n gen_additive_mod_spec |> \n fit(strength ~ s(age) + s(cement), data = reg_train)\ngen_additive_mod_fit\n#> parsnip model object\n#> \n#> \n#> Family: gaussian \n#> Link function: identity \n#> \n#> Formula:\n#> strength ~ s(age) + s(cement)\n#> \n#> Estimated degrees of freedom:\n#> 4.18 3.56 total = 8.74 \n#> \n#> GCV score: 108.4401\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(gen_additive_mod_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.1\n#> 2 41.2\n#> 3 26.7\n#> 4 55.9\n#> 5 35.2\n#> 6 37.1\n#> 7 38.5\n#> 8 39.6\npredict(gen_additive_mod_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 18.9 27.4\n#> 2 35.7 46.6\n#> 3 22.4 31.0\n#> 4 47.0 64.7\n#> 5 30.1 40.4\n#> 6 32.9 41.2\n#> 7 34.3 42.6\n#> 8 30.3 49.0\n```\n:::\n\n\n:::\n\n## Linear Reg (`linear_reg()`) \n\n:::{.panel-tabset}\n\n## `lm` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and lm is the default engine so there is no need to set that either.\nlinear_reg_spec <- linear_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = strength ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) cement age \n#> 33.577 8.795 5.471\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 28.8 35.4\n#> 2 27.1 33.5\n#> 3 17.3 25.9\n#> 4 44.6 58.1\n#> 5 35.6 45.0\n#> 6 32.3 38.3\n#> 7 33.2 39.4\n#> 8 41.6 56.0\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 5.72 58.5\n#> 2 3.89 56.7\n#> 3 -4.94 48.2\n#> 4 24.3 78.5\n#> 5 13.7 67.0\n#> 6 8.95 61.7\n#> 7 9.89 62.7\n#> 8 21.6 76.0\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(1)\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Linear regression\n#> \n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> batch size: 83 \n#> scaled validation loss after 1 epoch: 235\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.1\n#> 3 21.6\n#> 4 51.2\n#> 5 40.3\n#> 6 35.2\n#> 7 36.2\n#> 8 48.7\n```\n:::\n\n\n## `gee` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- \n linear_reg_spec |> \n fit(weight ~ Time + Diet + id_var(Rat), data = reg_group_train)\n#> Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27\n#> running glm to get initial regression estimate\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> GEE: GENERALIZED LINEAR MODELS FOR DEPENDENT DATA\n#> gee S-function, version 4.13 modified 98/01/27 (1998) \n#> \n#> Model:\n#> Link: Identity \n#> Variance to Mean Relation: Gaussian \n#> Correlation Structure: Independent \n#> \n#> Call:\n#> gee::gee(formula = weight ~ Time + Diet, id = data$Rat, data = data, \n#> family = gaussian)\n#> \n#> Number of observations : 132 \n#> \n#> Maximum cluster size : 11 \n#> \n#> \n#> Coefficients:\n#> (Intercept) Time Diet2 Diet3 \n#> 245.410439 0.549192 185.621212 259.287879 \n#> \n#> Estimated Scale Parameter: 272.1604\n#> Number of Iterations: 1\n#> \n#> Working Correlation[1:4,1:4]\n#> [,1] [,2] [,3] [,4]\n#> [1,] 1 0 0 0\n#> [2,] 0 1 0 0\n#> [3,] 0 0 1 0\n#> [4,] 0 0 0 1\n#> \n#> \n#> Returned Error Value:\n#> [1] 0\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `glm` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = strength ~ ., family = stats::gaussian, \n#> data = data)\n#> \n#> Coefficients:\n#> (Intercept) cement age \n#> 33.577 8.795 5.471 \n#> \n#> Degrees of Freedom: 91 Total (i.e. Null); 89 Residual\n#> Null Deviance:\t 26560 \n#> Residual Deviance: 15480 \tAIC: 740.6\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 28.8 35.4\n#> 2 27.1 33.5\n#> 3 17.3 25.9\n#> 4 44.6 58.1\n#> 5 35.6 45.0\n#> 6 32.3 38.3\n#> 7 33.2 39.4\n#> 8 41.6 56.0\n```\n:::\n\n\n## `glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- \n linear_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\n#> Warning in lme4::glmer(formula = weight ~ Diet + Time + (1 | Rat), data = data,\n#> : calling glmer() with family=gaussian (identity link) as a shortcut to lmer()\n#> is deprecated; please call lmer() directly\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Linear mixed model fit by REML ['lmerMod']\n#> Formula: weight ~ Diet + Time + (1 | Rat)\n#> Data: data\n#> REML criterion at convergence: 955.6549\n#> Random effects:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 16.331 \n#> Residual 8.117 \n#> Number of obs: 132, groups: Rat, 12\n#> Fixed Effects:\n#> (Intercept) Diet2 Diet3 Time \n#> 245.4104 185.6212 259.2879 0.5492\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `glmnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"gaussian\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 9.5680\n#> 2 1 5.38 8.7180\n#> 3 1 9.85 7.9430\n#> 4 1 13.56 7.2380\n#> 5 1 16.64 6.5950\n#> 6 2 19.99 6.0090\n#> 7 2 23.68 5.4750\n#> 8 2 26.75 4.9890\n#> 9 2 29.29 4.5450\n#> 10 2 31.40 4.1420\n#> 11 2 33.15 3.7740\n#> 12 2 34.61 3.4380\n#> 13 2 35.82 3.1330\n#> 14 2 36.82 2.8550\n#> 15 2 37.65 2.6010\n#> 16 2 38.34 2.3700\n#> 17 2 38.92 2.1590\n#> 18 2 39.39 1.9680\n#> 19 2 39.79 1.7930\n#> 20 2 40.12 1.6340\n#> 21 2 40.39 1.4880\n#> 22 2 40.62 1.3560\n#> 23 2 40.80 1.2360\n#> 24 2 40.96 1.1260\n#> 25 2 41.09 1.0260\n#> 26 2 41.20 0.9348\n#> 27 2 41.29 0.8517\n#> 28 2 41.36 0.7761\n#> 29 2 41.42 0.7071\n#> 30 2 41.47 0.6443\n#> 31 2 41.52 0.5871\n#> 32 2 41.55 0.5349\n#> 33 2 41.58 0.4874\n#> 34 2 41.60 0.4441\n#> 35 2 41.63 0.4046\n#> 36 2 41.64 0.3687\n#> 37 2 41.66 0.3359\n#> 38 2 41.67 0.3061\n#> 39 2 41.68 0.2789\n#> 40 2 41.68 0.2541\n#> 41 2 41.69 0.2316\n#> 42 2 41.70 0.2110\n#> 43 2 41.70 0.1922\n#> 44 2 41.71 0.1752\n#> 45 2 41.71 0.1596\n#> 46 2 41.71 0.1454\n#> 47 2 41.71 0.1325\n#> 48 2 41.71 0.1207\n#> 49 2 41.72 0.1100\n#> 50 2 41.72 0.1002\n#> 51 2 41.72 0.0913\n#> 52 2 41.72 0.0832\n#> 53 2 41.72 0.0758\n#> 54 2 41.72 0.0691\n#> 55 2 41.72 0.0630\n#> 56 2 41.72 0.0574\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.2\n#> 2 30.3\n#> 3 21.7\n#> 4 51.3\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.7\n```\n:::\n\n\n## `gls` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n # Also, nlme::gls() specifies the random effects outside of the formula so\n # we set that as an engine parameter\n set_engine(\"gls\", correlation = nlme::corCompSymm(form = ~Time|Rat))\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(weight ~ Time + Diet, data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Generalized least squares fit by REML\n#> Model: weight ~ Time + Diet \n#> Data: data \n#> Log-restricted-likelihood: -477.8274\n#> \n#> Coefficients:\n#> (Intercept) Time Diet2 Diet3 \n#> 245.410439 0.549192 185.621212 259.287879 \n#> \n#> Correlation Structure: Compound symmetry\n#> Formula: ~Time | Rat \n#> Parameter estimate(s):\n#> Rho \n#> 0.8019221 \n#> Degrees of freedom: 132 total; 128 residual\n#> Residual standard error: 18.23695\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: glm\n#> Model ID: GLM_model_R_1763571327438_5394 \n#> GLM Model: summary\n#> family link regularization\n#> 1 gaussian identity Elastic Net (alpha = 0.5, lambda = 0.01903 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 1\n#> training_frame\n#> 1 object_ujvnjgioue\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept 33.577283 33.577283\n#> 2 cement 8.708461 8.708461\n#> 3 age 5.422201 5.422201\n#> \n#> H2ORegressionMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 168.2822\n#> RMSE: 12.97236\n#> MAE: 10.62672\n#> RMSLE: 0.4645554\n#> Mean Residual Deviance : 168.2822\n#> R^2 : 0.4171988\n#> Null Deviance :26564.74\n#> Null D.o.F. :91\n#> Residual Deviance :15481.96\n#> Residual D.o.F. :89\n#> AIC :740.6438\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.7\n#> 4 51.2\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.7\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(596)\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Model: \"sequential_3\"\n#> ________________________________________________________________________________\n#> Layer (type) Output Shape Param # \n#> ================================================================================\n#> dense_6 (Dense) (None, 1) 3 \n#> dense_7 (Dense) (None, 1) 2 \n#> ================================================================================\n#> Total params: 5 (20.00 Byte)\n#> Trainable params: 5 (20.00 Byte)\n#> Non-trainable params: 0 (0.00 Byte)\n#> ________________________________________________________________________________\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> 1/1 - 0s - 41ms/epoch - 41ms/step\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 0.157 \n#> 2 -0.000594\n#> 3 -0.0677 \n#> 4 0.414 \n#> 5 0.290 \n#> 6 0.154 \n#> 7 0.170 \n#> 8 0.443\n```\n:::\n\n\n## `lme` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that. \n # nlme::lme() makes us set the random effects outside of the formula so we\n # add it as an engine parameter. \n set_engine(\"lme\", random = ~ Time | Rat)\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(weight ~ Diet + Time, data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Linear mixed-effects model fit by REML\n#> Data: data \n#> Log-restricted-likelihood: -426.5662\n#> Fixed: weight ~ Diet + Time \n#> (Intercept) Diet2 Diet3 Time \n#> 240.483603 199.723140 264.893298 0.549192 \n#> \n#> Random effects:\n#> Formula: ~Time | Rat\n#> Structure: General positive-definite, Log-Cholesky parametrization\n#> StdDev Corr \n#> (Intercept) 25.2657397 (Intr)\n#> Time 0.3411097 -0.816\n#> Residual 4.5940697 \n#> \n#> Number of Observations: 132\n#> Number of Groups: 12\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 241.\n#> 2 245.\n#> 3 249.\n#> 4 253.\n#> 5 256.\n#> 6 260.\n#> 7 264.\n#> 8 265.\n#> 9 268.\n#> 10 272.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `lmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"lmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- \n linear_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Linear mixed model fit by REML ['lmerMod']\n#> Formula: weight ~ Diet + Time + (1 | Rat)\n#> Data: data\n#> REML criterion at convergence: 955.6549\n#> Random effects:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 16.331 \n#> Residual 8.117 \n#> Number of obs: 132, groups: Rat, 12\n#> Fixed Effects:\n#> (Intercept) Diet2 Diet3 Time \n#> 245.4104 185.6212 259.2879 0.5492\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `stan` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(357)\nlinear_reg_fit <- linear_reg_spec |> fit(weight ~ Diet + Time, data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> stan_glm\n#> family: gaussian [identity]\n#> formula: weight ~ Diet + Time\n#> observations: 132\n#> predictors: 4\n#> ------\n#> Median MAD_SD\n#> (Intercept) 245.3 3.3 \n#> Diet2 185.6 3.6 \n#> Diet3 259.3 3.4 \n#> Time 0.6 0.1 \n#> \n#> Auxiliary parameter(s):\n#> Median MAD_SD\n#> sigma 16.6 1.0 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 240. 252.\n#> 2 244. 255.\n#> 3 249. 258.\n#> 4 253. 262.\n#> 5 257. 265.\n#> 6 261. 269.\n#> 7 265. 273.\n#> 8 265. 274.\n#> 9 268. 278.\n#> 10 271. 282.\n#> # ℹ 34 more rows\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 213. 278.\n#> 2 216. 282.\n#> 3 220. 287.\n#> 4 224. 290.\n#> 5 228. 292.\n#> 6 230. 297.\n#> 7 236. 301.\n#> 8 236. 302.\n#> 9 240. 305.\n#> 10 244. 310.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `stan_glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(895)\nlinear_reg_fit <- \n linear_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> stan_glmer\n#> family: gaussian [identity]\n#> formula: weight ~ Diet + Time + (1 | Rat)\n#> observations: 132\n#> ------\n#> Median MAD_SD\n#> (Intercept) 245.6 6.8 \n#> Diet2 185.7 11.5 \n#> Diet3 259.2 11.5 \n#> Time 0.5 0.0 \n#> \n#> Auxiliary parameter(s):\n#> Median MAD_SD\n#> sigma 8.2 0.5 \n#> \n#> Error terms:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 17.2 \n#> Residual 8.2 \n#> Num. levels: Rat 12 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 258.\n#> 5 262.\n#> 6 266.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 205. 285.\n#> 2 211. 289.\n#> 3 214. 292.\n#> 4 218. 295.\n#> 5 221. 300.\n#> 6 225. 303.\n#> 7 230. 307.\n#> 8 230. 309.\n#> 9 233. 312.\n#> 10 237. 314.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> Coefficients:\n#> (Intercept) cement blast_furnace_slag fly_ash \n#> -21.80239627 0.12003251 0.10399582 0.08747677 \n#> water superplasticizer coarse_aggregate fine_aggregate \n#> -0.15701342 0.28531613 0.01777782 0.02018358 \n#> age \n#> 0.11678247\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 16.5\n#> 2 19.7\n#> 3 26.1\n#> 4 23.6\n#> 5 24.2\n#> 6 29.1\n#> 7 21.3\n#> 8 24.2\n#> 9 33.9\n#> 10 57.7\n#> # ℹ more rows\n```\n:::\n\n\n:::\n\n## Multivariate Adaptive Regression Splines (`mars()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_spec <- mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_fit <- mars_spec |> fit(strength ~ ., data = reg_train)\nmars_fit\n#> parsnip model object\n#> \n#> Selected 4 of 9 terms, and 2 of 2 predictors\n#> Termination condition: RSq changed by less than 0.001 at 9 terms\n#> Importance: age, cement\n#> Number of terms at each degree of interaction: 1 3 (additive model)\n#> GCV 113.532 RSS 8915.965 GRSq 0.6153128 RSq 0.6643684\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mars_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.0\n#> 2 43.1\n#> 3 28.1\n#> 4 58.0\n#> 5 33.8\n#> 6 34.9\n#> 7 36.3\n#> 8 43.5\n```\n:::\n\n\n:::\n\n## Neural Networks (`mlp()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(159)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> a 2-5-1 network with 21 weights\n#> inputs: cement age \n#> output(s): strength \n#> options were - linear output units\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 14.8\n#> 2 38.5\n#> 3 32.0\n#> 4 63.6\n#> 5 43.5\n#> 6 42.7\n#> 7 42.3\n#> 8 33.1\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(407)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> relu activation,\n#> 3 hidden units,\n#> 13 model parameters\n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 83 \n#> learn rate: 0.01 \n#> scaled validation loss after 9 epochs: 0.189\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.1\n#> 2 39.4\n#> 3 26.9\n#> 4 56.4\n#> 5 32.9\n#> 6 37.2\n#> 7 38.4\n#> 8 40.1\n```\n:::\n\n\n## `brulee_two_layer` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"brulee_two_layer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(585)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> c(relu,relu) activation,\n#> c(3,3) hidden units,\n#> 25 model parameters\n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 83 \n#> learn rate: 0.01 \n#> scaled validation loss after 3 epochs: 0.379\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.5\n#> 2 32.6\n#> 3 24.6\n#> 4 50.5\n#> 5 46.7\n#> 6 33.8\n#> 7 37.0\n#> 8 50.5\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(93)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: deeplearning\n#> Model ID: DeepLearning_model_R_1763571327438_5395 \n#> Status of Neuron Layers: predicting .outcome, regression, gaussian distribution, Quadratic loss, 801 weights/biases, 14.5 KB, 920 training samples, mini-batch size 1\n#> layer units type dropout l1 l2 mean_rate rate_rms momentum\n#> 1 1 2 Input 0.00 % NA NA NA NA NA\n#> 2 2 200 Rectifier 0.00 % 0.000000 0.000000 0.012666 0.031575 0.000000\n#> 3 3 1 Linear NA 0.000000 0.000000 0.000613 0.000166 0.000000\n#> mean_weight weight_rms mean_bias bias_rms\n#> 1 NA NA NA NA\n#> 2 -0.003107 0.098394 0.499664 0.001157\n#> 3 -0.000248 0.098163 0.000245 0.000000\n#> \n#> \n#> H2ORegressionMetrics: deeplearning\n#> ** Reported on training data. **\n#> ** Metrics reported on full training frame **\n#> \n#> MSE: 173.8723\n#> RMSE: 13.18606\n#> MAE: 10.40789\n#> RMSLE: 0.48563\n#> Mean Residual Deviance : 173.8723\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 31.1\n#> 2 31.6\n#> 3 25.1\n#> 4 44.1\n#> 5 36.3\n#> 6 33.9\n#> 7 34.5\n#> 8 41.5\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(879)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> Coefficients:\n#> (Intercept) cement blast_furnace_slag fly_ash \n#> -21.80239627 0.12003251 0.10399582 0.08747677 \n#> water superplasticizer coarse_aggregate fine_aggregate \n#> -0.15701342 0.28531613 0.01777782 0.02018358 \n#> age \n#> 0.11678247\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> 1/1 - 0s - 42ms/epoch - 42ms/step\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 -0.386\n#> 2 -0.337\n#> 3 -0.299\n#> 4 -0.279\n#> 5 -0.385\n#> 6 -0.374\n#> 7 -0.373\n#> 8 -0.342\n```\n:::\n\n\n:::\n\n## K-Nearest Neighbors (`nearest_neighbor()`) \n\n:::{.panel-tabset}\n\n## `kknn` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_spec <- nearest_neighbor() |>\n # We need to set the mode since this engine works with multiple modes\n # and kknn is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_fit <- nearest_neighbor_spec |> fit(strength ~ ., data = reg_train)\nnearest_neighbor_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> kknn::train.kknn(formula = strength ~ ., data = data, ks = min_rows(5, data, 5))\n#> \n#> Type of response variable: continuous\n#> minimal mean absolute error: 8.257735\n#> Minimal mean squared error: 115.8737\n#> Best kernel: optimal\n#> Best k: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(nearest_neighbor_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 16.3\n#> 2 35.7\n#> 3 27.5\n#> 4 56.7\n#> 5 42.6\n#> 6 41.7\n#> 7 41.2\n#> 8 50.2\n```\n:::\n\n\n## Null Model (`null_model()`) \n\n## `parsnip` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_spec <- null_model() |>\n # We need to set the mode since this engine works with multiple modes\n # and parsnip is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_fit <- null_model_spec |> fit(strength ~ ., data = reg_train)\nnull_model_fit\n#> parsnip model object\n#> \n#> Null Classification Model\n#> Predicted Value: 33.57728\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(null_model_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 33.6\n#> 2 33.6\n#> 3 33.6\n#> 4 33.6\n#> 5 33.6\n#> 6 33.6\n#> 7 33.6\n#> 8 33.6\n```\n:::\n\n\n:::\n\n## Partial Least Squares (`pls()`) \n\n:::{.panel-tabset}\n\n## `mixOmics` \n\nThis engine requires the plsmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(plsmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_spec <- pls() |>\n # We need to set the mode since this engine works with multiple modes\n # and mixOmics is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_fit <- pls_spec |> fit(strength ~ ., data = reg_train)\npls_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> mixOmics::spls(X = x, Y = y, ncomp = ncomp, keepX = keepX) \n#> \n#> sPLS with a 'regression' mode with 2 sPLS components. \n#> You entered data X of dimensions: 92 2 \n#> You entered data Y of dimensions: 92 1 \n#> \n#> Selection of [2] [2] variables on each of the sPLS components on the X data set. \n#> Selection of [1] [1] variables on each of the sPLS components on the Y data set. \n#> \n#> Main numerical outputs: \n#> -------------------- \n#> loading vectors: see object$loadings \n#> variates: see object$variates \n#> variable names: see object$names \n#> \n#> Functions to visualise samples: \n#> -------------------- \n#> plotIndiv, plotArrow \n#> \n#> Functions to visualise variables: \n#> -------------------- \n#> plotVar, plotLoadings, network, cim\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(pls_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\n```\n:::\n\n\n:::\n\n## Poisson Reg (`poisson_reg()`) \n\n:::{.panel-tabset}\n\n## `glm` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and glm is the default engine so there is no need to set that either.\npoisson_reg_spec <- poisson_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = num_years ~ ., family = stats::poisson, \n#> data = data)\n#> \n#> Coefficients:\n#> (Intercept) age income \n#> 2.2861 0.2804 0.2822 \n#> \n#> Degrees of Freedom: 1460 Total (i.e. Null); 1458 Residual\n#> Null Deviance:\t 7434 \n#> Residual Deviance: 2597 \tAIC: 8446\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.6 \n#> 2 6.66\n#> 3 11.8 \n#> 4 24.8 \n#> 5 26.6 \n#> 6 8.23\n#> 7 32.1 \n#> 8 4.86\n#> 9 28.3\n```\n:::\n\n\n## `gee` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- \n poisson_reg_spec |> \n fit(weight ~ Diet + Time + id_var(Rat), data = reg_group_train)\n#> Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27\n#> running glm to get initial regression estimate\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> GEE: GENERALIZED LINEAR MODELS FOR DEPENDENT DATA\n#> gee S-function, version 4.13 modified 98/01/27 (1998) \n#> \n#> Model:\n#> Link: Logarithm \n#> Variance to Mean Relation: Poisson \n#> Correlation Structure: Independent \n#> \n#> Call:\n#> gee::gee(formula = weight ~ Diet + Time, id = data$Rat, data = data, \n#> family = stats::poisson)\n#> \n#> Number of observations : 132 \n#> \n#> Maximum cluster size : 11 \n#> \n#> \n#> Coefficients:\n#> (Intercept) Diet2 Diet3 Time \n#> 5.525683187 0.532717136 0.684495610 0.001467487 \n#> \n#> Estimated Scale Parameter: 0.6879328\n#> Number of Iterations: 1\n#> \n#> Working Correlation[1:4,1:4]\n#> [,1] [,2] [,3] [,4]\n#> [1,] 1 0 0 0\n#> [2,] 0 1 0 0\n#> [3,] 0 0 1 0\n#> [4,] 0 0 0 1\n#> \n#> \n#> Returned Error Value:\n#> [1] 0\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Can't reproduce this:\n# predict(poisson_reg_fit, new_data = reg_group_test)\n```\n:::\n\n\n## `glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(826)\npoisson_reg_fit <- \n poisson_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\n#> Warning in checkConv(attr(opt, \"derivs\"), opt$par, ctrl = control$checkConv, :\n#> Model failed to converge with max|grad| = 0.00394285 (tol = 0.002, component 1)\n#> Warning in checkConv(attr(opt, \"derivs\"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue\n#> - Rescale variables?\npoisson_reg_fit\n#> parsnip model object\n#> \n#> Generalized linear mixed model fit by maximum likelihood (Laplace\n#> Approximation) [glmerMod]\n#> Family: poisson ( log )\n#> Formula: weight ~ Diet + Time + (1 | Rat)\n#> Data: data\n#> AIC BIC logLik -2*log(L) df.resid \n#> 1079.1349 1093.5489 -534.5675 1069.1349 127 \n#> Random effects:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 0.03683 \n#> Number of obs: 132, groups: Rat, 12\n#> Fixed Effects:\n#> (Intercept) Diet2 Diet3 Time \n#> 5.524796 0.533446 0.684637 0.001467 \n#> optimizer (Nelder_Mead) convergence code: 0 (OK) ; 0 optimizer warnings; 2 lme4 warnings\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 251.\n#> 2 254.\n#> 3 256.\n#> 4 259.\n#> 5 262.\n#> 6 264.\n#> 7 267.\n#> 8 268.\n#> 9 270.\n#> 10 273.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `glmnet` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"poisson\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 5.9710\n#> 2 1 10.26 5.4400\n#> 3 1 18.31 4.9570\n#> 4 2 24.84 4.5170\n#> 5 2 32.06 4.1150\n#> 6 2 37.94 3.7500\n#> 7 2 42.73 3.4170\n#> 8 2 46.65 3.1130\n#> 9 2 49.87 2.8370\n#> 10 2 52.51 2.5850\n#> 11 2 54.69 2.3550\n#> 12 2 56.48 2.1460\n#> 13 2 57.96 1.9550\n#> 14 2 59.18 1.7810\n#> 15 2 60.19 1.6230\n#> 16 2 61.03 1.4790\n#> 17 2 61.72 1.3480\n#> 18 2 62.29 1.2280\n#> 19 2 62.76 1.1190\n#> 20 2 63.16 1.0190\n#> 21 2 63.48 0.9289\n#> 22 2 63.75 0.8463\n#> 23 2 63.98 0.7712\n#> 24 2 64.16 0.7026\n#> 25 2 64.31 0.6402\n#> 26 2 64.44 0.5833\n#> 27 2 64.55 0.5315\n#> 28 2 64.64 0.4843\n#> 29 2 64.71 0.4413\n#> 30 2 64.77 0.4021\n#> 31 2 64.82 0.3664\n#> 32 2 64.86 0.3338\n#> 33 2 64.90 0.3042\n#> 34 2 64.92 0.2771\n#> 35 2 64.95 0.2525\n#> 36 2 64.97 0.2301\n#> 37 2 64.98 0.2096\n#> 38 2 65.00 0.1910\n#> 39 2 65.01 0.1741\n#> 40 2 65.02 0.1586\n#> 41 2 65.03 0.1445\n#> 42 2 65.03 0.1317\n#> 43 2 65.04 0.1200\n#> 44 2 65.04 0.1093\n#> 45 2 65.05 0.0996\n#> 46 2 65.05 0.0907\n#> 47 2 65.05 0.0827\n#> 48 2 65.05 0.0753\n#> 49 2 65.06 0.0687\n#> 50 2 65.06 0.0625\n#> 51 2 65.06 0.0570\n#> 52 2 65.06 0.0519\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.4 \n#> 2 6.70\n#> 3 11.8 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.27\n#> 7 31.8 \n#> 8 4.91\n#> 9 28.1\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: glm\n#> Model ID: GLM_model_R_1763571327438_5396 \n#> GLM Model: summary\n#> family link regularization\n#> 1 poisson log Elastic Net (alpha = 0.5, lambda = 0.01194 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 4\n#> training_frame\n#> 1 object_kyirzmfbti\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept 2.286411 2.286411\n#> 2 age 0.279967 0.279967\n#> 3 income 0.281952 0.281952\n#> \n#> H2ORegressionMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 18.40519\n#> RMSE: 4.290128\n#> MAE: 3.297048\n#> RMSLE: 0.467537\n#> Mean Residual Deviance : 1.777749\n#> R^2 : 0.6934292\n#> Null Deviance :7434.374\n#> Null D.o.F. :1460\n#> Residual Deviance :2597.291\n#> Residual D.o.F. :1458\n#> AIC :8445.967\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.6 \n#> 2 6.67\n#> 3 11.8 \n#> 4 24.8 \n#> 5 26.5 \n#> 6 8.24\n#> 7 32.0 \n#> 8 4.87\n#> 9 28.2\n```\n:::\n\n\n## `hurdle` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"hurdle\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> pscl::hurdle(formula = num_years ~ ., data = data)\n#> \n#> Count model coefficients (truncated poisson with log link):\n#> (Intercept) age income \n#> 2.2911 0.2749 0.2820 \n#> \n#> Zero hurdle model coefficients (binomial with logit link):\n#> (Intercept) age income \n#> 24.656 5.611 13.092\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.5 \n#> 2 6.74\n#> 3 11.9 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.32\n#> 7 31.9 \n#> 8 4.89\n#> 9 28.2\n```\n:::\n\n\n## `stan` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(213)\npoisson_reg_fit <- \n poisson_reg_spec |> \n fit(weight ~ Diet + Time, data = reg_group_train)\n#> \n#> SAMPLING FOR MODEL 'count' NOW (CHAIN 1).\n#> Chain 1: \n#> Chain 1: Gradient evaluation took 8.9e-05 seconds\n#> Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.89 seconds.\n#> Chain 1: Adjust your expectations accordingly!\n#> Chain 1: \n#> Chain 1: \n#> Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)\n#> Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)\n#> Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)\n#> Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)\n#> Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)\n#> Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)\n#> Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)\n#> Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)\n#> Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)\n#> Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)\n#> Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)\n#> Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)\n#> Chain 1: \n#> Chain 1: Elapsed Time: 0.034 seconds (Warm-up)\n#> Chain 1: 0.035 seconds (Sampling)\n#> Chain 1: 0.069 seconds (Total)\n#> Chain 1: \n#> \n#> SAMPLING FOR MODEL 'count' NOW (CHAIN 2).\n#> Chain 2: \n#> Chain 2: Gradient evaluation took 6e-06 seconds\n#> Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.06 seconds.\n#> Chain 2: Adjust your expectations accordingly!\n#> Chain 2: \n#> Chain 2: \n#> Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)\n#> Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)\n#> Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)\n#> Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)\n#> Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)\n#> Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)\n#> Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)\n#> Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)\n#> Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)\n#> Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)\n#> Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)\n#> Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)\n#> Chain 2: \n#> Chain 2: Elapsed Time: 0.035 seconds (Warm-up)\n#> Chain 2: 0.034 seconds (Sampling)\n#> Chain 2: 0.069 seconds (Total)\n#> Chain 2: \n#> \n#> SAMPLING FOR MODEL 'count' NOW (CHAIN 3).\n#> Chain 3: \n#> Chain 3: Gradient evaluation took 5e-06 seconds\n#> Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.05 seconds.\n#> Chain 3: Adjust your expectations accordingly!\n#> Chain 3: \n#> Chain 3: \n#> Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)\n#> Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)\n#> Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)\n#> Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)\n#> Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)\n#> Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)\n#> Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)\n#> Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)\n#> Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)\n#> Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)\n#> Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)\n#> Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)\n#> Chain 3: \n#> Chain 3: Elapsed Time: 0.033 seconds (Warm-up)\n#> Chain 3: 0.035 seconds (Sampling)\n#> Chain 3: 0.068 seconds (Total)\n#> Chain 3: \n#> \n#> SAMPLING FOR MODEL 'count' NOW (CHAIN 4).\n#> Chain 4: \n#> Chain 4: Gradient evaluation took 5e-06 seconds\n#> Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.05 seconds.\n#> Chain 4: Adjust your expectations accordingly!\n#> Chain 4: \n#> Chain 4: \n#> Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)\n#> Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)\n#> Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)\n#> Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)\n#> Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)\n#> Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)\n#> Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)\n#> Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)\n#> Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)\n#> Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)\n#> Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)\n#> Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)\n#> Chain 4: \n#> Chain 4: Elapsed Time: 0.035 seconds (Warm-up)\n#> Chain 4: 0.036 seconds (Sampling)\n#> Chain 4: 0.071 seconds (Total)\n#> Chain 4:\npoisson_reg_fit\n#> parsnip model object\n#> \n#> stan_glm\n#> family: poisson [log]\n#> formula: weight ~ Diet + Time\n#> observations: 132\n#> predictors: 4\n#> ------\n#> Median MAD_SD\n#> (Intercept) 5.5 0.0 \n#> Diet2 0.5 0.0 \n#> Diet3 0.7 0.0 \n#> Time 0.0 0.0 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 5.53\n#> 2 5.54\n#> 3 5.55\n#> 4 5.56\n#> 5 5.57\n#> 6 5.58\n#> 7 5.59\n#> 8 5.59\n#> 9 5.60\n#> 10 5.61\n#> # ℹ 34 more rows\npredict(poisson_reg_fit, type = \"conf_int\", new_data = reg_group_test)\n#> Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 246. 257.\n#> 2 249. 259.\n#> 3 252. 261.\n#> 4 255. 263.\n#> 5 258. 266.\n#> 6 261. 269.\n#> 7 263. 272.\n#> 8 264. 272.\n#> 9 266. 275.\n#> 10 268. 278.\n#> # ℹ 34 more rows\npredict(poisson_reg_fit, type = \"pred_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 220 284\n#> 2 222 286\n#> 3 225 288\n#> 4 228 291\n#> 5 230 296\n#> 6 232 297\n#> 7 235 300\n#> 8 236 300\n#> 9 238 303\n#> 10 241 306\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `stan_glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(690)\npoisson_reg_fit <- \n poisson_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> stan_glmer\n#> family: poisson [log]\n#> formula: weight ~ Diet + Time + (1 | Rat)\n#> observations: 132\n#> ------\n#> Median MAD_SD\n#> (Intercept) 5.5 0.0 \n#> Diet2 0.5 0.0 \n#> Diet3 0.7 0.0 \n#> Time 0.0 0.0 \n#> \n#> Error terms:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 0.054 \n#> Num. levels: Rat 12 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 251.\n#> 2 254.\n#> 3 256.\n#> 4 259.\n#> 5 261.\n#> 6 264.\n#> 7 267.\n#> 8 268.\n#> 9 270.\n#> 10 272.\n#> # ℹ 34 more rows\npredict(poisson_reg_fit, type = \"pred_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 210. 294 \n#> 2 213 298 \n#> 3 214 301 \n#> 4 217 304 \n#> 5 220 306 \n#> 6 222 309 \n#> 7 223 313.\n#> 8 225 315 \n#> 9 226 317.\n#> 10 229 320 \n#> # ℹ 34 more rows\n```\n:::\n\n\n## `zeroinfl` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"zeroinfl\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\n#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> pscl::zeroinfl(formula = num_years ~ ., data = data)\n#> \n#> Count model coefficients (poisson with log link):\n#> (Intercept) age income \n#> 2.2912 0.2748 0.2821 \n#> \n#> Zero-inflation model coefficients (binomial with logit link):\n#> (Intercept) age income \n#> -48.26 -18.22 -11.72\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.5 \n#> 2 6.74\n#> 3 11.9 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.31\n#> 7 31.9 \n#> 8 4.93\n#> 9 28.2\n```\n:::\n\n\n:::\n\n## Random Forests (`rand_forest()`) \n\n:::{.panel-tabset}\n\n## `ranger` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n # and ranger is the default engine so there is no need to set that either.\n set_engine(\"ranger\", keep.inbag = TRUE) |> \n # However, we'll set the engine and use the keep.inbag=TRUE option so that we \n # can produce interval predictions. This is not generally required. \n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(860)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Ranger result\n#> \n#> Call:\n#> ranger::ranger(x = maybe_data_frame(x), y = y, keep.inbag = ~TRUE, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1)) \n#> \n#> Type: Regression \n#> Number of trees: 500 \n#> Sample size: 92 \n#> Number of independent variables: 2 \n#> Mtry: 1 \n#> Target node size: 5 \n#> Variable importance mode: none \n#> Splitrule: variance \n#> OOB prediction error (MSE): 92.94531 \n#> R squared (OOB): 0.6816071\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.6\n#> 2 36.9\n#> 3 28.4\n#> 4 56.5\n#> 5 38.6\n#> 6 36.5\n#> 7 38.7\n#> 8 34.4\npredict(rand_forest_fit, type = \"conf_int\", new_data = reg_test)\n#> Warning in rInfJack(pred = result$predictions, inbag = inbag.counts, used.trees\n#> = 1:num.trees): Sample size <=20, no calibration performed.\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 18.1 29.1\n#> 2 32.6 41.1\n#> 3 24.0 32.9\n#> 4 45.4 67.7\n#> 5 33.0 44.3\n#> 6 32.0 41.0\n#> 7 35.1 42.3\n#> 8 28.4 40.3\n```\n:::\n\n\n## `aorsf` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(47)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random regression forest\n#> \n#> Linear combinations: Accelerated Linear regression\n#> N observations: 92\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 13.994\n#> Min observations in leaf: 5\n#> OOB stat value: 0.59\n#> OOB stat type: RSQ\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 25.2\n#> 2 36.4\n#> 3 29.7\n#> 4 55.5\n#> 5 42.3\n#> 6 38.5\n#> 7 40.7\n#> 8 52.7\n```\n:::\n\n\n## `grf` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"grf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(130)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\npredict(rand_forest_fit, type = \"conf_int\", new_data = reg_test)\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(211)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: drf\n#> Model ID: DRF_model_R_1763571327438_5397 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 22316 7\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 14 9.04000 14 43 30.86000\n#> \n#> \n#> H2ORegressionMetrics: drf\n#> ** Reported on training data. **\n#> ** Metrics reported on Out-Of-Bag training samples **\n#> \n#> MSE: 89.19785\n#> RMSE: 9.444462\n#> MAE: 7.597463\n#> RMSLE: 0.3303384\n#> Mean Residual Deviance : 89.19785\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.9\n#> 2 36.4\n#> 3 28.1\n#> 4 56.8\n#> 5 39.0\n#> 6 37.8\n#> 7 37.4\n#> 8 31.8\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(981)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\n```\n:::\n\n\nThe print method has a lot of output: \n\n
\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncapture.output(print(rand_forest_fit))[1:100] |> cat(sep = \"\\n\")\n#> parsnip model object\n#> \n#> $nodes\n#> $nodes[[1]]\n#> [1] root\n#> | [2] V2 <= 0.31678\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -0.89134 *\n#> | | | [6] V2 > -0.89134 *\n#> | [7] V2 > 0.31678\n#> | | [8] V3 <= -0.60316 *\n#> | | [9] V3 > -0.60316 *\n#> \n#> $nodes[[2]]\n#> [1] root\n#> | [2] V2 <= 0.62459\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -1.16452 *\n#> | | | [6] V2 > -1.16452\n#> | | | | [7] V3 <= -0.2359 *\n#> | | | | [8] V3 > -0.2359 *\n#> | [9] V2 > 0.62459 *\n#> \n#> $nodes[[3]]\n#> [1] root\n#> | [2] V2 <= 0.34564\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -1.19338 *\n#> | | | [6] V2 > -1.19338 *\n#> | [7] V2 > 0.34564\n#> | | [8] V2 <= 1.21134 *\n#> | | [9] V2 > 1.21134 *\n#> \n#> $nodes[[4]]\n#> [1] root\n#> | [2] V2 <= 0.34564\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V3 <= 0.25377 *\n#> | | | [6] V3 > 0.25377 *\n#> | [7] V2 > 0.34564\n#> | | [8] V3 <= -0.60316 *\n#> | | [9] V3 > -0.60316 *\n#> \n#> $nodes[[5]]\n#> [1] root\n#> | [2] V2 <= 0.62459\n#> | | [3] V3 <= -0.48074 *\n#> | | [4] V3 > -0.48074\n#> | | | [5] V2 <= -1.12604 *\n#> | | | [6] V2 > -1.12604\n#> | | | | [7] V3 <= -0.2359 *\n#> | | | | [8] V3 > -0.2359 *\n#> | [9] V2 > 0.62459 *\n#> \n#> $nodes[[6]]\n#> [1] root\n#> | [2] V2 <= 0.72078\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -0.84517 *\n#> | | | [6] V2 > -0.84517 *\n#> | [7] V2 > 0.72078 *\n#> \n#> $nodes[[7]]\n#> [1] root\n#> | [2] V2 <= 0.72078\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V3 <= -0.2359\n#> | | | | [6] V2 <= 0.24945 *\n#> | | | | [7] V2 > 0.24945 *\n#> | | | [8] V3 > -0.2359 *\n#> | [9] V2 > 0.72078 *\n#> \n#> $nodes[[8]]\n#> [1] root\n#> | [2] V2 <= 0.72078\n#> | | [3] V3 <= -0.48074 *\n#> | | [4] V3 > -0.48074\n#> | | | [5] V3 <= -0.2359 *\n#> | | | [6] V3 > -0.2359 *\n#> | [7] V2 > 0.72078 *\n#> \n#> $nodes[[9]]\n#> [1] root\n#> | [2] V2 <= 0.62459\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -0.23149\n#> | | | | [6] V2 <= -1.09526 *\n#> | | | | [7] V2 > -1.09526 *\n#> | | | [8] V2 > -0.23149 *\n#> | [9] V2 > 0.62459 *\n#> \n#> $nodes[[10]]\n#> [1] root\n```\n:::\n\n
\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 16.3\n#> 2 37.7\n#> 3 28.5\n#> 4 50.6\n#> 5 49.2\n#> 6 36.1\n#> 7 38.6\n#> 8 49.7\n```\n:::\n\n\n## `randomForest` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"randomForest\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(793)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> randomForest(x = maybe_data_frame(x), y = y) \n#> Type of random forest: regression\n#> Number of trees: 500\n#> No. of variables tried at each split: 1\n#> \n#> Mean of squared residuals: 90.38475\n#> % Var explained: 68.7\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.5\n#> 2 36.8\n#> 3 28.6\n#> 4 58.0\n#> 5 38.3\n#> 6 35.4\n#> 7 38.1\n#> 8 33.7\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n set_engine(\"spark\") |> \n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(157)\nrand_forest_fit <- rand_forest_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> RandomForestRegressionModel: uid=random_forest__5a153ba4_7b1f_4072_9e7c_6a00b51132e0, numTrees=20, numFeatures=8\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 28.2\n#> 2 29.6\n#> 3 23.0\n#> 4 28.2\n#> 5 15.2\n#> 6 35.3\n#> 7 18.6\n#> 8 31.9\n#> 9 36.3\n#> 10 45.4\n#> # ℹ more rows\n```\n:::\n\n\n:::\n\n## Rule Fit (`rule_fit()`) \n\n:::{.panel-tabset}\n\n## `xrf` \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n # and xrf is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(431)\nrule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> An eXtreme RuleFit model of 179 rules.\n#> \n#> Original Formula:\n#> \n#> strength ~ cement + age\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 27.5\n#> 2 32.0\n#> 3 26.5\n#> 4 52.9\n#> 5 35.9\n#> 6 31.8\n#> 7 46.2\n#> 8 30.8\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(236)\nrule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: rulefit\n#> Model ID: RuleFit_model_R_1763571327438_5398 \n#> Rulefit Model Summary: \n#> family link regularization number_of_predictors_total\n#> 1 gaussian identity Lasso (lambda = 0.9516 ) 1917\n#> number_of_active_predictors number_of_iterations rule_ensemble_size\n#> 1 51 1 1915\n#> number_of_trees number_of_internal_trees min_depth max_depth mean_depth\n#> 1 150 150 0 5 4.00000\n#> min_leaves max_leaves mean_leaves\n#> 1 0 28 12.76667\n#> \n#> \n#> H2ORegressionMetrics: rulefit\n#> ** Reported on training data. **\n#> \n#> MSE: 90.45501\n#> RMSE: 9.510784\n#> MAE: 7.15224\n#> RMSLE: 0.3531064\n#> Mean Residual Deviance : 90.45501\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 26.9\n#> 2 35.5\n#> 3 26.9\n#> 4 50.1\n#> 5 42.1\n#> 6 34.5\n#> 7 39.3\n#> 8 40.8\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Linear Kernel) (`svm_linear()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"kernlab\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(strength ~ ., data = reg_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Linear (vanilla) kernel function. \n#> \n#> Number of Support Vectors : 85 \n#> \n#> Objective Function Value : -47.4495 \n#> Training error : 0.606701\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.4\n#> 2 30.9\n#> 3 21.7\n#> 4 47.1\n#> 5 36.4\n#> 6 33.4\n#> 7 34.2\n#> 8 43.2\n```\n:::\n\n\n## `LiblineaR` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n # and LiblineaR is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(strength ~ ., data = reg_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized L2-loss support vector regression primal (L2R_L2LOSS_SVR)\"\n#> \n#> $Type\n#> [1] 11\n#> \n#> $W\n#> cement age Bias\n#> [1,] 8.665447 5.486263 33.34299\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 31.9\n#> 2 30.1\n#> 3 21.5\n#> 4 50.9\n#> 5 39.9\n#> 6 35.0\n#> 7 36.0\n#> 8 48.3\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_spec <- svm_poly() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_fit <- svm_poly_spec |> fit(strength ~ ., data = reg_train)\n#> Setting default kernel parameters\nsvm_poly_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Polynomial kernel function. \n#> Hyperparameters : degree = 1 scale = 1 offset = 1 \n#> \n#> Number of Support Vectors : 85 \n#> \n#> Objective Function Value : -47.4495 \n#> Training error : 0.606702\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_poly_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.4\n#> 2 30.9\n#> 3 21.7\n#> 4 47.1\n#> 5 36.4\n#> 6 33.4\n#> 7 34.2\n#> 8 43.2\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_spec <- svm_rbf() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_fit <- svm_rbf_spec |> fit(strength ~ ., data = reg_train)\nsvm_rbf_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Gaussian Radial Basis kernel function. \n#> Hyperparameter : sigma = 0.850174270140177 \n#> \n#> Number of Support Vectors : 79 \n#> \n#> Objective Function Value : -33.0277 \n#> Training error : 0.28361\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_rbf_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 20.0\n#> 2 41.3\n#> 3 26.0\n#> 4 53.5\n#> 5 35.2\n#> 6 34.7\n#> 7 36.2\n#> 8 42.3\n```\n:::\n\n\n\n:::\n\n# Censored Regression Models\n\nLet's simulate a data set using the prodlim and survival packages: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(survival)\n#> \n#> Attaching package: 'survival'\n#> The following object is masked from 'package:future':\n#> \n#> cluster\nlibrary(prodlim)\n\nset.seed(1000)\ncns_data <- \n SimSurv(250) |> \n mutate(event_time = Surv(time, event)) |> \n select(event_time, X1, X2)\n\ncns_split <- initial_split(cns_data, prop = 0.98)\ncns_split\n#> \n#> <245/5/250>\ncns_train <- training(cns_split)\ncns_test <- testing(cns_split)\n```\n:::\n\n\nFor some types of predictions, we need the _evaluation time(s)_ for the predictions. We'll use these three times to demonstrate: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\neval_times <- c(1, 3, 5)\n```\n:::\n\n\n\n## Bagged Decision Trees (`bag_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"censored regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_fit <- bag_tree_spec |> fit(event_time ~ ., data = cns_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> \n#> Bagging survival trees with 25 bootstrap replications \n#> \n#> Call: bagging.data.frame(formula = event_time ~ ., data = data, cp = ~0, \n#> minsplit = ~2)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.65\n#> 2 4.12\n#> 3 5.03\n#> 4 5.58\n#> 5 4.88\npredict(bag_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.993\n#> 2 3 0.864\n#> 3 5 0.638\n```\n:::\n\n\n:::\n\n## Boosted Decision Trees (`boost_tree()`) \n\n:::{.panel-tabset}\n\n## `mboost` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"censored regression\") |> \n set_engine(\"mboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(852)\nboost_tree_fit <- boost_tree_spec |> fit(event_time ~ ., data = cns_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> \n#> \t Model-based Boosting\n#> \n#> Call:\n#> mboost::blackboost(formula = formula, data = data, family = family, control = mboost::boost_control(), tree_controls = partykit::ctree_control(teststat = \"quadratic\", testtype = \"Teststatistic\", mincriterion = 0, minsplit = 10, minbucket = 4, maxdepth = 2, saveinfo = FALSE))\n#> \n#> \n#> \t Cox Partial Likelihood \n#> \n#> Loss function: \n#> \n#> Number of boosting iterations: mstop = 100 \n#> Step size: 0.1 \n#> Offset: 0 \n#> Number of baselearners: 1\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 6.51\n#> 2 3.92\n#> 3 4.51\n#> 4 7.17\n#> 5 4.51\npredict(boost_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(boost_tree_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 0.00839\n#> 2 -1.14 \n#> 3 -0.823 \n#> 4 0.229 \n#> 5 -0.823\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.982\n#> 2 3 0.877\n#> 3 5 0.657\n```\n:::\n\n\n:::\n\n## Decision Tree (`decision_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"censored regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(event_time ~ ., data = cns_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> $rpart\n#> n= 245 \n#> \n#> node), split, n, deviance, yval\n#> * denotes terminal node\n#> \n#> 1) root 245 329.03530 1.0000000 \n#> 2) X2< -0.09937043 110 119.05180 0.5464982 \n#> 4) X2< -0.9419799 41 42.43138 0.3153769 \n#> 8) X1< 0.5 20 12.93725 0.1541742 *\n#> 9) X1>=0.5 21 23.29519 0.5656502 *\n#> 5) X2>=-0.9419799 69 67.76223 0.7336317 *\n#> 3) X2>=-0.09937043 135 157.14990 1.7319010 \n#> 6) X1< 0.5 79 66.30972 1.2572690 *\n#> 7) X1>=0.5 56 69.62652 3.0428230 \n#> 14) X2< 1.222057 44 40.33335 2.5072040 *\n#> 15) X2>=1.222057 12 17.95790 6.3934130 *\n#> \n#> $survfit\n#> \n#> Call: prodlim::prodlim(formula = form, data = data)\n#> Stratified Kaplan-Meier estimator for the conditional event time survival function\n#> Discrete predictor variable: rpartFactor (0.154174164904031, 0.565650228981439, 0.733631734872791, 1.25726850344687, 2.50720371146533, 6.39341334321542)\n#> \n#> Right-censored response of a survival model\n#> \n#> No.Observations: 245 \n#> \n#> Pattern:\n#> Freq\n#> event 161 \n#> right.censored 84 \n#> \n#> $levels\n#> [1] \"0.154174164904031\" \"0.565650228981439\" \"0.733631734872791\"\n#> [4] \"1.25726850344687\" \"2.50720371146533\" \"6.39341334321542\" \n#> \n#> attr(,\"class\")\n#> [1] \"pecRpart\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 1.26\n#> 2 2.51\n#> 3 1.26\n#> 4 1.26\n#> 5 1.26\npredict(decision_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.987\n#> 2 3 0.854\n#> 3 5 0.634\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(event_time ~ ., data = cns_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> event_time ~ X1 + X2\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] X2 <= -0.36159\n#> | | [3] X1 <= 0: 13.804 (n = 41)\n#> | | [4] X1 > 0: 8.073 (n = 47)\n#> | [5] X2 > -0.36159\n#> | | [6] X1 <= 0: 6.274 (n = 89)\n#> | | [7] X1 > 0\n#> | | | [8] X2 <= 0.56098: 5.111 (n = 39)\n#> | | | [9] X2 > 0.56098: 2.713 (n = 29)\n#> \n#> Number of inner nodes: 4\n#> Number of terminal nodes: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 6.27\n#> 2 5.11\n#> 3 6.27\n#> 4 6.27\n#> 5 6.27\npredict(decision_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.989\n#> 2 3 0.871\n#> 3 5 0.649\n```\n:::\n\n\n:::\n\n## Proportional Hazards (`proportional_hazards()`) \n\n:::{.panel-tabset}\n\n## `survival` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and survival is the default engine so there is no need to set that either.\nproportional_hazards_spec <- proportional_hazards()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit <- proportional_hazards_spec |> fit(event_time ~ ., data = cns_train)\nproportional_hazards_fit\n#> parsnip model object\n#> \n#> Call:\n#> survival::coxph(formula = event_time ~ ., data = data, model = TRUE, \n#> x = TRUE)\n#> \n#> coef exp(coef) se(coef) z p\n#> X1 0.99547 2.70599 0.16799 5.926 3.11e-09\n#> X2 0.91398 2.49422 0.09566 9.555 < 2e-16\n#> \n#> Likelihood ratio test=106.8 on 2 df, p=< 2.2e-16\n#> n= 245, number of events= 161\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(proportional_hazards_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.16\n#> 3 4.62\n#> 4 5.19\n#> 5 4.41\npredict(proportional_hazards_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(proportional_hazards_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -0.111\n#> 2 -1.49 \n#> 3 -1.27 \n#> 4 -1.02 \n#> 5 -1.37\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.985\n#> 2 3 0.909\n#> 3 5 0.750\n```\n:::\n\n\n## `glmnet` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_spec <- proportional_hazards(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit <- proportional_hazards_spec |> fit(event_time ~ ., data = cns_train)\nproportional_hazards_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = data_obj$x, y = data_obj$y, family = \"cox\", weights = weights, alpha = alpha, lambda = lambda) \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.39700\n#> 2 1 0.82 0.36170\n#> 3 1 1.51 0.32960\n#> 4 1 2.07 0.30030\n#> 5 1 2.54 0.27360\n#> 6 1 2.94 0.24930\n#> 7 2 3.28 0.22720\n#> 8 2 3.95 0.20700\n#> 9 2 4.50 0.18860\n#> 10 2 4.95 0.17180\n#> 11 2 5.33 0.15660\n#> 12 2 5.65 0.14270\n#> 13 2 5.91 0.13000\n#> 14 2 6.13 0.11840\n#> 15 2 6.31 0.10790\n#> 16 2 6.46 0.09833\n#> 17 2 6.58 0.08960\n#> 18 2 6.69 0.08164\n#> 19 2 6.77 0.07439\n#> 20 2 6.85 0.06778\n#> 21 2 6.91 0.06176\n#> 22 2 6.96 0.05627\n#> 23 2 7.00 0.05127\n#> 24 2 7.03 0.04672\n#> 25 2 7.06 0.04257\n#> 26 2 7.08 0.03879\n#> 27 2 7.10 0.03534\n#> 28 2 7.12 0.03220\n#> 29 2 7.13 0.02934\n#> 30 2 7.14 0.02673\n#> 31 2 7.15 0.02436\n#> 32 2 7.16 0.02219\n#> 33 2 7.17 0.02022\n#> 34 2 7.17 0.01843\n#> 35 2 7.18 0.01679\n#> 36 2 7.18 0.01530\n#> 37 2 7.18 0.01394\n#> 38 2 7.19 0.01270\n#> 39 2 7.19 0.01157\n#> 40 2 7.19 0.01054\n#> 41 2 7.19 0.00961\n#> 42 2 7.19 0.00875\n#> The training data has been saved for prediction.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(proportional_hazards_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.80\n#> 2 4.21\n#> 3 4.63\n#> 4 5.18\n#> 5 4.42\npredict(proportional_hazards_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(proportional_hazards_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -0.108\n#> 2 -1.43 \n#> 3 -1.23 \n#> 4 -0.993\n#> 5 -1.33\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.984\n#> 2 3 0.906\n#> 3 5 0.743\n```\n:::\n\n\n:::\n\n## Random Forests (`rand_forest()`) \n\n:::{.panel-tabset}\n\n## `aorsf` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(2)\nrand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random survival forest\n#> \n#> Linear combinations: Accelerated Cox regression\n#> N observations: 245\n#> N events: 161\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 12.85\n#> Min observations in leaf: 5\n#> Min events in leaf: 1\n#> OOB stat value: 0.70\n#> OOB stat type: Harrell's C-index\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.93\n#> 2 3.85\n#> 3 4.41\n#> 4 5.43\n#> 5 4.34\npredict(rand_forest_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.999\n#> 2 3 0.873\n#> 3 5 0.627\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(89)\nrand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train)\n```\n:::\n\n\nThe print method has a lot of output: \n\n
\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncapture.output(print(rand_forest_fit))[1:100] |> cat(sep = \"\\n\")\n#> parsnip model object\n#> \n#> $nodes\n#> $nodes[[1]]\n#> [1] root\n#> | [2] V3 <= -0.16072\n#> | | [3] V2 <= 0\n#> | | | [4] V3 <= -1.68226 *\n#> | | | [5] V3 > -1.68226\n#> | | | | [6] V3 <= -0.65952 *\n#> | | | | [7] V3 > -0.65952 *\n#> | | [8] V2 > 0\n#> | | | [9] V3 <= -0.98243 *\n#> | | | [10] V3 > -0.98243\n#> | | | | [11] V3 <= -0.67216 *\n#> | | | | [12] V3 > -0.67216 *\n#> | [13] V3 > -0.16072\n#> | | [14] V2 <= 0\n#> | | | [15] V3 <= 0.95981\n#> | | | | [16] V3 <= 0.3117\n#> | | | | | [17] V3 <= 0.09688 *\n#> | | | | | [18] V3 > 0.09688 *\n#> | | | | [19] V3 > 0.3117\n#> | | | | | [20] V3 <= 0.40845 *\n#> | | | | | [21] V3 > 0.40845 *\n#> | | | [22] V3 > 0.95981 *\n#> | | [23] V2 > 0\n#> | | | [24] V3 <= 0.56098 *\n#> | | | [25] V3 > 0.56098 *\n#> \n#> $nodes[[2]]\n#> [1] root\n#> | [2] V3 <= -0.36618\n#> | | [3] V2 <= 0\n#> | | | [4] V3 <= -1.19881 *\n#> | | | [5] V3 > -1.19881 *\n#> | | [6] V2 > 0\n#> | | | [7] V3 <= -1.18263 *\n#> | | | [8] V3 > -1.18263\n#> | | | | [9] V3 <= -0.55449 *\n#> | | | | [10] V3 > -0.55449 *\n#> | [11] V3 > -0.36618\n#> | | [12] V2 <= 0\n#> | | | [13] V3 <= 0.3117\n#> | | | | [14] V3 <= -0.01851 *\n#> | | | | [15] V3 > -0.01851 *\n#> | | | [16] V3 > 0.3117\n#> | | | | [17] V3 <= 0.85976 *\n#> | | | | [18] V3 > 0.85976 *\n#> | | [19] V2 > 0\n#> | | | [20] V3 <= -0.04369 *\n#> | | | [21] V3 > -0.04369\n#> | | | | [22] V3 <= 0.56098 *\n#> | | | | [23] V3 > 0.56098\n#> | | | | | [24] V3 <= 1.22094 *\n#> | | | | | [25] V3 > 1.22094 *\n#> \n#> $nodes[[3]]\n#> [1] root\n#> | [2] V3 <= -0.46092\n#> | | [3] V2 <= 0\n#> | | | [4] V3 <= -1.65465 *\n#> | | | [5] V3 > -1.65465 *\n#> | | [6] V2 > 0\n#> | | | [7] V3 <= -1.36941 *\n#> | | | [8] V3 > -1.36941\n#> | | | | [9] V3 <= -0.83366 *\n#> | | | | [10] V3 > -0.83366 *\n#> | [11] V3 > -0.46092\n#> | | [12] V2 <= 0\n#> | | | [13] V3 <= -0.01851 *\n#> | | | [14] V3 > -0.01851\n#> | | | | [15] V3 <= 0.22967 *\n#> | | | | [16] V3 > 0.22967\n#> | | | | | [17] V3 <= 0.95368\n#> | | | | | | [18] V3 <= 0.68292 *\n#> | | | | | | [19] V3 > 0.68292 *\n#> | | | | | [20] V3 > 0.95368 *\n#> | | [21] V2 > 0\n#> | | | [22] V3 <= 0.15595 *\n#> | | | [23] V3 > 0.15595\n#> | | | | [24] V3 <= 0.51117 *\n#> | | | | [25] V3 > 0.51117 *\n#> \n#> $nodes[[4]]\n#> [1] root\n#> | [2] V3 <= -0.10421\n#> | | [3] V2 <= 0\n#> | | | [4] V3 <= -0.96818 *\n#> | | | [5] V3 > -0.96818\n#> | | | | [6] V3 <= -0.64682 *\n#> | | | | [7] V3 > -0.64682 *\n#> | | [8] V2 > 0\n#> | | | [9] V3 <= -0.83366 *\n#> | | | [10] V3 > -0.83366 *\n#> | [11] V3 > -0.10421\n#> | | [12] V2 <= 0\n#> | | | [13] V3 <= 0.14347 *\n#> | | | [14] V3 > 0.14347\n#> | | | | [15] V3 <= 1.20345\n```\n:::\n\n
\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.22\n#> 2 4.12\n#> 3 3.87\n#> 4 4.82\n#> 5 3.87\npredict(rand_forest_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 1 \n#> 2 3 0.870\n#> 3 5 0.594\n```\n:::\n\n\n:::\n\n## Parametric Survival Models (`survival_reg()`) \n\n:::{.panel-tabset}\n\n## `survival` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and survival is the default engine so there is no need to set that either.\nsurvival_reg_spec <- survival_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> survival::survreg(formula = event_time ~ ., data = data, model = TRUE)\n#> \n#> Coefficients:\n#> (Intercept) X1 X2 \n#> 2.2351722 -0.4648296 -0.4222887 \n#> \n#> Scale= 0.4728442 \n#> \n#> Loglik(model)= -427.4 Loglik(intercept only)= -481.3\n#> \tChisq= 107.73 on 2 degrees of freedom, p= <2e-16 \n#> n= 245\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 8.88\n#> 2 4.67\n#> 3 5.20\n#> 4 5.83\n#> 5 4.97\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 2.18\n#> 2 1.54\n#> 3 1.65\n#> 4 1.76\n#> 5 1.60\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.990\n#> 2 3 0.904\n#> 3 5 0.743\n```\n:::\n\n\n## `flexsurv` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_spec <- survival_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"flexsurv\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> flexsurv::flexsurvreg(formula = event_time ~ ., data = data, \n#> dist = \"weibull\")\n#> \n#> Estimates: \n#> data mean est L95% U95% se exp(est) L95% \n#> shape NA 2.11486 1.87774 2.38192 0.12832 NA NA\n#> scale NA 9.34809 8.38852 10.41743 0.51658 NA NA\n#> X1 0.46939 -0.46483 -0.61347 -0.31619 0.07584 0.62824 0.54147\n#> X2 -0.00874 -0.42229 -0.50641 -0.33817 0.04292 0.65554 0.60266\n#> U95% \n#> shape NA\n#> scale NA\n#> X1 0.72892\n#> X2 0.71307\n#> \n#> N = 245, Events: 161, Censored: 84\n#> Total time at risk: 1388.951\n#> Log-likelihood = -427.4387, df = 4\n#> AIC = 862.8774\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.13\n#> 3 4.61\n#> 4 5.16\n#> 5 4.40\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 2.18\n#> 2 1.54\n#> 3 1.65\n#> 4 1.76\n#> 5 1.60\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.990\n#> 2 3 0.904\n#> 3 5 0.743\n```\n:::\n\n\n## `flexsurvspline` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_spec <- survival_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"flexsurvspline\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> flexsurv::flexsurvspline(formula = event_time ~ ., data = data)\n#> \n#> Estimates: \n#> data mean est L95% U95% se exp(est) L95% \n#> gamma0 NA -4.72712 -5.31772 -4.13651 0.30134 NA NA\n#> gamma1 NA 2.11487 1.86338 2.36637 0.12832 NA NA\n#> X1 0.46939 0.98305 0.65928 1.30683 0.16519 2.67261 1.93340\n#> X2 -0.00874 0.89308 0.70943 1.07673 0.09370 2.44265 2.03283\n#> U95% \n#> gamma0 NA\n#> gamma1 NA\n#> X1 3.69444\n#> X2 2.93508\n#> \n#> N = 245, Events: 161, Censored: 84\n#> Total time at risk: 1388.951\n#> Log-likelihood = -427.4387, df = 4\n#> AIC = 862.8774\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.13\n#> 3 4.61\n#> 4 5.16\n#> 5 4.40\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -4.62\n#> 2 -3.26\n#> 3 -3.49\n#> 4 -3.73\n#> 5 -3.39\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.990\n#> 2 3 0.904\n#> 3 5 0.743\n```\n:::\n\n\n:::\n\n# Quantile Regression Models\n\nTo demonstrate quantile regression, let's make a larger version of our regression data: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(938)\nqnt_split <-\n modeldata::concrete |> \n slice_sample(n = 100) |> \n select(strength = compressive_strength, cement, age) |> \n initial_split(prop = 0.95, strata = strength)\nqnt_split\n#> \n#> <92/8/100>\n\nqnt_rec <- \n recipe(strength ~ ., data = training(qnt_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nqnt_train <- bake(qnt_rec, new_data = NULL)\nqnt_test <- bake(qnt_rec, new_data = testing(qnt_split))\n```\n:::\n\n\nWe'll also predict these quantile levels: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nqnt_lvls <- (1:3) / 4\n```\n:::\n\n\n## Linear Regression (`linear_reg()`) \n\n:::{.panel-tabset}\n\n## `quantreg` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n set_engine(\"quantreg\") |> \n set_mode(\"quantile regression\", quantile_levels = qnt_lvls)\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = qnt_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> quantreg::rq(formula = strength ~ ., tau = quantile_levels, data = data)\n#> \n#> Coefficients:\n#> tau= 0.25 tau= 0.50 tau= 0.75\n#> (Intercept) 23.498029 33.265428 42.046031\n#> cement 6.635233 7.955658 8.181235\n#> age 5.566668 9.514832 7.110702\n#> \n#> Degrees of freedom: 92 total; 89 residual\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, type = \"quantile\", new_data = qnt_test)\n#> # A tibble: 8 × 1\n#> .pred_quantile\n#> \n#> 1 [29.2]\n#> 2 [31.5]\n#> 3 [21.4]\n#> 4 [48.3]\n#> 5 [36.6]\n#> 6 [33.8]\n#> 7 [34.6]\n#> 8 [43.8]\n```\n:::\n\n\nEach row of predictions has a special vector class containing all of the quantile predictions: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit |> \n predict(type = \"quantile\", new_data = qnt_test)|> \n slice(1) |> \n pluck(\".pred_quantile\") |> \n # Expand the results for each quantile level by converting to a tibble\n as_tibble()\n#> # A tibble: 3 × 3\n#> .pred_quantile .quantile_levels .row\n#> \n#> 1 21.5 0.25 1\n#> 2 29.2 0.5 1\n#> 3 39.5 0.75 1\n```\n:::\n\n\n:::\n\n## Random Forests (`rand_forest()`) \n\n:::{.panel-tabset}\n\n## `grf` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n set_engine(\"grf\") |> \n set_mode(\"quantile regression\", quantile_levels = qnt_lvls)\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(435)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = qnt_train)\nrand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"quantile\", new_data = qnt_test)\n```\n:::\n\n\nEach row of predictions has a special vector class containing all of the quantile predictions: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit |> \n predict(type = \"quantile\", new_data = qnt_test)|> \n slice(1) |> \n pluck(\".pred_quantile\") |> \n # Expand the results for each quantile level by converting to a tibble\n as_tibble()\n```\n:::\n\n\n:::\n\n\n\n", + "supporting": [ + "index_files" + ], "filters": [ "rmarkdown/pagebreak.lua" ], diff --git a/installs.R b/installs.R index 9ac6f12d..62b584c8 100644 --- a/installs.R +++ b/installs.R @@ -89,7 +89,15 @@ packages <- c( "pscl", "coin", "pec", - "flexsurv" + "flexsurv", + "agua", + "bonsai", + "multilevelmod", + "sparklyr", + "HSAUR3", + "lme4", + "survival", + "gee" ) pak::pak(packages) From 82052586bec57163d9113caf2a0f2a0f1d5594fe Mon Sep 17 00:00:00 2001 From: Edgar Ruiz <77294576+edgararuiz@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:43:03 -0600 Subject: [PATCH 23/23] Installs parsnip from branch, adds missing set_engine() and re-enables the other example that was affected by the issue --- .../parsnip-predictions/index/execute-results/html.json | 4 ++-- learn/models/parsnip-predictions/index.qmd | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/_freeze/learn/models/parsnip-predictions/index/execute-results/html.json b/_freeze/learn/models/parsnip-predictions/index/execute-results/html.json index 2236c48a..9099464d 100644 --- a/_freeze/learn/models/parsnip-predictions/index/execute-results/html.json +++ b/_freeze/learn/models/parsnip-predictions/index/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "dff776495a405e30e27b4af8898dacff", + "hash": "0fa75413e84db534cedd43cb05c12d53", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Fitting and predicting with parsnip\"\ncategories:\n - model fitting\n - parsnip\n - regression\n - classification\ntype: learn-subsection\nweight: 1\ndescription: | \n Examples that show how to fit and predict with different combinations of model, mode, and engine.\ntoc: true\ntoc-depth: 3\ninclude-after-body: ../../../resources.html\nformat:\n html:\n theme: [\"style.scss\"]\n---\n\n\n\n\n\n\n# Introduction\n\nThis page shows examples of how to *fit* and *predict* with different combinations of model, mode, and engine. As a reminder, in parsnip, \n\n- the **model type** differentiates basic modeling approaches, such as random forests, logistic regression, linear support vector machines, etc.,\n\n- the **mode** denotes in what kind of modeling context it will be used (most commonly, classification or regression), and\n\n- the computational **engine** indicates how the model is fit, such as with a specific R package implementation or even methods outside of R like Keras or Stan.\n\nWe'll break the examples up by their mode. For each model, we'll show different data sets used across the different engines. \n\nTo use code in this article, you will need to install the following packages: agua, baguette, bonsai, censored, discrim, HSAUR3, lme4, multilevelmod, plsmod, poissonreg, prodlim, rules, sparklyr, survival, and tidymodels. There are numerous other \"engine\" packages that are required. If you use a model that is missing one or more installed packages, parsnip will prompt you to install them. There are some packages that require non-standard installation or rely on external dependencies. We'll describe these next. \n\n## External Dependencies\n\nSome models available in parsnip use other computational frameworks for computations. There may be some additional downloads for engines using **catboost**, **Spark**, **h2o**, **tensorflow**/**keras**, and **torch**. You can expand the sections below to get basic installation instructions.\n\n
\n\n### catboost\n\ncatboost is a popular boosting framework. Unfortunately, the R package is not available on CRAN. First, go to [https://github.com/catboost/catboost/releases/](\"https://github.com/catboost/catboost/releases/) and search for \"`[R-package]`\" to find the most recent release. \n\nThe following code and be used to install and test the package (which requires the glue package to be installed): \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(glue)\n\n# Put the current version number in this variable: \nversion_number <- \"#.##\"\n\ntemplate <- \"https://github.com/catboost/catboost/releases/download/v{version}/catboost-R-darwin-universal2-{version}.tgz\"\n\ntarget_url <- glue::glue(template)\ntarget_dest <- tempfile()\ndownload.file(target_url, target_dest)\n\nif (grepl(\"^mac\", .Platform$pkgType)) {\n options <- \"--no-staged-install\"\n} else {\n options <- character(0)\n}\n\ninst <- glue::glue(\"R CMD INSTALL {options} {target_dest}\")\nsystem(inst)\n```\n:::\n\n\nTo test, fit an example model: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(catboost)\n\ntrain_pool_path <- system.file(\"extdata\", \"adult_train.1000\", package = \"catboost\")\ntest_pool_path <- system.file(\"extdata\", \"adult_test.1000\", package = \"catboost\")\ncd_path <- system.file(\"extdata\", \"adult.cd\", package = \"catboost\")\ntrain_pool <- catboost.load_pool(train_pool_path, column_description = cd_path)\ntest_pool <- catboost.load_pool(test_pool_path, column_description = cd_path)\nfit_params <- list(\n iterations = 100,\n loss_function = 'Logloss',\n ignored_features = c(4, 9),\n border_count = 32,\n depth = 5,\n learning_rate = 0.03,\n l2_leaf_reg = 3.5,\n train_dir = tempdir())\nfit_params\n```\n:::\n\n\n### Apache Spark\n\nTo use [Apache Spark](https://spark.apache.org/) as an engine, we will first install Spark and then need a connection to a cluster. For this article, we will set up and use a single-node Spark cluster running on a laptop.\n\nTo install, first install sparklyr:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ninstall.packages(\"sparklyr\")\n```\n:::\n\n\nand then install the Spark backend. For example, you might use: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nspark_install(version = \"4.0\")\n```\n:::\n\n\nOnce that is working, you can get ready to fit models using: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nsc <- spark_connect(\"local\")\n#> Warning in sprintf(version$pattern, version$spark, version$hadoop): 2 arguments\n#> not used by format 'spark-4.1.0-preview3-bin-hadoop3'\n```\n:::\n\n\n### h2o \n\nh2o.ai offers a Java-based high-performance computing server for machine learning. This can be run locally or externally. There are general installation instructions at [https://docs.h2o.ai/](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html). There is a package on CRAN, but you can also install directly from [h2o](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html#install-in-r) via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ninstall.packages(\n \"h2o\",\n type = \"source\",\n repos = \"http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R\"\n)\n```\n:::\n\n\nAfter installation is complete, you can start a local server via `h2o::h2o.init()`. \n\nThe tidymodels [agua](https://agua.tidymodels.org/) package contains some helpers and will also need to be installed. You can use its function to start a server too:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n#> \n#> Attaching package: 'agua'\n#> The following object is masked from 'package:workflowsets':\n#> \n#> rank_results\nh2o_start()\n#> Warning: JAVA not found, H2O may take minutes trying to connect.\n#> Warning in h2o.clusterInfo(): \n#> Your H2O cluster version is (1 year, 11 months and 4 days) old. There may be a newer version available.\n#> Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html\n```\n:::\n\n\n### Tensorflow and Keras\n\nR's tensorflow and keras3 packages call Python directly. To enable this, you'll have to install two R packages: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ninstall.packages(\"keras3\")\n```\n:::\n\n\nOnce that is done, use: \n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nkeras3::install_keras(backend = \"tensorflow\")\n```\n:::\n\n\nThere are other options for installation. See [https://tensorflow.rstudio.com/install/index.html](https://tensorflow.rstudio.com/install/index.html) for more details. \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Assumes you are going to use a virtual environment called \npve <- grep(\"tensorflow\", reticulate::virtualenv_list(), value = TRUE)\nreticulate::use_virtualenv(pve)\n```\n:::\n\n\n### Torch\n\nR's torch package is the low-level package containing the framework. Once you have installed it, you will get this message the first time you load the package: \n\n> Additional software needs to be downloaded and installed for torch to work correctly.\"\n\nChoosing \"Yes\" will do the _one-time_ installation. \n\n
\n\nTo get started, let's load the tidymodels package: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(tidymodels)\ntheme_set(theme_bw() + theme(legend.position = \"top\"))\n```\n:::\n\n\n# Classification Models\n\nTo demonstrate classification, let's make a small training and test sets for a binary outcome. We'll center and scale the data since some models require the same units.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(207)\nbin_split <- \n\tmodeldata::two_class_dat |> \n\trename(class = Class) |> \n\tinitial_split(prop = 0.994, strata = class)\nbin_split\n#> \n#> <785/6/791>\n\nbin_rec <- \n recipe(class ~ ., data = training(bin_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nbin_train <- bake(bin_rec, new_data = NULL)\nbin_test <- bake(bin_rec, new_data = testing(bin_split))\n```\n:::\n\n\nFor models that _only_ work for three or more classes, we'll simulate:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(1752)\nmtl_data <-\n sim_multinomial(\n 200,\n ~ -0.5 + 0.6 * abs(A),\n ~ ifelse(A > 0 & B > 0, 1.0 + 0.2 * A / B, - 2),\n ~ A + B - A * B)\n\nmtl_split <- initial_split(mtl_data, prop = 0.967, strata = class)\nmtl_split\n#> \n#> <192/8/200>\n\n# Predictors are in the same units\nmtl_train <- training(mtl_split)\nmtl_test <- testing(mtl_split)\n```\n:::\n\n\nFinally, we have some models that handle hierarchical data, where some rows are statistically correlated with other rows. For these examples, we'll use data from a clinical trial where patients were followed over time. The outcome is binary. The data are in the HSAUR3 package. We'll split these data in a way where all rows for a specific subject are either in the training or test sets: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(72)\ncls_group_split <- \n HSAUR3::toenail |> \n group_initial_split(group = patientID)\ncls_group_train <- training(cls_group_split)\ncls_group_test <- testing(cls_group_split)\n```\n:::\n\n\nThere are 219 subjects in the training set and 75 in the test set. \n\nIf using the **Apache Spark** engine, we will need to identify the data source and then use it to create the splits. For this article, we will copy the `two_class_dat` and the `mtl_data` data sets into the Spark session.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nsc <- spark_connect(\"local\")\n#> Re-using existing Spark connection to local\n\ntbl_two_class <- copy_to(sc, modeldata::two_class_dat)\n\ntbl_bin <- sdf_random_split(tbl_two_class, training = 0.994, test = 1-0.994, seed = 100)\n\ntbl_sim_mtl <- copy_to(sc, mtl_data)\n\ntbl_mtl <- sdf_random_split(tbl_sim_mtl, training = 0.967, test = 1-0.967, seed = 100)\n```\n:::\n\n\n\n## Bagged MARS (`bag_mars()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mars_spec <- bag_mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(268)\nbag_mars_fit <- bag_mars_spec |> fit(class ~ ., data = bin_train)\n#> \n#> Attaching package: 'plotrix'\n#> The following object is masked from 'package:scales':\n#> \n#> rescale\n#> Registered S3 method overwritten by 'butcher':\n#> method from \n#> as.character.dev_topic generics\nbag_mars_fit\n#> parsnip model object\n#> \n#> Bagged MARS (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 100 0 11\n#> 2 A 40.4 1.60 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mars_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_mars_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.452 0.548 \n#> 2 0.854 0.146 \n#> 3 0.455 0.545 \n#> 4 0.968 0.0316\n#> 5 0.939 0.0610\n#> 6 0.872 0.128\n```\n:::\n\n\n:::\n\n## Bagged Neural Networks (`bag_mlp()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mlp_spec <- bag_mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(318)\nbag_mlp_fit <- bag_mlp_spec |> fit(class ~ ., data = bin_train)\nbag_mlp_fit\n#> parsnip model object\n#> \n#> Bagged nnet (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 A 52.1 2.16 11\n#> 2 B 47.9 2.16 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.439 0.561\n#> 2 0.676 0.324\n#> 3 0.428 0.572\n#> 4 0.727 0.273\n#> 5 0.709 0.291\n#> 6 0.660 0.340\n```\n:::\n\n\n:::\n\n## Bagged Decision Trees (`bag_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(985)\nbag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged CART (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 271. 4.35 11\n#> 2 A 237. 5.58 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0 1 \n#> 2 1 0 \n#> 3 0.0909 0.909 \n#> 4 1 0 \n#> 5 0.727 0.273 \n#> 6 0.909 0.0909\n```\n:::\n\n\n## `C5.0` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(937)\nbag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged C5.0 (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 100 0 11\n#> 2 A 48.7 7.33 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.269 0.731\n#> 2 0.863 0.137\n#> 3 0.259 0.741\n#> 4 0.897 0.103\n#> 5 0.897 0.103\n#> 6 0.870 0.130\n```\n:::\n\n\n:::\n\n## Bayesian Additive Regression Trees (`bart()`) \n\n:::{.panel-tabset}\n\n## `dbarts` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbart_spec <- bart() |>\n # We need to set the mode since this engine works with multiple modes\n # and dbarts is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(217)\nbart_fit <- bart_spec |> fit(class ~ ., data = bin_train)\nbart_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> `NULL`()\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bart_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bart_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.439 0.561\n#> 2 0.734 0.266\n#> 3 0.34 0.66 \n#> 4 0.957 0.043\n#> 5 0.931 0.069\n#> 6 0.782 0.218\npredict(bart_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_lower_Class2 .pred_upper_Class1 .pred_upper_Class2\n#> \n#> 1 0.815 0.00280 0.997 0.185\n#> 2 0.781 0.0223 0.978 0.219\n#> 3 0.558 0.0702 0.930 0.442\n#> 4 0.540 0.105 0.895 0.460\n#> 5 0.239 0.345 0.655 0.761\n#> 6 0.195 0.469 0.531 0.805\npredict(bart_fit, type = \"pred_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_lower_Class2 .pred_upper_Class1 .pred_upper_Class2\n#> \n#> 1 0 0 1 1\n#> 2 0 0 1 1\n#> 3 0 0 1 1\n#> 4 0 0 1 1\n#> 5 0 0 1 1\n#> 6 0 0 1 1\n```\n:::\n\n\n:::\n\n## Boosted Decision Trees (`boost_tree()`) \n\n:::{.panel-tabset}\n\n## `xgboost` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and xgboost is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(738)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> ##### xgb.Booster\n#> raw: 40.4 Kb \n#> call:\n#> xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, \n#> colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, \n#> subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist, \n#> verbose = 0, nthread = 1, objective = \"binary:logistic\")\n#> params (as set within xgb.train):\n#> eta = \"0.3\", max_depth = \"6\", gamma = \"0\", colsample_bytree = \"1\", colsample_bynode = \"1\", min_child_weight = \"1\", subsample = \"1\", nthread = \"1\", objective = \"binary:logistic\", validate_parameters = \"TRUE\"\n#> xgb.attributes:\n#> niter\n#> callbacks:\n#> cb.evaluation.log()\n#> # of features: 2 \n#> niter: 15\n#> nfeatures : 2 \n#> evaluation_log:\n#> iter training_logloss\n#> \n#> 1 0.5546750\n#> 2 0.4719804\n#> --- ---\n#> 14 0.2587640\n#> 15 0.2528938\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.244 0.756 \n#> 2 0.770 0.230 \n#> 3 0.307 0.693 \n#> 4 0.944 0.0565\n#> 5 0.821 0.179 \n#> 6 0.938 0.0621\n```\n:::\n\n\n## `C5.0` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(984)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = 15, control = C50::C5.0Control(minCases\n#> = 2, sample = 0))\n#> \n#> Classification Tree\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Number of boosting iterations: 15 requested; 7 used due to early stopping\n#> Average tree size: 3.1 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.307 0.693\n#> 2 0.756 0.244\n#> 3 0.281 0.719\n#> 4 1 0 \n#> 5 1 0 \n#> 6 0.626 0.374\n```\n:::\n\n\n## `catboost` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"catboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(644)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> CatBoost model (1000 trees)\n#> Loss function: Logloss\n#> Fit to 2 feature(s)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.291 0.709 \n#> 2 0.836 0.164 \n#> 3 0.344 0.656 \n#> 4 0.998 0.00245\n#> 5 0.864 0.136 \n#> 6 0.902 0.0983\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(186)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: gbm\n#> Model ID: GBM_model_R_1763571327438_5073 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 25379 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 21 55 35.70000\n#> \n#> \n#> H2OBinomialMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.007948832\n#> RMSE: 0.08915622\n#> LogLoss: 0.05942305\n#> Mean Per-Class Error: 0\n#> AUC: 1\n#> AUCPR: 1\n#> Gini: 1\n#> R^2: 0.9678452\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 434 0 0.000000 =0/434\n#> Class2 0 351 0.000000 =0/351\n#> Totals 434 351 0.000000 =0/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.598690 1.000000 200\n#> 2 max f2 0.598690 1.000000 200\n#> 3 max f0point5 0.598690 1.000000 200\n#> 4 max accuracy 0.598690 1.000000 200\n#> 5 max precision 0.998192 1.000000 0\n#> 6 max recall 0.598690 1.000000 200\n#> 7 max specificity 0.998192 1.000000 0\n#> 8 max absolute_mcc 0.598690 1.000000 200\n#> 9 max min_per_class_accuracy 0.598690 1.000000 200\n#> 10 max mean_per_class_accuracy 0.598690 1.000000 200\n#> 11 max tns 0.998192 434.000000 0\n#> 12 max fns 0.998192 349.000000 0\n#> 13 max fps 0.000831 434.000000 399\n#> 14 max tps 0.598690 351.000000 200\n#> 15 max tnr 0.998192 1.000000 0\n#> 16 max fnr 0.998192 0.994302 0\n#> 17 max fpr 0.000831 1.000000 399\n#> 18 max tpr 0.598690 1.000000 200\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.0496 0.950 \n#> 2 0.905 0.0953 \n#> 3 0.0738 0.926 \n#> 4 0.997 0.00273\n#> 5 0.979 0.0206 \n#> 6 0.878 0.122\n```\n:::\n\n\n## `h2o_gbm` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(724)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: gbm\n#> Model ID: GBM_model_R_1763571327438_5125 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 25379 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 21 55 35.70000\n#> \n#> \n#> H2OBinomialMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.007948832\n#> RMSE: 0.08915622\n#> LogLoss: 0.05942305\n#> Mean Per-Class Error: 0\n#> AUC: 1\n#> AUCPR: 1\n#> Gini: 1\n#> R^2: 0.9678452\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 434 0 0.000000 =0/434\n#> Class2 0 351 0.000000 =0/351\n#> Totals 434 351 0.000000 =0/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.598690 1.000000 200\n#> 2 max f2 0.598690 1.000000 200\n#> 3 max f0point5 0.598690 1.000000 200\n#> 4 max accuracy 0.598690 1.000000 200\n#> 5 max precision 0.998192 1.000000 0\n#> 6 max recall 0.598690 1.000000 200\n#> 7 max specificity 0.998192 1.000000 0\n#> 8 max absolute_mcc 0.598690 1.000000 200\n#> 9 max min_per_class_accuracy 0.598690 1.000000 200\n#> 10 max mean_per_class_accuracy 0.598690 1.000000 200\n#> 11 max tns 0.998192 434.000000 0\n#> 12 max fns 0.998192 349.000000 0\n#> 13 max fps 0.000831 434.000000 399\n#> 14 max tps 0.598690 351.000000 200\n#> 15 max tnr 0.998192 1.000000 0\n#> 16 max fnr 0.998192 0.994302 0\n#> 17 max fpr 0.000831 1.000000 399\n#> 18 max tpr 0.598690 1.000000 200\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.0496 0.950 \n#> 2 0.905 0.0953 \n#> 3 0.0738 0.926 \n#> 4 0.997 0.00273\n#> 5 0.979 0.0206 \n#> 6 0.878 0.122\n```\n:::\n\n\n## `lightgbm` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"lightgbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(906)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> LightGBM Model (100 trees)\n#> Objective: binary\n#> Fitted to dataset with 2 columns\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.147 0.853 \n#> 2 0.930 0.0699\n#> 3 0.237 0.763 \n#> 4 0.990 0.0101\n#> 5 0.929 0.0714\n#> 6 0.956 0.0445\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(285)\nboost_tree_fit <- boost_tree_spec |> fit(Class ~ ., data = tbl_bin$training)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> GBTClassificationModel: uid = gradient_boosted_trees__254e29b6_2f3f_43c5_b7d4_b4473d59cf31, numTrees=20, numClasses=2, numFeatures=2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class1 \n#> 4 Class2 \n#> 5 Class2 \n#> 6 Class1 \n#> 7 Class2\npredict(boost_tree_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 2]\n#> # Database: spark_connection\n#> pred_Class1 pred_Class2\n#> \n#> 1 0.307 0.693 \n#> 2 0.292 0.708 \n#> 3 0.856 0.144 \n#> 4 0.192 0.808 \n#> 5 0.332 0.668 \n#> 6 0.952 0.0476\n#> 7 0.0865 0.914\n```\n:::\n\n\n:::\n\n## C5 Rules (`C5_rules()`) \n\n:::{.panel-tabset}\n\n## `C5.0` \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and C5.0 is the default engine so there is no need to set that either.\nC5_rules_spec <- C5_rules()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(93)\nC5_rules_fit <- C5_rules_spec |> fit(class ~ ., data = bin_train)\nC5_rules_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = trials, rules = TRUE, control\n#> = C50::C5.0Control(minCases = minCases, seed = sample.int(10^5,\n#> 1), earlyStopping = FALSE))\n#> \n#> Rule-Based Model\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Number of Rules: 4 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(C5_rules_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(C5_rules_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 1 0\n#> 2 1 0\n#> 3 0 1\n#> 4 1 0\n#> 5 1 0\n#> 6 1 0\n```\n:::\n\n\n:::\n\n## Decision Tree (`decision_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> n= 785 \n#> \n#> node), split, n, loss, yval, (yprob)\n#> * denotes terminal node\n#> \n#> 1) root 785 351 Class1 (0.5528662 0.4471338) \n#> 2) B< -0.06526451 399 61 Class1 (0.8471178 0.1528822) *\n#> 3) B>=-0.06526451 386 96 Class2 (0.2487047 0.7512953) \n#> 6) B< 0.7339337 194 72 Class2 (0.3711340 0.6288660) \n#> 12) A>=0.6073948 49 13 Class1 (0.7346939 0.2653061) *\n#> 13) A< 0.6073948 145 36 Class2 (0.2482759 0.7517241) *\n#> 7) B>=0.7339337 192 24 Class2 (0.1250000 0.8750000) *\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.735 0.265\n#> 2 0.847 0.153\n#> 3 0.248 0.752\n#> 4 0.847 0.153\n#> 5 0.847 0.153\n#> 6 0.847 0.153\n```\n:::\n\n\n## `C5.0` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = 1, control = C50::C5.0Control(minCases =\n#> 2, sample = 0))\n#> \n#> Classification Tree\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Tree size: 4 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.732 0.268\n#> 2 0.846 0.154\n#> 3 0.236 0.764\n#> 4 0.846 0.154\n#> 5 0.846 0.154\n#> 6 0.846 0.154\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> class ~ A + B\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] B <= -0.06906\n#> | | [3] B <= -0.50486: Class1 (n = 291, err = 8.2%)\n#> | | [4] B > -0.50486\n#> | | | [5] A <= -0.07243: Class1 (n = 77, err = 45.5%)\n#> | | | [6] A > -0.07243: Class1 (n = 31, err = 6.5%)\n#> | [7] B > -0.06906\n#> | | [8] B <= 0.72938\n#> | | | [9] A <= 0.60196: Class2 (n = 145, err = 24.8%)\n#> | | | [10] A > 0.60196\n#> | | | | [11] B <= 0.44701: Class1 (n = 23, err = 4.3%)\n#> | | | | [12] B > 0.44701: Class1 (n = 26, err = 46.2%)\n#> | | [13] B > 0.72938: Class2 (n = 192, err = 12.5%)\n#> \n#> Number of inner nodes: 6\n#> Number of terminal nodes: 7\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.538 0.462 \n#> 2 0.935 0.0645\n#> 3 0.248 0.752 \n#> 4 0.918 0.0825\n#> 5 0.918 0.0825\n#> 6 0.935 0.0645\n```\n:::\n\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(Class ~ ., data = tbl_bin$training)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> n= 784 \n#> \n#> node), split, n, loss, yval, (yprob)\n#> * denotes terminal node\n#> \n#> 1) root 784 350 Class1 (0.5535714 0.4464286) \n#> 2) B< 1.495535 401 62 Class1 (0.8453865 0.1546135) *\n#> 3) B>=1.495535 383 95 Class2 (0.2480418 0.7519582) \n#> 6) B< 2.079458 192 71 Class2 (0.3697917 0.6302083) \n#> 12) A>=2.572663 50 14 Class1 (0.7200000 0.2800000) *\n#> 13) A< 2.572663 142 35 Class2 (0.2464789 0.7535211) *\n#> 7) B>=2.079458 191 24 Class2 (0.1256545 0.8743455) *\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = tbl_bin$test)\n#> # A tibble: 7 × 1\n#> .pred_class\n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n#> 6 \n#> 7 \npredict(decision_tree_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # A tibble: 7 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.246 0.754\n#> 2 0.246 0.754\n#> 3 0.845 0.155\n#> 4 0.246 0.754\n#> 5 0.246 0.754\n#> 6 0.845 0.155\n#> 7 0.126 0.874\n```\n:::\n\n\n:::\n\n## Flexible Discriminant Analysis (`discrim_flexible()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and earth is the default engine so there is no need to set that either.\ndiscrim_flexible_spec <- discrim_flexible()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_flexible_fit <- discrim_flexible_spec |> fit(class ~ ., data = bin_train)\ndiscrim_flexible_fit\n#> parsnip model object\n#> \n#> Call:\n#> mda::fda(formula = class ~ ., data = data, method = earth::earth)\n#> \n#> Dimension: 1 \n#> \n#> Percent Between-Group Variance Explained:\n#> v1 \n#> 100 \n#> \n#> Training Misclassification Error: 0.1707 ( N = 785 )\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_flexible_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_flexible_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.339 0.661 \n#> 2 0.848 0.152 \n#> 3 0.342 0.658 \n#> 4 0.964 0.0360\n#> 5 0.964 0.0360\n#> 6 0.875 0.125\n```\n:::\n\n\n:::\n\n## Linear Discriminant Analysis (`discrim_linear()`) \n\n:::{.panel-tabset}\n\n## `MASS` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and MASS is the default engine so there is no need to set that either.\ndiscrim_linear_spec <- discrim_linear()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Call:\n#> lda(class ~ ., data = data)\n#> \n#> Prior probabilities of groups:\n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Group means:\n#> A B\n#> Class1 -0.2982900 -0.5573140\n#> Class2 0.3688258 0.6891006\n#> \n#> Coefficients of linear discriminants:\n#> LD1\n#> A -0.6068479\n#> B 1.7079953\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.369 0.631 \n#> 2 0.868 0.132 \n#> 3 0.541 0.459 \n#> 4 0.984 0.0158\n#> 5 0.928 0.0718\n#> 6 0.854 0.146\n```\n:::\n\n\n## `mda` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"mda\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Call:\n#> mda::fda(formula = class ~ ., data = data, method = mda::gen.ridge, \n#> keep.fitted = FALSE)\n#> \n#> Dimension: 1 \n#> \n#> Percent Between-Group Variance Explained:\n#> v1 \n#> 100 \n#> \n#> Degrees of Freedom (per dimension): 1.99423 \n#> \n#> Training Misclassification Error: 0.17707 ( N = 785 )\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.368 0.632 \n#> 2 0.867 0.133 \n#> 3 0.542 0.458 \n#> 4 0.984 0.0158\n#> 5 0.928 0.0718\n#> 6 0.853 0.147\n```\n:::\n\n\n## `sda` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sda\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> $regularization\n#> lambda lambda.var lambda.freqs \n#> 0.003136201 0.067551534 0.112819609 \n#> \n#> $freqs\n#> Class1 Class2 \n#> 0.5469019 0.4530981 \n#> \n#> $alpha\n#> Class1 Class2 \n#> -0.8934125 -1.2349286 \n#> \n#> $beta\n#> A B\n#> Class1 0.4565325 -1.298858\n#> Class2 -0.5510473 1.567757\n#> attr(,\"class\")\n#> [1] \"shrinkage\"\n#> \n#> attr(,\"class\")\n#> [1] \"sda\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.366 0.634 \n#> 2 0.860 0.140 \n#> 3 0.536 0.464 \n#> 4 0.982 0.0176\n#> 5 0.923 0.0768\n#> 6 0.845 0.155\n```\n:::\n\n\n## `sparsediscrim` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sparsediscrim\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Diagonal LDA\n#> \n#> Sample Size: 785 \n#> Number of Features: 2 \n#> \n#> Classes and Prior Probabilities:\n#> Class1 (55.29%), Class2 (44.71%)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.182 0.818 \n#> 2 0.755 0.245 \n#> 3 0.552 0.448 \n#> 4 0.996 0.00372\n#> 5 0.973 0.0274 \n#> 6 0.629 0.371\n```\n:::\n\n\n:::\n\n## Quandratic Discriminant Analysis (`discrim_quad()`) \n\n:::{.panel-tabset}\n\n## `MASS` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_spec <- discrim_quad()\n # This engine works with a single mode so no need to set that\n # and MASS is the default engine so there is no need to set that either.\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_fit <- discrim_quad_spec |> fit(class ~ ., data = bin_train)\ndiscrim_quad_fit\n#> parsnip model object\n#> \n#> Call:\n#> qda(class ~ ., data = data)\n#> \n#> Prior probabilities of groups:\n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Group means:\n#> A B\n#> Class1 -0.2982900 -0.5573140\n#> Class2 0.3688258 0.6891006\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_quad_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_quad_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.340 0.660 \n#> 2 0.884 0.116 \n#> 3 0.500 0.500 \n#> 4 0.965 0.0349\n#> 5 0.895 0.105 \n#> 6 0.895 0.105\n```\n:::\n\n\n## `sparsediscrim` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_spec <- discrim_quad() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sparsediscrim\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_fit <- discrim_quad_spec |> fit(class ~ ., data = bin_train)\ndiscrim_quad_fit\n#> parsnip model object\n#> \n#> Diagonal QDA\n#> \n#> Sample Size: 785 \n#> Number of Features: 2 \n#> \n#> Classes and Prior Probabilities:\n#> Class1 (55.29%), Class2 (44.71%)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_quad_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_quad_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.180 0.820 \n#> 2 0.750 0.250 \n#> 3 0.556 0.444 \n#> 4 0.994 0.00634\n#> 5 0.967 0.0328 \n#> 6 0.630 0.370\n```\n:::\n\n\n:::\n\n## Regularized Discriminant Analysis (`discrim_regularized()`) \n\n:::{.panel-tabset}\n\n## `klaR` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and klaR is the default engine so there is no need to set that either.\ndiscrim_regularized_spec <- discrim_regularized()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_regularized_fit <- discrim_regularized_spec |> fit(class ~ ., data = bin_train)\ndiscrim_regularized_fit\n#> parsnip model object\n#> \n#> Call: \n#> rda(formula = class ~ ., data = data)\n#> \n#> Regularization parameters: \n#> gamma lambda \n#> 0.0005969518 0.0131575746 \n#> \n#> Prior probabilities of groups: \n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Misclassification rate: \n#> apparent: 17.707 %\n#> cross-validated: 17.682 %\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_regularized_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_regularized_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.340 0.660 \n#> 2 0.883 0.117 \n#> 3 0.501 0.499 \n#> 4 0.965 0.0346\n#> 5 0.895 0.105 \n#> 6 0.894 0.106\n```\n:::\n\n\n:::\n\n## Generalized Additive Models (`gen_additive_mod()`) \n\n:::{.panel-tabset}\n\n## `mgcv` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_spec <- gen_additive_mod() |>\n # We need to set the mode since this engine works with multiple modes\n # and mgcv is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_fit <- \n gen_additive_mod_spec |> \n fit(class ~ s(A) + s(B), data = bin_train)\ngen_additive_mod_fit\n#> parsnip model object\n#> \n#> \n#> Family: binomial \n#> Link function: logit \n#> \n#> Formula:\n#> class ~ s(A) + s(B)\n#> \n#> Estimated degrees of freedom:\n#> 2.76 4.22 total = 7.98 \n#> \n#> UBRE score: -0.153537\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(gen_additive_mod_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(gen_additive_mod_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.400 0.600 \n#> 2 0.826 0.174 \n#> 3 0.454 0.546 \n#> 4 0.975 0.0250\n#> 5 0.929 0.0711\n#> 6 0.829 0.171\npredict(gen_additive_mod_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0.304 0.504 0.496 0.696\n#> 2 0.739 0.889 0.111 0.261\n#> 3 0.364 0.546 0.454 0.636\n#> 4 0.846 0.996 0.00358 0.154\n#> 5 0.881 0.958 0.0416 0.119\n#> 6 0.735 0.894 0.106 0.265\n```\n:::\n\n\n:::\n\n## Logistic Regression (`logistic_reg()`) \n\n:::{.panel-tabset}\n\n## `glm` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg()\n # This engine works with a single mode so no need to set that\n # and glm is the default engine so there is no need to set that either.\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = class ~ ., family = stats::binomial, data = data)\n#> \n#> Coefficients:\n#> (Intercept) A B \n#> -0.3563 -1.1250 2.8154 \n#> \n#> Degrees of Freedom: 784 Total (i.e. Null); 782 Residual\n#> Null Deviance:\t 1079 \n#> Residual Deviance: 666.9 \tAIC: 672.9\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.400 0.600 \n#> 2 0.862 0.138 \n#> 3 0.541 0.459 \n#> 4 0.977 0.0234\n#> 5 0.909 0.0905\n#> 6 0.853 0.147\npredict(logistic_reg_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0.339 0.465 0.535 0.661 \n#> 2 0.816 0.897 0.103 0.184 \n#> 3 0.493 0.588 0.412 0.507 \n#> 4 0.960 0.986 0.0137 0.0395\n#> 5 0.875 0.935 0.0647 0.125 \n#> 6 0.800 0.894 0.106 0.200\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(466)\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Logistic regression\n#> \n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> batch size: 707 \n#> validation loss after 1 epoch: 0.283\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.412 0.588 \n#> 2 0.854 0.146 \n#> 3 0.537 0.463 \n#> 4 0.971 0.0294\n#> 5 0.896 0.104 \n#> 6 0.848 0.152\n```\n:::\n\n\n## `gee` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- \n logistic_reg_spec |> \n fit(outcome ~ treatment * visit + id_var(patientID), data = cls_group_train)\n#> Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27\n#> running glm to get initial regression estimate\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> \n#> GEE: GENERALIZED LINEAR MODELS FOR DEPENDENT DATA\n#> gee S-function, version 4.13 modified 98/01/27 (1998) \n#> \n#> Model:\n#> Link: Logit \n#> Variance to Mean Relation: Binomial \n#> Correlation Structure: Independent \n#> \n#> Call:\n#> gee::gee(formula = outcome ~ treatment + visit, id = data$patientID, \n#> data = data, family = binomial)\n#> \n#> Number of observations : 1433 \n#> \n#> Maximum cluster size : 7 \n#> \n#> \n#> Coefficients:\n#> (Intercept) treatmentterbinafine visit \n#> -0.06853546 -0.25700680 -0.35646522 \n#> \n#> Estimated Scale Parameter: 0.9903994\n#> Number of Iterations: 1\n#> \n#> Working Correlation[1:4,1:4]\n#> [,1] [,2] [,3] [,4]\n#> [1,] 1 0 0 0\n#> [2,] 0 1 0 0\n#> [3,] 0 0 1 0\n#> [4,] 0 0 0 1\n#> \n#> \n#> Returned Error Value:\n#> [1] 0\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = cls_group_test)\n#> # A tibble: 475 × 1\n#> .pred_class \n#> \n#> 1 none or mild\n#> 2 none or mild\n#> 3 none or mild\n#> 4 none or mild\n#> 5 none or mild\n#> 6 none or mild\n#> 7 none or mild\n#> 8 none or mild\n#> 9 none or mild\n#> 10 none or mild\n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"prob\", new_data = cls_group_test)\n#> # A tibble: 475 × 2\n#> `.pred_none or mild` `.pred_moderate or severe`\n#> \n#> 1 0.664 0.336 \n#> 2 0.739 0.261 \n#> 3 0.801 0.199 \n#> 4 0.852 0.148 \n#> 5 0.892 0.108 \n#> 6 0.922 0.0784\n#> 7 0.944 0.0562\n#> 8 0.605 0.395 \n#> 9 0.686 0.314 \n#> 10 0.757 0.243 \n#> # ℹ 465 more rows\n```\n:::\n\n\n## `glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- \n logistic_reg_spec |> \n fit(outcome ~ treatment * visit + (1 | patientID), data = cls_group_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Generalized linear mixed model fit by maximum likelihood (Laplace\n#> Approximation) [glmerMod]\n#> Family: binomial ( logit )\n#> Formula: outcome ~ treatment * visit + (1 | patientID)\n#> Data: data\n#> AIC BIC logLik -2*log(L) df.resid \n#> 863.8271 890.1647 -426.9135 853.8271 1428 \n#> Random effects:\n#> Groups Name Std.Dev.\n#> patientID (Intercept) 8.35 \n#> Number of obs: 1433, groups: patientID, 219\n#> Fixed Effects:\n#> (Intercept) treatmentterbinafine \n#> -4.57420 -0.51193 \n#> visit treatmentterbinafine:visit \n#> -0.98725 -0.00112\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = cls_group_test)\n#> # A tibble: 475 × 1\n#> .pred_class \n#> \n#> 1 none or mild\n#> 2 none or mild\n#> 3 none or mild\n#> 4 none or mild\n#> 5 none or mild\n#> 6 none or mild\n#> 7 none or mild\n#> 8 none or mild\n#> 9 none or mild\n#> 10 none or mild\n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"prob\", new_data = cls_group_test)\n#> # A tibble: 475 × 2\n#> `.pred_none or mild` `.pred_moderate or severe`\n#> \n#> 1 0.998 0.00230 \n#> 2 0.999 0.000856 \n#> 3 1.000 0.000319 \n#> 4 1.000 0.000119 \n#> 5 1.000 0.0000441 \n#> 6 1.000 0.0000164 \n#> 7 1.000 0.00000612\n#> 8 0.996 0.00383 \n#> 9 0.999 0.00143 \n#> 10 0.999 0.000533 \n#> # ℹ 465 more rows\n```\n:::\n\n\n## `glmnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"binomial\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.308300\n#> 2 1 4.75 0.280900\n#> 3 1 8.73 0.256000\n#> 4 1 12.10 0.233200\n#> 5 1 14.99 0.212500\n#> 6 1 17.46 0.193600\n#> 7 1 19.60 0.176400\n#> 8 1 21.45 0.160800\n#> 9 1 23.05 0.146500\n#> 10 1 24.44 0.133500\n#> 11 1 25.65 0.121600\n#> 12 1 26.70 0.110800\n#> 13 1 27.61 0.101000\n#> 14 1 28.40 0.091990\n#> 15 1 29.08 0.083820\n#> 16 1 29.68 0.076370\n#> 17 1 30.19 0.069590\n#> 18 1 30.63 0.063410\n#> 19 1 31.00 0.057770\n#> 20 1 31.33 0.052640\n#> 21 1 31.61 0.047960\n#> 22 1 31.85 0.043700\n#> 23 1 32.05 0.039820\n#> 24 2 32.62 0.036280\n#> 25 2 33.41 0.033060\n#> 26 2 34.10 0.030120\n#> 27 2 34.68 0.027450\n#> 28 2 35.19 0.025010\n#> 29 2 35.63 0.022790\n#> 30 2 36.01 0.020760\n#> 31 2 36.33 0.018920\n#> 32 2 36.62 0.017240\n#> 33 2 36.86 0.015710\n#> 34 2 37.06 0.014310\n#> 35 2 37.24 0.013040\n#> 36 2 37.39 0.011880\n#> 37 2 37.52 0.010830\n#> 38 2 37.63 0.009864\n#> 39 2 37.72 0.008988\n#> 40 2 37.80 0.008189\n#> 41 2 37.86 0.007462\n#> 42 2 37.92 0.006799\n#> 43 2 37.97 0.006195\n#> 44 2 38.01 0.005644\n#> 45 2 38.04 0.005143\n#> 46 2 38.07 0.004686\n#> 47 2 38.10 0.004270\n#> 48 2 38.12 0.003891\n#> 49 2 38.13 0.003545\n#> 50 2 38.15 0.003230\n#> 51 2 38.16 0.002943\n#> 52 2 38.17 0.002682\n#> 53 2 38.18 0.002443\n#> 54 2 38.18 0.002226\n#> 55 2 38.19 0.002029\n#> 56 2 38.19 0.001848\n#> 57 2 38.20 0.001684\n#> 58 2 38.20 0.001534\n#> 59 2 38.20 0.001398\n#> 60 2 38.21 0.001274\n#> 61 2 38.21 0.001161\n#> 62 2 38.21 0.001058\n#> 63 2 38.21 0.000964\n#> 64 2 38.21 0.000878\n#> 65 2 38.21 0.000800\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.383 0.617 \n#> 2 0.816 0.184 \n#> 3 0.537 0.463 \n#> 4 0.969 0.0313\n#> 5 0.894 0.106 \n#> 6 0.797 0.203\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: glm\n#> Model ID: GLM_model_R_1763571327438_5177 \n#> GLM Model: summary\n#> family link regularization\n#> 1 binomial logit Elastic Net (alpha = 0.5, lambda = 6.162E-4 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 4\n#> training_frame\n#> 1 object_zkelygexok\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept -0.350788 -0.350788\n#> 2 A -1.084233 -1.084233\n#> 3 B 2.759366 2.759366\n#> \n#> H2OBinomialMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.130451\n#> RMSE: 0.3611799\n#> LogLoss: 0.4248206\n#> Mean Per-Class Error: 0.1722728\n#> AUC: 0.8889644\n#> AUCPR: 0.8520865\n#> Gini: 0.7779288\n#> R^2: 0.4722968\n#> Residual Deviance: 666.9684\n#> AIC: 672.9684\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 350 84 0.193548 =84/434\n#> Class2 53 298 0.150997 =53/351\n#> Totals 403 382 0.174522 =137/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.411045 0.813097 213\n#> 2 max f2 0.229916 0.868991 279\n#> 3 max f0point5 0.565922 0.816135 166\n#> 4 max accuracy 0.503565 0.826752 185\n#> 5 max precision 0.997356 1.000000 0\n#> 6 max recall 0.009705 1.000000 395\n#> 7 max specificity 0.997356 1.000000 0\n#> 8 max absolute_mcc 0.411045 0.652014 213\n#> 9 max min_per_class_accuracy 0.454298 0.822581 201\n#> 10 max mean_per_class_accuracy 0.411045 0.827727 213\n#> 11 max tns 0.997356 434.000000 0\n#> 12 max fns 0.997356 349.000000 0\n#> 13 max fps 0.001723 434.000000 399\n#> 14 max tps 0.009705 351.000000 395\n#> 15 max tnr 0.997356 1.000000 0\n#> 16 max fnr 0.997356 0.994302 0\n#> 17 max fpr 0.001723 1.000000 399\n#> 18 max tpr 0.009705 1.000000 395\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.399 0.601 \n#> 2 0.857 0.143 \n#> 3 0.540 0.460 \n#> 4 0.976 0.0243\n#> 5 0.908 0.0925\n#> 6 0.848 0.152\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(730)\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Model: \"sequential\"\n#> ________________________________________________________________________________\n#> Layer (type) Output Shape Param # \n#> ================================================================================\n#> dense (Dense) (None, 1) 3 \n#> dense_1 (Dense) (None, 2) 4 \n#> ================================================================================\n#> Total params: 7 (28.00 Byte)\n#> Trainable params: 7 (28.00 Byte)\n#> Non-trainable params: 0 (0.00 Byte)\n#> ________________________________________________________________________________\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> 1/1 - 0s - 92ms/epoch - 92ms/step\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class2\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> 1/1 - 0s - 6ms/epoch - 6ms/step\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.214 0.786 \n#> 2 0.633 0.367 \n#> 3 0.584 0.416 \n#> 4 0.990 0.00975\n#> 5 0.955 0.0449 \n#> 6 0.477 0.523\n```\n:::\n\n\n## `LiblineaR` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"LiblineaR\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized logistic regression primal (L2R_LR)\"\n#> \n#> $Type\n#> [1] 0\n#> \n#> $W\n#> A B Bias\n#> [1,] 1.014233 -2.65166 0.3363362\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $ClassNames\n#> [1] Class1 Class2\n#> Levels: Class1 Class2\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.397 0.603 \n#> 2 0.847 0.153 \n#> 3 0.539 0.461 \n#> 4 0.973 0.0267\n#> 5 0.903 0.0974\n#> 6 0.837 0.163\n```\n:::\n\n\n## `stan` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(96)\nlogistic_reg_fit <- \n logistic_reg_spec |> \n fit(outcome ~ treatment * visit, data = cls_group_train)\nlogistic_reg_fit |> print(digits = 3)\n#> parsnip model object\n#> \n#> stan_glm\n#> family: binomial [logit]\n#> formula: outcome ~ treatment * visit\n#> observations: 1433\n#> predictors: 4\n#> ------\n#> Median MAD_SD\n#> (Intercept) -0.137 0.187\n#> treatmentterbinafine -0.108 0.264\n#> visit -0.335 0.050\n#> treatmentterbinafine:visit -0.048 0.073\n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = cls_group_test)\n#> # A tibble: 475 × 1\n#> .pred_class \n#> \n#> 1 none or mild\n#> 2 none or mild\n#> 3 none or mild\n#> 4 none or mild\n#> 5 none or mild\n#> 6 none or mild\n#> 7 none or mild\n#> 8 none or mild\n#> 9 none or mild\n#> 10 none or mild\n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"prob\", new_data = cls_group_test)\n#> # A tibble: 475 × 2\n#> `.pred_none or mild` `.pred_moderate or severe`\n#> \n#> 1 0.652 0.348 \n#> 2 0.734 0.266 \n#> 3 0.802 0.198 \n#> 4 0.856 0.144 \n#> 5 0.898 0.102 \n#> 6 0.928 0.0721\n#> 7 0.950 0.0502\n#> 8 0.617 0.383 \n#> 9 0.692 0.308 \n#> 10 0.759 0.241 \n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"conf_int\", new_data = cls_group_test)\n#> # A tibble: 475 × 4\n#> `.pred_lower_none or mild` `.pred_upper_none or mild` .pred_lower_moderate …¹\n#> \n#> 1 0.583 0.715 0.285 \n#> 2 0.689 0.776 0.224 \n#> 3 0.771 0.832 0.168 \n#> 4 0.827 0.883 0.117 \n#> 5 0.868 0.924 0.0761\n#> 6 0.899 0.952 0.0482\n#> 7 0.922 0.970 0.0302\n#> 8 0.547 0.683 0.317 \n#> 9 0.644 0.736 0.264 \n#> 10 0.723 0.791 0.209 \n#> # ℹ 465 more rows\n#> # ℹ abbreviated name: ¹​`.pred_lower_moderate or severe`\n#> # ℹ 1 more variable: `.pred_upper_moderate or severe` \npredict(logistic_reg_fit, type = \"pred_int\", new_data = cls_group_test)\n#> # A tibble: 475 × 4\n#> `.pred_lower_none or mild` `.pred_upper_none or mild` .pred_lower_moderate …¹\n#> \n#> 1 0 1 0\n#> 2 0 1 0\n#> 3 0 1 0\n#> 4 0 1 0\n#> 5 0 1 0\n#> 6 0 1 0\n#> 7 0 1 0\n#> 8 0 1 0\n#> 9 0 1 0\n#> 10 0 1 0\n#> # ℹ 465 more rows\n#> # ℹ abbreviated name: ¹​`.pred_lower_moderate or severe`\n#> # ℹ 1 more variable: `.pred_upper_moderate or severe` \n```\n:::\n\n\n## `stan_glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(484)\nlogistic_reg_fit <- \n logistic_reg_spec |> \n fit(outcome ~ treatment * visit + (1 | patientID), data = cls_group_train)\nlogistic_reg_fit |> print(digits = 3)\n#> parsnip model object\n#> \n#> stan_glmer\n#> family: binomial [logit]\n#> formula: outcome ~ treatment * visit + (1 | patientID)\n#> observations: 1433\n#> ------\n#> Median MAD_SD\n#> (Intercept) -0.628 0.585\n#> treatmentterbinafine -0.686 0.821\n#> visit -0.830 0.105\n#> treatmentterbinafine:visit -0.023 0.143\n#> \n#> Error terms:\n#> Groups Name Std.Dev.\n#> patientID (Intercept) 4.376 \n#> Num. levels: patientID 219 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = cls_group_test)\n#> # A tibble: 475 × 1\n#> .pred_class \n#> \n#> 1 none or mild\n#> 2 none or mild\n#> 3 none or mild\n#> 4 none or mild\n#> 5 none or mild\n#> 6 none or mild\n#> 7 none or mild\n#> 8 none or mild\n#> 9 none or mild\n#> 10 none or mild\n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"prob\", new_data = cls_group_test)\n#> # A tibble: 475 × 2\n#> `.pred_none or mild` `.pred_moderate or severe`\n#> \n#> 1 0.671 0.329 \n#> 2 0.730 0.270 \n#> 3 0.796 0.204 \n#> 4 0.847 0.153 \n#> 5 0.882 0.118 \n#> 6 0.909 0.0908\n#> 7 0.934 0.0655\n#> 8 0.613 0.387 \n#> 9 0.681 0.319 \n#> 10 0.744 0.256 \n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"conf_int\", new_data = cls_group_test)\n#> # A tibble: 475 × 4\n#> `.pred_lower_none or mild` `.pred_upper_none or mild` .pred_lower_moderate …¹\n#> \n#> 1 0.00184 1.000 0.0000217 \n#> 2 0.00417 1.000 0.00000942 \n#> 3 0.00971 1.000 0.00000412 \n#> 4 0.0214 1.000 0.00000169 \n#> 5 0.0465 1.000 0.000000706\n#> 6 0.101 1.000 0.000000300\n#> 7 0.203 1.000 0.000000120\n#> 8 0.000923 1.000 0.0000440 \n#> 9 0.00196 1.000 0.0000175 \n#> 10 0.00447 1.000 0.00000724 \n#> # ℹ 465 more rows\n#> # ℹ abbreviated name: ¹​`.pred_lower_moderate or severe`\n#> # ℹ 1 more variable: `.pred_upper_moderate or severe` \npredict(logistic_reg_fit, type = \"pred_int\", new_data = cls_group_test)\n#> # A tibble: 475 × 4\n#> `.pred_lower_none or mild` `.pred_upper_none or mild` .pred_lower_moderate …¹\n#> \n#> 1 0 1 0\n#> 2 0 1 0\n#> 3 0 1 0\n#> 4 0 1 0\n#> 5 0 1 0\n#> 6 0 1 0\n#> 7 0 1 0\n#> 8 0 1 0\n#> 9 0 1 0\n#> 10 0 1 0\n#> # ℹ 465 more rows\n#> # ℹ abbreviated name: ¹​`.pred_lower_moderate or severe`\n#> # ℹ 1 more variable: `.pred_upper_moderate or severe` \n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(Class ~ ., data = tbl_bin$training)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> Coefficients:\n#> (Intercept) A B \n#> -3.731170 -1.214355 3.794186\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class1 \n#> 4 Class2 \n#> 5 Class2 \n#> 6 Class1 \n#> 7 Class2\npredict(logistic_reg_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 2]\n#> # Database: spark_connection\n#> pred_Class1 pred_Class2\n#> \n#> 1 0.130 0.870\n#> 2 0.262 0.738\n#> 3 0.787 0.213\n#> 4 0.279 0.721\n#> 5 0.498 0.502\n#> 6 0.900 0.100\n#> 7 0.161 0.839\n```\n:::\n\n\n:::\n\n## Multivariate Adaptive Regression Splines (`mars()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_spec <- mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_fit <- mars_spec |> fit(class ~ ., data = bin_train)\nmars_fit\n#> parsnip model object\n#> \n#> GLM (family binomial, link logit):\n#> nulldev df dev df devratio AIC iters converged\n#> 1079.45 784 638.975 779 0.408 651 5 1\n#> \n#> Earth selected 6 of 13 terms, and 2 of 2 predictors\n#> Termination condition: Reached nk 21\n#> Importance: B, A\n#> Number of terms at each degree of interaction: 1 5 (additive model)\n#> Earth GCV 0.1342746 RSS 102.4723 GRSq 0.4582121 RSq 0.4719451\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mars_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mars_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.410 0.590 \n#> 2 0.794 0.206 \n#> 3 0.356 0.644 \n#> 4 0.927 0.0729\n#> 5 0.927 0.0729\n#> 6 0.836 0.164\n```\n:::\n\n\n:::\n\n## Neural Networks (`mlp()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(839)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> a 2-5-1 network with 21 weights\n#> inputs: A B \n#> output(s): class \n#> options were - entropy fitting\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.390 0.610\n#> 2 0.685 0.315\n#> 3 0.433 0.567\n#> 4 0.722 0.278\n#> 5 0.720 0.280\n#> 6 0.684 0.316\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(38)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> relu activation,\n#> 3 hidden units,\n#> 17 model parameters\n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 707 \n#> learn rate: 0.01 \n#> validation loss after 5 epochs: 0.427\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.387 0.613 \n#> 2 0.854 0.146 \n#> 3 0.540 0.460 \n#> 4 0.941 0.0589\n#> 5 0.882 0.118 \n#> 6 0.842 0.158\n```\n:::\n\n\n## `brulee_two_layer` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"brulee_two_layer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(336)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> c(relu,relu) activation,\n#> c(3,3) hidden units,\n#> 29 model parameters\n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 707 \n#> learn rate: 0.01 \n#> validation loss after 17 epochs: 0.405\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.392 0.608 \n#> 2 0.835 0.165 \n#> 3 0.440 0.560 \n#> 4 0.938 0.0620\n#> 5 0.938 0.0620\n#> 6 0.848 0.152\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(306)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: deeplearning\n#> Model ID: DeepLearning_model_R_1763571327438_5179 \n#> Status of Neuron Layers: predicting .outcome, 2-class classification, bernoulli distribution, CrossEntropy loss, 1,002 weights/biases, 16.9 KB, 7,850 training samples, mini-batch size 1\n#> layer units type dropout l1 l2 mean_rate rate_rms momentum\n#> 1 1 2 Input 0.00 % NA NA NA NA NA\n#> 2 2 200 Rectifier 0.00 % 0.000000 0.000000 0.006954 0.012998 0.000000\n#> 3 3 2 Softmax NA 0.000000 0.000000 0.003180 0.000140 0.000000\n#> mean_weight weight_rms mean_bias bias_rms\n#> 1 NA NA NA NA\n#> 2 0.001014 0.103167 0.490217 0.023645\n#> 3 -0.003600 0.402544 0.019355 0.013006\n#> \n#> \n#> H2OBinomialMetrics: deeplearning\n#> ** Reported on training data. **\n#> ** Metrics reported on full training frame **\n#> \n#> MSE: 0.1724685\n#> RMSE: 0.4152933\n#> LogLoss: 0.5401076\n#> Mean Per-Class Error: 0.1731524\n#> AUC: 0.8892926\n#> AUCPR: 0.8518107\n#> Gini: 0.7785852\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 348 86 0.198157 =86/434\n#> Class2 52 299 0.148148 =52/351\n#> Totals 400 385 0.175796 =138/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.719329 0.812500 153\n#> 2 max f2 0.540433 0.869565 213\n#> 3 max f0point5 0.836246 0.815873 105\n#> 4 max accuracy 0.793925 0.825478 126\n#> 5 max precision 0.998841 1.000000 0\n#> 6 max recall 0.026905 1.000000 393\n#> 7 max specificity 0.998841 1.000000 0\n#> 8 max absolute_mcc 0.719329 0.650150 153\n#> 9 max min_per_class_accuracy 0.761683 0.820513 139\n#> 10 max mean_per_class_accuracy 0.719329 0.826848 153\n#> 11 max tns 0.998841 434.000000 0\n#> 12 max fns 0.998841 349.000000 0\n#> 13 max fps 0.004356 434.000000 399\n#> 14 max tps 0.026905 351.000000 393\n#> 15 max tnr 0.998841 1.000000 0\n#> 16 max fnr 0.998841 0.994302 0\n#> 17 max fpr 0.004356 1.000000 399\n#> 18 max tpr 0.026905 1.000000 393\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.149 0.851 \n#> 2 0.639 0.361 \n#> 3 0.237 0.763 \n#> 4 0.924 0.0763\n#> 5 0.739 0.261 \n#> 6 0.623 0.377\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(216)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit\n#> parsnip model object\n#> \n#> Model: \"sequential_1\"\n#> ________________________________________________________________________________\n#> Layer (type) Output Shape Param # \n#> ================================================================================\n#> dense_2 (Dense) (None, 5) 15 \n#> dense_3 (Dense) (None, 2) 12 \n#> ================================================================================\n#> Total params: 27 (108.00 Byte)\n#> Trainable params: 27 (108.00 Byte)\n#> Non-trainable params: 0 (0.00 Byte)\n#> ________________________________________________________________________________\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> 1/1 - 0s - 42ms/epoch - 42ms/step\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class2\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> 1/1 - 0s - 6ms/epoch - 6ms/step\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.313 0.687\n#> 2 0.578 0.422\n#> 3 0.503 0.497\n#> 4 0.894 0.106\n#> 5 0.869 0.131\n#> 6 0.470 0.530\n```\n:::\n\n\n:::\n\n## Multinom Regression (`multinom_reg()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and nnet is the default engine so there is no need to set that either.\nmultinom_reg_spec <- multinom_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(634)\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> nnet::multinom(formula = class ~ ., data = data, trace = FALSE)\n#> \n#> Coefficients:\n#> (Intercept) A B\n#> two -0.5868435 1.881920 1.379106\n#> three 0.2910810 1.129622 1.292802\n#> \n#> Residual Deviance: 315.8164 \n#> AIC: 327.8164\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.145 0.213 0.641 \n#> 2 0.308 0.178 0.514 \n#> 3 0.350 0.189 0.461 \n#> 4 0.983 0.00123 0.0155\n#> 5 0.956 0.00275 0.0415\n#> 6 0.00318 0.754 0.243 \n#> 7 0.0591 0.414 0.527 \n#> 8 0.522 0.0465 0.431\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(837)\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Multinomial regression\n#> \n#> 192 samples, 2 features, 3 classes \n#> class weights one=1, two=1, three=1 \n#> weight decay: 0.001 \n#> batch size: 173 \n#> validation loss after 1 epoch: 0.953\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 three\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.131 0.190 0.679 \n#> 2 0.303 0.174 0.523 \n#> 3 0.358 0.192 0.449 \n#> 4 0.983 0.00125 0.0154\n#> 5 0.948 0.00275 0.0491\n#> 6 0.00344 0.796 0.200 \n#> 7 0.0611 0.420 0.518 \n#> 8 0.443 0.0390 0.518\n```\n:::\n\n\n## `glmnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"multinomial\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.219200\n#> 2 1 1.61 0.199700\n#> 3 2 3.90 0.181900\n#> 4 2 6.07 0.165800\n#> 5 2 7.93 0.151100\n#> 6 2 9.52 0.137600\n#> 7 2 10.90 0.125400\n#> 8 2 12.09 0.114300\n#> 9 2 13.13 0.104100\n#> 10 2 14.22 0.094870\n#> 11 2 15.28 0.086440\n#> 12 2 16.20 0.078760\n#> 13 2 16.99 0.071760\n#> 14 2 17.68 0.065390\n#> 15 2 18.28 0.059580\n#> 16 2 18.80 0.054290\n#> 17 2 19.24 0.049460\n#> 18 2 19.63 0.045070\n#> 19 2 19.96 0.041070\n#> 20 2 20.25 0.037420\n#> 21 2 20.49 0.034090\n#> 22 2 20.70 0.031070\n#> 23 2 20.88 0.028310\n#> 24 2 21.04 0.025790\n#> 25 2 21.17 0.023500\n#> 26 2 21.28 0.021410\n#> 27 2 21.38 0.019510\n#> 28 2 21.46 0.017780\n#> 29 2 21.53 0.016200\n#> 30 2 21.58 0.014760\n#> 31 2 21.63 0.013450\n#> 32 2 21.67 0.012250\n#> 33 2 21.71 0.011160\n#> 34 2 21.74 0.010170\n#> 35 2 21.77 0.009269\n#> 36 2 21.79 0.008445\n#> 37 2 21.82 0.007695\n#> 38 2 21.83 0.007011\n#> 39 2 21.85 0.006389\n#> 40 2 21.86 0.005821\n#> 41 2 21.87 0.005304\n#> 42 2 21.88 0.004833\n#> 43 2 21.89 0.004403\n#> 44 2 21.89 0.004012\n#> 45 2 21.90 0.003656\n#> 46 2 21.90 0.003331\n#> 47 2 21.91 0.003035\n#> 48 2 21.91 0.002765\n#> 49 2 21.91 0.002520\n#> 50 2 21.91 0.002296\n#> 51 2 21.92 0.002092\n#> 52 2 21.92 0.001906\n#> 53 2 21.92 0.001737\n#> 54 2 21.92 0.001582\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.163 0.211 0.626 \n#> 2 0.318 0.185 0.496 \n#> 3 0.358 0.198 0.444 \n#> 4 0.976 0.00268 0.0217\n#> 5 0.940 0.00529 0.0544\n#> 6 0.00617 0.699 0.295 \n#> 7 0.0757 0.390 0.534 \n#> 8 0.506 0.0563 0.438\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OMultinomialModel: glm\n#> Model ID: GLM_model_R_1763571327438_5182 \n#> GLM Model: summary\n#> family link regularization\n#> 1 multinomial multinomial Elastic Net (alpha = 0.5, lambda = 4.372E-4 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 9 6 4\n#> training_frame\n#> 1 object_jbhwnlsrno\n#> \n#> Coefficients: glm multinomial coefficients\n#> names coefs_class_0 coefs_class_1 coefs_class_2 std_coefs_class_0\n#> 1 Intercept -1.119482 -0.831434 -1.706488 -1.083442\n#> 2 A -1.119327 0.002894 0.750746 -1.029113\n#> 3 B -1.208210 0.078752 0.162842 -1.187423\n#> std_coefs_class_1 std_coefs_class_2\n#> 1 -0.819868 -1.830487\n#> 2 0.002661 0.690238\n#> 3 0.077397 0.160041\n#> \n#> H2OMultinomialMetrics: glm\n#> ** Reported on training data. **\n#> \n#> Training Set Metrics: \n#> =====================\n#> \n#> Extract training frame with `h2o.getFrame(\"object_jbhwnlsrno\")`\n#> MSE: (Extract with `h2o.mse`) 0.2982118\n#> RMSE: (Extract with `h2o.rmse`) 0.5460878\n#> Logloss: (Extract with `h2o.logloss`) 0.822443\n#> Mean Per-Class Error: 0.4583896\n#> AUC: (Extract with `h2o.auc`) NaN\n#> AUCPR: (Extract with `h2o.aucpr`) NaN\n#> Null Deviance: (Extract with `h2o.nulldeviance`) 404.5036\n#> Residual Deviance: (Extract with `h2o.residual_deviance`) 315.8181\n#> R^2: (Extract with `h2o.r2`) 0.4682043\n#> AIC: (Extract with `h2o.aic`) NaN\n#> Confusion Matrix: Extract with `h2o.confusionMatrix(,train = TRUE)`)\n#> =========================================================================\n#> Confusion Matrix: Row labels: Actual class; Column labels: Predicted class\n#> one three two Error Rate\n#> one 59 18 1 0.2436 = 19 / 78\n#> three 19 52 5 0.3158 = 24 / 76\n#> two 7 24 7 0.8158 = 31 / 38\n#> Totals 85 94 13 0.3854 = 74 / 192\n#> \n#> Hit Ratio Table: Extract with `h2o.hit_ratio_table(,train = TRUE)`\n#> =======================================================================\n#> Top-3 Hit Ratios: \n#> k hit_ratio\n#> 1 1 0.614583\n#> 2 2 0.890625\n#> 3 3 1.000000\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_three .pred_two\n#> \n#> 1 0.146 0.641 0.213 \n#> 2 0.308 0.513 0.179 \n#> 3 0.350 0.460 0.190 \n#> 4 0.983 0.0158 0.00128\n#> 5 0.955 0.0422 0.00284\n#> 6 0.00329 0.244 0.752 \n#> 7 0.0599 0.527 0.413 \n#> 8 0.521 0.432 0.0469\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Model: \"sequential_2\"\n#> ________________________________________________________________________________\n#> Layer (type) Output Shape Param # \n#> ================================================================================\n#> dense_4 (Dense) (None, 1) 3 \n#> dense_5 (Dense) (None, 3) 6 \n#> ================================================================================\n#> Total params: 9 (36.00 Byte)\n#> Trainable params: 9 (36.00 Byte)\n#> Non-trainable params: 0 (0.00 Byte)\n#> ________________________________________________________________________________\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> 1/1 - 0s - 43ms/epoch - 43ms/step\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 one \n#> 4 one \n#> 5 one \n#> 6 three \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> 1/1 - 0s - 6ms/epoch - 6ms/step\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.262 0.342 0.396 \n#> 2 0.335 0.326 0.338 \n#> 3 0.352 0.322 0.326 \n#> 4 0.749 0.159 0.0919\n#> 5 0.680 0.194 0.126 \n#> 6 0.0924 0.335 0.573 \n#> 7 0.203 0.349 0.448 \n#> 8 0.417 0.303 0.280\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = tbl_mtl$training)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Formula: class ~ .\n#> \n#> Coefficients:\n#> (Intercept) A B\n#> one 0.05447853 -1.0569131 -0.9049194\n#> three 0.41207949 0.1458870 0.3959664\n#> two -0.46655802 0.9110261 0.5089529\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = tbl_mtl$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 one \n#> 2 one \n#> 3 three \n#> 4 three \n#> 5 three \n#> 6 three \n#> 7 three\npredict(multinom_reg_fit, type = \"prob\", new_data = tbl_mtl$test)\n#> # Source: SQL [?? x 3]\n#> # Database: spark_connection\n#> pred_one pred_three pred_two\n#> \n#> 1 0.910 0.0814 0.00904\n#> 2 0.724 0.233 0.0427 \n#> 3 0.124 0.620 0.256 \n#> 4 0.0682 0.610 0.322 \n#> 5 0.130 0.571 0.300 \n#> 6 0.115 0.549 0.336 \n#> 7 0.0517 0.524 0.424\n```\n:::\n\n\n:::\n\n## Naive Bayes (`naive_Bayes()`) \n\n:::{.panel-tabset}\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_spec <- naive_Bayes() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\nnaive_Bayes_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: naivebayes\n#> Model ID: NaiveBayes_model_R_1763571327438_5183 \n#> Model Summary: \n#> number_of_response_levels min_apriori_probability max_apriori_probability\n#> 1 2 0.44713 0.55287\n#> \n#> \n#> H2OBinomialMetrics: naivebayes\n#> ** Reported on training data. **\n#> \n#> MSE: 0.1737113\n#> RMSE: 0.4167869\n#> LogLoss: 0.5473431\n#> Mean Per-Class Error: 0.2356138\n#> AUC: 0.8377152\n#> AUCPR: 0.788608\n#> Gini: 0.6754303\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 274 160 0.368664 =160/434\n#> Class2 36 315 0.102564 =36/351\n#> Totals 310 475 0.249682 =196/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.175296 0.762712 286\n#> 2 max f2 0.133412 0.851119 306\n#> 3 max f0point5 0.497657 0.731343 183\n#> 4 max accuracy 0.281344 0.765605 248\n#> 5 max precision 0.999709 1.000000 0\n#> 6 max recall 0.020983 1.000000 390\n#> 7 max specificity 0.999709 1.000000 0\n#> 8 max absolute_mcc 0.280325 0.541898 249\n#> 9 max min_per_class_accuracy 0.398369 0.758065 215\n#> 10 max mean_per_class_accuracy 0.280325 0.771945 249\n#> 11 max tns 0.999709 434.000000 0\n#> 12 max fns 0.999709 347.000000 0\n#> 13 max fps 0.006522 434.000000 399\n#> 14 max tps 0.020983 351.000000 390\n#> 15 max tnr 0.999709 1.000000 0\n#> 16 max fnr 0.999709 0.988604 0\n#> 17 max fpr 0.006522 1.000000 399\n#> 18 max tpr 0.020983 1.000000 390\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class2\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.181 0.819 \n#> 2 0.750 0.250 \n#> 3 0.556 0.444 \n#> 4 0.994 0.00643\n#> 5 0.967 0.0331 \n#> 6 0.630 0.370\n```\n:::\n\n\n## `klaR` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and klaR is the default engine so there is no need to set that either.\nnaive_Bayes_spec <- naive_Bayes()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.250 0.750 \n#> 2 0.593 0.407 \n#> 3 0.333 0.667 \n#> 4 0.993 0.00658\n#> 5 0.978 0.0223 \n#> 6 0.531 0.469\n```\n:::\n\n\n## `naivebayes` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_spec <- naive_Bayes() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"naivebayes\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\nnaive_Bayes_fit\n#> parsnip model object\n#> \n#> \n#> ================================= Naive Bayes ==================================\n#> \n#> Call:\n#> naive_bayes.default(x = maybe_data_frame(x), y = y, usekernel = TRUE)\n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> Laplace smoothing: 0\n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> A priori probabilities: \n#> \n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> Tables: \n#> \n#> -------------------------------------------------------------------------------- \n#> :: A::Class1 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (434 obs.);\tBandwidth 'bw' = 0.2548\n#> \n#> x y \n#> Min. :-2.5638 Min. :0.0002915 \n#> 1st Qu.:-1.2013 1st Qu.:0.0506201 \n#> Median : 0.1612 Median :0.1619843 \n#> Mean : 0.1612 Mean :0.1831190 \n#> 3rd Qu.: 1.5237 3rd Qu.:0.2581668 \n#> Max. : 2.8862 Max. :0.5370762 \n#> -------------------------------------------------------------------------------- \n#> :: A::Class2 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (351 obs.);\tBandwidth 'bw' = 0.2596\n#> \n#> x y \n#> Min. :-2.5428 Min. :4.977e-05 \n#> 1st Qu.:-1.1840 1st Qu.:2.672e-02 \n#> Median : 0.1748 Median :2.239e-01 \n#> Mean : 0.1748 Mean :1.836e-01 \n#> 3rd Qu.: 1.5336 3rd Qu.:2.926e-01 \n#> Max. : 2.8924 Max. :3.740e-01 \n#> \n#> -------------------------------------------------------------------------------- \n#> :: B::Class1 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (434 obs.);\tBandwidth 'bw' = 0.1793\n#> \n#> x y \n#> Min. :-2.4501 Min. :5.747e-05 \n#> 1st Qu.:-1.0894 1st Qu.:1.424e-02 \n#> Median : 0.2713 Median :8.798e-02 \n#> Mean : 0.2713 Mean :1.834e-01 \n#> 3rd Qu.: 1.6320 3rd Qu.:2.758e-01 \n#> Max. : 2.9927 Max. :6.872e-01 \n#> \n#> -------------------------------------------------------------------------------- \n#> :: B::Class2 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (351 obs.);\tBandwidth 'bw' = 0.2309\n#> \n#> x y \n#> Min. :-2.4621 Min. :5.623e-05 \n#> 1st Qu.:-0.8979 1st Qu.:1.489e-02 \n#> Median : 0.6663 Median :7.738e-02 \n#> Mean : 0.6663 Mean :1.595e-01 \n#> 3rd Qu.: 2.2305 3rd Qu.:3.336e-01 \n#> Max. : 3.7948 Max. :4.418e-01 \n#> \n#> --------------------------------------------------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.249 0.751 \n#> 2 0.593 0.407 \n#> 3 0.332 0.668 \n#> 4 0.993 0.00674\n#> 5 0.978 0.0224 \n#> 6 0.532 0.468\n```\n:::\n\n\n:::\n\n## K-Nearest Neighbors (`nearest_neighbor()`) \n\n:::{.panel-tabset}\n\n## `kknn` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_spec <- nearest_neighbor() |>\n # We need to set the mode since this engine works with multiple modes\n # and kknn is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_fit <- nearest_neighbor_spec |> fit(class ~ ., data = bin_train)\nnearest_neighbor_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> kknn::train.kknn(formula = class ~ ., data = data, ks = min_rows(5, data, 5))\n#> \n#> Type of response variable: nominal\n#> Minimal misclassification: 0.2101911\n#> Best kernel: optimal\n#> Best k: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(nearest_neighbor_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(nearest_neighbor_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.2 0.8 \n#> 2 0.72 0.28\n#> 3 0.32 0.68\n#> 4 1 0 \n#> 5 1 0 \n#> 6 1 0\n```\n:::\n\n\n:::\n\n## Null Model (`null_model()`) \n\n:::{.panel-tabset}\n\n## `parsnip` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_spec <- null_model() |>\n # We need to set the mode since this engine works with multiple modes\n # and parsnip is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_fit <- null_model_spec |> fit(class ~ ., data = bin_train)\nnull_model_fit\n#> parsnip model object\n#> \n#> Null Regression Model\n#> Predicted Value: Class1\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(null_model_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(null_model_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.553 0.447\n#> 2 0.553 0.447\n#> 3 0.553 0.447\n#> 4 0.553 0.447\n#> 5 0.553 0.447\n#> 6 0.553 0.447\n```\n:::\n\n\n:::\n\n## Partial Least Squares (`pls()`) \n\n:::{.panel-tabset}\n\n## `mixOmics` \n\nThis engine requires the plsmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(plsmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_spec <- pls() |>\n # We need to set the mode since this engine works with multiple modes\n # and mixOmics is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_fit <- pls_spec |> fit(class ~ ., data = bin_train)\npls_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> mixOmics::splsda(X = x, Y = y, ncomp = ncomp, keepX = keepX) \n#> \n#> sPLS-DA (regression mode) with 2 sPLS-DA components. \n#> You entered data X of dimensions: 785 2 \n#> You entered data Y with 2 classes. \n#> \n#> Selection of [2] [2] variables on each of the sPLS-DA components on the X data set. \n#> No Y variables can be selected. \n#> \n#> Main numerical outputs: \n#> -------------------- \n#> loading vectors: see object$loadings \n#> variates: see object$variates \n#> variable names: see object$names \n#> \n#> Functions to visualise samples: \n#> -------------------- \n#> plotIndiv, plotArrow, cim \n#> \n#> Functions to visualise variables: \n#> -------------------- \n#> plotVar, plotLoadings, network, cim \n#> \n#> Other functions: \n#> -------------------- \n#> selectVar, tune, perf, auc\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(pls_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(pls_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.462 0.538\n#> 2 0.631 0.369\n#> 3 0.512 0.488\n#> 4 0.765 0.235\n#> 5 0.675 0.325\n#> 6 0.624 0.376\n```\n:::\n\n\n:::\n\n## Random Forests (`rand_forest()`) \n\n:::{.panel-tabset}\n\n## `ranger` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n # and ranger is the default engine so there is no need to set that either.\n set_engine(\"ranger\", keep.inbag = TRUE) |> \n # However, we'll set the engine and use the keep.inbag=TRUE option so that we \n # can produce interval predictions. This is not generally required. \n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(841)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Ranger result\n#> \n#> Call:\n#> ranger::ranger(x = maybe_data_frame(x), y = y, keep.inbag = ~TRUE, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) \n#> \n#> Type: Probability estimation \n#> Number of trees: 500 \n#> Sample size: 785 \n#> Number of independent variables: 2 \n#> Mtry: 1 \n#> Target node size: 10 \n#> Variable importance mode: none \n#> Splitrule: gini \n#> OOB prediction error (Brier s.): 0.1477679\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.220 0.780 \n#> 2 0.837 0.163 \n#> 3 0.220 0.780 \n#> 4 0.951 0.0485\n#> 5 0.785 0.215 \n#> 6 0.913 0.0868\npredict(rand_forest_fit, type = \"conf_int\", new_data = bin_test)\n#> Warning in rInfJack(x, inbag.counts): Sample size <=20, no calibration\n#> performed.\n#> Warning in rInfJack(x, inbag.counts): Sample size <=20, no calibration\n#> performed.\n#> Warning in sqrt(infjack): NaNs produced\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0 0.477 0.523 1 \n#> 2 0.604 1 0 0.396\n#> 3 0.01000 0.431 0.569 0.990\n#> 4 0.846 1 0 0.154\n#> 5 0.469 1 0 0.531\n#> 6 NaN NaN NaN NaN\n```\n:::\n\n\n## `aorsf` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(923)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random classification forest\n#> \n#> Linear combinations: Accelerated Logistic regression\n#> N observations: 785\n#> N classes: 2\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 24.092\n#> Min observations in leaf: 5\n#> OOB stat value: 0.87\n#> OOB stat type: AUC-ROC\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.189 0.811 \n#> 2 0.870 0.130 \n#> 3 0.346 0.654 \n#> 4 0.979 0.0206\n#> 5 0.940 0.0599\n#> 6 0.899 0.101\n```\n:::\n\n\n## `grf` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"grf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(546)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\npredict(rand_forest_fit, type = \"conf_int\", new_data = bin_test)\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(493)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: drf\n#> Model ID: DRF_model_R_1763571327438_5185 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 92621 12\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 20 16.60000 126 166 143.08000\n#> \n#> \n#> H2OBinomialMetrics: drf\n#> ** Reported on training data. **\n#> ** Metrics reported on Out-Of-Bag training samples **\n#> \n#> MSE: 0.164699\n#> RMSE: 0.4058312\n#> LogLoss: 1.506369\n#> Mean Per-Class Error: 0.200195\n#> AUC: 0.8389854\n#> AUCPR: 0.7931927\n#> Gini: 0.6779708\n#> R^2: 0.3337559\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 327 107 0.246544 =107/434\n#> Class2 54 297 0.153846 =54/351\n#> Totals 381 404 0.205096 =161/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.363636 0.786755 125\n#> 2 max f2 0.238095 0.832435 148\n#> 3 max f0point5 0.421053 0.760108 115\n#> 4 max accuracy 0.363636 0.794904 125\n#> 5 max precision 1.000000 0.890244 0\n#> 6 max recall 0.000000 1.000000 208\n#> 7 max specificity 1.000000 0.979263 0\n#> 8 max absolute_mcc 0.363636 0.596505 125\n#> 9 max min_per_class_accuracy 0.450000 0.785714 110\n#> 10 max mean_per_class_accuracy 0.363636 0.799805 125\n#> 11 max tns 1.000000 425.000000 0\n#> 12 max fns 1.000000 278.000000 0\n#> 13 max fps 0.000000 434.000000 208\n#> 14 max tps 0.000000 351.000000 208\n#> 15 max tnr 1.000000 0.979263 0\n#> 16 max fnr 1.000000 0.792023 0\n#> 17 max fpr 0.000000 1.000000 208\n#> 18 max tpr 0.000000 1.000000 208\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.12 0.88 \n#> 2 0.94 0.0600\n#> 3 0.175 0.825 \n#> 4 1 0 \n#> 5 0.78 0.22 \n#> 6 0.92 0.0800\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(252)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\n```\n:::\n\n\nThe print method has a lot of output: \n\n
\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncapture.output(print(rand_forest_fit))[1:100] |> cat(sep = \"\\n\")\n#> parsnip model object\n#> \n#> $nodes\n#> $nodes[[1]]\n#> [1] root\n#> | [2] V3 <= -0.06906\n#> | | [3] V3 <= -0.61707\n#> | | | [4] V3 <= -0.83314\n#> | | | | [5] V3 <= -0.99048\n#> | | | | | [6] V3 <= -1.29863\n#> | | | | | | [7] V2 <= -0.93951 *\n#> | | | | | | [8] V2 > -0.93951 *\n#> | | | | | [9] V3 > -1.29863\n#> | | | | | | [10] V3 <= -1.21418 *\n#> | | | | | | [11] V3 > -1.21418\n#> | | | | | | | [12] V2 <= -1.13676 *\n#> | | | | | | | [13] V2 > -1.13676\n#> | | | | | | | | [14] V3 <= -1.14373 *\n#> | | | | | | | | [15] V3 > -1.14373 *\n#> | | | | [16] V3 > -0.99048\n#> | | | | | [17] V2 <= -1.10136 *\n#> | | | | | [18] V2 > -1.10136 *\n#> | | | [19] V3 > -0.83314\n#> | | | | [20] V3 <= -0.68684\n#> | | | | | [21] V2 <= -0.62666 *\n#> | | | | | [22] V2 > -0.62666 *\n#> | | | | [23] V3 > -0.68684 *\n#> | | [24] V3 > -0.61707\n#> | | | [25] V2 <= -0.10774\n#> | | | | [26] V3 <= -0.35574\n#> | | | | | [27] V3 <= -0.41085\n#> | | | | | | [28] V3 <= -0.52674 *\n#> | | | | | | [29] V3 > -0.52674 *\n#> | | | | | [30] V3 > -0.41085 *\n#> | | | | [31] V3 > -0.35574\n#> | | | | | [32] V3 <= -0.17325 *\n#> | | | | | [33] V3 > -0.17325 *\n#> | | | [34] V2 > -0.10774\n#> | | | | [35] V3 <= -0.38428 *\n#> | | | | [36] V3 > -0.38428 *\n#> | [37] V3 > -0.06906\n#> | | [38] V3 <= 0.54852\n#> | | | [39] V2 <= 0.53027\n#> | | | | [40] V2 <= 0.21749\n#> | | | | | [41] V3 <= 0.09376 *\n#> | | | | | [42] V3 > 0.09376\n#> | | | | | | [43] V3 <= 0.28687\n#> | | | | | | | [44] V3 <= 0.17513 *\n#> | | | | | | | [45] V3 > 0.17513 *\n#> | | | | | | [46] V3 > 0.28687 *\n#> | | | | [47] V2 > 0.21749 *\n#> | | | [48] V2 > 0.53027 *\n#> | | [49] V3 > 0.54852\n#> | | | [50] V2 <= 1.99786\n#> | | | | [51] V3 <= 1.02092\n#> | | | | | [52] V2 <= 0.5469\n#> | | | | | | [53] V3 <= 0.83487\n#> | | | | | | | [54] V2 <= 0.36626 *\n#> | | | | | | | [55] V2 > 0.36626 *\n#> | | | | | | [56] V3 > 0.83487 *\n#> | | | | | [57] V2 > 0.5469\n#> | | | | | | [58] V3 <= 0.62673 *\n#> | | | | | | [59] V3 > 0.62673 *\n#> | | | | [60] V3 > 1.02092\n#> | | | | | [61] V3 <= 1.29539\n#> | | | | | | [62] V3 <= 1.2241 *\n#> | | | | | | [63] V3 > 1.2241 *\n#> | | | | | [64] V3 > 1.29539\n#> | | | | | | [65] V3 <= 2.01809 *\n#> | | | | | | [66] V3 > 2.01809 *\n#> | | | [67] V2 > 1.99786 *\n#> \n#> $nodes[[2]]\n#> [1] root\n#> | [2] V3 <= -0.00054\n#> | | [3] V3 <= -0.58754\n#> | | | [4] V3 <= -0.83314\n#> | | | | [5] V2 <= -1.15852\n#> | | | | | [6] V2 <= -1.76192 *\n#> | | | | | [7] V2 > -1.76192 *\n#> | | | | [8] V2 > -1.15852\n#> | | | | | [9] V3 <= -1.21418\n#> | | | | | | [10] V3 <= -1.32176 *\n#> | | | | | | [11] V3 > -1.32176 *\n#> | | | | | [12] V3 > -1.21418\n#> | | | | | | [13] V2 <= -1.08164 *\n#> | | | | | | [14] V2 > -1.08164\n#> | | | | | | | [15] V3 <= -1.14373 *\n#> | | | | | | | [16] V3 > -1.14373 *\n#> | | | [17] V3 > -0.83314\n#> | | | | [18] V2 <= -0.51524\n#> | | | | | [19] V3 <= -0.66041\n#> | | | | | | [20] V3 <= -0.70885 *\n#> | | | | | | [21] V3 > -0.70885 *\n#> | | | | | [22] V3 > -0.66041 *\n#> | | | | [23] V2 > -0.51524 *\n#> | | [24] V3 > -0.58754\n#> | | | [25] V2 <= -0.07243\n#> | | | | [26] V3 <= -0.31247\n#> | | | | | [27] V2 <= -0.98014 *\n```\n:::\n\n
\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.375 0.625 \n#> 2 0.813 0.187 \n#> 3 0.284 0.716 \n#> 4 0.963 0.0365\n#> 5 0.892 0.108 \n#> 6 0.922 0.0785\n```\n:::\n\n\n## `randomForest` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"randomForest\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(726)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> randomForest(x = maybe_data_frame(x), y = y) \n#> Type of random forest: classification\n#> Number of trees: 500\n#> No. of variables tried at each split: 1\n#> \n#> OOB estimate of error rate: 21.53%\n#> Confusion matrix:\n#> Class1 Class2 class.error\n#> Class1 349 85 0.1958525\n#> Class2 84 267 0.2393162\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.162 0.838\n#> 2 0.848 0.152\n#> 3 0.108 0.892\n#> 4 1 0 \n#> 5 0.74 0.26 \n#> 6 0.91 0.09\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n set_mode(\"classification\") |>\n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(693)\nrand_forest_fit <- rand_forest_spec |> fit(Class ~ ., data = tbl_bin$training)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> RandomForestClassificationModel: uid=random_forest__ffe2aceb_0ffa_4c2c_9cac_0d7e0f09c9f5, numTrees=20, numClasses=2, numFeatures=2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class1 \n#> 4 Class2 \n#> 5 Class2 \n#> 6 Class1 \n#> 7 Class2\npredict(rand_forest_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 2]\n#> # Database: spark_connection\n#> pred_Class1 pred_Class2\n#> \n#> 1 0.315 0.685 \n#> 2 0.241 0.759 \n#> 3 0.732 0.268 \n#> 4 0.235 0.765 \n#> 5 0.259 0.741 \n#> 6 0.933 0.0674\n#> 7 0.0968 0.903\n```\n:::\n\n\n:::\n\n## Rule Fit (`rule_fit()`) \n\n:::{.panel-tabset}\n\n## `xrf` \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n # and xrf is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(95)\nrule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> An eXtreme RuleFit model of 358 rules.\n#> \n#> Original Formula:\n#> \n#> class ~ A + B\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rule_fit_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.419 0.581\n#> 2 0.651 0.349\n#> 3 0.506 0.494\n#> 4 0.891 0.109\n#> 5 0.805 0.195\n#> 6 0.616 0.384\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(536)\nrule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: rulefit\n#> Model ID: RuleFit_model_R_1763571327438_5236 \n#> Rulefit Model Summary: \n#> family link regularization number_of_predictors_total\n#> 1 binomial logit Lasso (lambda = 0.03081 ) 2329\n#> number_of_active_predictors number_of_iterations rule_ensemble_size\n#> 1 3 4 2327\n#> number_of_trees number_of_internal_trees min_depth max_depth mean_depth\n#> 1 150 150 0 5 4.00000\n#> min_leaves max_leaves mean_leaves\n#> 1 0 29 15.51333\n#> \n#> \n#> H2OBinomialMetrics: rulefit\n#> ** Reported on training data. **\n#> \n#> MSE: 0.1411478\n#> RMSE: 0.3756964\n#> LogLoss: 0.4472749\n#> Mean Per-Class Error: 0.1850933\n#> AUC: 0.8779327\n#> AUCPR: 0.8372496\n#> Gini: 0.7558654\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 350 84 0.193548 =84/434\n#> Class2 62 289 0.176638 =62/351\n#> Totals 412 373 0.185987 =146/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.499611 0.798343 199\n#> 2 max f2 0.226927 0.861169 285\n#> 3 max f0point5 0.626200 0.803634 144\n#> 4 max accuracy 0.523044 0.815287 191\n#> 5 max precision 0.980574 1.000000 0\n#> 6 max recall 0.052101 1.000000 394\n#> 7 max specificity 0.980574 1.000000 0\n#> 8 max absolute_mcc 0.523044 0.627478 191\n#> 9 max min_per_class_accuracy 0.512020 0.813364 196\n#> 10 max mean_per_class_accuracy 0.499611 0.814907 199\n#> 11 max tns 0.980574 434.000000 0\n#> 12 max fns 0.980574 350.000000 0\n#> 13 max fps 0.043433 434.000000 399\n#> 14 max tps 0.052101 351.000000 394\n#> 15 max tnr 0.980574 1.000000 0\n#> 16 max fnr 0.980574 0.997151 0\n#> 17 max fpr 0.043433 1.000000 399\n#> 18 max tpr 0.052101 1.000000 394\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rule_fit_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.393 0.607 \n#> 2 0.739 0.261 \n#> 3 0.455 0.545 \n#> 4 0.956 0.0442\n#> 5 0.882 0.118 \n#> 6 0.693 0.307\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Linear Kernel) (`svm_linear()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"kernlab\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(class ~ ., data = bin_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Linear (vanilla) kernel function. \n#> \n#> Number of Support Vectors : 357 \n#> \n#> Objective Function Value : -353.0043 \n#> Training error : 0.17707 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.404 0.596 \n#> 2 0.858 0.142 \n#> 3 0.541 0.459 \n#> 4 0.975 0.0254\n#> 5 0.905 0.0950\n#> 6 0.850 0.150\n```\n:::\n\n\n## `LiblineaR` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n # and LiblineaR is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(class ~ ., data = bin_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized L2-loss support vector classification dual (L2R_L2LOSS_SVC_DUAL)\"\n#> \n#> $Type\n#> [1] 1\n#> \n#> $W\n#> A B Bias\n#> [1,] 0.3641766 -0.9648797 0.1182725\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $ClassNames\n#> [1] Class1 Class2\n#> Levels: Class1 Class2\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_spec <- svm_poly() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_fit <- svm_poly_spec |> fit(class ~ ., data = bin_train)\n#> Setting default kernel parameters\nsvm_poly_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Polynomial kernel function. \n#> Hyperparameters : degree = 1 scale = 1 offset = 1 \n#> \n#> Number of Support Vectors : 357 \n#> \n#> Objective Function Value : -353.0043 \n#> Training error : 0.17707 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_poly_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_poly_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.399 0.601 \n#> 2 0.861 0.139 \n#> 3 0.538 0.462 \n#> 4 0.976 0.0237\n#> 5 0.908 0.0917\n#> 6 0.853 0.147\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_spec <- svm_rbf() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_fit <- svm_rbf_spec |> fit(class ~ ., data = bin_train)\nsvm_rbf_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Gaussian Radial Basis kernel function. \n#> Hyperparameter : sigma = 1.9107071282545 \n#> \n#> Number of Support Vectors : 335 \n#> \n#> Objective Function Value : -296.4885 \n#> Training error : 0.173248 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_rbf_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_rbf_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.547 0.453\n#> 2 0.871 0.129\n#> 3 0.260 0.740\n#> 4 0.861 0.139\n#> 5 0.863 0.137\n#> 6 0.863 0.137\n```\n:::\n\n\n:::\n\n# Regression Models\n\nTo demonstrate regression, we'll subset some data. make a training/test split, and standardize the predictors: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(938)\nreg_split <-\n modeldata::concrete |> \n slice_sample(n = 100) |> \n select(strength = compressive_strength, cement, age) |> \n initial_split(prop = 0.95, strata = strength)\nreg_split\n#> \n#> <92/8/100>\n\nreg_rec <- \n recipe(strength ~ ., data = training(reg_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nreg_train <- bake(reg_rec, new_data = NULL)\nreg_test <- bake(reg_rec, new_data = testing(reg_split))\n```\n:::\n\n\nWe also have models that are specifically designed for integer count outcomes. The data for these are:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(207)\ncount_split <-\n attrition |>\n select(num_years = TotalWorkingYears, age = Age, income = MonthlyIncome) |>\n initial_split(prop = 0.994)\ncount_split\n#> \n#> <1461/9/1470>\n\ncount_rec <-\n recipe(num_years ~ ., data = training(count_split)) |>\n step_normalize(all_numeric_predictors()) |>\n prep()\n\ncount_train <- bake(count_rec, new_data = NULL)\ncount_test <- bake(count_rec, new_data = testing(count_split))\n```\n:::\n\n\nFinally, we have some models that handle hierarchical data, where some rows are statistically correlated with other rows. For these examples, we'll use a data set that models body weights as a function of time for several \"subjects\" (rats, actually). We'll split these data in a way where all rows for a specific subject are either in the training or test sets: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(224)\nreg_group_split <- \n nlme::BodyWeight |> \n # Get rid of some extra attributes added by the nlme package\n as_tibble() |> \n # Convert to an _unordered_ factor\n mutate(Rat = factor(as.character(Rat))) |> \n group_initial_split(group = Rat)\nreg_group_train <- training(reg_group_split)\nreg_group_test <- testing(reg_group_split)\n```\n:::\n\n\nThere are 12 subjects in the training set and 4 in the test set. \n\nIf using the **Apache Spark** engine, we will need to identify the data source, and then use it to create the splits. For this article, we will copy the `concrete` data set into the Spark session.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nsc <- spark_connect(\"local\")\n#> Re-using existing Spark connection to local\n\ntbl_concrete <- copy_to(sc, modeldata::concrete)\n\ntbl_reg <- sdf_random_split(tbl_concrete, training = 0.95, test = 0.05, seed = 100)\n```\n:::\n\n\n## Bagged MARS (`bag_mars()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mars_spec <- bag_mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(147)\nbag_mars_fit <- bag_mars_spec |> fit(strength ~ ., data = reg_train)\nbag_mars_fit\n#> parsnip model object\n#> \n#> Bagged MARS (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 age 93.1 4.61 11\n#> 2 cement 69.4 4.95 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mars_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.4\n#> 2 41.9\n#> 3 26.7\n#> 4 56.6\n#> 5 36.4\n#> 6 36.2\n#> 7 37.8\n#> 8 37.7\n```\n:::\n\n\n:::\n\n## Bagged Neural Networks (`bag_mlp()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mlp_spec <- bag_mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(324)\nbag_mlp_fit <- bag_mlp_spec |> fit(strength ~ ., data = reg_train)\nbag_mlp_fit\n#> parsnip model object\n#> \n#> Bagged nnet (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 age 55.9 2.96 11\n#> 2 cement 44.1 2.96 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 19.9\n#> 2 39.1\n#> 3 28.3\n#> 4 68.8\n#> 5 44.1\n#> 6 36.3\n#> 7 40.8\n#> 8 37.0\n```\n:::\n\n\n:::\n\n## Bagged Decision Trees (`bag_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(230)\nbag_tree_fit <- bag_tree_spec |> fit(strength ~ ., data = reg_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged CART (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 cement 16621. 1392. 11\n#> 2 age 12264. 710. 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.0\n#> 2 33.0\n#> 3 29.6\n#> 4 54.2\n#> 5 36.2\n#> 6 39.4\n#> 7 40.7\n#> 8 46.5\n```\n:::\n\n\n:::\n\n## Bayesian Additive Regression Trees (`bart()`) \n\n:::{.panel-tabset}\n\n## `dbarts` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbart_spec <- bart() |>\n # We need to set the mode since this engine works with multiple modes\n # and dbarts is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(134)\nbart_fit <- bart_spec |> fit(strength ~ ., data = reg_train)\nbart_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> `NULL`()\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bart_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.2\n#> 2 40.9\n#> 3 26.0\n#> 4 52.0\n#> 5 36.5\n#> 6 36.7\n#> 7 39.0\n#> 8 37.8\npredict(bart_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 17.0 32.4\n#> 2 33.0 48.9\n#> 3 20.1 31.5\n#> 4 42.0 62.5\n#> 5 28.5 44.5\n#> 6 30.3 42.3\n#> 7 33.1 45.3\n#> 8 26.3 48.8\npredict(bart_fit, type = \"pred_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 5.00 41.8\n#> 2 19.9 60.5\n#> 3 7.37 44.3\n#> 4 32.4 72.1\n#> 5 15.7 56.4\n#> 6 18.9 56.8\n#> 7 21.2 57.2\n#> 8 17.2 58.5\n```\n:::\n\n\n:::\n\n## Boosted Decision Trees (`boost_tree()`) \n\n:::{.panel-tabset}\n\n## `xgboost` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and xgboost is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(748)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> ##### xgb.Booster\n#> raw: 35 Kb \n#> call:\n#> xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, \n#> colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, \n#> subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist, \n#> verbose = 0, nthread = 1, objective = \"reg:squarederror\")\n#> params (as set within xgb.train):\n#> eta = \"0.3\", max_depth = \"6\", gamma = \"0\", colsample_bytree = \"1\", colsample_bynode = \"1\", min_child_weight = \"1\", subsample = \"1\", nthread = \"1\", objective = \"reg:squarederror\", validate_parameters = \"TRUE\"\n#> xgb.attributes:\n#> niter\n#> callbacks:\n#> cb.evaluation.log()\n#> # of features: 2 \n#> niter: 15\n#> nfeatures : 2 \n#> evaluation_log:\n#> iter training_rmse\n#> \n#> 1 27.511751\n#> 2 20.726236\n#> --- ---\n#> 14 2.774394\n#> 15 2.632224\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.3\n#> 2 32.9\n#> 3 26.7\n#> 4 57.6\n#> 5 34.9\n#> 6 33.8\n#> 7 42.6\n#> 8 26.3\n```\n:::\n\n\n## `catboost` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"catboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(557)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> CatBoost model (1000 trees)\n#> Loss function: RMSE\n#> Fit to 2 feature(s)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 26.6\n#> 2 33.9\n#> 3 27.8\n#> 4 60.6\n#> 5 34.7\n#> 6 36.3\n#> 7 43.6\n#> 8 29.3\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(720)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: gbm\n#> Model ID: GBM_model_R_1763571327438_5392 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 20472 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 14 43 27.92000\n#> \n#> \n#> H2ORegressionMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.001563879\n#> RMSE: 0.03954591\n#> MAE: 0.02903684\n#> RMSLE: 0.001771464\n#> Mean Residual Deviance : 0.001563879\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.7\n#> 2 32.2\n#> 3 26.9\n#> 4 63.2\n#> 5 34.9\n#> 6 39.0\n#> 7 40.0\n#> 8 32.9\n```\n:::\n\n\n## `h2o_gbm` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(90)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: gbm\n#> Model ID: GBM_model_R_1763571327438_5393 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 20473 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 14 43 27.92000\n#> \n#> \n#> H2ORegressionMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.001563879\n#> RMSE: 0.03954591\n#> MAE: 0.02903684\n#> RMSLE: 0.001771464\n#> Mean Residual Deviance : 0.001563879\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.7\n#> 2 32.2\n#> 3 26.9\n#> 4 63.2\n#> 5 34.9\n#> 6 39.0\n#> 7 40.0\n#> 8 32.9\n```\n:::\n\n\n## `lightgbm` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"lightgbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(570)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> LightGBM Model (100 trees)\n#> Objective: regression\n#> Fitted to dataset with 2 columns\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 20.6\n#> 2 42.5\n#> 3 27.0\n#> 4 49.2\n#> 5 43.7\n#> 6 38.3\n#> 7 41.1\n#> 8 36.9\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n set_mode(\"regression\") |>\n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(620)\nboost_tree_fit <- boost_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> GBTRegressionModel: uid=gradient_boosted_trees__1965cfeb_e7de_44f1_a524_4ebd0e873064, numTrees=20, numFeatures=8\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 20.8 \n#> 2 28.1 \n#> 3 15.5 \n#> 4 22.4 \n#> 5 9.37\n#> 6 40.1 \n#> 7 14.2 \n#> 8 32.1 \n#> 9 37.4 \n#> 10 49.5 \n#> # ℹ more rows\n```\n:::\n\n\n:::\n\n## Cubist Rules (`cubist_rules()`) \n\n:::{.panel-tabset}\n\n## `Cubist` \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and Cubist is the default engine so there is no need to set that either.\ncubist_rules_spec <- cubist_rules()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(188)\ncubist_rules_fit <- cubist_rules_spec |> fit(strength ~ ., data = reg_train)\ncubist_rules_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> cubist.default(x = x, y = y, committees = 1)\n#> \n#> Number of samples: 92 \n#> Number of predictors: 2 \n#> \n#> Number of committees: 1 \n#> Number of rules: 2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(cubist_rules_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.2\n#> 2 46.3\n#> 3 23.6\n#> 4 54.4\n#> 5 32.7\n#> 6 37.8\n#> 7 38.8\n#> 8 38.6\n```\n:::\n\n\n:::\n\n## Decision Tree (`decision_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(strength ~ ., data = reg_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> n= 92 \n#> \n#> node), split, n, deviance, yval\n#> * denotes terminal node\n#> \n#> 1) root 92 26564.7400 33.57728 \n#> 2) cement< 0.7861846 69 12009.9000 27.81493 \n#> 4) age< -0.5419541 23 964.6417 14.42348 \n#> 8) cement< -0.3695209 12 292.7811 11.14083 *\n#> 9) cement>=-0.3695209 11 401.4871 18.00455 *\n#> 5) age>=-0.5419541 46 4858.3440 34.51065 \n#> 10) age< 0.008934354 32 2208.3040 31.16781 \n#> 20) cement< 0.311975 24 1450.6200 28.75583 *\n#> 21) cement>=0.311975 8 199.1900 38.40375 *\n#> 11) age>=0.008934354 14 1475.1130 42.15143 *\n#> 3) cement>=0.7861846 23 5390.3320 50.86435 \n#> 6) age< -0.5419541 7 390.4204 40.08429 *\n#> 7) age>=-0.5419541 16 3830.5510 55.58062 *\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 18.0\n#> 2 42.2\n#> 3 28.8\n#> 4 55.6\n#> 5 40.1\n#> 6 38.4\n#> 7 38.4\n#> 8 40.1\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(strength ~ ., data = reg_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> strength ~ cement + age\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] cement <= 0.72078\n#> | | [3] age <= -0.60316\n#> | | | [4] cement <= -0.38732: 11.141 (n = 12, err = 292.8)\n#> | | | [5] cement > -0.38732: 18.005 (n = 11, err = 401.5)\n#> | | [6] age > -0.60316\n#> | | | [7] cement <= 0.24945\n#> | | | | [8] age <= -0.2359: 28.756 (n = 24, err = 1450.6)\n#> | | | | [9] age > -0.2359: 39.014 (n = 11, err = 634.8)\n#> | | | [10] cement > 0.24945: 42.564 (n = 11, err = 1041.7)\n#> | [11] cement > 0.72078: 50.864 (n = 23, err = 5390.3)\n#> \n#> Number of inner nodes: 5\n#> Number of terminal nodes: 6\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 18.0\n#> 2 39.0\n#> 3 28.8\n#> 4 50.9\n#> 5 50.9\n#> 6 42.6\n#> 7 42.6\n#> 8 50.9\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n set_mode(\"regression\") |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\ndecision_tree_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = tbl_reg$test)\n```\n:::\n\n\n:::\n\n## Generalized Additive Models (`gen_additive_mod()`) \n\n:::{.panel-tabset}\n\n## `mgcv` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_spec <- gen_additive_mod() |>\n # We need to set the mode since this engine works with multiple modes\n # and mgcv is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_fit <- \n gen_additive_mod_spec |> \n fit(strength ~ s(age) + s(cement), data = reg_train)\ngen_additive_mod_fit\n#> parsnip model object\n#> \n#> \n#> Family: gaussian \n#> Link function: identity \n#> \n#> Formula:\n#> strength ~ s(age) + s(cement)\n#> \n#> Estimated degrees of freedom:\n#> 4.18 3.56 total = 8.74 \n#> \n#> GCV score: 108.4401\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(gen_additive_mod_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.1\n#> 2 41.2\n#> 3 26.7\n#> 4 55.9\n#> 5 35.2\n#> 6 37.1\n#> 7 38.5\n#> 8 39.6\npredict(gen_additive_mod_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 18.9 27.4\n#> 2 35.7 46.6\n#> 3 22.4 31.0\n#> 4 47.0 64.7\n#> 5 30.1 40.4\n#> 6 32.9 41.2\n#> 7 34.3 42.6\n#> 8 30.3 49.0\n```\n:::\n\n\n:::\n\n## Linear Reg (`linear_reg()`) \n\n:::{.panel-tabset}\n\n## `lm` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and lm is the default engine so there is no need to set that either.\nlinear_reg_spec <- linear_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = strength ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) cement age \n#> 33.577 8.795 5.471\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 28.8 35.4\n#> 2 27.1 33.5\n#> 3 17.3 25.9\n#> 4 44.6 58.1\n#> 5 35.6 45.0\n#> 6 32.3 38.3\n#> 7 33.2 39.4\n#> 8 41.6 56.0\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 5.72 58.5\n#> 2 3.89 56.7\n#> 3 -4.94 48.2\n#> 4 24.3 78.5\n#> 5 13.7 67.0\n#> 6 8.95 61.7\n#> 7 9.89 62.7\n#> 8 21.6 76.0\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(1)\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Linear regression\n#> \n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> batch size: 83 \n#> scaled validation loss after 1 epoch: 235\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.1\n#> 3 21.6\n#> 4 51.2\n#> 5 40.3\n#> 6 35.2\n#> 7 36.2\n#> 8 48.7\n```\n:::\n\n\n## `gee` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- \n linear_reg_spec |> \n fit(weight ~ Time + Diet + id_var(Rat), data = reg_group_train)\n#> Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27\n#> running glm to get initial regression estimate\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> GEE: GENERALIZED LINEAR MODELS FOR DEPENDENT DATA\n#> gee S-function, version 4.13 modified 98/01/27 (1998) \n#> \n#> Model:\n#> Link: Identity \n#> Variance to Mean Relation: Gaussian \n#> Correlation Structure: Independent \n#> \n#> Call:\n#> gee::gee(formula = weight ~ Time + Diet, id = data$Rat, data = data, \n#> family = gaussian)\n#> \n#> Number of observations : 132 \n#> \n#> Maximum cluster size : 11 \n#> \n#> \n#> Coefficients:\n#> (Intercept) Time Diet2 Diet3 \n#> 245.410439 0.549192 185.621212 259.287879 \n#> \n#> Estimated Scale Parameter: 272.1604\n#> Number of Iterations: 1\n#> \n#> Working Correlation[1:4,1:4]\n#> [,1] [,2] [,3] [,4]\n#> [1,] 1 0 0 0\n#> [2,] 0 1 0 0\n#> [3,] 0 0 1 0\n#> [4,] 0 0 0 1\n#> \n#> \n#> Returned Error Value:\n#> [1] 0\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `glm` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = strength ~ ., family = stats::gaussian, \n#> data = data)\n#> \n#> Coefficients:\n#> (Intercept) cement age \n#> 33.577 8.795 5.471 \n#> \n#> Degrees of Freedom: 91 Total (i.e. Null); 89 Residual\n#> Null Deviance:\t 26560 \n#> Residual Deviance: 15480 \tAIC: 740.6\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 28.8 35.4\n#> 2 27.1 33.5\n#> 3 17.3 25.9\n#> 4 44.6 58.1\n#> 5 35.6 45.0\n#> 6 32.3 38.3\n#> 7 33.2 39.4\n#> 8 41.6 56.0\n```\n:::\n\n\n## `glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- \n linear_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\n#> Warning in lme4::glmer(formula = weight ~ Diet + Time + (1 | Rat), data = data,\n#> : calling glmer() with family=gaussian (identity link) as a shortcut to lmer()\n#> is deprecated; please call lmer() directly\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Linear mixed model fit by REML ['lmerMod']\n#> Formula: weight ~ Diet + Time + (1 | Rat)\n#> Data: data\n#> REML criterion at convergence: 955.6549\n#> Random effects:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 16.331 \n#> Residual 8.117 \n#> Number of obs: 132, groups: Rat, 12\n#> Fixed Effects:\n#> (Intercept) Diet2 Diet3 Time \n#> 245.4104 185.6212 259.2879 0.5492\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `glmnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"gaussian\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 9.5680\n#> 2 1 5.38 8.7180\n#> 3 1 9.85 7.9430\n#> 4 1 13.56 7.2380\n#> 5 1 16.64 6.5950\n#> 6 2 19.99 6.0090\n#> 7 2 23.68 5.4750\n#> 8 2 26.75 4.9890\n#> 9 2 29.29 4.5450\n#> 10 2 31.40 4.1420\n#> 11 2 33.15 3.7740\n#> 12 2 34.61 3.4380\n#> 13 2 35.82 3.1330\n#> 14 2 36.82 2.8550\n#> 15 2 37.65 2.6010\n#> 16 2 38.34 2.3700\n#> 17 2 38.92 2.1590\n#> 18 2 39.39 1.9680\n#> 19 2 39.79 1.7930\n#> 20 2 40.12 1.6340\n#> 21 2 40.39 1.4880\n#> 22 2 40.62 1.3560\n#> 23 2 40.80 1.2360\n#> 24 2 40.96 1.1260\n#> 25 2 41.09 1.0260\n#> 26 2 41.20 0.9348\n#> 27 2 41.29 0.8517\n#> 28 2 41.36 0.7761\n#> 29 2 41.42 0.7071\n#> 30 2 41.47 0.6443\n#> 31 2 41.52 0.5871\n#> 32 2 41.55 0.5349\n#> 33 2 41.58 0.4874\n#> 34 2 41.60 0.4441\n#> 35 2 41.63 0.4046\n#> 36 2 41.64 0.3687\n#> 37 2 41.66 0.3359\n#> 38 2 41.67 0.3061\n#> 39 2 41.68 0.2789\n#> 40 2 41.68 0.2541\n#> 41 2 41.69 0.2316\n#> 42 2 41.70 0.2110\n#> 43 2 41.70 0.1922\n#> 44 2 41.71 0.1752\n#> 45 2 41.71 0.1596\n#> 46 2 41.71 0.1454\n#> 47 2 41.71 0.1325\n#> 48 2 41.71 0.1207\n#> 49 2 41.72 0.1100\n#> 50 2 41.72 0.1002\n#> 51 2 41.72 0.0913\n#> 52 2 41.72 0.0832\n#> 53 2 41.72 0.0758\n#> 54 2 41.72 0.0691\n#> 55 2 41.72 0.0630\n#> 56 2 41.72 0.0574\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.2\n#> 2 30.3\n#> 3 21.7\n#> 4 51.3\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.7\n```\n:::\n\n\n## `gls` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n # Also, nlme::gls() specifies the random effects outside of the formula so\n # we set that as an engine parameter\n set_engine(\"gls\", correlation = nlme::corCompSymm(form = ~Time|Rat))\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(weight ~ Time + Diet, data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Generalized least squares fit by REML\n#> Model: weight ~ Time + Diet \n#> Data: data \n#> Log-restricted-likelihood: -477.8274\n#> \n#> Coefficients:\n#> (Intercept) Time Diet2 Diet3 \n#> 245.410439 0.549192 185.621212 259.287879 \n#> \n#> Correlation Structure: Compound symmetry\n#> Formula: ~Time | Rat \n#> Parameter estimate(s):\n#> Rho \n#> 0.8019221 \n#> Degrees of freedom: 132 total; 128 residual\n#> Residual standard error: 18.23695\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: glm\n#> Model ID: GLM_model_R_1763571327438_5394 \n#> GLM Model: summary\n#> family link regularization\n#> 1 gaussian identity Elastic Net (alpha = 0.5, lambda = 0.01903 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 1\n#> training_frame\n#> 1 object_ujvnjgioue\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept 33.577283 33.577283\n#> 2 cement 8.708461 8.708461\n#> 3 age 5.422201 5.422201\n#> \n#> H2ORegressionMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 168.2822\n#> RMSE: 12.97236\n#> MAE: 10.62672\n#> RMSLE: 0.4645554\n#> Mean Residual Deviance : 168.2822\n#> R^2 : 0.4171988\n#> Null Deviance :26564.74\n#> Null D.o.F. :91\n#> Residual Deviance :15481.96\n#> Residual D.o.F. :89\n#> AIC :740.6438\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.7\n#> 4 51.2\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.7\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(596)\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Model: \"sequential_3\"\n#> ________________________________________________________________________________\n#> Layer (type) Output Shape Param # \n#> ================================================================================\n#> dense_6 (Dense) (None, 1) 3 \n#> dense_7 (Dense) (None, 1) 2 \n#> ================================================================================\n#> Total params: 5 (20.00 Byte)\n#> Trainable params: 5 (20.00 Byte)\n#> Non-trainable params: 0 (0.00 Byte)\n#> ________________________________________________________________________________\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> 1/1 - 0s - 41ms/epoch - 41ms/step\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 0.157 \n#> 2 -0.000594\n#> 3 -0.0677 \n#> 4 0.414 \n#> 5 0.290 \n#> 6 0.154 \n#> 7 0.170 \n#> 8 0.443\n```\n:::\n\n\n## `lme` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that. \n # nlme::lme() makes us set the random effects outside of the formula so we\n # add it as an engine parameter. \n set_engine(\"lme\", random = ~ Time | Rat)\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(weight ~ Diet + Time, data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Linear mixed-effects model fit by REML\n#> Data: data \n#> Log-restricted-likelihood: -426.5662\n#> Fixed: weight ~ Diet + Time \n#> (Intercept) Diet2 Diet3 Time \n#> 240.483603 199.723140 264.893298 0.549192 \n#> \n#> Random effects:\n#> Formula: ~Time | Rat\n#> Structure: General positive-definite, Log-Cholesky parametrization\n#> StdDev Corr \n#> (Intercept) 25.2657397 (Intr)\n#> Time 0.3411097 -0.816\n#> Residual 4.5940697 \n#> \n#> Number of Observations: 132\n#> Number of Groups: 12\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 241.\n#> 2 245.\n#> 3 249.\n#> 4 253.\n#> 5 256.\n#> 6 260.\n#> 7 264.\n#> 8 265.\n#> 9 268.\n#> 10 272.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `lmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"lmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- \n linear_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Linear mixed model fit by REML ['lmerMod']\n#> Formula: weight ~ Diet + Time + (1 | Rat)\n#> Data: data\n#> REML criterion at convergence: 955.6549\n#> Random effects:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 16.331 \n#> Residual 8.117 \n#> Number of obs: 132, groups: Rat, 12\n#> Fixed Effects:\n#> (Intercept) Diet2 Diet3 Time \n#> 245.4104 185.6212 259.2879 0.5492\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `stan` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(357)\nlinear_reg_fit <- linear_reg_spec |> fit(weight ~ Diet + Time, data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> stan_glm\n#> family: gaussian [identity]\n#> formula: weight ~ Diet + Time\n#> observations: 132\n#> predictors: 4\n#> ------\n#> Median MAD_SD\n#> (Intercept) 245.3 3.3 \n#> Diet2 185.6 3.6 \n#> Diet3 259.3 3.4 \n#> Time 0.6 0.1 \n#> \n#> Auxiliary parameter(s):\n#> Median MAD_SD\n#> sigma 16.6 1.0 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 240. 252.\n#> 2 244. 255.\n#> 3 249. 258.\n#> 4 253. 262.\n#> 5 257. 265.\n#> 6 261. 269.\n#> 7 265. 273.\n#> 8 265. 274.\n#> 9 268. 278.\n#> 10 271. 282.\n#> # ℹ 34 more rows\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 213. 278.\n#> 2 216. 282.\n#> 3 220. 287.\n#> 4 224. 290.\n#> 5 228. 292.\n#> 6 230. 297.\n#> 7 236. 301.\n#> 8 236. 302.\n#> 9 240. 305.\n#> 10 244. 310.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `stan_glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(895)\nlinear_reg_fit <- \n linear_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> stan_glmer\n#> family: gaussian [identity]\n#> formula: weight ~ Diet + Time + (1 | Rat)\n#> observations: 132\n#> ------\n#> Median MAD_SD\n#> (Intercept) 245.6 6.8 \n#> Diet2 185.7 11.5 \n#> Diet3 259.2 11.5 \n#> Time 0.5 0.0 \n#> \n#> Auxiliary parameter(s):\n#> Median MAD_SD\n#> sigma 8.2 0.5 \n#> \n#> Error terms:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 17.2 \n#> Residual 8.2 \n#> Num. levels: Rat 12 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 258.\n#> 5 262.\n#> 6 266.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 205. 285.\n#> 2 211. 289.\n#> 3 214. 292.\n#> 4 218. 295.\n#> 5 221. 300.\n#> 6 225. 303.\n#> 7 230. 307.\n#> 8 230. 309.\n#> 9 233. 312.\n#> 10 237. 314.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> Coefficients:\n#> (Intercept) cement blast_furnace_slag fly_ash \n#> -21.80239627 0.12003251 0.10399582 0.08747677 \n#> water superplasticizer coarse_aggregate fine_aggregate \n#> -0.15701342 0.28531613 0.01777782 0.02018358 \n#> age \n#> 0.11678247\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 16.5\n#> 2 19.7\n#> 3 26.1\n#> 4 23.6\n#> 5 24.2\n#> 6 29.1\n#> 7 21.3\n#> 8 24.2\n#> 9 33.9\n#> 10 57.7\n#> # ℹ more rows\n```\n:::\n\n\n:::\n\n## Multivariate Adaptive Regression Splines (`mars()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_spec <- mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_fit <- mars_spec |> fit(strength ~ ., data = reg_train)\nmars_fit\n#> parsnip model object\n#> \n#> Selected 4 of 9 terms, and 2 of 2 predictors\n#> Termination condition: RSq changed by less than 0.001 at 9 terms\n#> Importance: age, cement\n#> Number of terms at each degree of interaction: 1 3 (additive model)\n#> GCV 113.532 RSS 8915.965 GRSq 0.6153128 RSq 0.6643684\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mars_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.0\n#> 2 43.1\n#> 3 28.1\n#> 4 58.0\n#> 5 33.8\n#> 6 34.9\n#> 7 36.3\n#> 8 43.5\n```\n:::\n\n\n:::\n\n## Neural Networks (`mlp()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(159)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> a 2-5-1 network with 21 weights\n#> inputs: cement age \n#> output(s): strength \n#> options were - linear output units\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 14.8\n#> 2 38.5\n#> 3 32.0\n#> 4 63.6\n#> 5 43.5\n#> 6 42.7\n#> 7 42.3\n#> 8 33.1\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(407)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> relu activation,\n#> 3 hidden units,\n#> 13 model parameters\n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 83 \n#> learn rate: 0.01 \n#> scaled validation loss after 9 epochs: 0.189\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.1\n#> 2 39.4\n#> 3 26.9\n#> 4 56.4\n#> 5 32.9\n#> 6 37.2\n#> 7 38.4\n#> 8 40.1\n```\n:::\n\n\n## `brulee_two_layer` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"brulee_two_layer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(585)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> c(relu,relu) activation,\n#> c(3,3) hidden units,\n#> 25 model parameters\n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 83 \n#> learn rate: 0.01 \n#> scaled validation loss after 3 epochs: 0.379\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.5\n#> 2 32.6\n#> 3 24.6\n#> 4 50.5\n#> 5 46.7\n#> 6 33.8\n#> 7 37.0\n#> 8 50.5\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(93)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: deeplearning\n#> Model ID: DeepLearning_model_R_1763571327438_5395 \n#> Status of Neuron Layers: predicting .outcome, regression, gaussian distribution, Quadratic loss, 801 weights/biases, 14.5 KB, 920 training samples, mini-batch size 1\n#> layer units type dropout l1 l2 mean_rate rate_rms momentum\n#> 1 1 2 Input 0.00 % NA NA NA NA NA\n#> 2 2 200 Rectifier 0.00 % 0.000000 0.000000 0.012666 0.031575 0.000000\n#> 3 3 1 Linear NA 0.000000 0.000000 0.000613 0.000166 0.000000\n#> mean_weight weight_rms mean_bias bias_rms\n#> 1 NA NA NA NA\n#> 2 -0.003107 0.098394 0.499664 0.001157\n#> 3 -0.000248 0.098163 0.000245 0.000000\n#> \n#> \n#> H2ORegressionMetrics: deeplearning\n#> ** Reported on training data. **\n#> ** Metrics reported on full training frame **\n#> \n#> MSE: 173.8723\n#> RMSE: 13.18606\n#> MAE: 10.40789\n#> RMSLE: 0.48563\n#> Mean Residual Deviance : 173.8723\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 31.1\n#> 2 31.6\n#> 3 25.1\n#> 4 44.1\n#> 5 36.3\n#> 6 33.9\n#> 7 34.5\n#> 8 41.5\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(879)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> Coefficients:\n#> (Intercept) cement blast_furnace_slag fly_ash \n#> -21.80239627 0.12003251 0.10399582 0.08747677 \n#> water superplasticizer coarse_aggregate fine_aggregate \n#> -0.15701342 0.28531613 0.01777782 0.02018358 \n#> age \n#> 0.11678247\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> 1/1 - 0s - 42ms/epoch - 42ms/step\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 -0.386\n#> 2 -0.337\n#> 3 -0.299\n#> 4 -0.279\n#> 5 -0.385\n#> 6 -0.374\n#> 7 -0.373\n#> 8 -0.342\n```\n:::\n\n\n:::\n\n## K-Nearest Neighbors (`nearest_neighbor()`) \n\n:::{.panel-tabset}\n\n## `kknn` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_spec <- nearest_neighbor() |>\n # We need to set the mode since this engine works with multiple modes\n # and kknn is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_fit <- nearest_neighbor_spec |> fit(strength ~ ., data = reg_train)\nnearest_neighbor_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> kknn::train.kknn(formula = strength ~ ., data = data, ks = min_rows(5, data, 5))\n#> \n#> Type of response variable: continuous\n#> minimal mean absolute error: 8.257735\n#> Minimal mean squared error: 115.8737\n#> Best kernel: optimal\n#> Best k: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(nearest_neighbor_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 16.3\n#> 2 35.7\n#> 3 27.5\n#> 4 56.7\n#> 5 42.6\n#> 6 41.7\n#> 7 41.2\n#> 8 50.2\n```\n:::\n\n\n## Null Model (`null_model()`) \n\n## `parsnip` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_spec <- null_model() |>\n # We need to set the mode since this engine works with multiple modes\n # and parsnip is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_fit <- null_model_spec |> fit(strength ~ ., data = reg_train)\nnull_model_fit\n#> parsnip model object\n#> \n#> Null Classification Model\n#> Predicted Value: 33.57728\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(null_model_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 33.6\n#> 2 33.6\n#> 3 33.6\n#> 4 33.6\n#> 5 33.6\n#> 6 33.6\n#> 7 33.6\n#> 8 33.6\n```\n:::\n\n\n:::\n\n## Partial Least Squares (`pls()`) \n\n:::{.panel-tabset}\n\n## `mixOmics` \n\nThis engine requires the plsmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(plsmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_spec <- pls() |>\n # We need to set the mode since this engine works with multiple modes\n # and mixOmics is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_fit <- pls_spec |> fit(strength ~ ., data = reg_train)\npls_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> mixOmics::spls(X = x, Y = y, ncomp = ncomp, keepX = keepX) \n#> \n#> sPLS with a 'regression' mode with 2 sPLS components. \n#> You entered data X of dimensions: 92 2 \n#> You entered data Y of dimensions: 92 1 \n#> \n#> Selection of [2] [2] variables on each of the sPLS components on the X data set. \n#> Selection of [1] [1] variables on each of the sPLS components on the Y data set. \n#> \n#> Main numerical outputs: \n#> -------------------- \n#> loading vectors: see object$loadings \n#> variates: see object$variates \n#> variable names: see object$names \n#> \n#> Functions to visualise samples: \n#> -------------------- \n#> plotIndiv, plotArrow \n#> \n#> Functions to visualise variables: \n#> -------------------- \n#> plotVar, plotLoadings, network, cim\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(pls_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\n```\n:::\n\n\n:::\n\n## Poisson Reg (`poisson_reg()`) \n\n:::{.panel-tabset}\n\n## `glm` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and glm is the default engine so there is no need to set that either.\npoisson_reg_spec <- poisson_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = num_years ~ ., family = stats::poisson, \n#> data = data)\n#> \n#> Coefficients:\n#> (Intercept) age income \n#> 2.2861 0.2804 0.2822 \n#> \n#> Degrees of Freedom: 1460 Total (i.e. Null); 1458 Residual\n#> Null Deviance:\t 7434 \n#> Residual Deviance: 2597 \tAIC: 8446\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.6 \n#> 2 6.66\n#> 3 11.8 \n#> 4 24.8 \n#> 5 26.6 \n#> 6 8.23\n#> 7 32.1 \n#> 8 4.86\n#> 9 28.3\n```\n:::\n\n\n## `gee` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- \n poisson_reg_spec |> \n fit(weight ~ Diet + Time + id_var(Rat), data = reg_group_train)\n#> Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27\n#> running glm to get initial regression estimate\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> GEE: GENERALIZED LINEAR MODELS FOR DEPENDENT DATA\n#> gee S-function, version 4.13 modified 98/01/27 (1998) \n#> \n#> Model:\n#> Link: Logarithm \n#> Variance to Mean Relation: Poisson \n#> Correlation Structure: Independent \n#> \n#> Call:\n#> gee::gee(formula = weight ~ Diet + Time, id = data$Rat, data = data, \n#> family = stats::poisson)\n#> \n#> Number of observations : 132 \n#> \n#> Maximum cluster size : 11 \n#> \n#> \n#> Coefficients:\n#> (Intercept) Diet2 Diet3 Time \n#> 5.525683187 0.532717136 0.684495610 0.001467487 \n#> \n#> Estimated Scale Parameter: 0.6879328\n#> Number of Iterations: 1\n#> \n#> Working Correlation[1:4,1:4]\n#> [,1] [,2] [,3] [,4]\n#> [1,] 1 0 0 0\n#> [2,] 0 1 0 0\n#> [3,] 0 0 1 0\n#> [4,] 0 0 0 1\n#> \n#> \n#> Returned Error Value:\n#> [1] 0\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Can't reproduce this:\n# predict(poisson_reg_fit, new_data = reg_group_test)\n```\n:::\n\n\n## `glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(826)\npoisson_reg_fit <- \n poisson_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\n#> Warning in checkConv(attr(opt, \"derivs\"), opt$par, ctrl = control$checkConv, :\n#> Model failed to converge with max|grad| = 0.00394285 (tol = 0.002, component 1)\n#> Warning in checkConv(attr(opt, \"derivs\"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue\n#> - Rescale variables?\npoisson_reg_fit\n#> parsnip model object\n#> \n#> Generalized linear mixed model fit by maximum likelihood (Laplace\n#> Approximation) [glmerMod]\n#> Family: poisson ( log )\n#> Formula: weight ~ Diet + Time + (1 | Rat)\n#> Data: data\n#> AIC BIC logLik -2*log(L) df.resid \n#> 1079.1349 1093.5489 -534.5675 1069.1349 127 \n#> Random effects:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 0.03683 \n#> Number of obs: 132, groups: Rat, 12\n#> Fixed Effects:\n#> (Intercept) Diet2 Diet3 Time \n#> 5.524796 0.533446 0.684637 0.001467 \n#> optimizer (Nelder_Mead) convergence code: 0 (OK) ; 0 optimizer warnings; 2 lme4 warnings\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 251.\n#> 2 254.\n#> 3 256.\n#> 4 259.\n#> 5 262.\n#> 6 264.\n#> 7 267.\n#> 8 268.\n#> 9 270.\n#> 10 273.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `glmnet` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"poisson\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 5.9710\n#> 2 1 10.26 5.4400\n#> 3 1 18.31 4.9570\n#> 4 2 24.84 4.5170\n#> 5 2 32.06 4.1150\n#> 6 2 37.94 3.7500\n#> 7 2 42.73 3.4170\n#> 8 2 46.65 3.1130\n#> 9 2 49.87 2.8370\n#> 10 2 52.51 2.5850\n#> 11 2 54.69 2.3550\n#> 12 2 56.48 2.1460\n#> 13 2 57.96 1.9550\n#> 14 2 59.18 1.7810\n#> 15 2 60.19 1.6230\n#> 16 2 61.03 1.4790\n#> 17 2 61.72 1.3480\n#> 18 2 62.29 1.2280\n#> 19 2 62.76 1.1190\n#> 20 2 63.16 1.0190\n#> 21 2 63.48 0.9289\n#> 22 2 63.75 0.8463\n#> 23 2 63.98 0.7712\n#> 24 2 64.16 0.7026\n#> 25 2 64.31 0.6402\n#> 26 2 64.44 0.5833\n#> 27 2 64.55 0.5315\n#> 28 2 64.64 0.4843\n#> 29 2 64.71 0.4413\n#> 30 2 64.77 0.4021\n#> 31 2 64.82 0.3664\n#> 32 2 64.86 0.3338\n#> 33 2 64.90 0.3042\n#> 34 2 64.92 0.2771\n#> 35 2 64.95 0.2525\n#> 36 2 64.97 0.2301\n#> 37 2 64.98 0.2096\n#> 38 2 65.00 0.1910\n#> 39 2 65.01 0.1741\n#> 40 2 65.02 0.1586\n#> 41 2 65.03 0.1445\n#> 42 2 65.03 0.1317\n#> 43 2 65.04 0.1200\n#> 44 2 65.04 0.1093\n#> 45 2 65.05 0.0996\n#> 46 2 65.05 0.0907\n#> 47 2 65.05 0.0827\n#> 48 2 65.05 0.0753\n#> 49 2 65.06 0.0687\n#> 50 2 65.06 0.0625\n#> 51 2 65.06 0.0570\n#> 52 2 65.06 0.0519\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.4 \n#> 2 6.70\n#> 3 11.8 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.27\n#> 7 31.8 \n#> 8 4.91\n#> 9 28.1\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: glm\n#> Model ID: GLM_model_R_1763571327438_5396 \n#> GLM Model: summary\n#> family link regularization\n#> 1 poisson log Elastic Net (alpha = 0.5, lambda = 0.01194 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 4\n#> training_frame\n#> 1 object_kyirzmfbti\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept 2.286411 2.286411\n#> 2 age 0.279967 0.279967\n#> 3 income 0.281952 0.281952\n#> \n#> H2ORegressionMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 18.40519\n#> RMSE: 4.290128\n#> MAE: 3.297048\n#> RMSLE: 0.467537\n#> Mean Residual Deviance : 1.777749\n#> R^2 : 0.6934292\n#> Null Deviance :7434.374\n#> Null D.o.F. :1460\n#> Residual Deviance :2597.291\n#> Residual D.o.F. :1458\n#> AIC :8445.967\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.6 \n#> 2 6.67\n#> 3 11.8 \n#> 4 24.8 \n#> 5 26.5 \n#> 6 8.24\n#> 7 32.0 \n#> 8 4.87\n#> 9 28.2\n```\n:::\n\n\n## `hurdle` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"hurdle\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> pscl::hurdle(formula = num_years ~ ., data = data)\n#> \n#> Count model coefficients (truncated poisson with log link):\n#> (Intercept) age income \n#> 2.2911 0.2749 0.2820 \n#> \n#> Zero hurdle model coefficients (binomial with logit link):\n#> (Intercept) age income \n#> 24.656 5.611 13.092\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.5 \n#> 2 6.74\n#> 3 11.9 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.32\n#> 7 31.9 \n#> 8 4.89\n#> 9 28.2\n```\n:::\n\n\n## `stan` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(213)\npoisson_reg_fit <- \n poisson_reg_spec |> \n fit(weight ~ Diet + Time, data = reg_group_train)\n#> \n#> SAMPLING FOR MODEL 'count' NOW (CHAIN 1).\n#> Chain 1: \n#> Chain 1: Gradient evaluation took 8.9e-05 seconds\n#> Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.89 seconds.\n#> Chain 1: Adjust your expectations accordingly!\n#> Chain 1: \n#> Chain 1: \n#> Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)\n#> Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)\n#> Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)\n#> Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)\n#> Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)\n#> Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)\n#> Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)\n#> Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)\n#> Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)\n#> Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)\n#> Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)\n#> Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)\n#> Chain 1: \n#> Chain 1: Elapsed Time: 0.034 seconds (Warm-up)\n#> Chain 1: 0.035 seconds (Sampling)\n#> Chain 1: 0.069 seconds (Total)\n#> Chain 1: \n#> \n#> SAMPLING FOR MODEL 'count' NOW (CHAIN 2).\n#> Chain 2: \n#> Chain 2: Gradient evaluation took 6e-06 seconds\n#> Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.06 seconds.\n#> Chain 2: Adjust your expectations accordingly!\n#> Chain 2: \n#> Chain 2: \n#> Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)\n#> Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)\n#> Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)\n#> Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)\n#> Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)\n#> Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)\n#> Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)\n#> Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)\n#> Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)\n#> Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)\n#> Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)\n#> Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)\n#> Chain 2: \n#> Chain 2: Elapsed Time: 0.035 seconds (Warm-up)\n#> Chain 2: 0.034 seconds (Sampling)\n#> Chain 2: 0.069 seconds (Total)\n#> Chain 2: \n#> \n#> SAMPLING FOR MODEL 'count' NOW (CHAIN 3).\n#> Chain 3: \n#> Chain 3: Gradient evaluation took 5e-06 seconds\n#> Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.05 seconds.\n#> Chain 3: Adjust your expectations accordingly!\n#> Chain 3: \n#> Chain 3: \n#> Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)\n#> Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)\n#> Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)\n#> Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)\n#> Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)\n#> Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)\n#> Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)\n#> Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)\n#> Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)\n#> Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)\n#> Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)\n#> Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)\n#> Chain 3: \n#> Chain 3: Elapsed Time: 0.033 seconds (Warm-up)\n#> Chain 3: 0.035 seconds (Sampling)\n#> Chain 3: 0.068 seconds (Total)\n#> Chain 3: \n#> \n#> SAMPLING FOR MODEL 'count' NOW (CHAIN 4).\n#> Chain 4: \n#> Chain 4: Gradient evaluation took 5e-06 seconds\n#> Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.05 seconds.\n#> Chain 4: Adjust your expectations accordingly!\n#> Chain 4: \n#> Chain 4: \n#> Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)\n#> Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)\n#> Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)\n#> Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)\n#> Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)\n#> Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)\n#> Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)\n#> Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)\n#> Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)\n#> Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)\n#> Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)\n#> Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)\n#> Chain 4: \n#> Chain 4: Elapsed Time: 0.035 seconds (Warm-up)\n#> Chain 4: 0.036 seconds (Sampling)\n#> Chain 4: 0.071 seconds (Total)\n#> Chain 4:\npoisson_reg_fit\n#> parsnip model object\n#> \n#> stan_glm\n#> family: poisson [log]\n#> formula: weight ~ Diet + Time\n#> observations: 132\n#> predictors: 4\n#> ------\n#> Median MAD_SD\n#> (Intercept) 5.5 0.0 \n#> Diet2 0.5 0.0 \n#> Diet3 0.7 0.0 \n#> Time 0.0 0.0 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 5.53\n#> 2 5.54\n#> 3 5.55\n#> 4 5.56\n#> 5 5.57\n#> 6 5.58\n#> 7 5.59\n#> 8 5.59\n#> 9 5.60\n#> 10 5.61\n#> # ℹ 34 more rows\npredict(poisson_reg_fit, type = \"conf_int\", new_data = reg_group_test)\n#> Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 246. 257.\n#> 2 249. 259.\n#> 3 252. 261.\n#> 4 255. 263.\n#> 5 258. 266.\n#> 6 261. 269.\n#> 7 263. 272.\n#> 8 264. 272.\n#> 9 266. 275.\n#> 10 268. 278.\n#> # ℹ 34 more rows\npredict(poisson_reg_fit, type = \"pred_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 220 284\n#> 2 222 286\n#> 3 225 288\n#> 4 228 291\n#> 5 230 296\n#> 6 232 297\n#> 7 235 300\n#> 8 236 300\n#> 9 238 303\n#> 10 241 306\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `stan_glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(690)\npoisson_reg_fit <- \n poisson_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> stan_glmer\n#> family: poisson [log]\n#> formula: weight ~ Diet + Time + (1 | Rat)\n#> observations: 132\n#> ------\n#> Median MAD_SD\n#> (Intercept) 5.5 0.0 \n#> Diet2 0.5 0.0 \n#> Diet3 0.7 0.0 \n#> Time 0.0 0.0 \n#> \n#> Error terms:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 0.054 \n#> Num. levels: Rat 12 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 251.\n#> 2 254.\n#> 3 256.\n#> 4 259.\n#> 5 261.\n#> 6 264.\n#> 7 267.\n#> 8 268.\n#> 9 270.\n#> 10 272.\n#> # ℹ 34 more rows\npredict(poisson_reg_fit, type = \"pred_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 210. 294 \n#> 2 213 298 \n#> 3 214 301 \n#> 4 217 304 \n#> 5 220 306 \n#> 6 222 309 \n#> 7 223 313.\n#> 8 225 315 \n#> 9 226 317.\n#> 10 229 320 \n#> # ℹ 34 more rows\n```\n:::\n\n\n## `zeroinfl` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"zeroinfl\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\n#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> pscl::zeroinfl(formula = num_years ~ ., data = data)\n#> \n#> Count model coefficients (poisson with log link):\n#> (Intercept) age income \n#> 2.2912 0.2748 0.2821 \n#> \n#> Zero-inflation model coefficients (binomial with logit link):\n#> (Intercept) age income \n#> -48.26 -18.22 -11.72\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.5 \n#> 2 6.74\n#> 3 11.9 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.31\n#> 7 31.9 \n#> 8 4.93\n#> 9 28.2\n```\n:::\n\n\n:::\n\n## Random Forests (`rand_forest()`) \n\n:::{.panel-tabset}\n\n## `ranger` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n # and ranger is the default engine so there is no need to set that either.\n set_engine(\"ranger\", keep.inbag = TRUE) |> \n # However, we'll set the engine and use the keep.inbag=TRUE option so that we \n # can produce interval predictions. This is not generally required. \n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(860)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Ranger result\n#> \n#> Call:\n#> ranger::ranger(x = maybe_data_frame(x), y = y, keep.inbag = ~TRUE, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1)) \n#> \n#> Type: Regression \n#> Number of trees: 500 \n#> Sample size: 92 \n#> Number of independent variables: 2 \n#> Mtry: 1 \n#> Target node size: 5 \n#> Variable importance mode: none \n#> Splitrule: variance \n#> OOB prediction error (MSE): 92.94531 \n#> R squared (OOB): 0.6816071\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.6\n#> 2 36.9\n#> 3 28.4\n#> 4 56.5\n#> 5 38.6\n#> 6 36.5\n#> 7 38.7\n#> 8 34.4\npredict(rand_forest_fit, type = \"conf_int\", new_data = reg_test)\n#> Warning in rInfJack(pred = result$predictions, inbag = inbag.counts, used.trees\n#> = 1:num.trees): Sample size <=20, no calibration performed.\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 18.1 29.1\n#> 2 32.6 41.1\n#> 3 24.0 32.9\n#> 4 45.4 67.7\n#> 5 33.0 44.3\n#> 6 32.0 41.0\n#> 7 35.1 42.3\n#> 8 28.4 40.3\n```\n:::\n\n\n## `aorsf` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(47)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random regression forest\n#> \n#> Linear combinations: Accelerated Linear regression\n#> N observations: 92\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 13.994\n#> Min observations in leaf: 5\n#> OOB stat value: 0.59\n#> OOB stat type: RSQ\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 25.2\n#> 2 36.4\n#> 3 29.7\n#> 4 55.5\n#> 5 42.3\n#> 6 38.5\n#> 7 40.7\n#> 8 52.7\n```\n:::\n\n\n## `grf` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"grf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(130)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\npredict(rand_forest_fit, type = \"conf_int\", new_data = reg_test)\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(211)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: drf\n#> Model ID: DRF_model_R_1763571327438_5397 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 22316 7\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 14 9.04000 14 43 30.86000\n#> \n#> \n#> H2ORegressionMetrics: drf\n#> ** Reported on training data. **\n#> ** Metrics reported on Out-Of-Bag training samples **\n#> \n#> MSE: 89.19785\n#> RMSE: 9.444462\n#> MAE: 7.597463\n#> RMSLE: 0.3303384\n#> Mean Residual Deviance : 89.19785\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.9\n#> 2 36.4\n#> 3 28.1\n#> 4 56.8\n#> 5 39.0\n#> 6 37.8\n#> 7 37.4\n#> 8 31.8\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(981)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\n```\n:::\n\n\nThe print method has a lot of output: \n\n
\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncapture.output(print(rand_forest_fit))[1:100] |> cat(sep = \"\\n\")\n#> parsnip model object\n#> \n#> $nodes\n#> $nodes[[1]]\n#> [1] root\n#> | [2] V2 <= 0.31678\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -0.89134 *\n#> | | | [6] V2 > -0.89134 *\n#> | [7] V2 > 0.31678\n#> | | [8] V3 <= -0.60316 *\n#> | | [9] V3 > -0.60316 *\n#> \n#> $nodes[[2]]\n#> [1] root\n#> | [2] V2 <= 0.62459\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -1.16452 *\n#> | | | [6] V2 > -1.16452\n#> | | | | [7] V3 <= -0.2359 *\n#> | | | | [8] V3 > -0.2359 *\n#> | [9] V2 > 0.62459 *\n#> \n#> $nodes[[3]]\n#> [1] root\n#> | [2] V2 <= 0.34564\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -1.19338 *\n#> | | | [6] V2 > -1.19338 *\n#> | [7] V2 > 0.34564\n#> | | [8] V2 <= 1.21134 *\n#> | | [9] V2 > 1.21134 *\n#> \n#> $nodes[[4]]\n#> [1] root\n#> | [2] V2 <= 0.34564\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V3 <= 0.25377 *\n#> | | | [6] V3 > 0.25377 *\n#> | [7] V2 > 0.34564\n#> | | [8] V3 <= -0.60316 *\n#> | | [9] V3 > -0.60316 *\n#> \n#> $nodes[[5]]\n#> [1] root\n#> | [2] V2 <= 0.62459\n#> | | [3] V3 <= -0.48074 *\n#> | | [4] V3 > -0.48074\n#> | | | [5] V2 <= -1.12604 *\n#> | | | [6] V2 > -1.12604\n#> | | | | [7] V3 <= -0.2359 *\n#> | | | | [8] V3 > -0.2359 *\n#> | [9] V2 > 0.62459 *\n#> \n#> $nodes[[6]]\n#> [1] root\n#> | [2] V2 <= 0.72078\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -0.84517 *\n#> | | | [6] V2 > -0.84517 *\n#> | [7] V2 > 0.72078 *\n#> \n#> $nodes[[7]]\n#> [1] root\n#> | [2] V2 <= 0.72078\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V3 <= -0.2359\n#> | | | | [6] V2 <= 0.24945 *\n#> | | | | [7] V2 > 0.24945 *\n#> | | | [8] V3 > -0.2359 *\n#> | [9] V2 > 0.72078 *\n#> \n#> $nodes[[8]]\n#> [1] root\n#> | [2] V2 <= 0.72078\n#> | | [3] V3 <= -0.48074 *\n#> | | [4] V3 > -0.48074\n#> | | | [5] V3 <= -0.2359 *\n#> | | | [6] V3 > -0.2359 *\n#> | [7] V2 > 0.72078 *\n#> \n#> $nodes[[9]]\n#> [1] root\n#> | [2] V2 <= 0.62459\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -0.23149\n#> | | | | [6] V2 <= -1.09526 *\n#> | | | | [7] V2 > -1.09526 *\n#> | | | [8] V2 > -0.23149 *\n#> | [9] V2 > 0.62459 *\n#> \n#> $nodes[[10]]\n#> [1] root\n```\n:::\n\n
\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 16.3\n#> 2 37.7\n#> 3 28.5\n#> 4 50.6\n#> 5 49.2\n#> 6 36.1\n#> 7 38.6\n#> 8 49.7\n```\n:::\n\n\n## `randomForest` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"randomForest\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(793)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> randomForest(x = maybe_data_frame(x), y = y) \n#> Type of random forest: regression\n#> Number of trees: 500\n#> No. of variables tried at each split: 1\n#> \n#> Mean of squared residuals: 90.38475\n#> % Var explained: 68.7\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.5\n#> 2 36.8\n#> 3 28.6\n#> 4 58.0\n#> 5 38.3\n#> 6 35.4\n#> 7 38.1\n#> 8 33.7\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n set_engine(\"spark\") |> \n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(157)\nrand_forest_fit <- rand_forest_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> RandomForestRegressionModel: uid=random_forest__5a153ba4_7b1f_4072_9e7c_6a00b51132e0, numTrees=20, numFeatures=8\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 28.2\n#> 2 29.6\n#> 3 23.0\n#> 4 28.2\n#> 5 15.2\n#> 6 35.3\n#> 7 18.6\n#> 8 31.9\n#> 9 36.3\n#> 10 45.4\n#> # ℹ more rows\n```\n:::\n\n\n:::\n\n## Rule Fit (`rule_fit()`) \n\n:::{.panel-tabset}\n\n## `xrf` \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n # and xrf is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(431)\nrule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> An eXtreme RuleFit model of 179 rules.\n#> \n#> Original Formula:\n#> \n#> strength ~ cement + age\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 27.5\n#> 2 32.0\n#> 3 26.5\n#> 4 52.9\n#> 5 35.9\n#> 6 31.8\n#> 7 46.2\n#> 8 30.8\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(236)\nrule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: rulefit\n#> Model ID: RuleFit_model_R_1763571327438_5398 \n#> Rulefit Model Summary: \n#> family link regularization number_of_predictors_total\n#> 1 gaussian identity Lasso (lambda = 0.9516 ) 1917\n#> number_of_active_predictors number_of_iterations rule_ensemble_size\n#> 1 51 1 1915\n#> number_of_trees number_of_internal_trees min_depth max_depth mean_depth\n#> 1 150 150 0 5 4.00000\n#> min_leaves max_leaves mean_leaves\n#> 1 0 28 12.76667\n#> \n#> \n#> H2ORegressionMetrics: rulefit\n#> ** Reported on training data. **\n#> \n#> MSE: 90.45501\n#> RMSE: 9.510784\n#> MAE: 7.15224\n#> RMSLE: 0.3531064\n#> Mean Residual Deviance : 90.45501\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 26.9\n#> 2 35.5\n#> 3 26.9\n#> 4 50.1\n#> 5 42.1\n#> 6 34.5\n#> 7 39.3\n#> 8 40.8\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Linear Kernel) (`svm_linear()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"kernlab\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(strength ~ ., data = reg_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Linear (vanilla) kernel function. \n#> \n#> Number of Support Vectors : 85 \n#> \n#> Objective Function Value : -47.4495 \n#> Training error : 0.606701\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.4\n#> 2 30.9\n#> 3 21.7\n#> 4 47.1\n#> 5 36.4\n#> 6 33.4\n#> 7 34.2\n#> 8 43.2\n```\n:::\n\n\n## `LiblineaR` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n # and LiblineaR is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(strength ~ ., data = reg_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized L2-loss support vector regression primal (L2R_L2LOSS_SVR)\"\n#> \n#> $Type\n#> [1] 11\n#> \n#> $W\n#> cement age Bias\n#> [1,] 8.665447 5.486263 33.34299\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 31.9\n#> 2 30.1\n#> 3 21.5\n#> 4 50.9\n#> 5 39.9\n#> 6 35.0\n#> 7 36.0\n#> 8 48.3\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_spec <- svm_poly() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_fit <- svm_poly_spec |> fit(strength ~ ., data = reg_train)\n#> Setting default kernel parameters\nsvm_poly_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Polynomial kernel function. \n#> Hyperparameters : degree = 1 scale = 1 offset = 1 \n#> \n#> Number of Support Vectors : 85 \n#> \n#> Objective Function Value : -47.4495 \n#> Training error : 0.606702\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_poly_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.4\n#> 2 30.9\n#> 3 21.7\n#> 4 47.1\n#> 5 36.4\n#> 6 33.4\n#> 7 34.2\n#> 8 43.2\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_spec <- svm_rbf() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_fit <- svm_rbf_spec |> fit(strength ~ ., data = reg_train)\nsvm_rbf_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Gaussian Radial Basis kernel function. \n#> Hyperparameter : sigma = 0.850174270140177 \n#> \n#> Number of Support Vectors : 79 \n#> \n#> Objective Function Value : -33.0277 \n#> Training error : 0.28361\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_rbf_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 20.0\n#> 2 41.3\n#> 3 26.0\n#> 4 53.5\n#> 5 35.2\n#> 6 34.7\n#> 7 36.2\n#> 8 42.3\n```\n:::\n\n\n\n:::\n\n# Censored Regression Models\n\nLet's simulate a data set using the prodlim and survival packages: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(survival)\n#> \n#> Attaching package: 'survival'\n#> The following object is masked from 'package:future':\n#> \n#> cluster\nlibrary(prodlim)\n\nset.seed(1000)\ncns_data <- \n SimSurv(250) |> \n mutate(event_time = Surv(time, event)) |> \n select(event_time, X1, X2)\n\ncns_split <- initial_split(cns_data, prop = 0.98)\ncns_split\n#> \n#> <245/5/250>\ncns_train <- training(cns_split)\ncns_test <- testing(cns_split)\n```\n:::\n\n\nFor some types of predictions, we need the _evaluation time(s)_ for the predictions. We'll use these three times to demonstrate: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\neval_times <- c(1, 3, 5)\n```\n:::\n\n\n\n## Bagged Decision Trees (`bag_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"censored regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_fit <- bag_tree_spec |> fit(event_time ~ ., data = cns_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> \n#> Bagging survival trees with 25 bootstrap replications \n#> \n#> Call: bagging.data.frame(formula = event_time ~ ., data = data, cp = ~0, \n#> minsplit = ~2)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.65\n#> 2 4.12\n#> 3 5.03\n#> 4 5.58\n#> 5 4.88\npredict(bag_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.993\n#> 2 3 0.864\n#> 3 5 0.638\n```\n:::\n\n\n:::\n\n## Boosted Decision Trees (`boost_tree()`) \n\n:::{.panel-tabset}\n\n## `mboost` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"censored regression\") |> \n set_engine(\"mboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(852)\nboost_tree_fit <- boost_tree_spec |> fit(event_time ~ ., data = cns_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> \n#> \t Model-based Boosting\n#> \n#> Call:\n#> mboost::blackboost(formula = formula, data = data, family = family, control = mboost::boost_control(), tree_controls = partykit::ctree_control(teststat = \"quadratic\", testtype = \"Teststatistic\", mincriterion = 0, minsplit = 10, minbucket = 4, maxdepth = 2, saveinfo = FALSE))\n#> \n#> \n#> \t Cox Partial Likelihood \n#> \n#> Loss function: \n#> \n#> Number of boosting iterations: mstop = 100 \n#> Step size: 0.1 \n#> Offset: 0 \n#> Number of baselearners: 1\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 6.51\n#> 2 3.92\n#> 3 4.51\n#> 4 7.17\n#> 5 4.51\npredict(boost_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(boost_tree_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 0.00839\n#> 2 -1.14 \n#> 3 -0.823 \n#> 4 0.229 \n#> 5 -0.823\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.982\n#> 2 3 0.877\n#> 3 5 0.657\n```\n:::\n\n\n:::\n\n## Decision Tree (`decision_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"censored regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(event_time ~ ., data = cns_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> $rpart\n#> n= 245 \n#> \n#> node), split, n, deviance, yval\n#> * denotes terminal node\n#> \n#> 1) root 245 329.03530 1.0000000 \n#> 2) X2< -0.09937043 110 119.05180 0.5464982 \n#> 4) X2< -0.9419799 41 42.43138 0.3153769 \n#> 8) X1< 0.5 20 12.93725 0.1541742 *\n#> 9) X1>=0.5 21 23.29519 0.5656502 *\n#> 5) X2>=-0.9419799 69 67.76223 0.7336317 *\n#> 3) X2>=-0.09937043 135 157.14990 1.7319010 \n#> 6) X1< 0.5 79 66.30972 1.2572690 *\n#> 7) X1>=0.5 56 69.62652 3.0428230 \n#> 14) X2< 1.222057 44 40.33335 2.5072040 *\n#> 15) X2>=1.222057 12 17.95790 6.3934130 *\n#> \n#> $survfit\n#> \n#> Call: prodlim::prodlim(formula = form, data = data)\n#> Stratified Kaplan-Meier estimator for the conditional event time survival function\n#> Discrete predictor variable: rpartFactor (0.154174164904031, 0.565650228981439, 0.733631734872791, 1.25726850344687, 2.50720371146533, 6.39341334321542)\n#> \n#> Right-censored response of a survival model\n#> \n#> No.Observations: 245 \n#> \n#> Pattern:\n#> Freq\n#> event 161 \n#> right.censored 84 \n#> \n#> $levels\n#> [1] \"0.154174164904031\" \"0.565650228981439\" \"0.733631734872791\"\n#> [4] \"1.25726850344687\" \"2.50720371146533\" \"6.39341334321542\" \n#> \n#> attr(,\"class\")\n#> [1] \"pecRpart\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 1.26\n#> 2 2.51\n#> 3 1.26\n#> 4 1.26\n#> 5 1.26\npredict(decision_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.987\n#> 2 3 0.854\n#> 3 5 0.634\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(event_time ~ ., data = cns_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> event_time ~ X1 + X2\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] X2 <= -0.36159\n#> | | [3] X1 <= 0: 13.804 (n = 41)\n#> | | [4] X1 > 0: 8.073 (n = 47)\n#> | [5] X2 > -0.36159\n#> | | [6] X1 <= 0: 6.274 (n = 89)\n#> | | [7] X1 > 0\n#> | | | [8] X2 <= 0.56098: 5.111 (n = 39)\n#> | | | [9] X2 > 0.56098: 2.713 (n = 29)\n#> \n#> Number of inner nodes: 4\n#> Number of terminal nodes: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 6.27\n#> 2 5.11\n#> 3 6.27\n#> 4 6.27\n#> 5 6.27\npredict(decision_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.989\n#> 2 3 0.871\n#> 3 5 0.649\n```\n:::\n\n\n:::\n\n## Proportional Hazards (`proportional_hazards()`) \n\n:::{.panel-tabset}\n\n## `survival` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and survival is the default engine so there is no need to set that either.\nproportional_hazards_spec <- proportional_hazards()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit <- proportional_hazards_spec |> fit(event_time ~ ., data = cns_train)\nproportional_hazards_fit\n#> parsnip model object\n#> \n#> Call:\n#> survival::coxph(formula = event_time ~ ., data = data, model = TRUE, \n#> x = TRUE)\n#> \n#> coef exp(coef) se(coef) z p\n#> X1 0.99547 2.70599 0.16799 5.926 3.11e-09\n#> X2 0.91398 2.49422 0.09566 9.555 < 2e-16\n#> \n#> Likelihood ratio test=106.8 on 2 df, p=< 2.2e-16\n#> n= 245, number of events= 161\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(proportional_hazards_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.16\n#> 3 4.62\n#> 4 5.19\n#> 5 4.41\npredict(proportional_hazards_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(proportional_hazards_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -0.111\n#> 2 -1.49 \n#> 3 -1.27 \n#> 4 -1.02 \n#> 5 -1.37\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.985\n#> 2 3 0.909\n#> 3 5 0.750\n```\n:::\n\n\n## `glmnet` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_spec <- proportional_hazards(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit <- proportional_hazards_spec |> fit(event_time ~ ., data = cns_train)\nproportional_hazards_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = data_obj$x, y = data_obj$y, family = \"cox\", weights = weights, alpha = alpha, lambda = lambda) \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.39700\n#> 2 1 0.82 0.36170\n#> 3 1 1.51 0.32960\n#> 4 1 2.07 0.30030\n#> 5 1 2.54 0.27360\n#> 6 1 2.94 0.24930\n#> 7 2 3.28 0.22720\n#> 8 2 3.95 0.20700\n#> 9 2 4.50 0.18860\n#> 10 2 4.95 0.17180\n#> 11 2 5.33 0.15660\n#> 12 2 5.65 0.14270\n#> 13 2 5.91 0.13000\n#> 14 2 6.13 0.11840\n#> 15 2 6.31 0.10790\n#> 16 2 6.46 0.09833\n#> 17 2 6.58 0.08960\n#> 18 2 6.69 0.08164\n#> 19 2 6.77 0.07439\n#> 20 2 6.85 0.06778\n#> 21 2 6.91 0.06176\n#> 22 2 6.96 0.05627\n#> 23 2 7.00 0.05127\n#> 24 2 7.03 0.04672\n#> 25 2 7.06 0.04257\n#> 26 2 7.08 0.03879\n#> 27 2 7.10 0.03534\n#> 28 2 7.12 0.03220\n#> 29 2 7.13 0.02934\n#> 30 2 7.14 0.02673\n#> 31 2 7.15 0.02436\n#> 32 2 7.16 0.02219\n#> 33 2 7.17 0.02022\n#> 34 2 7.17 0.01843\n#> 35 2 7.18 0.01679\n#> 36 2 7.18 0.01530\n#> 37 2 7.18 0.01394\n#> 38 2 7.19 0.01270\n#> 39 2 7.19 0.01157\n#> 40 2 7.19 0.01054\n#> 41 2 7.19 0.00961\n#> 42 2 7.19 0.00875\n#> The training data has been saved for prediction.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(proportional_hazards_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.80\n#> 2 4.21\n#> 3 4.63\n#> 4 5.18\n#> 5 4.42\npredict(proportional_hazards_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(proportional_hazards_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -0.108\n#> 2 -1.43 \n#> 3 -1.23 \n#> 4 -0.993\n#> 5 -1.33\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.984\n#> 2 3 0.906\n#> 3 5 0.743\n```\n:::\n\n\n:::\n\n## Random Forests (`rand_forest()`) \n\n:::{.panel-tabset}\n\n## `aorsf` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(2)\nrand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random survival forest\n#> \n#> Linear combinations: Accelerated Cox regression\n#> N observations: 245\n#> N events: 161\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 12.85\n#> Min observations in leaf: 5\n#> Min events in leaf: 1\n#> OOB stat value: 0.70\n#> OOB stat type: Harrell's C-index\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.93\n#> 2 3.85\n#> 3 4.41\n#> 4 5.43\n#> 5 4.34\npredict(rand_forest_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.999\n#> 2 3 0.873\n#> 3 5 0.627\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(89)\nrand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train)\n```\n:::\n\n\nThe print method has a lot of output: \n\n
\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncapture.output(print(rand_forest_fit))[1:100] |> cat(sep = \"\\n\")\n#> parsnip model object\n#> \n#> $nodes\n#> $nodes[[1]]\n#> [1] root\n#> | [2] V3 <= -0.16072\n#> | | [3] V2 <= 0\n#> | | | [4] V3 <= -1.68226 *\n#> | | | [5] V3 > -1.68226\n#> | | | | [6] V3 <= -0.65952 *\n#> | | | | [7] V3 > -0.65952 *\n#> | | [8] V2 > 0\n#> | | | [9] V3 <= -0.98243 *\n#> | | | [10] V3 > -0.98243\n#> | | | | [11] V3 <= -0.67216 *\n#> | | | | [12] V3 > -0.67216 *\n#> | [13] V3 > -0.16072\n#> | | [14] V2 <= 0\n#> | | | [15] V3 <= 0.95981\n#> | | | | [16] V3 <= 0.3117\n#> | | | | | [17] V3 <= 0.09688 *\n#> | | | | | [18] V3 > 0.09688 *\n#> | | | | [19] V3 > 0.3117\n#> | | | | | [20] V3 <= 0.40845 *\n#> | | | | | [21] V3 > 0.40845 *\n#> | | | [22] V3 > 0.95981 *\n#> | | [23] V2 > 0\n#> | | | [24] V3 <= 0.56098 *\n#> | | | [25] V3 > 0.56098 *\n#> \n#> $nodes[[2]]\n#> [1] root\n#> | [2] V3 <= -0.36618\n#> | | [3] V2 <= 0\n#> | | | [4] V3 <= -1.19881 *\n#> | | | [5] V3 > -1.19881 *\n#> | | [6] V2 > 0\n#> | | | [7] V3 <= -1.18263 *\n#> | | | [8] V3 > -1.18263\n#> | | | | [9] V3 <= -0.55449 *\n#> | | | | [10] V3 > -0.55449 *\n#> | [11] V3 > -0.36618\n#> | | [12] V2 <= 0\n#> | | | [13] V3 <= 0.3117\n#> | | | | [14] V3 <= -0.01851 *\n#> | | | | [15] V3 > -0.01851 *\n#> | | | [16] V3 > 0.3117\n#> | | | | [17] V3 <= 0.85976 *\n#> | | | | [18] V3 > 0.85976 *\n#> | | [19] V2 > 0\n#> | | | [20] V3 <= -0.04369 *\n#> | | | [21] V3 > -0.04369\n#> | | | | [22] V3 <= 0.56098 *\n#> | | | | [23] V3 > 0.56098\n#> | | | | | [24] V3 <= 1.22094 *\n#> | | | | | [25] V3 > 1.22094 *\n#> \n#> $nodes[[3]]\n#> [1] root\n#> | [2] V3 <= -0.46092\n#> | | [3] V2 <= 0\n#> | | | [4] V3 <= -1.65465 *\n#> | | | [5] V3 > -1.65465 *\n#> | | [6] V2 > 0\n#> | | | [7] V3 <= -1.36941 *\n#> | | | [8] V3 > -1.36941\n#> | | | | [9] V3 <= -0.83366 *\n#> | | | | [10] V3 > -0.83366 *\n#> | [11] V3 > -0.46092\n#> | | [12] V2 <= 0\n#> | | | [13] V3 <= -0.01851 *\n#> | | | [14] V3 > -0.01851\n#> | | | | [15] V3 <= 0.22967 *\n#> | | | | [16] V3 > 0.22967\n#> | | | | | [17] V3 <= 0.95368\n#> | | | | | | [18] V3 <= 0.68292 *\n#> | | | | | | [19] V3 > 0.68292 *\n#> | | | | | [20] V3 > 0.95368 *\n#> | | [21] V2 > 0\n#> | | | [22] V3 <= 0.15595 *\n#> | | | [23] V3 > 0.15595\n#> | | | | [24] V3 <= 0.51117 *\n#> | | | | [25] V3 > 0.51117 *\n#> \n#> $nodes[[4]]\n#> [1] root\n#> | [2] V3 <= -0.10421\n#> | | [3] V2 <= 0\n#> | | | [4] V3 <= -0.96818 *\n#> | | | [5] V3 > -0.96818\n#> | | | | [6] V3 <= -0.64682 *\n#> | | | | [7] V3 > -0.64682 *\n#> | | [8] V2 > 0\n#> | | | [9] V3 <= -0.83366 *\n#> | | | [10] V3 > -0.83366 *\n#> | [11] V3 > -0.10421\n#> | | [12] V2 <= 0\n#> | | | [13] V3 <= 0.14347 *\n#> | | | [14] V3 > 0.14347\n#> | | | | [15] V3 <= 1.20345\n```\n:::\n\n
\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.22\n#> 2 4.12\n#> 3 3.87\n#> 4 4.82\n#> 5 3.87\npredict(rand_forest_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 1 \n#> 2 3 0.870\n#> 3 5 0.594\n```\n:::\n\n\n:::\n\n## Parametric Survival Models (`survival_reg()`) \n\n:::{.panel-tabset}\n\n## `survival` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and survival is the default engine so there is no need to set that either.\nsurvival_reg_spec <- survival_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> survival::survreg(formula = event_time ~ ., data = data, model = TRUE)\n#> \n#> Coefficients:\n#> (Intercept) X1 X2 \n#> 2.2351722 -0.4648296 -0.4222887 \n#> \n#> Scale= 0.4728442 \n#> \n#> Loglik(model)= -427.4 Loglik(intercept only)= -481.3\n#> \tChisq= 107.73 on 2 degrees of freedom, p= <2e-16 \n#> n= 245\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 8.88\n#> 2 4.67\n#> 3 5.20\n#> 4 5.83\n#> 5 4.97\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 2.18\n#> 2 1.54\n#> 3 1.65\n#> 4 1.76\n#> 5 1.60\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.990\n#> 2 3 0.904\n#> 3 5 0.743\n```\n:::\n\n\n## `flexsurv` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_spec <- survival_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"flexsurv\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> flexsurv::flexsurvreg(formula = event_time ~ ., data = data, \n#> dist = \"weibull\")\n#> \n#> Estimates: \n#> data mean est L95% U95% se exp(est) L95% \n#> shape NA 2.11486 1.87774 2.38192 0.12832 NA NA\n#> scale NA 9.34809 8.38852 10.41743 0.51658 NA NA\n#> X1 0.46939 -0.46483 -0.61347 -0.31619 0.07584 0.62824 0.54147\n#> X2 -0.00874 -0.42229 -0.50641 -0.33817 0.04292 0.65554 0.60266\n#> U95% \n#> shape NA\n#> scale NA\n#> X1 0.72892\n#> X2 0.71307\n#> \n#> N = 245, Events: 161, Censored: 84\n#> Total time at risk: 1388.951\n#> Log-likelihood = -427.4387, df = 4\n#> AIC = 862.8774\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.13\n#> 3 4.61\n#> 4 5.16\n#> 5 4.40\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 2.18\n#> 2 1.54\n#> 3 1.65\n#> 4 1.76\n#> 5 1.60\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.990\n#> 2 3 0.904\n#> 3 5 0.743\n```\n:::\n\n\n## `flexsurvspline` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_spec <- survival_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"flexsurvspline\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> flexsurv::flexsurvspline(formula = event_time ~ ., data = data)\n#> \n#> Estimates: \n#> data mean est L95% U95% se exp(est) L95% \n#> gamma0 NA -4.72712 -5.31772 -4.13651 0.30134 NA NA\n#> gamma1 NA 2.11487 1.86338 2.36637 0.12832 NA NA\n#> X1 0.46939 0.98305 0.65928 1.30683 0.16519 2.67261 1.93340\n#> X2 -0.00874 0.89308 0.70943 1.07673 0.09370 2.44265 2.03283\n#> U95% \n#> gamma0 NA\n#> gamma1 NA\n#> X1 3.69444\n#> X2 2.93508\n#> \n#> N = 245, Events: 161, Censored: 84\n#> Total time at risk: 1388.951\n#> Log-likelihood = -427.4387, df = 4\n#> AIC = 862.8774\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.13\n#> 3 4.61\n#> 4 5.16\n#> 5 4.40\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -4.62\n#> 2 -3.26\n#> 3 -3.49\n#> 4 -3.73\n#> 5 -3.39\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.990\n#> 2 3 0.904\n#> 3 5 0.743\n```\n:::\n\n\n:::\n\n# Quantile Regression Models\n\nTo demonstrate quantile regression, let's make a larger version of our regression data: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(938)\nqnt_split <-\n modeldata::concrete |> \n slice_sample(n = 100) |> \n select(strength = compressive_strength, cement, age) |> \n initial_split(prop = 0.95, strata = strength)\nqnt_split\n#> \n#> <92/8/100>\n\nqnt_rec <- \n recipe(strength ~ ., data = training(qnt_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nqnt_train <- bake(qnt_rec, new_data = NULL)\nqnt_test <- bake(qnt_rec, new_data = testing(qnt_split))\n```\n:::\n\n\nWe'll also predict these quantile levels: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nqnt_lvls <- (1:3) / 4\n```\n:::\n\n\n## Linear Regression (`linear_reg()`) \n\n:::{.panel-tabset}\n\n## `quantreg` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n set_engine(\"quantreg\") |> \n set_mode(\"quantile regression\", quantile_levels = qnt_lvls)\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = qnt_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> quantreg::rq(formula = strength ~ ., tau = quantile_levels, data = data)\n#> \n#> Coefficients:\n#> tau= 0.25 tau= 0.50 tau= 0.75\n#> (Intercept) 23.498029 33.265428 42.046031\n#> cement 6.635233 7.955658 8.181235\n#> age 5.566668 9.514832 7.110702\n#> \n#> Degrees of freedom: 92 total; 89 residual\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, type = \"quantile\", new_data = qnt_test)\n#> # A tibble: 8 × 1\n#> .pred_quantile\n#> \n#> 1 [29.2]\n#> 2 [31.5]\n#> 3 [21.4]\n#> 4 [48.3]\n#> 5 [36.6]\n#> 6 [33.8]\n#> 7 [34.6]\n#> 8 [43.8]\n```\n:::\n\n\nEach row of predictions has a special vector class containing all of the quantile predictions: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit |> \n predict(type = \"quantile\", new_data = qnt_test)|> \n slice(1) |> \n pluck(\".pred_quantile\") |> \n # Expand the results for each quantile level by converting to a tibble\n as_tibble()\n#> # A tibble: 3 × 3\n#> .pred_quantile .quantile_levels .row\n#> \n#> 1 21.5 0.25 1\n#> 2 29.2 0.5 1\n#> 3 39.5 0.75 1\n```\n:::\n\n\n:::\n\n## Random Forests (`rand_forest()`) \n\n:::{.panel-tabset}\n\n## `grf` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n set_engine(\"grf\") |> \n set_mode(\"quantile regression\", quantile_levels = qnt_lvls)\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(435)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = qnt_train)\nrand_forest_fit\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"quantile\", new_data = qnt_test)\n```\n:::\n\n\nEach row of predictions has a special vector class containing all of the quantile predictions: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit |> \n predict(type = \"quantile\", new_data = qnt_test)|> \n slice(1) |> \n pluck(\".pred_quantile\") |> \n # Expand the results for each quantile level by converting to a tibble\n as_tibble()\n```\n:::\n\n\n:::\n\n\n\n", + "markdown": "---\ntitle: \"Fitting and predicting with parsnip\"\ncategories:\n - model fitting\n - parsnip\n - regression\n - classification\ntype: learn-subsection\nweight: 1\ndescription: | \n Examples that show how to fit and predict with different combinations of model, mode, and engine.\ntoc: true\ntoc-depth: 3\ninclude-after-body: ../../../resources.html\nformat:\n html:\n theme: [\"style.scss\"]\n---\n\n\n\n\n\n\n# Introduction\n\nThis page shows examples of how to *fit* and *predict* with different combinations of model, mode, and engine. As a reminder, in parsnip, \n\n- the **model type** differentiates basic modeling approaches, such as random forests, logistic regression, linear support vector machines, etc.,\n\n- the **mode** denotes in what kind of modeling context it will be used (most commonly, classification or regression), and\n\n- the computational **engine** indicates how the model is fit, such as with a specific R package implementation or even methods outside of R like Keras or Stan.\n\nWe'll break the examples up by their mode. For each model, we'll show different data sets used across the different engines. \n\nTo use code in this article, you will need to install the following packages: agua, baguette, bonsai, censored, discrim, HSAUR3, lme4, multilevelmod, plsmod, poissonreg, prodlim, rules, sparklyr, survival, and tidymodels. There are numerous other \"engine\" packages that are required. If you use a model that is missing one or more installed packages, parsnip will prompt you to install them. There are some packages that require non-standard installation or rely on external dependencies. We'll describe these next. \n\n## External Dependencies\n\nSome models available in parsnip use other computational frameworks for computations. There may be some additional downloads for engines using **catboost**, **Spark**, **h2o**, **tensorflow**/**keras**, and **torch**. You can expand the sections below to get basic installation instructions.\n\n
\n\n### catboost\n\ncatboost is a popular boosting framework. Unfortunately, the R package is not available on CRAN. First, go to [https://github.com/catboost/catboost/releases/](\"https://github.com/catboost/catboost/releases/) and search for \"`[R-package]`\" to find the most recent release. \n\nThe following code and be used to install and test the package (which requires the glue package to be installed): \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(glue)\n\n# Put the current version number in this variable: \nversion_number <- \"#.##\"\n\ntemplate <- \"https://github.com/catboost/catboost/releases/download/v{version}/catboost-R-darwin-universal2-{version}.tgz\"\n\ntarget_url <- glue::glue(template)\ntarget_dest <- tempfile()\ndownload.file(target_url, target_dest)\n\nif (grepl(\"^mac\", .Platform$pkgType)) {\n options <- \"--no-staged-install\"\n} else {\n options <- character(0)\n}\n\ninst <- glue::glue(\"R CMD INSTALL {options} {target_dest}\")\nsystem(inst)\n```\n:::\n\n\nTo test, fit an example model: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(catboost)\n\ntrain_pool_path <- system.file(\"extdata\", \"adult_train.1000\", package = \"catboost\")\ntest_pool_path <- system.file(\"extdata\", \"adult_test.1000\", package = \"catboost\")\ncd_path <- system.file(\"extdata\", \"adult.cd\", package = \"catboost\")\ntrain_pool <- catboost.load_pool(train_pool_path, column_description = cd_path)\ntest_pool <- catboost.load_pool(test_pool_path, column_description = cd_path)\nfit_params <- list(\n iterations = 100,\n loss_function = 'Logloss',\n ignored_features = c(4, 9),\n border_count = 32,\n depth = 5,\n learning_rate = 0.03,\n l2_leaf_reg = 3.5,\n train_dir = tempdir())\nfit_params\n```\n:::\n\n\n### Apache Spark\n\nTo use [Apache Spark](https://spark.apache.org/) as an engine, we will first install Spark and then need a connection to a cluster. For this article, we will set up and use a single-node Spark cluster running on a laptop.\n\nTo install, first install sparklyr:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ninstall.packages(\"sparklyr\")\n```\n:::\n\n\nand then install the Spark backend. For example, you might use: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nspark_install(version = \"4.0\")\n```\n:::\n\n\nOnce that is working, you can get ready to fit models using: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nsc <- spark_connect(\"local\")\n#> Warning in sprintf(version$pattern, version$spark, version$hadoop): 2 arguments\n#> not used by format 'spark-4.1.0-preview3-bin-hadoop3'\n```\n:::\n\n\n### h2o \n\nh2o.ai offers a Java-based high-performance computing server for machine learning. This can be run locally or externally. There are general installation instructions at [https://docs.h2o.ai/](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html). There is a package on CRAN, but you can also install directly from [h2o](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html#install-in-r) via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ninstall.packages(\n \"h2o\",\n type = \"source\",\n repos = \"http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R\"\n)\n```\n:::\n\n\nAfter installation is complete, you can start a local server via `h2o::h2o.init()`. \n\nThe tidymodels [agua](https://agua.tidymodels.org/) package contains some helpers and will also need to be installed. You can use its function to start a server too:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n#> \n#> Attaching package: 'agua'\n#> The following object is masked from 'package:workflowsets':\n#> \n#> rank_results\nh2o_start()\n#> Warning: JAVA not found, H2O may take minutes trying to connect.\n#> Warning in h2o.clusterInfo(): \n#> Your H2O cluster version is (1 year, 11 months and 5 days) old. There may be a newer version available.\n#> Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html\n```\n:::\n\n\n### Tensorflow and Keras\n\nR's tensorflow and keras3 packages call Python directly. To enable this, you'll have to install two R packages: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ninstall.packages(\"keras3\")\n```\n:::\n\n\nOnce that is done, use: \n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nkeras3::install_keras(backend = \"tensorflow\")\n```\n:::\n\n\nThere are other options for installation. See [https://tensorflow.rstudio.com/install/index.html](https://tensorflow.rstudio.com/install/index.html) for more details. \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Assumes you are going to use a virtual environment called \npve <- grep(\"tensorflow\", reticulate::virtualenv_list(), value = TRUE)\nreticulate::use_virtualenv(pve)\n```\n:::\n\n\n### Torch\n\nR's torch package is the low-level package containing the framework. Once you have installed it, you will get this message the first time you load the package: \n\n> Additional software needs to be downloaded and installed for torch to work correctly.\"\n\nChoosing \"Yes\" will do the _one-time_ installation. \n\n
\n\nTo get started, let's load the tidymodels package: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(tidymodels)\ntheme_set(theme_bw() + theme(legend.position = \"top\"))\n```\n:::\n\n\n# Classification Models\n\nTo demonstrate classification, let's make a small training and test sets for a binary outcome. We'll center and scale the data since some models require the same units.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(207)\nbin_split <- \n\tmodeldata::two_class_dat |> \n\trename(class = Class) |> \n\tinitial_split(prop = 0.994, strata = class)\nbin_split\n#> \n#> <785/6/791>\n\nbin_rec <- \n recipe(class ~ ., data = training(bin_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nbin_train <- bake(bin_rec, new_data = NULL)\nbin_test <- bake(bin_rec, new_data = testing(bin_split))\n```\n:::\n\n\nFor models that _only_ work for three or more classes, we'll simulate:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(1752)\nmtl_data <-\n sim_multinomial(\n 200,\n ~ -0.5 + 0.6 * abs(A),\n ~ ifelse(A > 0 & B > 0, 1.0 + 0.2 * A / B, - 2),\n ~ A + B - A * B)\n\nmtl_split <- initial_split(mtl_data, prop = 0.967, strata = class)\nmtl_split\n#> \n#> <192/8/200>\n\n# Predictors are in the same units\nmtl_train <- training(mtl_split)\nmtl_test <- testing(mtl_split)\n```\n:::\n\n\nFinally, we have some models that handle hierarchical data, where some rows are statistically correlated with other rows. For these examples, we'll use data from a clinical trial where patients were followed over time. The outcome is binary. The data are in the HSAUR3 package. We'll split these data in a way where all rows for a specific subject are either in the training or test sets: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(72)\ncls_group_split <- \n HSAUR3::toenail |> \n group_initial_split(group = patientID)\ncls_group_train <- training(cls_group_split)\ncls_group_test <- testing(cls_group_split)\n```\n:::\n\n\nThere are 219 subjects in the training set and 75 in the test set. \n\nIf using the **Apache Spark** engine, we will need to identify the data source and then use it to create the splits. For this article, we will copy the `two_class_dat` and the `mtl_data` data sets into the Spark session.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nsc <- spark_connect(\"local\")\n#> Re-using existing Spark connection to local\n\ntbl_two_class <- copy_to(sc, modeldata::two_class_dat)\n\ntbl_bin <- sdf_random_split(tbl_two_class, training = 0.994, test = 1-0.994, seed = 100)\n\ntbl_sim_mtl <- copy_to(sc, mtl_data)\n\ntbl_mtl <- sdf_random_split(tbl_sim_mtl, training = 0.967, test = 1-0.967, seed = 100)\n```\n:::\n\n\n\n## Bagged MARS (`bag_mars()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mars_spec <- bag_mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(268)\nbag_mars_fit <- bag_mars_spec |> fit(class ~ ., data = bin_train)\n#> \n#> Attaching package: 'plotrix'\n#> The following object is masked from 'package:scales':\n#> \n#> rescale\n#> Registered S3 method overwritten by 'butcher':\n#> method from \n#> as.character.dev_topic generics\nbag_mars_fit\n#> parsnip model object\n#> \n#> Bagged MARS (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 100 0 11\n#> 2 A 40.4 1.60 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mars_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_mars_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.452 0.548 \n#> 2 0.854 0.146 \n#> 3 0.455 0.545 \n#> 4 0.968 0.0316\n#> 5 0.939 0.0610\n#> 6 0.872 0.128\n```\n:::\n\n\n:::\n\n## Bagged Neural Networks (`bag_mlp()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mlp_spec <- bag_mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(318)\nbag_mlp_fit <- bag_mlp_spec |> fit(class ~ ., data = bin_train)\nbag_mlp_fit\n#> parsnip model object\n#> \n#> Bagged nnet (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 A 52.1 2.16 11\n#> 2 B 47.9 2.16 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.439 0.561\n#> 2 0.676 0.324\n#> 3 0.428 0.572\n#> 4 0.727 0.273\n#> 5 0.709 0.291\n#> 6 0.660 0.340\n```\n:::\n\n\n:::\n\n## Bagged Decision Trees (`bag_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(985)\nbag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged CART (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 271. 4.35 11\n#> 2 A 237. 5.58 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0 1 \n#> 2 1 0 \n#> 3 0.0909 0.909 \n#> 4 1 0 \n#> 5 0.727 0.273 \n#> 6 0.909 0.0909\n```\n:::\n\n\n## `C5.0` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(937)\nbag_tree_fit <- bag_tree_spec |> fit(class ~ ., data = bin_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged C5.0 (classification with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 B 100 0 11\n#> 2 A 48.7 7.33 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bag_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.269 0.731\n#> 2 0.863 0.137\n#> 3 0.259 0.741\n#> 4 0.897 0.103\n#> 5 0.897 0.103\n#> 6 0.870 0.130\n```\n:::\n\n\n:::\n\n## Bayesian Additive Regression Trees (`bart()`) \n\n:::{.panel-tabset}\n\n## `dbarts` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbart_spec <- bart() |>\n # We need to set the mode since this engine works with multiple modes\n # and dbarts is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(217)\nbart_fit <- bart_spec |> fit(class ~ ., data = bin_train)\nbart_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> `NULL`()\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bart_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(bart_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.439 0.561\n#> 2 0.734 0.266\n#> 3 0.34 0.66 \n#> 4 0.957 0.043\n#> 5 0.931 0.069\n#> 6 0.782 0.218\npredict(bart_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_lower_Class2 .pred_upper_Class1 .pred_upper_Class2\n#> \n#> 1 0.815 0.00280 0.997 0.185\n#> 2 0.781 0.0223 0.978 0.219\n#> 3 0.558 0.0702 0.930 0.442\n#> 4 0.540 0.105 0.895 0.460\n#> 5 0.239 0.345 0.655 0.761\n#> 6 0.195 0.469 0.531 0.805\npredict(bart_fit, type = \"pred_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_lower_Class2 .pred_upper_Class1 .pred_upper_Class2\n#> \n#> 1 0 0 1 1\n#> 2 0 0 1 1\n#> 3 0 0 1 1\n#> 4 0 0 1 1\n#> 5 0 0 1 1\n#> 6 0 0 1 1\n```\n:::\n\n\n:::\n\n## Boosted Decision Trees (`boost_tree()`) \n\n:::{.panel-tabset}\n\n## `xgboost` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and xgboost is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(738)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> ##### xgb.Booster\n#> raw: 40.4 Kb \n#> call:\n#> xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, \n#> colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, \n#> subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist, \n#> verbose = 0, nthread = 1, objective = \"binary:logistic\")\n#> params (as set within xgb.train):\n#> eta = \"0.3\", max_depth = \"6\", gamma = \"0\", colsample_bytree = \"1\", colsample_bynode = \"1\", min_child_weight = \"1\", subsample = \"1\", nthread = \"1\", objective = \"binary:logistic\", validate_parameters = \"TRUE\"\n#> xgb.attributes:\n#> niter\n#> callbacks:\n#> cb.evaluation.log()\n#> # of features: 2 \n#> niter: 15\n#> nfeatures : 2 \n#> evaluation_log:\n#> iter training_logloss\n#> \n#> 1 0.5546750\n#> 2 0.4719804\n#> --- ---\n#> 14 0.2587640\n#> 15 0.2528938\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.244 0.756 \n#> 2 0.770 0.230 \n#> 3 0.307 0.693 \n#> 4 0.944 0.0565\n#> 5 0.821 0.179 \n#> 6 0.938 0.0621\n```\n:::\n\n\n## `C5.0` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(984)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = 15, control = C50::C5.0Control(minCases\n#> = 2, sample = 0))\n#> \n#> Classification Tree\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Number of boosting iterations: 15 requested; 7 used due to early stopping\n#> Average tree size: 3.1 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.307 0.693\n#> 2 0.756 0.244\n#> 3 0.281 0.719\n#> 4 1 0 \n#> 5 1 0 \n#> 6 0.626 0.374\n```\n:::\n\n\n## `catboost` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"catboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(644)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> CatBoost model (1000 trees)\n#> Loss function: Logloss\n#> Fit to 2 feature(s)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.291 0.709 \n#> 2 0.836 0.164 \n#> 3 0.344 0.656 \n#> 4 0.998 0.00245\n#> 5 0.864 0.136 \n#> 6 0.902 0.0983\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(186)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: gbm\n#> Model ID: GBM_model_R_1763571327438_5515 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 25377 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 21 55 35.70000\n#> \n#> \n#> H2OBinomialMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.007948832\n#> RMSE: 0.08915622\n#> LogLoss: 0.05942305\n#> Mean Per-Class Error: 0\n#> AUC: 1\n#> AUCPR: 1\n#> Gini: 1\n#> R^2: 0.9678452\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 434 0 0.000000 =0/434\n#> Class2 0 351 0.000000 =0/351\n#> Totals 434 351 0.000000 =0/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.598690 1.000000 200\n#> 2 max f2 0.598690 1.000000 200\n#> 3 max f0point5 0.598690 1.000000 200\n#> 4 max accuracy 0.598690 1.000000 200\n#> 5 max precision 0.998192 1.000000 0\n#> 6 max recall 0.598690 1.000000 200\n#> 7 max specificity 0.998192 1.000000 0\n#> 8 max absolute_mcc 0.598690 1.000000 200\n#> 9 max min_per_class_accuracy 0.598690 1.000000 200\n#> 10 max mean_per_class_accuracy 0.598690 1.000000 200\n#> 11 max tns 0.998192 434.000000 0\n#> 12 max fns 0.998192 349.000000 0\n#> 13 max fps 0.000831 434.000000 399\n#> 14 max tps 0.598690 351.000000 200\n#> 15 max tnr 0.998192 1.000000 0\n#> 16 max fnr 0.998192 0.994302 0\n#> 17 max fpr 0.000831 1.000000 399\n#> 18 max tpr 0.598690 1.000000 200\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.0496 0.950 \n#> 2 0.905 0.0953 \n#> 3 0.0738 0.926 \n#> 4 0.997 0.00273\n#> 5 0.979 0.0206 \n#> 6 0.878 0.122\n```\n:::\n\n\n## `h2o_gbm` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(724)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: gbm\n#> Model ID: GBM_model_R_1763571327438_5567 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 25378 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 21 55 35.70000\n#> \n#> \n#> H2OBinomialMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.007948832\n#> RMSE: 0.08915622\n#> LogLoss: 0.05942305\n#> Mean Per-Class Error: 0\n#> AUC: 1\n#> AUCPR: 1\n#> Gini: 1\n#> R^2: 0.9678452\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 434 0 0.000000 =0/434\n#> Class2 0 351 0.000000 =0/351\n#> Totals 434 351 0.000000 =0/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.598690 1.000000 200\n#> 2 max f2 0.598690 1.000000 200\n#> 3 max f0point5 0.598690 1.000000 200\n#> 4 max accuracy 0.598690 1.000000 200\n#> 5 max precision 0.998192 1.000000 0\n#> 6 max recall 0.598690 1.000000 200\n#> 7 max specificity 0.998192 1.000000 0\n#> 8 max absolute_mcc 0.598690 1.000000 200\n#> 9 max min_per_class_accuracy 0.598690 1.000000 200\n#> 10 max mean_per_class_accuracy 0.598690 1.000000 200\n#> 11 max tns 0.998192 434.000000 0\n#> 12 max fns 0.998192 349.000000 0\n#> 13 max fps 0.000831 434.000000 399\n#> 14 max tps 0.598690 351.000000 200\n#> 15 max tnr 0.998192 1.000000 0\n#> 16 max fnr 0.998192 0.994302 0\n#> 17 max fpr 0.000831 1.000000 399\n#> 18 max tpr 0.598690 1.000000 200\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.0496 0.950 \n#> 2 0.905 0.0953 \n#> 3 0.0738 0.926 \n#> 4 0.997 0.00273\n#> 5 0.979 0.0206 \n#> 6 0.878 0.122\n```\n:::\n\n\n## `lightgbm` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"lightgbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(906)\nboost_tree_fit <- boost_tree_spec |> fit(class ~ ., data = bin_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> LightGBM Model (100 trees)\n#> Objective: binary\n#> Fitted to dataset with 2 columns\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(boost_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.147 0.853 \n#> 2 0.930 0.0699\n#> 3 0.237 0.763 \n#> 4 0.990 0.0101\n#> 5 0.929 0.0714\n#> 6 0.956 0.0445\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(285)\nboost_tree_fit <- boost_tree_spec |> fit(Class ~ ., data = tbl_bin$training)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> GBTClassificationModel: uid = gradient_boosted_trees__0d66c197_daaa_47eb_ba06_62029801a638, numTrees=20, numClasses=2, numFeatures=2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"class\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class1 \n#> 4 Class2 \n#> 5 Class2 \n#> 6 Class1 \n#> 7 Class2\npredict(boost_tree_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 2]\n#> # Database: spark_connection\n#> pred_Class1 pred_Class2\n#> \n#> 1 0.307 0.693 \n#> 2 0.292 0.708 \n#> 3 0.856 0.144 \n#> 4 0.192 0.808 \n#> 5 0.332 0.668 \n#> 6 0.952 0.0476\n#> 7 0.0865 0.914\n```\n:::\n\n\n:::\n\n## C5 Rules (`C5_rules()`) \n\n:::{.panel-tabset}\n\n## `C5.0` \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and C5.0 is the default engine so there is no need to set that either.\nC5_rules_spec <- C5_rules()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(93)\nC5_rules_fit <- C5_rules_spec |> fit(class ~ ., data = bin_train)\nC5_rules_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = trials, rules = TRUE, control\n#> = C50::C5.0Control(minCases = minCases, seed = sample.int(10^5,\n#> 1), earlyStopping = FALSE))\n#> \n#> Rule-Based Model\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Number of Rules: 4 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(C5_rules_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(C5_rules_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 1 0\n#> 2 1 0\n#> 3 0 1\n#> 4 1 0\n#> 5 1 0\n#> 6 1 0\n```\n:::\n\n\n:::\n\n## Decision Tree (`decision_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> n= 785 \n#> \n#> node), split, n, loss, yval, (yprob)\n#> * denotes terminal node\n#> \n#> 1) root 785 351 Class1 (0.5528662 0.4471338) \n#> 2) B< -0.06526451 399 61 Class1 (0.8471178 0.1528822) *\n#> 3) B>=-0.06526451 386 96 Class2 (0.2487047 0.7512953) \n#> 6) B< 0.7339337 194 72 Class2 (0.3711340 0.6288660) \n#> 12) A>=0.6073948 49 13 Class1 (0.7346939 0.2653061) *\n#> 13) A< 0.6073948 145 36 Class2 (0.2482759 0.7517241) *\n#> 7) B>=0.7339337 192 24 Class2 (0.1250000 0.8750000) *\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.735 0.265\n#> 2 0.847 0.153\n#> 3 0.248 0.752\n#> 4 0.847 0.153\n#> 5 0.847 0.153\n#> 6 0.847 0.153\n```\n:::\n\n\n## `C5.0` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |> \n set_mode(\"classification\") |> \n set_engine(\"C5.0\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> C5.0.default(x = x, y = y, trials = 1, control = C50::C5.0Control(minCases =\n#> 2, sample = 0))\n#> \n#> Classification Tree\n#> Number of samples: 785 \n#> Number of predictors: 2 \n#> \n#> Tree size: 4 \n#> \n#> Non-standard options: attempt to group attributes\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.732 0.268\n#> 2 0.846 0.154\n#> 3 0.236 0.764\n#> 4 0.846 0.154\n#> 5 0.846 0.154\n#> 6 0.846 0.154\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(class ~ ., data = bin_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> class ~ A + B\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] B <= -0.06906\n#> | | [3] B <= -0.50486: Class1 (n = 291, err = 8.2%)\n#> | | [4] B > -0.50486\n#> | | | [5] A <= -0.07243: Class1 (n = 77, err = 45.5%)\n#> | | | [6] A > -0.07243: Class1 (n = 31, err = 6.5%)\n#> | [7] B > -0.06906\n#> | | [8] B <= 0.72938\n#> | | | [9] A <= 0.60196: Class2 (n = 145, err = 24.8%)\n#> | | | [10] A > 0.60196\n#> | | | | [11] B <= 0.44701: Class1 (n = 23, err = 4.3%)\n#> | | | | [12] B > 0.44701: Class1 (n = 26, err = 46.2%)\n#> | | [13] B > 0.72938: Class2 (n = 192, err = 12.5%)\n#> \n#> Number of inner nodes: 6\n#> Number of terminal nodes: 7\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(decision_tree_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.538 0.462 \n#> 2 0.935 0.0645\n#> 3 0.248 0.752 \n#> 4 0.918 0.0825\n#> 5 0.918 0.0825\n#> 6 0.935 0.0645\n```\n:::\n\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n set_mode(\"classification\") |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(Class ~ ., data = tbl_bin$training)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> DecisionTreeClassificationModel: uid=decision_tree_classifier__1e1401b8_a95f_48a9_8969_2fd48eb813d7, depth=5, numNodes=43, numClasses=2, numFeatures=2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"class\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class1 \n#> 4 Class2 \n#> 5 Class2 \n#> 6 Class1 \n#> 7 Class2\npredict(decision_tree_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 2]\n#> # Database: spark_connection\n#> pred_Class1 pred_Class2\n#> \n#> 1 0.260 0.740 \n#> 2 0.260 0.740 \n#> 3 0.860 0.140 \n#> 4 0.260 0.740 \n#> 5 0.260 0.740 \n#> 6 0.923 0.0769\n#> 7 0.0709 0.929\n```\n:::\n\n\n:::\n\n## Flexible Discriminant Analysis (`discrim_flexible()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and earth is the default engine so there is no need to set that either.\ndiscrim_flexible_spec <- discrim_flexible()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_flexible_fit <- discrim_flexible_spec |> fit(class ~ ., data = bin_train)\ndiscrim_flexible_fit\n#> parsnip model object\n#> \n#> Call:\n#> mda::fda(formula = class ~ ., data = data, method = earth::earth)\n#> \n#> Dimension: 1 \n#> \n#> Percent Between-Group Variance Explained:\n#> v1 \n#> 100 \n#> \n#> Training Misclassification Error: 0.1707 ( N = 785 )\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_flexible_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_flexible_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.339 0.661 \n#> 2 0.848 0.152 \n#> 3 0.342 0.658 \n#> 4 0.964 0.0360\n#> 5 0.964 0.0360\n#> 6 0.875 0.125\n```\n:::\n\n\n:::\n\n## Linear Discriminant Analysis (`discrim_linear()`) \n\n:::{.panel-tabset}\n\n## `MASS` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and MASS is the default engine so there is no need to set that either.\ndiscrim_linear_spec <- discrim_linear()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Call:\n#> lda(class ~ ., data = data)\n#> \n#> Prior probabilities of groups:\n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Group means:\n#> A B\n#> Class1 -0.2982900 -0.5573140\n#> Class2 0.3688258 0.6891006\n#> \n#> Coefficients of linear discriminants:\n#> LD1\n#> A -0.6068479\n#> B 1.7079953\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.369 0.631 \n#> 2 0.868 0.132 \n#> 3 0.541 0.459 \n#> 4 0.984 0.0158\n#> 5 0.928 0.0718\n#> 6 0.854 0.146\n```\n:::\n\n\n## `mda` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"mda\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Call:\n#> mda::fda(formula = class ~ ., data = data, method = mda::gen.ridge, \n#> keep.fitted = FALSE)\n#> \n#> Dimension: 1 \n#> \n#> Percent Between-Group Variance Explained:\n#> v1 \n#> 100 \n#> \n#> Degrees of Freedom (per dimension): 1.99423 \n#> \n#> Training Misclassification Error: 0.17707 ( N = 785 )\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.368 0.632 \n#> 2 0.867 0.133 \n#> 3 0.542 0.458 \n#> 4 0.984 0.0158\n#> 5 0.928 0.0718\n#> 6 0.853 0.147\n```\n:::\n\n\n## `sda` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sda\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> $regularization\n#> lambda lambda.var lambda.freqs \n#> 0.003136201 0.067551534 0.112819609 \n#> \n#> $freqs\n#> Class1 Class2 \n#> 0.5469019 0.4530981 \n#> \n#> $alpha\n#> Class1 Class2 \n#> -0.8934125 -1.2349286 \n#> \n#> $beta\n#> A B\n#> Class1 0.4565325 -1.298858\n#> Class2 -0.5510473 1.567757\n#> attr(,\"class\")\n#> [1] \"shrinkage\"\n#> \n#> attr(,\"class\")\n#> [1] \"sda\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.366 0.634 \n#> 2 0.860 0.140 \n#> 3 0.536 0.464 \n#> 4 0.982 0.0176\n#> 5 0.923 0.0768\n#> 6 0.845 0.155\n```\n:::\n\n\n## `sparsediscrim` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_spec <- discrim_linear() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sparsediscrim\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_linear_fit <- discrim_linear_spec |> fit(class ~ ., data = bin_train)\ndiscrim_linear_fit\n#> parsnip model object\n#> \n#> Diagonal LDA\n#> \n#> Sample Size: 785 \n#> Number of Features: 2 \n#> \n#> Classes and Prior Probabilities:\n#> Class1 (55.29%), Class2 (44.71%)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.182 0.818 \n#> 2 0.755 0.245 \n#> 3 0.552 0.448 \n#> 4 0.996 0.00372\n#> 5 0.973 0.0274 \n#> 6 0.629 0.371\n```\n:::\n\n\n:::\n\n## Quandratic Discriminant Analysis (`discrim_quad()`) \n\n:::{.panel-tabset}\n\n## `MASS` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_spec <- discrim_quad()\n # This engine works with a single mode so no need to set that\n # and MASS is the default engine so there is no need to set that either.\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_fit <- discrim_quad_spec |> fit(class ~ ., data = bin_train)\ndiscrim_quad_fit\n#> parsnip model object\n#> \n#> Call:\n#> qda(class ~ ., data = data)\n#> \n#> Prior probabilities of groups:\n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Group means:\n#> A B\n#> Class1 -0.2982900 -0.5573140\n#> Class2 0.3688258 0.6891006\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_quad_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_quad_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.340 0.660 \n#> 2 0.884 0.116 \n#> 3 0.500 0.500 \n#> 4 0.965 0.0349\n#> 5 0.895 0.105 \n#> 6 0.895 0.105\n```\n:::\n\n\n## `sparsediscrim` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_spec <- discrim_quad() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"sparsediscrim\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_quad_fit <- discrim_quad_spec |> fit(class ~ ., data = bin_train)\ndiscrim_quad_fit\n#> parsnip model object\n#> \n#> Diagonal QDA\n#> \n#> Sample Size: 785 \n#> Number of Features: 2 \n#> \n#> Classes and Prior Probabilities:\n#> Class1 (55.29%), Class2 (44.71%)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_quad_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_quad_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.180 0.820 \n#> 2 0.750 0.250 \n#> 3 0.556 0.444 \n#> 4 0.994 0.00634\n#> 5 0.967 0.0328 \n#> 6 0.630 0.370\n```\n:::\n\n\n:::\n\n## Regularized Discriminant Analysis (`discrim_regularized()`) \n\n:::{.panel-tabset}\n\n## `klaR` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and klaR is the default engine so there is no need to set that either.\ndiscrim_regularized_spec <- discrim_regularized()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndiscrim_regularized_fit <- discrim_regularized_spec |> fit(class ~ ., data = bin_train)\ndiscrim_regularized_fit\n#> parsnip model object\n#> \n#> Call: \n#> rda(formula = class ~ ., data = data)\n#> \n#> Regularization parameters: \n#> gamma lambda \n#> 3.348721e-05 3.288193e-04 \n#> \n#> Prior probabilities of groups: \n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> Misclassification rate: \n#> apparent: 17.707 %\n#> cross-validated: 17.566 %\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(discrim_regularized_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(discrim_regularized_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.340 0.660 \n#> 2 0.884 0.116 \n#> 3 0.501 0.499 \n#> 4 0.965 0.0349\n#> 5 0.895 0.105 \n#> 6 0.895 0.105\n```\n:::\n\n\n:::\n\n## Generalized Additive Models (`gen_additive_mod()`) \n\n:::{.panel-tabset}\n\n## `mgcv` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_spec <- gen_additive_mod() |>\n # We need to set the mode since this engine works with multiple modes\n # and mgcv is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_fit <- \n gen_additive_mod_spec |> \n fit(class ~ s(A) + s(B), data = bin_train)\ngen_additive_mod_fit\n#> parsnip model object\n#> \n#> \n#> Family: binomial \n#> Link function: logit \n#> \n#> Formula:\n#> class ~ s(A) + s(B)\n#> \n#> Estimated degrees of freedom:\n#> 2.76 4.22 total = 7.98 \n#> \n#> UBRE score: -0.153537\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(gen_additive_mod_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(gen_additive_mod_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.400 0.600 \n#> 2 0.826 0.174 \n#> 3 0.454 0.546 \n#> 4 0.975 0.0250\n#> 5 0.929 0.0711\n#> 6 0.829 0.171\npredict(gen_additive_mod_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0.304 0.504 0.496 0.696\n#> 2 0.739 0.889 0.111 0.261\n#> 3 0.364 0.546 0.454 0.636\n#> 4 0.846 0.996 0.00358 0.154\n#> 5 0.881 0.958 0.0416 0.119\n#> 6 0.735 0.894 0.106 0.265\n```\n:::\n\n\n:::\n\n## Logistic Regression (`logistic_reg()`) \n\n:::{.panel-tabset}\n\n## `glm` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg()\n # This engine works with a single mode so no need to set that\n # and glm is the default engine so there is no need to set that either.\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = class ~ ., family = stats::binomial, data = data)\n#> \n#> Coefficients:\n#> (Intercept) A B \n#> -0.3563 -1.1250 2.8154 \n#> \n#> Degrees of Freedom: 784 Total (i.e. Null); 782 Residual\n#> Null Deviance:\t 1079 \n#> Residual Deviance: 666.9 \tAIC: 672.9\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.400 0.600 \n#> 2 0.862 0.138 \n#> 3 0.541 0.459 \n#> 4 0.977 0.0234\n#> 5 0.909 0.0905\n#> 6 0.853 0.147\npredict(logistic_reg_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0.339 0.465 0.535 0.661 \n#> 2 0.816 0.897 0.103 0.184 \n#> 3 0.493 0.588 0.412 0.507 \n#> 4 0.960 0.986 0.0137 0.0395\n#> 5 0.875 0.935 0.0647 0.125 \n#> 6 0.800 0.894 0.106 0.200\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(466)\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Logistic regression\n#> \n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> batch size: 707 \n#> validation loss after 1 epoch: 0.283\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.412 0.588 \n#> 2 0.854 0.146 \n#> 3 0.537 0.463 \n#> 4 0.971 0.0294\n#> 5 0.896 0.104 \n#> 6 0.848 0.152\n```\n:::\n\n\n## `gee` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- \n logistic_reg_spec |> \n fit(outcome ~ treatment * visit + id_var(patientID), data = cls_group_train)\n#> Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27\n#> running glm to get initial regression estimate\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> \n#> GEE: GENERALIZED LINEAR MODELS FOR DEPENDENT DATA\n#> gee S-function, version 4.13 modified 98/01/27 (1998) \n#> \n#> Model:\n#> Link: Logit \n#> Variance to Mean Relation: Binomial \n#> Correlation Structure: Independent \n#> \n#> Call:\n#> gee::gee(formula = outcome ~ treatment + visit, id = data$patientID, \n#> data = data, family = binomial)\n#> \n#> Number of observations : 1433 \n#> \n#> Maximum cluster size : 7 \n#> \n#> \n#> Coefficients:\n#> (Intercept) treatmentterbinafine visit \n#> -0.06853546 -0.25700680 -0.35646522 \n#> \n#> Estimated Scale Parameter: 0.9903994\n#> Number of Iterations: 1\n#> \n#> Working Correlation[1:4,1:4]\n#> [,1] [,2] [,3] [,4]\n#> [1,] 1 0 0 0\n#> [2,] 0 1 0 0\n#> [3,] 0 0 1 0\n#> [4,] 0 0 0 1\n#> \n#> \n#> Returned Error Value:\n#> [1] 0\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = cls_group_test)\n#> # A tibble: 475 × 1\n#> .pred_class \n#> \n#> 1 none or mild\n#> 2 none or mild\n#> 3 none or mild\n#> 4 none or mild\n#> 5 none or mild\n#> 6 none or mild\n#> 7 none or mild\n#> 8 none or mild\n#> 9 none or mild\n#> 10 none or mild\n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"prob\", new_data = cls_group_test)\n#> # A tibble: 475 × 2\n#> `.pred_none or mild` `.pred_moderate or severe`\n#> \n#> 1 0.664 0.336 \n#> 2 0.739 0.261 \n#> 3 0.801 0.199 \n#> 4 0.852 0.148 \n#> 5 0.892 0.108 \n#> 6 0.922 0.0784\n#> 7 0.944 0.0562\n#> 8 0.605 0.395 \n#> 9 0.686 0.314 \n#> 10 0.757 0.243 \n#> # ℹ 465 more rows\n```\n:::\n\n\n## `glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- \n logistic_reg_spec |> \n fit(outcome ~ treatment * visit + (1 | patientID), data = cls_group_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Generalized linear mixed model fit by maximum likelihood (Laplace\n#> Approximation) [glmerMod]\n#> Family: binomial ( logit )\n#> Formula: outcome ~ treatment * visit + (1 | patientID)\n#> Data: data\n#> AIC BIC logLik -2*log(L) df.resid \n#> 863.8271 890.1647 -426.9135 853.8271 1428 \n#> Random effects:\n#> Groups Name Std.Dev.\n#> patientID (Intercept) 8.35 \n#> Number of obs: 1433, groups: patientID, 219\n#> Fixed Effects:\n#> (Intercept) treatmentterbinafine \n#> -4.57420 -0.51193 \n#> visit treatmentterbinafine:visit \n#> -0.98725 -0.00112\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = cls_group_test)\n#> # A tibble: 475 × 1\n#> .pred_class \n#> \n#> 1 none or mild\n#> 2 none or mild\n#> 3 none or mild\n#> 4 none or mild\n#> 5 none or mild\n#> 6 none or mild\n#> 7 none or mild\n#> 8 none or mild\n#> 9 none or mild\n#> 10 none or mild\n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"prob\", new_data = cls_group_test)\n#> # A tibble: 475 × 2\n#> `.pred_none or mild` `.pred_moderate or severe`\n#> \n#> 1 0.998 0.00230 \n#> 2 0.999 0.000856 \n#> 3 1.000 0.000319 \n#> 4 1.000 0.000119 \n#> 5 1.000 0.0000441 \n#> 6 1.000 0.0000164 \n#> 7 1.000 0.00000612\n#> 8 0.996 0.00383 \n#> 9 0.999 0.00143 \n#> 10 0.999 0.000533 \n#> # ℹ 465 more rows\n```\n:::\n\n\n## `glmnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"binomial\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.308300\n#> 2 1 4.75 0.280900\n#> 3 1 8.73 0.256000\n#> 4 1 12.10 0.233200\n#> 5 1 14.99 0.212500\n#> 6 1 17.46 0.193600\n#> 7 1 19.60 0.176400\n#> 8 1 21.45 0.160800\n#> 9 1 23.05 0.146500\n#> 10 1 24.44 0.133500\n#> 11 1 25.65 0.121600\n#> 12 1 26.70 0.110800\n#> 13 1 27.61 0.101000\n#> 14 1 28.40 0.091990\n#> 15 1 29.08 0.083820\n#> 16 1 29.68 0.076370\n#> 17 1 30.19 0.069590\n#> 18 1 30.63 0.063410\n#> 19 1 31.00 0.057770\n#> 20 1 31.33 0.052640\n#> 21 1 31.61 0.047960\n#> 22 1 31.85 0.043700\n#> 23 1 32.05 0.039820\n#> 24 2 32.62 0.036280\n#> 25 2 33.41 0.033060\n#> 26 2 34.10 0.030120\n#> 27 2 34.68 0.027450\n#> 28 2 35.19 0.025010\n#> 29 2 35.63 0.022790\n#> 30 2 36.01 0.020760\n#> 31 2 36.33 0.018920\n#> 32 2 36.62 0.017240\n#> 33 2 36.86 0.015710\n#> 34 2 37.06 0.014310\n#> 35 2 37.24 0.013040\n#> 36 2 37.39 0.011880\n#> 37 2 37.52 0.010830\n#> 38 2 37.63 0.009864\n#> 39 2 37.72 0.008988\n#> 40 2 37.80 0.008189\n#> 41 2 37.86 0.007462\n#> 42 2 37.92 0.006799\n#> 43 2 37.97 0.006195\n#> 44 2 38.01 0.005644\n#> 45 2 38.04 0.005143\n#> 46 2 38.07 0.004686\n#> 47 2 38.10 0.004270\n#> 48 2 38.12 0.003891\n#> 49 2 38.13 0.003545\n#> 50 2 38.15 0.003230\n#> 51 2 38.16 0.002943\n#> 52 2 38.17 0.002682\n#> 53 2 38.18 0.002443\n#> 54 2 38.18 0.002226\n#> 55 2 38.19 0.002029\n#> 56 2 38.19 0.001848\n#> 57 2 38.20 0.001684\n#> 58 2 38.20 0.001534\n#> 59 2 38.20 0.001398\n#> 60 2 38.21 0.001274\n#> 61 2 38.21 0.001161\n#> 62 2 38.21 0.001058\n#> 63 2 38.21 0.000964\n#> 64 2 38.21 0.000878\n#> 65 2 38.21 0.000800\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.383 0.617 \n#> 2 0.816 0.184 \n#> 3 0.537 0.463 \n#> 4 0.969 0.0313\n#> 5 0.894 0.106 \n#> 6 0.797 0.203\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: glm\n#> Model ID: GLM_model_R_1763571327438_5619 \n#> GLM Model: summary\n#> family link regularization\n#> 1 binomial logit Elastic Net (alpha = 0.5, lambda = 6.162E-4 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 4\n#> training_frame\n#> 1 object_zkelygexok\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept -0.350788 -0.350788\n#> 2 A -1.084233 -1.084233\n#> 3 B 2.759366 2.759366\n#> \n#> H2OBinomialMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.130451\n#> RMSE: 0.3611799\n#> LogLoss: 0.4248206\n#> Mean Per-Class Error: 0.1722728\n#> AUC: 0.8889644\n#> AUCPR: 0.8520865\n#> Gini: 0.7779288\n#> R^2: 0.4722968\n#> Residual Deviance: 666.9684\n#> AIC: 672.9684\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 350 84 0.193548 =84/434\n#> Class2 53 298 0.150997 =53/351\n#> Totals 403 382 0.174522 =137/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.411045 0.813097 213\n#> 2 max f2 0.229916 0.868991 279\n#> 3 max f0point5 0.565922 0.816135 166\n#> 4 max accuracy 0.503565 0.826752 185\n#> 5 max precision 0.997356 1.000000 0\n#> 6 max recall 0.009705 1.000000 395\n#> 7 max specificity 0.997356 1.000000 0\n#> 8 max absolute_mcc 0.411045 0.652014 213\n#> 9 max min_per_class_accuracy 0.454298 0.822581 201\n#> 10 max mean_per_class_accuracy 0.411045 0.827727 213\n#> 11 max tns 0.997356 434.000000 0\n#> 12 max fns 0.997356 349.000000 0\n#> 13 max fps 0.001723 434.000000 399\n#> 14 max tps 0.009705 351.000000 395\n#> 15 max tnr 0.997356 1.000000 0\n#> 16 max fnr 0.997356 0.994302 0\n#> 17 max fpr 0.001723 1.000000 399\n#> 18 max tpr 0.009705 1.000000 395\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.399 0.601 \n#> 2 0.857 0.143 \n#> 3 0.540 0.460 \n#> 4 0.976 0.0243\n#> 5 0.908 0.0925\n#> 6 0.848 0.152\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(730)\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Model: \"sequential\"\n#> ________________________________________________________________________________\n#> Layer (type) Output Shape Param # \n#> ================================================================================\n#> dense (Dense) (None, 1) 3 \n#> dense_1 (Dense) (None, 2) 4 \n#> ================================================================================\n#> Total params: 7 (28.00 Byte)\n#> Trainable params: 7 (28.00 Byte)\n#> Non-trainable params: 0 (0.00 Byte)\n#> ________________________________________________________________________________\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> 1/1 - 0s - 91ms/epoch - 91ms/step\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class2\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> 1/1 - 0s - 7ms/epoch - 7ms/step\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.212 0.788 \n#> 2 0.626 0.374 \n#> 3 0.579 0.421 \n#> 4 0.990 0.0103\n#> 5 0.953 0.0467\n#> 6 0.471 0.529\n```\n:::\n\n\n## `LiblineaR` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"LiblineaR\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(class ~ ., data = bin_train)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized logistic regression primal (L2R_LR)\"\n#> \n#> $Type\n#> [1] 0\n#> \n#> $W\n#> A B Bias\n#> [1,] 1.014233 -2.65166 0.3363362\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $ClassNames\n#> [1] Class1 Class2\n#> Levels: Class1 Class2\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(logistic_reg_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.397 0.603 \n#> 2 0.847 0.153 \n#> 3 0.539 0.461 \n#> 4 0.973 0.0267\n#> 5 0.903 0.0974\n#> 6 0.837 0.163\n```\n:::\n\n\n## `stan` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(96)\nlogistic_reg_fit <- \n logistic_reg_spec |> \n fit(outcome ~ treatment * visit, data = cls_group_train)\nlogistic_reg_fit |> print(digits = 3)\n#> parsnip model object\n#> \n#> stan_glm\n#> family: binomial [logit]\n#> formula: outcome ~ treatment * visit\n#> observations: 1433\n#> predictors: 4\n#> ------\n#> Median MAD_SD\n#> (Intercept) -0.137 0.187\n#> treatmentterbinafine -0.108 0.264\n#> visit -0.335 0.050\n#> treatmentterbinafine:visit -0.048 0.073\n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = cls_group_test)\n#> # A tibble: 475 × 1\n#> .pred_class \n#> \n#> 1 none or mild\n#> 2 none or mild\n#> 3 none or mild\n#> 4 none or mild\n#> 5 none or mild\n#> 6 none or mild\n#> 7 none or mild\n#> 8 none or mild\n#> 9 none or mild\n#> 10 none or mild\n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"prob\", new_data = cls_group_test)\n#> # A tibble: 475 × 2\n#> `.pred_none or mild` `.pred_moderate or severe`\n#> \n#> 1 0.652 0.348 \n#> 2 0.734 0.266 \n#> 3 0.802 0.198 \n#> 4 0.856 0.144 \n#> 5 0.898 0.102 \n#> 6 0.928 0.0721\n#> 7 0.950 0.0502\n#> 8 0.617 0.383 \n#> 9 0.692 0.308 \n#> 10 0.759 0.241 \n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"conf_int\", new_data = cls_group_test)\n#> # A tibble: 475 × 4\n#> `.pred_lower_none or mild` `.pred_upper_none or mild` .pred_lower_moderate …¹\n#> \n#> 1 0.583 0.715 0.285 \n#> 2 0.689 0.776 0.224 \n#> 3 0.771 0.832 0.168 \n#> 4 0.827 0.883 0.117 \n#> 5 0.868 0.924 0.0761\n#> 6 0.899 0.952 0.0482\n#> 7 0.922 0.970 0.0302\n#> 8 0.547 0.683 0.317 \n#> 9 0.644 0.736 0.264 \n#> 10 0.723 0.791 0.209 \n#> # ℹ 465 more rows\n#> # ℹ abbreviated name: ¹​`.pred_lower_moderate or severe`\n#> # ℹ 1 more variable: `.pred_upper_moderate or severe` \npredict(logistic_reg_fit, type = \"pred_int\", new_data = cls_group_test)\n#> # A tibble: 475 × 4\n#> `.pred_lower_none or mild` `.pred_upper_none or mild` .pred_lower_moderate …¹\n#> \n#> 1 0 1 0\n#> 2 0 1 0\n#> 3 0 1 0\n#> 4 0 1 0\n#> 5 0 1 0\n#> 6 0 1 0\n#> 7 0 1 0\n#> 8 0 1 0\n#> 9 0 1 0\n#> 10 0 1 0\n#> # ℹ 465 more rows\n#> # ℹ abbreviated name: ¹​`.pred_lower_moderate or severe`\n#> # ℹ 1 more variable: `.pred_upper_moderate or severe` \n```\n:::\n\n\n## `stan_glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(484)\nlogistic_reg_fit <- \n logistic_reg_spec |> \n fit(outcome ~ treatment * visit + (1 | patientID), data = cls_group_train)\nlogistic_reg_fit |> print(digits = 3)\n#> parsnip model object\n#> \n#> stan_glmer\n#> family: binomial [logit]\n#> formula: outcome ~ treatment * visit + (1 | patientID)\n#> observations: 1433\n#> ------\n#> Median MAD_SD\n#> (Intercept) -0.628 0.585\n#> treatmentterbinafine -0.686 0.821\n#> visit -0.830 0.105\n#> treatmentterbinafine:visit -0.023 0.143\n#> \n#> Error terms:\n#> Groups Name Std.Dev.\n#> patientID (Intercept) 4.376 \n#> Num. levels: patientID 219 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = cls_group_test)\n#> # A tibble: 475 × 1\n#> .pred_class \n#> \n#> 1 none or mild\n#> 2 none or mild\n#> 3 none or mild\n#> 4 none or mild\n#> 5 none or mild\n#> 6 none or mild\n#> 7 none or mild\n#> 8 none or mild\n#> 9 none or mild\n#> 10 none or mild\n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"prob\", new_data = cls_group_test)\n#> # A tibble: 475 × 2\n#> `.pred_none or mild` `.pred_moderate or severe`\n#> \n#> 1 0.671 0.329 \n#> 2 0.730 0.270 \n#> 3 0.796 0.204 \n#> 4 0.847 0.153 \n#> 5 0.882 0.118 \n#> 6 0.909 0.0908\n#> 7 0.934 0.0655\n#> 8 0.613 0.387 \n#> 9 0.681 0.319 \n#> 10 0.744 0.256 \n#> # ℹ 465 more rows\npredict(logistic_reg_fit, type = \"conf_int\", new_data = cls_group_test)\n#> # A tibble: 475 × 4\n#> `.pred_lower_none or mild` `.pred_upper_none or mild` .pred_lower_moderate …¹\n#> \n#> 1 0.00184 1.000 0.0000217 \n#> 2 0.00417 1.000 0.00000942 \n#> 3 0.00971 1.000 0.00000412 \n#> 4 0.0214 1.000 0.00000169 \n#> 5 0.0465 1.000 0.000000706\n#> 6 0.101 1.000 0.000000300\n#> 7 0.203 1.000 0.000000120\n#> 8 0.000923 1.000 0.0000440 \n#> 9 0.00196 1.000 0.0000175 \n#> 10 0.00447 1.000 0.00000724 \n#> # ℹ 465 more rows\n#> # ℹ abbreviated name: ¹​`.pred_lower_moderate or severe`\n#> # ℹ 1 more variable: `.pred_upper_moderate or severe` \npredict(logistic_reg_fit, type = \"pred_int\", new_data = cls_group_test)\n#> # A tibble: 475 × 4\n#> `.pred_lower_none or mild` `.pred_upper_none or mild` .pred_lower_moderate …¹\n#> \n#> 1 0 1 0\n#> 2 0 1 0\n#> 3 0 1 0\n#> 4 0 1 0\n#> 5 0 1 0\n#> 6 0 1 0\n#> 7 0 1 0\n#> 8 0 1 0\n#> 9 0 1 0\n#> 10 0 1 0\n#> # ℹ 465 more rows\n#> # ℹ abbreviated name: ¹​`.pred_lower_moderate or severe`\n#> # ℹ 1 more variable: `.pred_upper_moderate or severe` \n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_spec <- logistic_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlogistic_reg_fit <- logistic_reg_spec |> fit(Class ~ ., data = tbl_bin$training)\nlogistic_reg_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> Coefficients:\n#> (Intercept) A B \n#> -3.731170 -1.214355 3.794186\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(logistic_reg_fit, type = \"class\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class1 \n#> 4 Class2 \n#> 5 Class2 \n#> 6 Class1 \n#> 7 Class2\npredict(logistic_reg_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 2]\n#> # Database: spark_connection\n#> pred_Class1 pred_Class2\n#> \n#> 1 0.130 0.870\n#> 2 0.262 0.738\n#> 3 0.787 0.213\n#> 4 0.279 0.721\n#> 5 0.498 0.502\n#> 6 0.900 0.100\n#> 7 0.161 0.839\n```\n:::\n\n\n:::\n\n## Multivariate Adaptive Regression Splines (`mars()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_spec <- mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_fit <- mars_spec |> fit(class ~ ., data = bin_train)\nmars_fit\n#> parsnip model object\n#> \n#> GLM (family binomial, link logit):\n#> nulldev df dev df devratio AIC iters converged\n#> 1079.45 784 638.975 779 0.408 651 5 1\n#> \n#> Earth selected 6 of 13 terms, and 2 of 2 predictors\n#> Termination condition: Reached nk 21\n#> Importance: B, A\n#> Number of terms at each degree of interaction: 1 5 (additive model)\n#> Earth GCV 0.1342746 RSS 102.4723 GRSq 0.4582121 RSq 0.4719451\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mars_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mars_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.410 0.590 \n#> 2 0.794 0.206 \n#> 3 0.356 0.644 \n#> 4 0.927 0.0729\n#> 5 0.927 0.0729\n#> 6 0.836 0.164\n```\n:::\n\n\n:::\n\n## Neural Networks (`mlp()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(839)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> a 2-5-1 network with 21 weights\n#> inputs: A B \n#> output(s): class \n#> options were - entropy fitting\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.390 0.610\n#> 2 0.685 0.315\n#> 3 0.433 0.567\n#> 4 0.722 0.278\n#> 5 0.720 0.280\n#> 6 0.684 0.316\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(38)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> relu activation,\n#> 3 hidden units,\n#> 17 model parameters\n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 707 \n#> learn rate: 0.01 \n#> validation loss after 5 epochs: 0.427\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.387 0.613 \n#> 2 0.854 0.146 \n#> 3 0.540 0.460 \n#> 4 0.941 0.0589\n#> 5 0.882 0.118 \n#> 6 0.842 0.158\n```\n:::\n\n\n## `brulee_two_layer` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"brulee_two_layer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(336)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> c(relu,relu) activation,\n#> c(3,3) hidden units,\n#> 29 model parameters\n#> 785 samples, 2 features, 2 classes \n#> class weights Class1=1, Class2=1 \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 707 \n#> learn rate: 0.01 \n#> validation loss after 17 epochs: 0.405\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.392 0.608 \n#> 2 0.835 0.165 \n#> 3 0.440 0.560 \n#> 4 0.938 0.0620\n#> 5 0.938 0.0620\n#> 6 0.848 0.152\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(306)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: deeplearning\n#> Model ID: DeepLearning_model_R_1763571327438_5621 \n#> Status of Neuron Layers: predicting .outcome, 2-class classification, bernoulli distribution, CrossEntropy loss, 1,002 weights/biases, 16.9 KB, 7,850 training samples, mini-batch size 1\n#> layer units type dropout l1 l2 mean_rate rate_rms momentum\n#> 1 1 2 Input 0.00 % NA NA NA NA NA\n#> 2 2 200 Rectifier 0.00 % 0.000000 0.000000 0.008580 0.016179 0.000000\n#> 3 3 2 Softmax NA 0.000000 0.000000 0.003447 0.000623 0.000000\n#> mean_weight weight_rms mean_bias bias_rms\n#> 1 NA NA NA NA\n#> 2 0.001886 0.102603 0.497570 0.009971\n#> 3 0.003765 0.404187 0.013307 0.017630\n#> \n#> \n#> H2OBinomialMetrics: deeplearning\n#> ** Reported on training data. **\n#> ** Metrics reported on full training frame **\n#> \n#> MSE: 0.1322443\n#> RMSE: 0.3636541\n#> LogLoss: 0.4297999\n#> Mean Per-Class Error: 0.1780102\n#> AUC: 0.8891613\n#> AUCPR: 0.8503254\n#> Gini: 0.7783226\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 324 110 0.253456 =110/434\n#> Class2 36 315 0.102564 =36/351\n#> Totals 360 425 0.185987 =146/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.305430 0.811856 245\n#> 2 max f2 0.235210 0.871535 274\n#> 3 max f0point5 0.456176 0.820152 193\n#> 4 max accuracy 0.456176 0.834395 193\n#> 5 max precision 0.992141 1.000000 0\n#> 6 max recall 0.007261 1.000000 395\n#> 7 max specificity 0.992141 1.000000 0\n#> 8 max absolute_mcc 0.456176 0.664266 193\n#> 9 max min_per_class_accuracy 0.412899 0.823362 210\n#> 10 max mean_per_class_accuracy 0.456176 0.830888 193\n#> 11 max tns 0.992141 434.000000 0\n#> 12 max fns 0.992141 349.000000 0\n#> 13 max fps 0.001274 434.000000 399\n#> 14 max tps 0.007261 351.000000 395\n#> 15 max tnr 0.992141 1.000000 0\n#> 16 max fnr 0.992141 0.994302 0\n#> 17 max fpr 0.001274 1.000000 399\n#> 18 max tpr 0.007261 1.000000 395\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.491 0.509 \n#> 2 0.884 0.116 \n#> 3 0.595 0.405 \n#> 4 0.971 0.0294\n#> 5 0.908 0.0923\n#> 6 0.883 0.117\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(216)\nmlp_fit <- mlp_spec |> fit(class ~ ., data = bin_train)\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_fit\n#> parsnip model object\n#> \n#> Model: \"sequential_1\"\n#> ________________________________________________________________________________\n#> Layer (type) Output Shape Param # \n#> ================================================================================\n#> dense_2 (Dense) (None, 5) 15 \n#> dense_3 (Dense) (None, 2) 12 \n#> ================================================================================\n#> Total params: 27 (108.00 Byte)\n#> Trainable params: 27 (108.00 Byte)\n#> Non-trainable params: 0 (0.00 Byte)\n#> ________________________________________________________________________________\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, type = \"class\", new_data = bin_test)\n#> 1/1 - 0s - 42ms/epoch - 42ms/step\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class2\npredict(mlp_fit, type = \"prob\", new_data = bin_test)\n#> 1/1 - 0s - 6ms/epoch - 6ms/step\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.315 0.685\n#> 2 0.579 0.421\n#> 3 0.505 0.495\n#> 4 0.892 0.108\n#> 5 0.867 0.133\n#> 6 0.471 0.529\n```\n:::\n\n\n:::\n\n## Multinom Regression (`multinom_reg()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and nnet is the default engine so there is no need to set that either.\nmultinom_reg_spec <- multinom_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(634)\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> nnet::multinom(formula = class ~ ., data = data, trace = FALSE)\n#> \n#> Coefficients:\n#> (Intercept) A B\n#> two -0.5868435 1.881920 1.379106\n#> three 0.2910810 1.129622 1.292802\n#> \n#> Residual Deviance: 315.8164 \n#> AIC: 327.8164\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.145 0.213 0.641 \n#> 2 0.308 0.178 0.514 \n#> 3 0.350 0.189 0.461 \n#> 4 0.983 0.00123 0.0155\n#> 5 0.956 0.00275 0.0415\n#> 6 0.00318 0.754 0.243 \n#> 7 0.0591 0.414 0.527 \n#> 8 0.522 0.0465 0.431\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(837)\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Multinomial regression\n#> \n#> 192 samples, 2 features, 3 classes \n#> class weights one=1, two=1, three=1 \n#> weight decay: 0.001 \n#> batch size: 173 \n#> validation loss after 1 epoch: 0.953\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 three\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.131 0.190 0.679 \n#> 2 0.303 0.174 0.523 \n#> 3 0.358 0.192 0.449 \n#> 4 0.983 0.00125 0.0154\n#> 5 0.948 0.00275 0.0491\n#> 6 0.00344 0.796 0.200 \n#> 7 0.0611 0.420 0.518 \n#> 8 0.443 0.0390 0.518\n```\n:::\n\n\n## `glmnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"multinomial\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.219200\n#> 2 1 1.61 0.199700\n#> 3 2 3.90 0.181900\n#> 4 2 6.07 0.165800\n#> 5 2 7.93 0.151100\n#> 6 2 9.52 0.137600\n#> 7 2 10.90 0.125400\n#> 8 2 12.09 0.114300\n#> 9 2 13.13 0.104100\n#> 10 2 14.22 0.094870\n#> 11 2 15.28 0.086440\n#> 12 2 16.20 0.078760\n#> 13 2 16.99 0.071760\n#> 14 2 17.68 0.065390\n#> 15 2 18.28 0.059580\n#> 16 2 18.80 0.054290\n#> 17 2 19.24 0.049460\n#> 18 2 19.63 0.045070\n#> 19 2 19.96 0.041070\n#> 20 2 20.25 0.037420\n#> 21 2 20.49 0.034090\n#> 22 2 20.70 0.031070\n#> 23 2 20.88 0.028310\n#> 24 2 21.04 0.025790\n#> 25 2 21.17 0.023500\n#> 26 2 21.28 0.021410\n#> 27 2 21.38 0.019510\n#> 28 2 21.46 0.017780\n#> 29 2 21.53 0.016200\n#> 30 2 21.58 0.014760\n#> 31 2 21.63 0.013450\n#> 32 2 21.67 0.012250\n#> 33 2 21.71 0.011160\n#> 34 2 21.74 0.010170\n#> 35 2 21.77 0.009269\n#> 36 2 21.79 0.008445\n#> 37 2 21.82 0.007695\n#> 38 2 21.83 0.007011\n#> 39 2 21.85 0.006389\n#> 40 2 21.86 0.005821\n#> 41 2 21.87 0.005304\n#> 42 2 21.88 0.004833\n#> 43 2 21.89 0.004403\n#> 44 2 21.89 0.004012\n#> 45 2 21.90 0.003656\n#> 46 2 21.90 0.003331\n#> 47 2 21.91 0.003035\n#> 48 2 21.91 0.002765\n#> 49 2 21.91 0.002520\n#> 50 2 21.91 0.002296\n#> 51 2 21.92 0.002092\n#> 52 2 21.92 0.001906\n#> 53 2 21.92 0.001737\n#> 54 2 21.92 0.001582\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.163 0.211 0.626 \n#> 2 0.318 0.185 0.496 \n#> 3 0.358 0.198 0.444 \n#> 4 0.976 0.00268 0.0217\n#> 5 0.940 0.00529 0.0544\n#> 6 0.00617 0.699 0.295 \n#> 7 0.0757 0.390 0.534 \n#> 8 0.506 0.0563 0.438\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OMultinomialModel: glm\n#> Model ID: GLM_model_R_1763571327438_5625 \n#> GLM Model: summary\n#> family link regularization\n#> 1 multinomial multinomial Elastic Net (alpha = 0.5, lambda = 4.372E-4 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 9 6 4\n#> training_frame\n#> 1 object_jbhwnlsrno\n#> \n#> Coefficients: glm multinomial coefficients\n#> names coefs_class_0 coefs_class_1 coefs_class_2 std_coefs_class_0\n#> 1 Intercept -1.119482 -0.831434 -1.706488 -1.083442\n#> 2 A -1.119327 0.002894 0.750746 -1.029113\n#> 3 B -1.208210 0.078752 0.162842 -1.187423\n#> std_coefs_class_1 std_coefs_class_2\n#> 1 -0.819868 -1.830487\n#> 2 0.002661 0.690238\n#> 3 0.077397 0.160041\n#> \n#> H2OMultinomialMetrics: glm\n#> ** Reported on training data. **\n#> \n#> Training Set Metrics: \n#> =====================\n#> \n#> Extract training frame with `h2o.getFrame(\"object_jbhwnlsrno\")`\n#> MSE: (Extract with `h2o.mse`) 0.2982118\n#> RMSE: (Extract with `h2o.rmse`) 0.5460878\n#> Logloss: (Extract with `h2o.logloss`) 0.822443\n#> Mean Per-Class Error: 0.4583896\n#> AUC: (Extract with `h2o.auc`) NaN\n#> AUCPR: (Extract with `h2o.aucpr`) NaN\n#> Null Deviance: (Extract with `h2o.nulldeviance`) 404.5036\n#> Residual Deviance: (Extract with `h2o.residual_deviance`) 315.8181\n#> R^2: (Extract with `h2o.r2`) 0.4682043\n#> AIC: (Extract with `h2o.aic`) NaN\n#> Confusion Matrix: Extract with `h2o.confusionMatrix(,train = TRUE)`)\n#> =========================================================================\n#> Confusion Matrix: Row labels: Actual class; Column labels: Predicted class\n#> one three two Error Rate\n#> one 59 18 1 0.2436 = 19 / 78\n#> three 19 52 5 0.3158 = 24 / 76\n#> two 7 24 7 0.8158 = 31 / 38\n#> Totals 85 94 13 0.3854 = 74 / 192\n#> \n#> Hit Ratio Table: Extract with `h2o.hit_ratio_table(,train = TRUE)`\n#> =======================================================================\n#> Top-3 Hit Ratios: \n#> k hit_ratio\n#> 1 1 0.614583\n#> 2 2 0.890625\n#> 3 3 1.000000\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 three \n#> 3 three \n#> 4 one \n#> 5 one \n#> 6 two \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> # A tibble: 8 × 3\n#> .pred_one .pred_three .pred_two\n#> \n#> 1 0.146 0.641 0.213 \n#> 2 0.308 0.513 0.179 \n#> 3 0.350 0.460 0.190 \n#> 4 0.983 0.0158 0.00128\n#> 5 0.955 0.0422 0.00284\n#> 6 0.00329 0.244 0.752 \n#> 7 0.0599 0.527 0.413 \n#> 8 0.521 0.432 0.0469\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = mtl_train)\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Model: \"sequential_2\"\n#> ________________________________________________________________________________\n#> Layer (type) Output Shape Param # \n#> ================================================================================\n#> dense_4 (Dense) (None, 1) 3 \n#> dense_5 (Dense) (None, 3) 6 \n#> ================================================================================\n#> Total params: 9 (36.00 Byte)\n#> Trainable params: 9 (36.00 Byte)\n#> Non-trainable params: 0 (0.00 Byte)\n#> ________________________________________________________________________________\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = mtl_test)\n#> 1/1 - 0s - 41ms/epoch - 41ms/step\n#> # A tibble: 8 × 1\n#> .pred_class\n#> \n#> 1 three \n#> 2 one \n#> 3 one \n#> 4 one \n#> 5 one \n#> 6 three \n#> 7 three \n#> 8 one\npredict(multinom_reg_fit, type = \"prob\", new_data = mtl_test)\n#> 1/1 - 0s - 6ms/epoch - 6ms/step\n#> # A tibble: 8 × 3\n#> .pred_one .pred_two .pred_three\n#> \n#> 1 0.264 0.342 0.394 \n#> 2 0.338 0.325 0.337 \n#> 3 0.355 0.321 0.325 \n#> 4 0.753 0.155 0.0914\n#> 5 0.684 0.191 0.125 \n#> 6 0.0930 0.338 0.569 \n#> 7 0.205 0.349 0.446 \n#> 8 0.421 0.301 0.279\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_spec <- multinom_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmultinom_reg_fit <- multinom_reg_spec |> fit(class ~ ., data = tbl_mtl$training)\nmultinom_reg_fit\n#> parsnip model object\n#> \n#> Formula: class ~ .\n#> \n#> Coefficients:\n#> (Intercept) A B\n#> one 0.05447853 -1.0569131 -0.9049194\n#> three 0.41207949 0.1458870 0.3959664\n#> two -0.46655802 0.9110261 0.5089529\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(multinom_reg_fit, type = \"class\", new_data = tbl_mtl$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 one \n#> 2 one \n#> 3 three \n#> 4 three \n#> 5 three \n#> 6 three \n#> 7 three\npredict(multinom_reg_fit, type = \"prob\", new_data = tbl_mtl$test)\n#> # Source: SQL [?? x 3]\n#> # Database: spark_connection\n#> pred_one pred_three pred_two\n#> \n#> 1 0.910 0.0814 0.00904\n#> 2 0.724 0.233 0.0427 \n#> 3 0.124 0.620 0.256 \n#> 4 0.0682 0.610 0.322 \n#> 5 0.130 0.571 0.300 \n#> 6 0.115 0.549 0.336 \n#> 7 0.0517 0.524 0.424\n```\n:::\n\n\n:::\n\n## Naive Bayes (`naive_Bayes()`) \n\n:::{.panel-tabset}\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_spec <- naive_Bayes() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\nnaive_Bayes_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: naivebayes\n#> Model ID: NaiveBayes_model_R_1763571327438_5626 \n#> Model Summary: \n#> number_of_response_levels min_apriori_probability max_apriori_probability\n#> 1 2 0.44713 0.55287\n#> \n#> \n#> H2OBinomialMetrics: naivebayes\n#> ** Reported on training data. **\n#> \n#> MSE: 0.1737113\n#> RMSE: 0.4167869\n#> LogLoss: 0.5473431\n#> Mean Per-Class Error: 0.2356138\n#> AUC: 0.8377152\n#> AUCPR: 0.788608\n#> Gini: 0.6754303\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 274 160 0.368664 =160/434\n#> Class2 36 315 0.102564 =36/351\n#> Totals 310 475 0.249682 =196/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.175296 0.762712 286\n#> 2 max f2 0.133412 0.851119 306\n#> 3 max f0point5 0.497657 0.731343 183\n#> 4 max accuracy 0.281344 0.765605 248\n#> 5 max precision 0.999709 1.000000 0\n#> 6 max recall 0.020983 1.000000 390\n#> 7 max specificity 0.999709 1.000000 0\n#> 8 max absolute_mcc 0.280325 0.541898 249\n#> 9 max min_per_class_accuracy 0.398369 0.758065 215\n#> 10 max mean_per_class_accuracy 0.280325 0.771945 249\n#> 11 max tns 0.999709 434.000000 0\n#> 12 max fns 0.999709 347.000000 0\n#> 13 max fps 0.006522 434.000000 399\n#> 14 max tps 0.020983 351.000000 390\n#> 15 max tnr 0.999709 1.000000 0\n#> 16 max fnr 0.999709 0.988604 0\n#> 17 max fpr 0.006522 1.000000 399\n#> 18 max tpr 0.020983 1.000000 390\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class2\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.181 0.819 \n#> 2 0.750 0.250 \n#> 3 0.556 0.444 \n#> 4 0.994 0.00643\n#> 5 0.967 0.0331 \n#> 6 0.630 0.370\n```\n:::\n\n\n## `klaR` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and klaR is the default engine so there is no need to set that either.\nnaive_Bayes_spec <- naive_Bayes()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.250 0.750 \n#> 2 0.593 0.407 \n#> 3 0.333 0.667 \n#> 4 0.993 0.00658\n#> 5 0.978 0.0223 \n#> 6 0.531 0.469\n```\n:::\n\n\n## `naivebayes` \n\nThis engine requires the discrim extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(discrim)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_spec <- naive_Bayes() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"naivebayes\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnaive_Bayes_fit <- naive_Bayes_spec |> fit(class ~ ., data = bin_train)\nnaive_Bayes_fit\n#> parsnip model object\n#> \n#> \n#> ================================= Naive Bayes ==================================\n#> \n#> Call:\n#> naive_bayes.default(x = maybe_data_frame(x), y = y, usekernel = TRUE)\n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> Laplace smoothing: 0\n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> A priori probabilities: \n#> \n#> Class1 Class2 \n#> 0.5528662 0.4471338 \n#> \n#> -------------------------------------------------------------------------------- \n#> \n#> Tables: \n#> \n#> -------------------------------------------------------------------------------- \n#> :: A::Class1 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (434 obs.);\tBandwidth 'bw' = 0.2548\n#> \n#> x y \n#> Min. :-2.5638 Min. :0.0002915 \n#> 1st Qu.:-1.2013 1st Qu.:0.0506201 \n#> Median : 0.1612 Median :0.1619843 \n#> Mean : 0.1612 Mean :0.1831190 \n#> 3rd Qu.: 1.5237 3rd Qu.:0.2581668 \n#> Max. : 2.8862 Max. :0.5370762 \n#> -------------------------------------------------------------------------------- \n#> :: A::Class2 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (351 obs.);\tBandwidth 'bw' = 0.2596\n#> \n#> x y \n#> Min. :-2.5428 Min. :4.977e-05 \n#> 1st Qu.:-1.1840 1st Qu.:2.672e-02 \n#> Median : 0.1748 Median :2.239e-01 \n#> Mean : 0.1748 Mean :1.836e-01 \n#> 3rd Qu.: 1.5336 3rd Qu.:2.926e-01 \n#> Max. : 2.8924 Max. :3.740e-01 \n#> \n#> -------------------------------------------------------------------------------- \n#> :: B::Class1 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (434 obs.);\tBandwidth 'bw' = 0.1793\n#> \n#> x y \n#> Min. :-2.4501 Min. :5.747e-05 \n#> 1st Qu.:-1.0894 1st Qu.:1.424e-02 \n#> Median : 0.2713 Median :8.798e-02 \n#> Mean : 0.2713 Mean :1.834e-01 \n#> 3rd Qu.: 1.6320 3rd Qu.:2.758e-01 \n#> Max. : 2.9927 Max. :6.872e-01 \n#> \n#> -------------------------------------------------------------------------------- \n#> :: B::Class2 (KDE)\n#> -------------------------------------------------------------------------------- \n#> \n#> Call:\n#> \tdensity.default(x = x, na.rm = TRUE)\n#> \n#> Data: x (351 obs.);\tBandwidth 'bw' = 0.2309\n#> \n#> x y \n#> Min. :-2.4621 Min. :5.623e-05 \n#> 1st Qu.:-0.8979 1st Qu.:1.489e-02 \n#> Median : 0.6663 Median :7.738e-02 \n#> Mean : 0.6663 Mean :1.595e-01 \n#> 3rd Qu.: 2.2305 3rd Qu.:3.336e-01 \n#> Max. : 3.7948 Max. :4.418e-01 \n#> \n#> --------------------------------------------------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(naive_Bayes_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(naive_Bayes_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.249 0.751 \n#> 2 0.593 0.407 \n#> 3 0.332 0.668 \n#> 4 0.993 0.00674\n#> 5 0.978 0.0224 \n#> 6 0.532 0.468\n```\n:::\n\n\n:::\n\n## K-Nearest Neighbors (`nearest_neighbor()`) \n\n:::{.panel-tabset}\n\n## `kknn` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_spec <- nearest_neighbor() |>\n # We need to set the mode since this engine works with multiple modes\n # and kknn is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_fit <- nearest_neighbor_spec |> fit(class ~ ., data = bin_train)\nnearest_neighbor_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> kknn::train.kknn(formula = class ~ ., data = data, ks = min_rows(5, data, 5))\n#> \n#> Type of response variable: nominal\n#> Minimal misclassification: 0.2101911\n#> Best kernel: optimal\n#> Best k: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(nearest_neighbor_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(nearest_neighbor_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.2 0.8 \n#> 2 0.72 0.28\n#> 3 0.32 0.68\n#> 4 1 0 \n#> 5 1 0 \n#> 6 1 0\n```\n:::\n\n\n:::\n\n## Null Model (`null_model()`) \n\n:::{.panel-tabset}\n\n## `parsnip` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_spec <- null_model() |>\n # We need to set the mode since this engine works with multiple modes\n # and parsnip is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_fit <- null_model_spec |> fit(class ~ ., data = bin_train)\nnull_model_fit\n#> parsnip model object\n#> \n#> Null Regression Model\n#> Predicted Value: Class1\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(null_model_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(null_model_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.553 0.447\n#> 2 0.553 0.447\n#> 3 0.553 0.447\n#> 4 0.553 0.447\n#> 5 0.553 0.447\n#> 6 0.553 0.447\n```\n:::\n\n\n:::\n\n## Partial Least Squares (`pls()`) \n\n:::{.panel-tabset}\n\n## `mixOmics` \n\nThis engine requires the plsmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(plsmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_spec <- pls() |>\n # We need to set the mode since this engine works with multiple modes\n # and mixOmics is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_fit <- pls_spec |> fit(class ~ ., data = bin_train)\npls_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> mixOmics::splsda(X = x, Y = y, ncomp = ncomp, keepX = keepX) \n#> \n#> sPLS-DA (regression mode) with 2 sPLS-DA components. \n#> You entered data X of dimensions: 785 2 \n#> You entered data Y with 2 classes. \n#> \n#> Selection of [2] [2] variables on each of the sPLS-DA components on the X data set. \n#> No Y variables can be selected. \n#> \n#> Main numerical outputs: \n#> -------------------- \n#> loading vectors: see object$loadings \n#> variates: see object$variates \n#> variable names: see object$names \n#> \n#> Functions to visualise samples: \n#> -------------------- \n#> plotIndiv, plotArrow, cim \n#> \n#> Functions to visualise variables: \n#> -------------------- \n#> plotVar, plotLoadings, network, cim \n#> \n#> Other functions: \n#> -------------------- \n#> selectVar, tune, perf, auc\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(pls_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(pls_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.462 0.538\n#> 2 0.631 0.369\n#> 3 0.512 0.488\n#> 4 0.765 0.235\n#> 5 0.675 0.325\n#> 6 0.624 0.376\n```\n:::\n\n\n:::\n\n## Random Forests (`rand_forest()`) \n\n:::{.panel-tabset}\n\n## `ranger` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n # and ranger is the default engine so there is no need to set that either.\n set_engine(\"ranger\", keep.inbag = TRUE) |> \n # However, we'll set the engine and use the keep.inbag=TRUE option so that we \n # can produce interval predictions. This is not generally required. \n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(841)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Ranger result\n#> \n#> Call:\n#> ranger::ranger(x = maybe_data_frame(x), y = y, keep.inbag = ~TRUE, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) \n#> \n#> Type: Probability estimation \n#> Number of trees: 500 \n#> Sample size: 785 \n#> Number of independent variables: 2 \n#> Mtry: 1 \n#> Target node size: 10 \n#> Variable importance mode: none \n#> Splitrule: gini \n#> OOB prediction error (Brier s.): 0.1477679\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.220 0.780 \n#> 2 0.837 0.163 \n#> 3 0.220 0.780 \n#> 4 0.951 0.0485\n#> 5 0.785 0.215 \n#> 6 0.913 0.0868\npredict(rand_forest_fit, type = \"conf_int\", new_data = bin_test)\n#> Warning in rInfJack(x, inbag.counts): Sample size <=20, no calibration\n#> performed.\n#> Warning in rInfJack(x, inbag.counts): Sample size <=20, no calibration\n#> performed.\n#> Warning in sqrt(infjack): NaNs produced\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_upper_Class1 .pred_lower_Class2 .pred_upper_Class2\n#> \n#> 1 0 0.477 0.523 1 \n#> 2 0.604 1 0 0.396\n#> 3 0.01000 0.431 0.569 0.990\n#> 4 0.846 1 0 0.154\n#> 5 0.469 1 0 0.531\n#> 6 NaN NaN NaN NaN\n```\n:::\n\n\n## `aorsf` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(923)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random classification forest\n#> \n#> Linear combinations: Accelerated Logistic regression\n#> N observations: 785\n#> N classes: 2\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 24.092\n#> Min observations in leaf: 5\n#> OOB stat value: 0.87\n#> OOB stat type: AUC-ROC\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.189 0.811 \n#> 2 0.870 0.130 \n#> 3 0.346 0.654 \n#> 4 0.979 0.0206\n#> 5 0.940 0.0599\n#> 6 0.899 0.101\n```\n:::\n\n\n## `grf` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"grf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(546)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> GRF forest object of type probability_forest \n#> Number of trees: 2000 \n#> Number of training samples: 785 \n#> Variable importance: \n#> 1 2 \n#> 0.26 0.74\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.381 0.619 \n#> 2 0.779 0.221 \n#> 3 0.367 0.633 \n#> 4 0.981 0.0186\n#> 5 0.883 0.117 \n#> 6 0.797 0.203\npredict(rand_forest_fit, type = \"conf_int\", new_data = bin_test)\n#> # A tibble: 6 × 4\n#> .pred_lower_Class1 .pred_lower_Class2 .pred_upper_Class1 .pred_upper_Class2\n#> \n#> 1 0.567 0.806 0.194 0.433 \n#> 2 0.869 0.311 0.689 0.131 \n#> 3 0.585 0.852 0.148 0.415 \n#> 4 1.02 0.0565 0.944 -0.0193 \n#> 5 0.994 0.228 0.772 0.00601\n#> 6 0.979 0.385 0.615 0.0207\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(493)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: drf\n#> Model ID: DRF_model_R_1763571327438_5628 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 92624 12\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 20 16.60000 126 166 143.08000\n#> \n#> \n#> H2OBinomialMetrics: drf\n#> ** Reported on training data. **\n#> ** Metrics reported on Out-Of-Bag training samples **\n#> \n#> MSE: 0.164699\n#> RMSE: 0.4058312\n#> LogLoss: 1.506369\n#> Mean Per-Class Error: 0.200195\n#> AUC: 0.8389854\n#> AUCPR: 0.7931927\n#> Gini: 0.6779708\n#> R^2: 0.3337559\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 327 107 0.246544 =107/434\n#> Class2 54 297 0.153846 =54/351\n#> Totals 381 404 0.205096 =161/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.363636 0.786755 125\n#> 2 max f2 0.238095 0.832435 148\n#> 3 max f0point5 0.421053 0.760108 115\n#> 4 max accuracy 0.363636 0.794904 125\n#> 5 max precision 1.000000 0.890244 0\n#> 6 max recall 0.000000 1.000000 208\n#> 7 max specificity 1.000000 0.979263 0\n#> 8 max absolute_mcc 0.363636 0.596505 125\n#> 9 max min_per_class_accuracy 0.450000 0.785714 110\n#> 10 max mean_per_class_accuracy 0.363636 0.799805 125\n#> 11 max tns 1.000000 425.000000 0\n#> 12 max fns 1.000000 278.000000 0\n#> 13 max fps 0.000000 434.000000 208\n#> 14 max tps 0.000000 351.000000 208\n#> 15 max tnr 1.000000 0.979263 0\n#> 16 max fnr 1.000000 0.792023 0\n#> 17 max fpr 0.000000 1.000000 208\n#> 18 max tpr 0.000000 1.000000 208\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.12 0.88 \n#> 2 0.94 0.0600\n#> 3 0.175 0.825 \n#> 4 1 0 \n#> 5 0.78 0.22 \n#> 6 0.92 0.0800\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(252)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\n```\n:::\n\n\nThe print method has a lot of output: \n\n
\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncapture.output(print(rand_forest_fit))[1:100] |> cat(sep = \"\\n\")\n#> parsnip model object\n#> \n#> $nodes\n#> $nodes[[1]]\n#> [1] root\n#> | [2] V3 <= -0.06906\n#> | | [3] V3 <= -0.61707\n#> | | | [4] V3 <= -0.83314\n#> | | | | [5] V3 <= -0.99048\n#> | | | | | [6] V3 <= -1.29863\n#> | | | | | | [7] V2 <= -0.93951 *\n#> | | | | | | [8] V2 > -0.93951 *\n#> | | | | | [9] V3 > -1.29863\n#> | | | | | | [10] V3 <= -1.21418 *\n#> | | | | | | [11] V3 > -1.21418\n#> | | | | | | | [12] V2 <= -1.13676 *\n#> | | | | | | | [13] V2 > -1.13676\n#> | | | | | | | | [14] V3 <= -1.14373 *\n#> | | | | | | | | [15] V3 > -1.14373 *\n#> | | | | [16] V3 > -0.99048\n#> | | | | | [17] V2 <= -1.10136 *\n#> | | | | | [18] V2 > -1.10136 *\n#> | | | [19] V3 > -0.83314\n#> | | | | [20] V3 <= -0.68684\n#> | | | | | [21] V2 <= -0.62666 *\n#> | | | | | [22] V2 > -0.62666 *\n#> | | | | [23] V3 > -0.68684 *\n#> | | [24] V3 > -0.61707\n#> | | | [25] V2 <= -0.10774\n#> | | | | [26] V3 <= -0.35574\n#> | | | | | [27] V3 <= -0.41085\n#> | | | | | | [28] V3 <= -0.52674 *\n#> | | | | | | [29] V3 > -0.52674 *\n#> | | | | | [30] V3 > -0.41085 *\n#> | | | | [31] V3 > -0.35574\n#> | | | | | [32] V3 <= -0.17325 *\n#> | | | | | [33] V3 > -0.17325 *\n#> | | | [34] V2 > -0.10774\n#> | | | | [35] V3 <= -0.38428 *\n#> | | | | [36] V3 > -0.38428 *\n#> | [37] V3 > -0.06906\n#> | | [38] V3 <= 0.54852\n#> | | | [39] V2 <= 0.53027\n#> | | | | [40] V2 <= 0.21749\n#> | | | | | [41] V3 <= 0.09376 *\n#> | | | | | [42] V3 > 0.09376\n#> | | | | | | [43] V3 <= 0.28687\n#> | | | | | | | [44] V3 <= 0.17513 *\n#> | | | | | | | [45] V3 > 0.17513 *\n#> | | | | | | [46] V3 > 0.28687 *\n#> | | | | [47] V2 > 0.21749 *\n#> | | | [48] V2 > 0.53027 *\n#> | | [49] V3 > 0.54852\n#> | | | [50] V2 <= 1.99786\n#> | | | | [51] V3 <= 1.02092\n#> | | | | | [52] V2 <= 0.5469\n#> | | | | | | [53] V3 <= 0.83487\n#> | | | | | | | [54] V2 <= 0.36626 *\n#> | | | | | | | [55] V2 > 0.36626 *\n#> | | | | | | [56] V3 > 0.83487 *\n#> | | | | | [57] V2 > 0.5469\n#> | | | | | | [58] V3 <= 0.62673 *\n#> | | | | | | [59] V3 > 0.62673 *\n#> | | | | [60] V3 > 1.02092\n#> | | | | | [61] V3 <= 1.29539\n#> | | | | | | [62] V3 <= 1.2241 *\n#> | | | | | | [63] V3 > 1.2241 *\n#> | | | | | [64] V3 > 1.29539\n#> | | | | | | [65] V3 <= 2.01809 *\n#> | | | | | | [66] V3 > 2.01809 *\n#> | | | [67] V2 > 1.99786 *\n#> \n#> $nodes[[2]]\n#> [1] root\n#> | [2] V3 <= -0.00054\n#> | | [3] V3 <= -0.58754\n#> | | | [4] V3 <= -0.83314\n#> | | | | [5] V2 <= -1.15852\n#> | | | | | [6] V2 <= -1.76192 *\n#> | | | | | [7] V2 > -1.76192 *\n#> | | | | [8] V2 > -1.15852\n#> | | | | | [9] V3 <= -1.21418\n#> | | | | | | [10] V3 <= -1.32176 *\n#> | | | | | | [11] V3 > -1.32176 *\n#> | | | | | [12] V3 > -1.21418\n#> | | | | | | [13] V2 <= -1.08164 *\n#> | | | | | | [14] V2 > -1.08164\n#> | | | | | | | [15] V3 <= -1.14373 *\n#> | | | | | | | [16] V3 > -1.14373 *\n#> | | | [17] V3 > -0.83314\n#> | | | | [18] V2 <= -0.51524\n#> | | | | | [19] V3 <= -0.66041\n#> | | | | | | [20] V3 <= -0.70885 *\n#> | | | | | | [21] V3 > -0.70885 *\n#> | | | | | [22] V3 > -0.66041 *\n#> | | | | [23] V2 > -0.51524 *\n#> | | [24] V3 > -0.58754\n#> | | | [25] V2 <= -0.07243\n#> | | | | [26] V3 <= -0.31247\n#> | | | | | [27] V2 <= -0.98014 *\n```\n:::\n\n
\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.375 0.625 \n#> 2 0.813 0.187 \n#> 3 0.284 0.716 \n#> 4 0.963 0.0365\n#> 5 0.892 0.108 \n#> 6 0.922 0.0785\n```\n:::\n\n\n## `randomForest` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"randomForest\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(726)\nrand_forest_fit <- rand_forest_spec |> fit(class ~ ., data = bin_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> randomForest(x = maybe_data_frame(x), y = y) \n#> Type of random forest: classification\n#> Number of trees: 500\n#> No. of variables tried at each split: 1\n#> \n#> OOB estimate of error rate: 21.53%\n#> Confusion matrix:\n#> Class1 Class2 class.error\n#> Class1 349 85 0.1958525\n#> Class2 84 267 0.2393162\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rand_forest_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.162 0.838\n#> 2 0.848 0.152\n#> 3 0.108 0.892\n#> 4 1 0 \n#> 5 0.74 0.26 \n#> 6 0.91 0.09\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n set_mode(\"classification\") |>\n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(693)\nrand_forest_fit <- rand_forest_spec |> fit(Class ~ ., data = tbl_bin$training)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Formula: Class ~ .\n#> \n#> RandomForestClassificationModel: uid=random_forest__85283141_ea71_4e8a_8447_1075f9539067, numTrees=20, numClasses=2, numFeatures=2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"class\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred_class\n#> \n#> 1 Class2 \n#> 2 Class2 \n#> 3 Class1 \n#> 4 Class2 \n#> 5 Class2 \n#> 6 Class1 \n#> 7 Class2\npredict(rand_forest_fit, type = \"prob\", new_data = tbl_bin$test)\n#> # Source: SQL [?? x 2]\n#> # Database: spark_connection\n#> pred_Class1 pred_Class2\n#> \n#> 1 0.315 0.685 \n#> 2 0.241 0.759 \n#> 3 0.732 0.268 \n#> 4 0.235 0.765 \n#> 5 0.259 0.741 \n#> 6 0.933 0.0674\n#> 7 0.0968 0.903\n```\n:::\n\n\n:::\n\n## Rule Fit (`rule_fit()`) \n\n:::{.panel-tabset}\n\n## `xrf` \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n # and xrf is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(95)\nrule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> An eXtreme RuleFit model of 358 rules.\n#> \n#> Original Formula:\n#> \n#> class ~ A + B\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rule_fit_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.419 0.581\n#> 2 0.651 0.349\n#> 3 0.506 0.494\n#> 4 0.891 0.109\n#> 5 0.805 0.195\n#> 6 0.616 0.384\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(536)\nrule_fit_fit <- rule_fit_spec |> fit(class ~ ., data = bin_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2OBinomialModel: rulefit\n#> Model ID: RuleFit_model_R_1763571327438_5679 \n#> Rulefit Model Summary: \n#> family link regularization number_of_predictors_total\n#> 1 binomial logit Lasso (lambda = 0.03081 ) 2329\n#> number_of_active_predictors number_of_iterations rule_ensemble_size\n#> 1 3 4 2327\n#> number_of_trees number_of_internal_trees min_depth max_depth mean_depth\n#> 1 150 150 0 5 4.00000\n#> min_leaves max_leaves mean_leaves\n#> 1 0 29 15.51333\n#> \n#> \n#> H2OBinomialMetrics: rulefit\n#> ** Reported on training data. **\n#> \n#> MSE: 0.1411478\n#> RMSE: 0.3756964\n#> LogLoss: 0.4472749\n#> Mean Per-Class Error: 0.1850933\n#> AUC: 0.8779327\n#> AUCPR: 0.8372496\n#> Gini: 0.7558654\n#> \n#> Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:\n#> Class1 Class2 Error Rate\n#> Class1 350 84 0.193548 =84/434\n#> Class2 62 289 0.176638 =62/351\n#> Totals 412 373 0.185987 =146/785\n#> \n#> Maximum Metrics: Maximum metrics at their respective thresholds\n#> metric threshold value idx\n#> 1 max f1 0.499611 0.798343 199\n#> 2 max f2 0.226927 0.861169 285\n#> 3 max f0point5 0.626200 0.803634 144\n#> 4 max accuracy 0.523044 0.815287 191\n#> 5 max precision 0.980574 1.000000 0\n#> 6 max recall 0.052101 1.000000 394\n#> 7 max specificity 0.980574 1.000000 0\n#> 8 max absolute_mcc 0.523044 0.627478 191\n#> 9 max min_per_class_accuracy 0.512020 0.813364 196\n#> 10 max mean_per_class_accuracy 0.499611 0.814907 199\n#> 11 max tns 0.980574 434.000000 0\n#> 12 max fns 0.980574 350.000000 0\n#> 13 max fps 0.043433 434.000000 399\n#> 14 max tps 0.052101 351.000000 394\n#> 15 max tnr 0.980574 1.000000 0\n#> 16 max fnr 0.980574 0.997151 0\n#> 17 max fpr 0.043433 1.000000 399\n#> 18 max tpr 0.052101 1.000000 394\n#> \n#> Gains/Lift Table: Extract with `h2o.gainsLift(, )` or `h2o.gainsLift(, valid=, xval=)`\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(rule_fit_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.393 0.607 \n#> 2 0.739 0.261 \n#> 3 0.455 0.545 \n#> 4 0.956 0.0442\n#> 5 0.882 0.118 \n#> 6 0.693 0.307\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Linear Kernel) (`svm_linear()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"classification\") |>\n set_engine(\"kernlab\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(class ~ ., data = bin_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Linear (vanilla) kernel function. \n#> \n#> Number of Support Vectors : 357 \n#> \n#> Objective Function Value : -353.0043 \n#> Training error : 0.17707 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_linear_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.404 0.596 \n#> 2 0.858 0.142 \n#> 3 0.541 0.459 \n#> 4 0.975 0.0254\n#> 5 0.905 0.0950\n#> 6 0.850 0.150\n```\n:::\n\n\n## `LiblineaR` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n # and LiblineaR is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(class ~ ., data = bin_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized L2-loss support vector classification dual (L2R_L2LOSS_SVC_DUAL)\"\n#> \n#> $Type\n#> [1] 1\n#> \n#> $W\n#> A B Bias\n#> [1,] 0.3641766 -0.9648797 0.1182725\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $ClassNames\n#> [1] Class1 Class2\n#> Levels: Class1 Class2\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_spec <- svm_poly() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_fit <- svm_poly_spec |> fit(class ~ ., data = bin_train)\n#> Setting default kernel parameters\nsvm_poly_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Polynomial kernel function. \n#> Hyperparameters : degree = 1 scale = 1 offset = 1 \n#> \n#> Number of Support Vectors : 357 \n#> \n#> Objective Function Value : -353.0043 \n#> Training error : 0.17707 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_poly_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class2 \n#> 2 Class1 \n#> 3 Class1 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_poly_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.399 0.601 \n#> 2 0.861 0.139 \n#> 3 0.538 0.462 \n#> 4 0.976 0.0237\n#> 5 0.908 0.0917\n#> 6 0.853 0.147\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_spec <- svm_rbf() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"classification\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_fit <- svm_rbf_spec |> fit(class ~ ., data = bin_train)\nsvm_rbf_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: C-svc (classification) \n#> parameter : cost C = 1 \n#> \n#> Gaussian Radial Basis kernel function. \n#> Hyperparameter : sigma = 1.9107071282545 \n#> \n#> Number of Support Vectors : 335 \n#> \n#> Objective Function Value : -296.4885 \n#> Training error : 0.173248 \n#> Probability model included.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_rbf_fit, type = \"class\", new_data = bin_test)\n#> # A tibble: 6 × 1\n#> .pred_class\n#> \n#> 1 Class1 \n#> 2 Class1 \n#> 3 Class2 \n#> 4 Class1 \n#> 5 Class1 \n#> 6 Class1\npredict(svm_rbf_fit, type = \"prob\", new_data = bin_test)\n#> # A tibble: 6 × 2\n#> .pred_Class1 .pred_Class2\n#> \n#> 1 0.547 0.453\n#> 2 0.871 0.129\n#> 3 0.260 0.740\n#> 4 0.861 0.139\n#> 5 0.863 0.137\n#> 6 0.863 0.137\n```\n:::\n\n\n:::\n\n# Regression Models\n\nTo demonstrate regression, we'll subset some data. make a training/test split, and standardize the predictors: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(938)\nreg_split <-\n modeldata::concrete |> \n slice_sample(n = 100) |> \n select(strength = compressive_strength, cement, age) |> \n initial_split(prop = 0.95, strata = strength)\nreg_split\n#> \n#> <92/8/100>\n\nreg_rec <- \n recipe(strength ~ ., data = training(reg_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nreg_train <- bake(reg_rec, new_data = NULL)\nreg_test <- bake(reg_rec, new_data = testing(reg_split))\n```\n:::\n\n\nWe also have models that are specifically designed for integer count outcomes. The data for these are:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(207)\ncount_split <-\n attrition |>\n select(num_years = TotalWorkingYears, age = Age, income = MonthlyIncome) |>\n initial_split(prop = 0.994)\ncount_split\n#> \n#> <1461/9/1470>\n\ncount_rec <-\n recipe(num_years ~ ., data = training(count_split)) |>\n step_normalize(all_numeric_predictors()) |>\n prep()\n\ncount_train <- bake(count_rec, new_data = NULL)\ncount_test <- bake(count_rec, new_data = testing(count_split))\n```\n:::\n\n\nFinally, we have some models that handle hierarchical data, where some rows are statistically correlated with other rows. For these examples, we'll use a data set that models body weights as a function of time for several \"subjects\" (rats, actually). We'll split these data in a way where all rows for a specific subject are either in the training or test sets: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(224)\nreg_group_split <- \n nlme::BodyWeight |> \n # Get rid of some extra attributes added by the nlme package\n as_tibble() |> \n # Convert to an _unordered_ factor\n mutate(Rat = factor(as.character(Rat))) |> \n group_initial_split(group = Rat)\nreg_group_train <- training(reg_group_split)\nreg_group_test <- testing(reg_group_split)\n```\n:::\n\n\nThere are 12 subjects in the training set and 4 in the test set. \n\nIf using the **Apache Spark** engine, we will need to identify the data source, and then use it to create the splits. For this article, we will copy the `concrete` data set into the Spark session.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(sparklyr)\nsc <- spark_connect(\"local\")\n#> Re-using existing Spark connection to local\n\ntbl_concrete <- copy_to(sc, modeldata::concrete)\n\ntbl_reg <- sdf_random_split(tbl_concrete, training = 0.95, test = 0.05, seed = 100)\n```\n:::\n\n\n## Bagged MARS (`bag_mars()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mars_spec <- bag_mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(147)\nbag_mars_fit <- bag_mars_spec |> fit(strength ~ ., data = reg_train)\nbag_mars_fit\n#> parsnip model object\n#> \n#> Bagged MARS (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 age 93.1 4.61 11\n#> 2 cement 69.4 4.95 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mars_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.4\n#> 2 41.9\n#> 3 26.7\n#> 4 56.6\n#> 5 36.4\n#> 6 36.2\n#> 7 37.8\n#> 8 37.7\n```\n:::\n\n\n:::\n\n## Bagged Neural Networks (`bag_mlp()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_mlp_spec <- bag_mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(324)\nbag_mlp_fit <- bag_mlp_spec |> fit(strength ~ ., data = reg_train)\nbag_mlp_fit\n#> parsnip model object\n#> \n#> Bagged nnet (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 age 55.9 2.96 11\n#> 2 cement 44.1 2.96 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 19.9\n#> 2 39.1\n#> 3 28.3\n#> 4 68.8\n#> 5 44.1\n#> 6 36.3\n#> 7 40.8\n#> 8 37.0\n```\n:::\n\n\n:::\n\n## Bagged Decision Trees (`bag_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nThis engine requires the baguette extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(baguette)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(230)\nbag_tree_fit <- bag_tree_spec |> fit(strength ~ ., data = reg_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> Bagged CART (regression with 11 members)\n#> \n#> Variable importance scores include:\n#> \n#> # A tibble: 2 × 4\n#> term value std.error used\n#> \n#> 1 cement 16621. 1392. 11\n#> 2 age 12264. 710. 11\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.0\n#> 2 33.0\n#> 3 29.6\n#> 4 54.2\n#> 5 36.2\n#> 6 39.4\n#> 7 40.7\n#> 8 46.5\n```\n:::\n\n\n:::\n\n## Bayesian Additive Regression Trees (`bart()`) \n\n:::{.panel-tabset}\n\n## `dbarts` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbart_spec <- bart() |>\n # We need to set the mode since this engine works with multiple modes\n # and dbarts is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(134)\nbart_fit <- bart_spec |> fit(strength ~ ., data = reg_train)\nbart_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> `NULL`()\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bart_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.2\n#> 2 40.9\n#> 3 26.0\n#> 4 52.0\n#> 5 36.5\n#> 6 36.7\n#> 7 39.0\n#> 8 37.8\npredict(bart_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 17.0 32.4\n#> 2 33.0 48.9\n#> 3 20.1 31.5\n#> 4 42.0 62.5\n#> 5 28.5 44.5\n#> 6 30.3 42.3\n#> 7 33.1 45.3\n#> 8 26.3 48.8\npredict(bart_fit, type = \"pred_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 5.00 41.8\n#> 2 19.9 60.5\n#> 3 7.37 44.3\n#> 4 32.4 72.1\n#> 5 15.7 56.4\n#> 6 18.9 56.8\n#> 7 21.2 57.2\n#> 8 17.2 58.5\n```\n:::\n\n\n:::\n\n## Boosted Decision Trees (`boost_tree()`) \n\n:::{.panel-tabset}\n\n## `xgboost` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and xgboost is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(748)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> ##### xgb.Booster\n#> raw: 35 Kb \n#> call:\n#> xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, \n#> colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, \n#> subsample = 1), data = x$data, nrounds = 15, watchlist = x$watchlist, \n#> verbose = 0, nthread = 1, objective = \"reg:squarederror\")\n#> params (as set within xgb.train):\n#> eta = \"0.3\", max_depth = \"6\", gamma = \"0\", colsample_bytree = \"1\", colsample_bynode = \"1\", min_child_weight = \"1\", subsample = \"1\", nthread = \"1\", objective = \"reg:squarederror\", validate_parameters = \"TRUE\"\n#> xgb.attributes:\n#> niter\n#> callbacks:\n#> cb.evaluation.log()\n#> # of features: 2 \n#> niter: 15\n#> nfeatures : 2 \n#> evaluation_log:\n#> iter training_rmse\n#> \n#> 1 27.511751\n#> 2 20.726236\n#> --- ---\n#> 14 2.774394\n#> 15 2.632224\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.3\n#> 2 32.9\n#> 3 26.7\n#> 4 57.6\n#> 5 34.9\n#> 6 33.8\n#> 7 42.6\n#> 8 26.3\n```\n:::\n\n\n## `catboost` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"catboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(557)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> CatBoost model (1000 trees)\n#> Loss function: RMSE\n#> Fit to 2 feature(s)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 26.6\n#> 2 33.9\n#> 3 27.8\n#> 4 60.6\n#> 5 34.7\n#> 6 36.3\n#> 7 43.6\n#> 8 29.3\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(720)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: gbm\n#> Model ID: GBM_model_R_1763571327438_5835 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 20474 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 14 43 27.92000\n#> \n#> \n#> H2ORegressionMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.001563879\n#> RMSE: 0.03954591\n#> MAE: 0.02903684\n#> RMSLE: 0.001771464\n#> Mean Residual Deviance : 0.001563879\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.7\n#> 2 32.2\n#> 3 26.9\n#> 4 63.2\n#> 5 34.9\n#> 6 39.0\n#> 7 40.0\n#> 8 32.9\n```\n:::\n\n\n## `h2o_gbm` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o_gbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(90)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: gbm\n#> Model ID: GBM_model_R_1763571327438_5836 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 20472 6\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 6 6.00000 14 43 27.92000\n#> \n#> \n#> H2ORegressionMetrics: gbm\n#> ** Reported on training data. **\n#> \n#> MSE: 0.001563879\n#> RMSE: 0.03954591\n#> MAE: 0.02903684\n#> RMSLE: 0.001771464\n#> Mean Residual Deviance : 0.001563879\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.7\n#> 2 32.2\n#> 3 26.9\n#> 4 63.2\n#> 5 34.9\n#> 6 39.0\n#> 7 40.0\n#> 8 32.9\n```\n:::\n\n\n## `lightgbm` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"lightgbm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(570)\nboost_tree_fit <- boost_tree_spec |> fit(strength ~ ., data = reg_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> LightGBM Model (100 trees)\n#> Objective: regression\n#> Fitted to dataset with 2 columns\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 20.6\n#> 2 42.5\n#> 3 27.0\n#> 4 49.2\n#> 5 43.7\n#> 6 38.3\n#> 7 41.1\n#> 8 36.9\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |>\n set_mode(\"regression\") |>\n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(620)\nboost_tree_fit <- boost_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nboost_tree_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> GBTRegressionModel: uid=gradient_boosted_trees__96624f1b_2bdd_4b67_a54f_99afd086dcfa, numTrees=20, numFeatures=8\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 20.8 \n#> 2 28.1 \n#> 3 15.5 \n#> 4 22.4 \n#> 5 9.37\n#> 6 40.1 \n#> 7 14.2 \n#> 8 32.1 \n#> 9 37.4 \n#> 10 49.5 \n#> # ℹ more rows\n```\n:::\n\n\n:::\n\n## Cubist Rules (`cubist_rules()`) \n\n:::{.panel-tabset}\n\n## `Cubist` \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and Cubist is the default engine so there is no need to set that either.\ncubist_rules_spec <- cubist_rules()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(188)\ncubist_rules_fit <- cubist_rules_spec |> fit(strength ~ ., data = reg_train)\ncubist_rules_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> cubist.default(x = x, y = y, committees = 1)\n#> \n#> Number of samples: 92 \n#> Number of predictors: 2 \n#> \n#> Number of committees: 1 \n#> Number of rules: 2\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(cubist_rules_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.2\n#> 2 46.3\n#> 3 23.6\n#> 4 54.4\n#> 5 32.7\n#> 6 37.8\n#> 7 38.8\n#> 8 38.6\n```\n:::\n\n\n:::\n\n## Decision Tree (`decision_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(strength ~ ., data = reg_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> n= 92 \n#> \n#> node), split, n, deviance, yval\n#> * denotes terminal node\n#> \n#> 1) root 92 26564.7400 33.57728 \n#> 2) cement< 0.7861846 69 12009.9000 27.81493 \n#> 4) age< -0.5419541 23 964.6417 14.42348 \n#> 8) cement< -0.3695209 12 292.7811 11.14083 *\n#> 9) cement>=-0.3695209 11 401.4871 18.00455 *\n#> 5) age>=-0.5419541 46 4858.3440 34.51065 \n#> 10) age< 0.008934354 32 2208.3040 31.16781 \n#> 20) cement< 0.311975 24 1450.6200 28.75583 *\n#> 21) cement>=0.311975 8 199.1900 38.40375 *\n#> 11) age>=0.008934354 14 1475.1130 42.15143 *\n#> 3) cement>=0.7861846 23 5390.3320 50.86435 \n#> 6) age< -0.5419541 7 390.4204 40.08429 *\n#> 7) age>=-0.5419541 16 3830.5510 55.58062 *\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 18.0\n#> 2 42.2\n#> 3 28.8\n#> 4 55.6\n#> 5 40.1\n#> 6 38.4\n#> 7 38.4\n#> 8 40.1\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(strength ~ ., data = reg_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> strength ~ cement + age\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] cement <= 0.72078\n#> | | [3] age <= -0.60316\n#> | | | [4] cement <= -0.38732: 11.141 (n = 12, err = 292.8)\n#> | | | [5] cement > -0.38732: 18.005 (n = 11, err = 401.5)\n#> | | [6] age > -0.60316\n#> | | | [7] cement <= 0.24945\n#> | | | | [8] age <= -0.2359: 28.756 (n = 24, err = 1450.6)\n#> | | | | [9] age > -0.2359: 39.014 (n = 11, err = 634.8)\n#> | | | [10] cement > 0.24945: 42.564 (n = 11, err = 1041.7)\n#> | [11] cement > 0.72078: 50.864 (n = 23, err = 5390.3)\n#> \n#> Number of inner nodes: 5\n#> Number of terminal nodes: 6\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 18.0\n#> 2 39.0\n#> 3 28.8\n#> 4 50.9\n#> 5 50.9\n#> 6 42.6\n#> 7 42.6\n#> 8 50.9\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n set_mode(\"regression\") |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> DecisionTreeRegressionModel: uid=decision_tree_regressor__0b7894ca_4b7d_4bd2_a584_afe151dd5002, depth=5, numNodes=63, numFeatures=8\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 26.7\n#> 2 26.7\n#> 3 14.9\n#> 4 26.7\n#> 5 10.5\n#> 6 40.2\n#> 7 15.0\n#> 8 40.2\n#> 9 40.2\n#> 10 41.4\n#> # ℹ more rows\n```\n:::\n\n\n:::\n\n## Generalized Additive Models (`gen_additive_mod()`) \n\n:::{.panel-tabset}\n\n## `mgcv` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_spec <- gen_additive_mod() |>\n # We need to set the mode since this engine works with multiple modes\n # and mgcv is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ngen_additive_mod_fit <- \n gen_additive_mod_spec |> \n fit(strength ~ s(age) + s(cement), data = reg_train)\ngen_additive_mod_fit\n#> parsnip model object\n#> \n#> \n#> Family: gaussian \n#> Link function: identity \n#> \n#> Formula:\n#> strength ~ s(age) + s(cement)\n#> \n#> Estimated degrees of freedom:\n#> 4.18 3.56 total = 8.74 \n#> \n#> GCV score: 108.4401\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(gen_additive_mod_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.1\n#> 2 41.2\n#> 3 26.7\n#> 4 55.9\n#> 5 35.2\n#> 6 37.1\n#> 7 38.5\n#> 8 39.6\npredict(gen_additive_mod_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 18.9 27.4\n#> 2 35.7 46.6\n#> 3 22.4 31.0\n#> 4 47.0 64.7\n#> 5 30.1 40.4\n#> 6 32.9 41.2\n#> 7 34.3 42.6\n#> 8 30.3 49.0\n```\n:::\n\n\n:::\n\n## Linear Reg (`linear_reg()`) \n\n:::{.panel-tabset}\n\n## `lm` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and lm is the default engine so there is no need to set that either.\nlinear_reg_spec <- linear_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> stats::lm(formula = strength ~ ., data = data)\n#> \n#> Coefficients:\n#> (Intercept) cement age \n#> 33.577 8.795 5.471\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 28.8 35.4\n#> 2 27.1 33.5\n#> 3 17.3 25.9\n#> 4 44.6 58.1\n#> 5 35.6 45.0\n#> 6 32.3 38.3\n#> 7 33.2 39.4\n#> 8 41.6 56.0\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 5.72 58.5\n#> 2 3.89 56.7\n#> 3 -4.94 48.2\n#> 4 24.3 78.5\n#> 5 13.7 67.0\n#> 6 8.95 61.7\n#> 7 9.89 62.7\n#> 8 21.6 76.0\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(1)\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Linear regression\n#> \n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> batch size: 83 \n#> scaled validation loss after 1 epoch: 235\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.1\n#> 3 21.6\n#> 4 51.2\n#> 5 40.3\n#> 6 35.2\n#> 7 36.2\n#> 8 48.7\n```\n:::\n\n\n## `gee` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- \n linear_reg_spec |> \n fit(weight ~ Time + Diet + id_var(Rat), data = reg_group_train)\n#> Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27\n#> running glm to get initial regression estimate\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> GEE: GENERALIZED LINEAR MODELS FOR DEPENDENT DATA\n#> gee S-function, version 4.13 modified 98/01/27 (1998) \n#> \n#> Model:\n#> Link: Identity \n#> Variance to Mean Relation: Gaussian \n#> Correlation Structure: Independent \n#> \n#> Call:\n#> gee::gee(formula = weight ~ Time + Diet, id = data$Rat, data = data, \n#> family = gaussian)\n#> \n#> Number of observations : 132 \n#> \n#> Maximum cluster size : 11 \n#> \n#> \n#> Coefficients:\n#> (Intercept) Time Diet2 Diet3 \n#> 245.410439 0.549192 185.621212 259.287879 \n#> \n#> Estimated Scale Parameter: 272.1604\n#> Number of Iterations: 1\n#> \n#> Working Correlation[1:4,1:4]\n#> [,1] [,2] [,3] [,4]\n#> [1,] 1 0 0 0\n#> [2,] 0 1 0 0\n#> [3,] 0 0 1 0\n#> [4,] 0 0 0 1\n#> \n#> \n#> Returned Error Value:\n#> [1] 0\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `glm` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glm\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = strength ~ ., family = stats::gaussian, \n#> data = data)\n#> \n#> Coefficients:\n#> (Intercept) cement age \n#> 33.577 8.795 5.471 \n#> \n#> Degrees of Freedom: 91 Total (i.e. Null); 89 Residual\n#> Null Deviance:\t 26560 \n#> Residual Deviance: 15480 \tAIC: 740.6\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 28.8 35.4\n#> 2 27.1 33.5\n#> 3 17.3 25.9\n#> 4 44.6 58.1\n#> 5 35.6 45.0\n#> 6 32.3 38.3\n#> 7 33.2 39.4\n#> 8 41.6 56.0\n```\n:::\n\n\n## `glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- \n linear_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\n#> Warning in lme4::glmer(formula = weight ~ Diet + Time + (1 | Rat), data = data,\n#> : calling glmer() with family=gaussian (identity link) as a shortcut to lmer()\n#> is deprecated; please call lmer() directly\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Linear mixed model fit by REML ['lmerMod']\n#> Formula: weight ~ Diet + Time + (1 | Rat)\n#> Data: data\n#> REML criterion at convergence: 955.6549\n#> Random effects:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 16.331 \n#> Residual 8.117 \n#> Number of obs: 132, groups: Rat, 12\n#> Fixed Effects:\n#> (Intercept) Diet2 Diet3 Time \n#> 245.4104 185.6212 259.2879 0.5492\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `glmnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"gaussian\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 9.5680\n#> 2 1 5.38 8.7180\n#> 3 1 9.85 7.9430\n#> 4 1 13.56 7.2380\n#> 5 1 16.64 6.5950\n#> 6 2 19.99 6.0090\n#> 7 2 23.68 5.4750\n#> 8 2 26.75 4.9890\n#> 9 2 29.29 4.5450\n#> 10 2 31.40 4.1420\n#> 11 2 33.15 3.7740\n#> 12 2 34.61 3.4380\n#> 13 2 35.82 3.1330\n#> 14 2 36.82 2.8550\n#> 15 2 37.65 2.6010\n#> 16 2 38.34 2.3700\n#> 17 2 38.92 2.1590\n#> 18 2 39.39 1.9680\n#> 19 2 39.79 1.7930\n#> 20 2 40.12 1.6340\n#> 21 2 40.39 1.4880\n#> 22 2 40.62 1.3560\n#> 23 2 40.80 1.2360\n#> 24 2 40.96 1.1260\n#> 25 2 41.09 1.0260\n#> 26 2 41.20 0.9348\n#> 27 2 41.29 0.8517\n#> 28 2 41.36 0.7761\n#> 29 2 41.42 0.7071\n#> 30 2 41.47 0.6443\n#> 31 2 41.52 0.5871\n#> 32 2 41.55 0.5349\n#> 33 2 41.58 0.4874\n#> 34 2 41.60 0.4441\n#> 35 2 41.63 0.4046\n#> 36 2 41.64 0.3687\n#> 37 2 41.66 0.3359\n#> 38 2 41.67 0.3061\n#> 39 2 41.68 0.2789\n#> 40 2 41.68 0.2541\n#> 41 2 41.69 0.2316\n#> 42 2 41.70 0.2110\n#> 43 2 41.70 0.1922\n#> 44 2 41.71 0.1752\n#> 45 2 41.71 0.1596\n#> 46 2 41.71 0.1454\n#> 47 2 41.71 0.1325\n#> 48 2 41.71 0.1207\n#> 49 2 41.72 0.1100\n#> 50 2 41.72 0.1002\n#> 51 2 41.72 0.0913\n#> 52 2 41.72 0.0832\n#> 53 2 41.72 0.0758\n#> 54 2 41.72 0.0691\n#> 55 2 41.72 0.0630\n#> 56 2 41.72 0.0574\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.2\n#> 2 30.3\n#> 3 21.7\n#> 4 51.3\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.7\n```\n:::\n\n\n## `gls` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n # Also, nlme::gls() specifies the random effects outside of the formula so\n # we set that as an engine parameter\n set_engine(\"gls\", correlation = nlme::corCompSymm(form = ~Time|Rat))\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(weight ~ Time + Diet, data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Generalized least squares fit by REML\n#> Model: weight ~ Time + Diet \n#> Data: data \n#> Log-restricted-likelihood: -477.8274\n#> \n#> Coefficients:\n#> (Intercept) Time Diet2 Diet3 \n#> 245.410439 0.549192 185.621212 259.287879 \n#> \n#> Correlation Structure: Compound symmetry\n#> Formula: ~Time | Rat \n#> Parameter estimate(s):\n#> Rho \n#> 0.8019221 \n#> Degrees of freedom: 132 total; 128 residual\n#> Residual standard error: 18.23695\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: glm\n#> Model ID: GLM_model_R_1763571327438_5837 \n#> GLM Model: summary\n#> family link regularization\n#> 1 gaussian identity Elastic Net (alpha = 0.5, lambda = 0.01903 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 1\n#> training_frame\n#> 1 object_ujvnjgioue\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept 33.577283 33.577283\n#> 2 cement 8.708461 8.708461\n#> 3 age 5.422201 5.422201\n#> \n#> H2ORegressionMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 168.2822\n#> RMSE: 12.97236\n#> MAE: 10.62672\n#> RMSLE: 0.4645554\n#> Mean Residual Deviance : 168.2822\n#> R^2 : 0.4171988\n#> Null Deviance :26564.74\n#> Null D.o.F. :91\n#> Residual Deviance :15481.96\n#> Residual D.o.F. :89\n#> AIC :740.6438\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.7\n#> 4 51.2\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.7\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(596)\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = reg_train)\nlinear_reg_fit\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Model: \"sequential_3\"\n#> ________________________________________________________________________________\n#> Layer (type) Output Shape Param # \n#> ================================================================================\n#> dense_6 (Dense) (None, 1) 3 \n#> dense_7 (Dense) (None, 1) 2 \n#> ================================================================================\n#> Total params: 5 (20.00 Byte)\n#> Trainable params: 5 (20.00 Byte)\n#> Non-trainable params: 0 (0.00 Byte)\n#> ________________________________________________________________________________\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_test)\n#> 1/1 - 0s - 43ms/epoch - 43ms/step\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 0.157 \n#> 2 -0.000953\n#> 3 -0.0695 \n#> 4 0.417 \n#> 5 0.291 \n#> 6 0.154 \n#> 7 0.170 \n#> 8 0.445\n```\n:::\n\n\n## `lme` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that. \n # nlme::lme() makes us set the random effects outside of the formula so we\n # add it as an engine parameter. \n set_engine(\"lme\", random = ~ Time | Rat)\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(weight ~ Diet + Time, data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Linear mixed-effects model fit by REML\n#> Data: data \n#> Log-restricted-likelihood: -426.5662\n#> Fixed: weight ~ Diet + Time \n#> (Intercept) Diet2 Diet3 Time \n#> 240.483603 199.723140 264.893298 0.549192 \n#> \n#> Random effects:\n#> Formula: ~Time | Rat\n#> Structure: General positive-definite, Log-Cholesky parametrization\n#> StdDev Corr \n#> (Intercept) 25.2657397 (Intr)\n#> Time 0.3411097 -0.816\n#> Residual 4.5940697 \n#> \n#> Number of Observations: 132\n#> Number of Groups: 12\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 241.\n#> 2 245.\n#> 3 249.\n#> 4 253.\n#> 5 256.\n#> 6 260.\n#> 7 264.\n#> 8 265.\n#> 9 268.\n#> 10 272.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `lmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"lmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- \n linear_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Linear mixed model fit by REML ['lmerMod']\n#> Formula: weight ~ Diet + Time + (1 | Rat)\n#> Data: data\n#> REML criterion at convergence: 955.6549\n#> Random effects:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 16.331 \n#> Residual 8.117 \n#> Number of obs: 132, groups: Rat, 12\n#> Fixed Effects:\n#> (Intercept) Diet2 Diet3 Time \n#> 245.4104 185.6212 259.2879 0.5492\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `stan` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(357)\nlinear_reg_fit <- linear_reg_spec |> fit(weight ~ Diet + Time, data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> stan_glm\n#> family: gaussian [identity]\n#> formula: weight ~ Diet + Time\n#> observations: 132\n#> predictors: 4\n#> ------\n#> Median MAD_SD\n#> (Intercept) 245.3 3.3 \n#> Diet2 185.6 3.6 \n#> Diet3 259.3 3.4 \n#> Time 0.6 0.1 \n#> \n#> Auxiliary parameter(s):\n#> Median MAD_SD\n#> sigma 16.6 1.0 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 257.\n#> 5 261.\n#> 6 265.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\npredict(linear_reg_fit, type = \"conf_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 240. 252.\n#> 2 244. 255.\n#> 3 249. 258.\n#> 4 253. 262.\n#> 5 257. 265.\n#> 6 261. 269.\n#> 7 265. 273.\n#> 8 265. 274.\n#> 9 268. 278.\n#> 10 271. 282.\n#> # ℹ 34 more rows\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 213. 278.\n#> 2 216. 282.\n#> 3 220. 287.\n#> 4 224. 290.\n#> 5 228. 292.\n#> 6 230. 297.\n#> 7 236. 301.\n#> 8 236. 302.\n#> 9 240. 305.\n#> 10 244. 310.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `stan_glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(895)\nlinear_reg_fit <- \n linear_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> stan_glmer\n#> family: gaussian [identity]\n#> formula: weight ~ Diet + Time + (1 | Rat)\n#> observations: 132\n#> ------\n#> Median MAD_SD\n#> (Intercept) 245.6 6.8 \n#> Diet2 185.7 11.5 \n#> Diet3 259.2 11.5 \n#> Time 0.5 0.0 \n#> \n#> Auxiliary parameter(s):\n#> Median MAD_SD\n#> sigma 8.2 0.5 \n#> \n#> Error terms:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 17.2 \n#> Residual 8.2 \n#> Num. levels: Rat 12 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 246.\n#> 2 250.\n#> 3 254.\n#> 4 258.\n#> 5 262.\n#> 6 266.\n#> 7 269.\n#> 8 270.\n#> 9 273.\n#> 10 277.\n#> # ℹ 34 more rows\npredict(linear_reg_fit, type = \"pred_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 205. 285.\n#> 2 211. 289.\n#> 3 214. 292.\n#> 4 218. 295.\n#> 5 221. 300.\n#> 6 225. 303.\n#> 7 230. 307.\n#> 8 230. 309.\n#> 9 233. 312.\n#> 10 237. 314.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n set_engine(\"spark\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> Coefficients:\n#> (Intercept) cement blast_furnace_slag fly_ash \n#> -21.80239627 0.12003251 0.10399582 0.08747677 \n#> water superplasticizer coarse_aggregate fine_aggregate \n#> -0.15701342 0.28531613 0.01777782 0.02018358 \n#> age \n#> 0.11678247\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 16.5\n#> 2 19.7\n#> 3 26.1\n#> 4 23.6\n#> 5 24.2\n#> 6 29.1\n#> 7 21.3\n#> 8 24.2\n#> 9 33.9\n#> 10 57.7\n#> # ℹ more rows\n```\n:::\n\n\n:::\n\n## Multivariate Adaptive Regression Splines (`mars()`) \n\n:::{.panel-tabset}\n\n## `earth` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_spec <- mars() |>\n # We need to set the mode since this engine works with multiple modes\n # and earth is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmars_fit <- mars_spec |> fit(strength ~ ., data = reg_train)\nmars_fit\n#> parsnip model object\n#> \n#> Selected 4 of 9 terms, and 2 of 2 predictors\n#> Termination condition: RSq changed by less than 0.001 at 9 terms\n#> Importance: age, cement\n#> Number of terms at each degree of interaction: 1 3 (additive model)\n#> GCV 113.532 RSS 8915.965 GRSq 0.6153128 RSq 0.6643684\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mars_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 22.0\n#> 2 43.1\n#> 3 28.1\n#> 4 58.0\n#> 5 33.8\n#> 6 34.9\n#> 7 36.3\n#> 8 43.5\n```\n:::\n\n\n:::\n\n## Neural Networks (`mlp()`) \n\n:::{.panel-tabset}\n\n## `nnet` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n # and nnet is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(159)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> a 2-5-1 network with 21 weights\n#> inputs: cement age \n#> output(s): strength \n#> options were - linear output units\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 14.8\n#> 2 38.5\n#> 3 32.0\n#> 4 63.6\n#> 5 43.5\n#> 6 42.7\n#> 7 42.3\n#> 8 33.1\n```\n:::\n\n\n## `brulee` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"brulee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(407)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> relu activation,\n#> 3 hidden units,\n#> 13 model parameters\n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 83 \n#> learn rate: 0.01 \n#> scaled validation loss after 9 epochs: 0.189\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.1\n#> 2 39.4\n#> 3 26.9\n#> 4 56.4\n#> 5 32.9\n#> 6 37.2\n#> 7 38.4\n#> 8 40.1\n```\n:::\n\n\n## `brulee_two_layer` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"brulee_two_layer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(585)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Multilayer perceptron\n#> \n#> c(relu,relu) activation,\n#> c(3,3) hidden units,\n#> 25 model parameters\n#> 92 samples, 2 features, numeric outcome \n#> weight decay: 0.001 \n#> dropout proportion: 0 \n#> batch size: 83 \n#> learn rate: 0.01 \n#> scaled validation loss after 3 epochs: 0.379\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.5\n#> 2 32.6\n#> 3 24.6\n#> 4 50.5\n#> 5 46.7\n#> 6 33.8\n#> 7 37.0\n#> 8 50.5\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(93)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\nmlp_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: deeplearning\n#> Model ID: DeepLearning_model_R_1763571327438_5838 \n#> Status of Neuron Layers: predicting .outcome, regression, gaussian distribution, Quadratic loss, 801 weights/biases, 14.5 KB, 920 training samples, mini-batch size 1\n#> layer units type dropout l1 l2 mean_rate rate_rms momentum\n#> 1 1 2 Input 0.00 % NA NA NA NA NA\n#> 2 2 200 Rectifier 0.00 % 0.000000 0.000000 0.012065 0.026861 0.000000\n#> 3 3 1 Linear NA 0.000000 0.000000 0.000591 0.000159 0.000000\n#> mean_weight weight_rms mean_bias bias_rms\n#> 1 NA NA NA NA\n#> 2 -0.002587 0.098240 0.499863 0.001026\n#> 3 -0.001060 0.098144 0.001830 0.000000\n#> \n#> \n#> H2ORegressionMetrics: deeplearning\n#> ** Reported on training data. **\n#> ** Metrics reported on full training frame **\n#> \n#> MSE: 181.1378\n#> RMSE: 13.45874\n#> MAE: 10.46307\n#> RMSLE: 0.4752285\n#> Mean Residual Deviance : 181.1378\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.2\n#> 2 30.3\n#> 3 24.1\n#> 4 41.3\n#> 5 34.0\n#> 6 32.0\n#> 7 32.6\n#> 8 38.6\n```\n:::\n\n\n## `keras` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nmlp_spec <- mlp() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"keras\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(879)\nmlp_fit <- mlp_spec |> fit(strength ~ ., data = reg_train)\n```\n:::\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> Coefficients:\n#> (Intercept) cement blast_furnace_slag fly_ash \n#> -21.80239627 0.12003251 0.10399582 0.08747677 \n#> water superplasticizer coarse_aggregate fine_aggregate \n#> -0.15701342 0.28531613 0.01777782 0.02018358 \n#> age \n#> 0.11678247\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(mlp_fit, new_data = reg_test)\n#> 1/1 - 0s - 43ms/epoch - 43ms/step\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 -0.386\n#> 2 -0.337\n#> 3 -0.299\n#> 4 -0.278\n#> 5 -0.384\n#> 6 -0.374\n#> 7 -0.373\n#> 8 -0.341\n```\n:::\n\n\n:::\n\n## K-Nearest Neighbors (`nearest_neighbor()`) \n\n:::{.panel-tabset}\n\n## `kknn` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_spec <- nearest_neighbor() |>\n # We need to set the mode since this engine works with multiple modes\n # and kknn is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnearest_neighbor_fit <- nearest_neighbor_spec |> fit(strength ~ ., data = reg_train)\nnearest_neighbor_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> kknn::train.kknn(formula = strength ~ ., data = data, ks = min_rows(5, data, 5))\n#> \n#> Type of response variable: continuous\n#> minimal mean absolute error: 8.257735\n#> Minimal mean squared error: 115.8737\n#> Best kernel: optimal\n#> Best k: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(nearest_neighbor_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 16.3\n#> 2 35.7\n#> 3 27.5\n#> 4 56.7\n#> 5 42.6\n#> 6 41.7\n#> 7 41.2\n#> 8 50.2\n```\n:::\n\n\n## Null Model (`null_model()`) \n\n## `parsnip` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_spec <- null_model() |>\n # We need to set the mode since this engine works with multiple modes\n # and parsnip is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nnull_model_fit <- null_model_spec |> fit(strength ~ ., data = reg_train)\nnull_model_fit\n#> parsnip model object\n#> \n#> Null Classification Model\n#> Predicted Value: 33.57728\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(null_model_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 33.6\n#> 2 33.6\n#> 3 33.6\n#> 4 33.6\n#> 5 33.6\n#> 6 33.6\n#> 7 33.6\n#> 8 33.6\n```\n:::\n\n\n:::\n\n## Partial Least Squares (`pls()`) \n\n:::{.panel-tabset}\n\n## `mixOmics` \n\nThis engine requires the plsmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(plsmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_spec <- pls() |>\n # We need to set the mode since this engine works with multiple modes\n # and mixOmics is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npls_fit <- pls_spec |> fit(strength ~ ., data = reg_train)\npls_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> mixOmics::spls(X = x, Y = y, ncomp = ncomp, keepX = keepX) \n#> \n#> sPLS with a 'regression' mode with 2 sPLS components. \n#> You entered data X of dimensions: 92 2 \n#> You entered data Y of dimensions: 92 1 \n#> \n#> Selection of [2] [2] variables on each of the sPLS components on the X data set. \n#> Selection of [1] [1] variables on each of the sPLS components on the Y data set. \n#> \n#> Main numerical outputs: \n#> -------------------- \n#> loading vectors: see object$loadings \n#> variates: see object$variates \n#> variable names: see object$names \n#> \n#> Functions to visualise samples: \n#> -------------------- \n#> plotIndiv, plotArrow \n#> \n#> Functions to visualise variables: \n#> -------------------- \n#> plotVar, plotLoadings, network, cim\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(pls_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 32.1\n#> 2 30.3\n#> 3 21.6\n#> 4 51.4\n#> 5 40.3\n#> 6 35.3\n#> 7 36.3\n#> 8 48.8\n```\n:::\n\n\n:::\n\n## Poisson Reg (`poisson_reg()`) \n\n:::{.panel-tabset}\n\n## `glm` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and glm is the default engine so there is no need to set that either.\npoisson_reg_spec <- poisson_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: stats::glm(formula = num_years ~ ., family = stats::poisson, \n#> data = data)\n#> \n#> Coefficients:\n#> (Intercept) age income \n#> 2.2861 0.2804 0.2822 \n#> \n#> Degrees of Freedom: 1460 Total (i.e. Null); 1458 Residual\n#> Null Deviance:\t 7434 \n#> Residual Deviance: 2597 \tAIC: 8446\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.6 \n#> 2 6.66\n#> 3 11.8 \n#> 4 24.8 \n#> 5 26.6 \n#> 6 8.23\n#> 7 32.1 \n#> 8 4.86\n#> 9 28.3\n```\n:::\n\n\n## `gee` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"gee\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- \n poisson_reg_spec |> \n fit(weight ~ Diet + Time + id_var(Rat), data = reg_group_train)\n#> Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27\n#> running glm to get initial regression estimate\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> GEE: GENERALIZED LINEAR MODELS FOR DEPENDENT DATA\n#> gee S-function, version 4.13 modified 98/01/27 (1998) \n#> \n#> Model:\n#> Link: Logarithm \n#> Variance to Mean Relation: Poisson \n#> Correlation Structure: Independent \n#> \n#> Call:\n#> gee::gee(formula = weight ~ Diet + Time, id = data$Rat, data = data, \n#> family = stats::poisson)\n#> \n#> Number of observations : 132 \n#> \n#> Maximum cluster size : 11 \n#> \n#> \n#> Coefficients:\n#> (Intercept) Diet2 Diet3 Time \n#> 5.525683187 0.532717136 0.684495610 0.001467487 \n#> \n#> Estimated Scale Parameter: 0.6879328\n#> Number of Iterations: 1\n#> \n#> Working Correlation[1:4,1:4]\n#> [,1] [,2] [,3] [,4]\n#> [1,] 1 0 0 0\n#> [2,] 0 1 0 0\n#> [3,] 0 0 1 0\n#> [4,] 0 0 0 1\n#> \n#> \n#> Returned Error Value:\n#> [1] 0\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Can't reproduce this:\n# predict(poisson_reg_fit, new_data = reg_group_test)\n```\n:::\n\n\n## `glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(826)\npoisson_reg_fit <- \n poisson_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\n#> Warning in checkConv(attr(opt, \"derivs\"), opt$par, ctrl = control$checkConv, :\n#> Model failed to converge with max|grad| = 0.00394285 (tol = 0.002, component 1)\n#> Warning in checkConv(attr(opt, \"derivs\"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue\n#> - Rescale variables?\npoisson_reg_fit\n#> parsnip model object\n#> \n#> Generalized linear mixed model fit by maximum likelihood (Laplace\n#> Approximation) [glmerMod]\n#> Family: poisson ( log )\n#> Formula: weight ~ Diet + Time + (1 | Rat)\n#> Data: data\n#> AIC BIC logLik -2*log(L) df.resid \n#> 1079.1349 1093.5489 -534.5675 1069.1349 127 \n#> Random effects:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 0.03683 \n#> Number of obs: 132, groups: Rat, 12\n#> Fixed Effects:\n#> (Intercept) Diet2 Diet3 Time \n#> 5.524796 0.533446 0.684637 0.001467 \n#> optimizer (Nelder_Mead) convergence code: 0 (OK) ; 0 optimizer warnings; 2 lme4 warnings\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 251.\n#> 2 254.\n#> 3 256.\n#> 4 259.\n#> 5 262.\n#> 6 264.\n#> 7 267.\n#> 8 268.\n#> 9 270.\n#> 10 273.\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `glmnet` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = \"poisson\") \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 5.9710\n#> 2 1 10.26 5.4400\n#> 3 1 18.31 4.9570\n#> 4 2 24.84 4.5170\n#> 5 2 32.06 4.1150\n#> 6 2 37.94 3.7500\n#> 7 2 42.73 3.4170\n#> 8 2 46.65 3.1130\n#> 9 2 49.87 2.8370\n#> 10 2 52.51 2.5850\n#> 11 2 54.69 2.3550\n#> 12 2 56.48 2.1460\n#> 13 2 57.96 1.9550\n#> 14 2 59.18 1.7810\n#> 15 2 60.19 1.6230\n#> 16 2 61.03 1.4790\n#> 17 2 61.72 1.3480\n#> 18 2 62.29 1.2280\n#> 19 2 62.76 1.1190\n#> 20 2 63.16 1.0190\n#> 21 2 63.48 0.9289\n#> 22 2 63.75 0.8463\n#> 23 2 63.98 0.7712\n#> 24 2 64.16 0.7026\n#> 25 2 64.31 0.6402\n#> 26 2 64.44 0.5833\n#> 27 2 64.55 0.5315\n#> 28 2 64.64 0.4843\n#> 29 2 64.71 0.4413\n#> 30 2 64.77 0.4021\n#> 31 2 64.82 0.3664\n#> 32 2 64.86 0.3338\n#> 33 2 64.90 0.3042\n#> 34 2 64.92 0.2771\n#> 35 2 64.95 0.2525\n#> 36 2 64.97 0.2301\n#> 37 2 64.98 0.2096\n#> 38 2 65.00 0.1910\n#> 39 2 65.01 0.1741\n#> 40 2 65.02 0.1586\n#> 41 2 65.03 0.1445\n#> 42 2 65.03 0.1317\n#> 43 2 65.04 0.1200\n#> 44 2 65.04 0.1093\n#> 45 2 65.05 0.0996\n#> 46 2 65.05 0.0907\n#> 47 2 65.05 0.0827\n#> 48 2 65.05 0.0753\n#> 49 2 65.06 0.0687\n#> 50 2 65.06 0.0625\n#> 51 2 65.06 0.0570\n#> 52 2 65.06 0.0519\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.4 \n#> 2 6.70\n#> 3 11.8 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.27\n#> 7 31.8 \n#> 8 4.91\n#> 9 28.1\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: glm\n#> Model ID: GLM_model_R_1763571327438_5839 \n#> GLM Model: summary\n#> family link regularization\n#> 1 poisson log Elastic Net (alpha = 0.5, lambda = 0.01194 )\n#> number_of_predictors_total number_of_active_predictors number_of_iterations\n#> 1 2 2 4\n#> training_frame\n#> 1 object_kyirzmfbti\n#> \n#> Coefficients: glm coefficients\n#> names coefficients standardized_coefficients\n#> 1 Intercept 2.286411 2.286411\n#> 2 age 0.279967 0.279967\n#> 3 income 0.281952 0.281952\n#> \n#> H2ORegressionMetrics: glm\n#> ** Reported on training data. **\n#> \n#> MSE: 18.40519\n#> RMSE: 4.290128\n#> MAE: 3.297048\n#> RMSLE: 0.467537\n#> Mean Residual Deviance : 1.777749\n#> R^2 : 0.6934292\n#> Null Deviance :7434.374\n#> Null D.o.F. :1460\n#> Residual Deviance :2597.291\n#> Residual D.o.F. :1458\n#> AIC :8445.967\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.6 \n#> 2 6.67\n#> 3 11.8 \n#> 4 24.8 \n#> 5 26.5 \n#> 6 8.24\n#> 7 32.0 \n#> 8 4.87\n#> 9 28.2\n```\n:::\n\n\n## `hurdle` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"hurdle\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> pscl::hurdle(formula = num_years ~ ., data = data)\n#> \n#> Count model coefficients (truncated poisson with log link):\n#> (Intercept) age income \n#> 2.2911 0.2749 0.2820 \n#> \n#> Zero hurdle model coefficients (binomial with logit link):\n#> (Intercept) age income \n#> 24.656 5.611 13.092\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.5 \n#> 2 6.74\n#> 3 11.9 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.32\n#> 7 31.9 \n#> 8 4.89\n#> 9 28.2\n```\n:::\n\n\n## `stan` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(213)\npoisson_reg_fit <- \n poisson_reg_spec |> \n fit(weight ~ Diet + Time, data = reg_group_train)\n#> \n#> SAMPLING FOR MODEL 'count' NOW (CHAIN 1).\n#> Chain 1: \n#> Chain 1: Gradient evaluation took 8.9e-05 seconds\n#> Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.89 seconds.\n#> Chain 1: Adjust your expectations accordingly!\n#> Chain 1: \n#> Chain 1: \n#> Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)\n#> Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)\n#> Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)\n#> Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)\n#> Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)\n#> Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)\n#> Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)\n#> Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)\n#> Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)\n#> Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)\n#> Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)\n#> Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)\n#> Chain 1: \n#> Chain 1: Elapsed Time: 0.034 seconds (Warm-up)\n#> Chain 1: 0.034 seconds (Sampling)\n#> Chain 1: 0.068 seconds (Total)\n#> Chain 1: \n#> \n#> SAMPLING FOR MODEL 'count' NOW (CHAIN 2).\n#> Chain 2: \n#> Chain 2: Gradient evaluation took 6e-06 seconds\n#> Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.06 seconds.\n#> Chain 2: Adjust your expectations accordingly!\n#> Chain 2: \n#> Chain 2: \n#> Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)\n#> Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)\n#> Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)\n#> Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)\n#> Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)\n#> Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)\n#> Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)\n#> Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)\n#> Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)\n#> Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)\n#> Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)\n#> Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)\n#> Chain 2: \n#> Chain 2: Elapsed Time: 0.035 seconds (Warm-up)\n#> Chain 2: 0.034 seconds (Sampling)\n#> Chain 2: 0.069 seconds (Total)\n#> Chain 2: \n#> \n#> SAMPLING FOR MODEL 'count' NOW (CHAIN 3).\n#> Chain 3: \n#> Chain 3: Gradient evaluation took 6e-06 seconds\n#> Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.06 seconds.\n#> Chain 3: Adjust your expectations accordingly!\n#> Chain 3: \n#> Chain 3: \n#> Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)\n#> Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)\n#> Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)\n#> Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)\n#> Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)\n#> Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)\n#> Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)\n#> Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)\n#> Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)\n#> Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)\n#> Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)\n#> Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)\n#> Chain 3: \n#> Chain 3: Elapsed Time: 0.035 seconds (Warm-up)\n#> Chain 3: 0.034 seconds (Sampling)\n#> Chain 3: 0.069 seconds (Total)\n#> Chain 3: \n#> \n#> SAMPLING FOR MODEL 'count' NOW (CHAIN 4).\n#> Chain 4: \n#> Chain 4: Gradient evaluation took 5e-06 seconds\n#> Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.05 seconds.\n#> Chain 4: Adjust your expectations accordingly!\n#> Chain 4: \n#> Chain 4: \n#> Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)\n#> Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)\n#> Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)\n#> Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)\n#> Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)\n#> Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)\n#> Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)\n#> Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)\n#> Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)\n#> Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)\n#> Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)\n#> Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)\n#> Chain 4: \n#> Chain 4: Elapsed Time: 0.035 seconds (Warm-up)\n#> Chain 4: 0.037 seconds (Sampling)\n#> Chain 4: 0.072 seconds (Total)\n#> Chain 4:\npoisson_reg_fit\n#> parsnip model object\n#> \n#> stan_glm\n#> family: poisson [log]\n#> formula: weight ~ Diet + Time\n#> observations: 132\n#> predictors: 4\n#> ------\n#> Median MAD_SD\n#> (Intercept) 5.5 0.0 \n#> Diet2 0.5 0.0 \n#> Diet3 0.7 0.0 \n#> Time 0.0 0.0 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 5.53\n#> 2 5.54\n#> 3 5.55\n#> 4 5.56\n#> 5 5.57\n#> 6 5.58\n#> 7 5.59\n#> 8 5.59\n#> 9 5.60\n#> 10 5.61\n#> # ℹ 34 more rows\npredict(poisson_reg_fit, type = \"conf_int\", new_data = reg_group_test)\n#> Instead of posterior_linpred(..., transform=TRUE) please call posterior_epred(), which provides equivalent functionality.\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 246. 257.\n#> 2 249. 259.\n#> 3 252. 261.\n#> 4 255. 263.\n#> 5 258. 266.\n#> 6 261. 269.\n#> 7 263. 272.\n#> 8 264. 272.\n#> 9 266. 275.\n#> 10 268. 278.\n#> # ℹ 34 more rows\npredict(poisson_reg_fit, type = \"pred_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 220 284\n#> 2 222 286\n#> 3 225 288\n#> 4 228 291\n#> 5 230 296\n#> 6 232 297\n#> 7 235 300\n#> 8 236 300\n#> 9 238 303\n#> 10 241 306\n#> # ℹ 34 more rows\n```\n:::\n\n\n## `stan_glmer` \n\nThis engine requires the multilevelmod extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(multilevelmod)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"stan_glmer\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(690)\npoisson_reg_fit <- \n poisson_reg_spec |> \n fit(weight ~ Diet + Time + (1|Rat), data = reg_group_train)\npoisson_reg_fit\n#> parsnip model object\n#> \n#> stan_glmer\n#> family: poisson [log]\n#> formula: weight ~ Diet + Time + (1 | Rat)\n#> observations: 132\n#> ------\n#> Median MAD_SD\n#> (Intercept) 5.5 0.0 \n#> Diet2 0.5 0.0 \n#> Diet3 0.7 0.0 \n#> Time 0.0 0.0 \n#> \n#> Error terms:\n#> Groups Name Std.Dev.\n#> Rat (Intercept) 0.054 \n#> Num. levels: Rat 12 \n#> \n#> ------\n#> * For help interpreting the printed output see ?print.stanreg\n#> * For info on the priors used see ?prior_summary.stanreg\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = reg_group_test)\n#> # A tibble: 44 × 1\n#> .pred\n#> \n#> 1 251.\n#> 2 254.\n#> 3 256.\n#> 4 259.\n#> 5 261.\n#> 6 264.\n#> 7 267.\n#> 8 268.\n#> 9 270.\n#> 10 272.\n#> # ℹ 34 more rows\npredict(poisson_reg_fit, type = \"pred_int\", new_data = reg_group_test)\n#> # A tibble: 44 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 210. 294 \n#> 2 213 298 \n#> 3 214 301 \n#> 4 217 304 \n#> 5 220 306 \n#> 6 222 309 \n#> 7 223 313.\n#> 8 225 315 \n#> 9 226 317.\n#> 10 229 320 \n#> # ℹ 34 more rows\n```\n:::\n\n\n## `zeroinfl` \n\nThis engine requires the poissonreg extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(poissonreg)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_spec <- poisson_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"zeroinfl\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npoisson_reg_fit <- poisson_reg_spec |> fit(num_years ~ ., data = count_train)\n#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred\npoisson_reg_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> pscl::zeroinfl(formula = num_years ~ ., data = data)\n#> \n#> Count model coefficients (poisson with log link):\n#> (Intercept) age income \n#> 2.2912 0.2748 0.2821 \n#> \n#> Zero-inflation model coefficients (binomial with logit link):\n#> (Intercept) age income \n#> -48.26 -18.22 -11.72\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(poisson_reg_fit, new_data = count_test)\n#> # A tibble: 9 × 1\n#> .pred\n#> \n#> 1 31.5 \n#> 2 6.74\n#> 3 11.9 \n#> 4 24.6 \n#> 5 26.4 \n#> 6 8.31\n#> 7 31.9 \n#> 8 4.93\n#> 9 28.2\n```\n:::\n\n\n:::\n\n## Random Forests (`rand_forest()`) \n\n:::{.panel-tabset}\n\n## `ranger` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n # and ranger is the default engine so there is no need to set that either.\n set_engine(\"ranger\", keep.inbag = TRUE) |> \n # However, we'll set the engine and use the keep.inbag=TRUE option so that we \n # can produce interval predictions. This is not generally required. \n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(860)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Ranger result\n#> \n#> Call:\n#> ranger::ranger(x = maybe_data_frame(x), y = y, keep.inbag = ~TRUE, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1)) \n#> \n#> Type: Regression \n#> Number of trees: 500 \n#> Sample size: 92 \n#> Number of independent variables: 2 \n#> Mtry: 1 \n#> Target node size: 5 \n#> Variable importance mode: none \n#> Splitrule: variance \n#> OOB prediction error (MSE): 92.94531 \n#> R squared (OOB): 0.6816071\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.6\n#> 2 36.9\n#> 3 28.4\n#> 4 56.5\n#> 5 38.6\n#> 6 36.5\n#> 7 38.7\n#> 8 34.4\npredict(rand_forest_fit, type = \"conf_int\", new_data = reg_test)\n#> Warning in rInfJack(pred = result$predictions, inbag = inbag.counts, used.trees\n#> = 1:num.trees): Sample size <=20, no calibration performed.\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 18.1 29.1\n#> 2 32.6 41.1\n#> 3 24.0 32.9\n#> 4 45.4 67.7\n#> 5 33.0 44.3\n#> 6 32.0 41.0\n#> 7 35.1 42.3\n#> 8 28.4 40.3\n```\n:::\n\n\n## `aorsf` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(47)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random regression forest\n#> \n#> Linear combinations: Accelerated Linear regression\n#> N observations: 92\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 13.994\n#> Min observations in leaf: 5\n#> OOB stat value: 0.59\n#> OOB stat type: RSQ\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 25.2\n#> 2 36.4\n#> 3 29.7\n#> 4 55.5\n#> 5 42.3\n#> 6 38.5\n#> 7 40.7\n#> 8 52.7\n```\n:::\n\n\n## `grf` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"grf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(130)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> GRF forest object of type regression_forest \n#> Number of trees: 2000 \n#> Number of training samples: 92 \n#> Variable importance: \n#> 1 2 \n#> 0.51 0.49\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 26.8\n#> 2 38.9\n#> 3 28.3\n#> 4 47.0\n#> 5 41.1\n#> 6 36.4\n#> 7 38.3\n#> 8 33.8\npredict(rand_forest_fit, type = \"conf_int\", new_data = reg_test)\n#> # A tibble: 8 × 2\n#> .pred_lower .pred_upper\n#> \n#> 1 34.8 18.8\n#> 2 47.1 30.8\n#> 3 31.9 24.7\n#> 4 58.3 35.8\n#> 5 48.0 34.1\n#> 6 40.0 32.9\n#> 7 43.7 32.9\n#> 8 43.9 23.7\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(211)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: drf\n#> Model ID: DRF_model_R_1763571327438_5840 \n#> Model Summary: \n#> number_of_trees number_of_internal_trees model_size_in_bytes min_depth\n#> 1 50 50 22318 7\n#> max_depth mean_depth min_leaves max_leaves mean_leaves\n#> 1 14 9.04000 14 43 30.86000\n#> \n#> \n#> H2ORegressionMetrics: drf\n#> ** Reported on training data. **\n#> ** Metrics reported on Out-Of-Bag training samples **\n#> \n#> MSE: 89.19785\n#> RMSE: 9.444462\n#> MAE: 7.597463\n#> RMSLE: 0.3303384\n#> Mean Residual Deviance : 89.19785\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 24.9\n#> 2 36.4\n#> 3 28.1\n#> 4 56.8\n#> 5 39.0\n#> 6 37.8\n#> 7 37.4\n#> 8 31.8\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the bonsai extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(bonsai)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(981)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\n```\n:::\n\n\nThe print method has a lot of output: \n\n
\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncapture.output(print(rand_forest_fit))[1:100] |> cat(sep = \"\\n\")\n#> parsnip model object\n#> \n#> $nodes\n#> $nodes[[1]]\n#> [1] root\n#> | [2] V2 <= 0.31678\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -0.89134 *\n#> | | | [6] V2 > -0.89134 *\n#> | [7] V2 > 0.31678\n#> | | [8] V3 <= -0.60316 *\n#> | | [9] V3 > -0.60316 *\n#> \n#> $nodes[[2]]\n#> [1] root\n#> | [2] V2 <= 0.62459\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -1.16452 *\n#> | | | [6] V2 > -1.16452\n#> | | | | [7] V3 <= -0.2359 *\n#> | | | | [8] V3 > -0.2359 *\n#> | [9] V2 > 0.62459 *\n#> \n#> $nodes[[3]]\n#> [1] root\n#> | [2] V2 <= 0.34564\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -1.19338 *\n#> | | | [6] V2 > -1.19338 *\n#> | [7] V2 > 0.34564\n#> | | [8] V2 <= 1.21134 *\n#> | | [9] V2 > 1.21134 *\n#> \n#> $nodes[[4]]\n#> [1] root\n#> | [2] V2 <= 0.34564\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V3 <= 0.25377 *\n#> | | | [6] V3 > 0.25377 *\n#> | [7] V2 > 0.34564\n#> | | [8] V3 <= -0.60316 *\n#> | | [9] V3 > -0.60316 *\n#> \n#> $nodes[[5]]\n#> [1] root\n#> | [2] V2 <= 0.62459\n#> | | [3] V3 <= -0.48074 *\n#> | | [4] V3 > -0.48074\n#> | | | [5] V2 <= -1.12604 *\n#> | | | [6] V2 > -1.12604\n#> | | | | [7] V3 <= -0.2359 *\n#> | | | | [8] V3 > -0.2359 *\n#> | [9] V2 > 0.62459 *\n#> \n#> $nodes[[6]]\n#> [1] root\n#> | [2] V2 <= 0.72078\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -0.84517 *\n#> | | | [6] V2 > -0.84517 *\n#> | [7] V2 > 0.72078 *\n#> \n#> $nodes[[7]]\n#> [1] root\n#> | [2] V2 <= 0.72078\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V3 <= -0.2359\n#> | | | | [6] V2 <= 0.24945 *\n#> | | | | [7] V2 > 0.24945 *\n#> | | | [8] V3 > -0.2359 *\n#> | [9] V2 > 0.72078 *\n#> \n#> $nodes[[8]]\n#> [1] root\n#> | [2] V2 <= 0.72078\n#> | | [3] V3 <= -0.48074 *\n#> | | [4] V3 > -0.48074\n#> | | | [5] V3 <= -0.2359 *\n#> | | | [6] V3 > -0.2359 *\n#> | [7] V2 > 0.72078 *\n#> \n#> $nodes[[9]]\n#> [1] root\n#> | [2] V2 <= 0.62459\n#> | | [3] V3 <= -0.60316 *\n#> | | [4] V3 > -0.60316\n#> | | | [5] V2 <= -0.23149\n#> | | | | [6] V2 <= -1.09526 *\n#> | | | | [7] V2 > -1.09526 *\n#> | | | [8] V2 > -0.23149 *\n#> | [9] V2 > 0.62459 *\n#> \n#> $nodes[[10]]\n#> [1] root\n```\n:::\n\n
\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 16.3\n#> 2 37.7\n#> 3 28.5\n#> 4 50.6\n#> 5 49.2\n#> 6 36.1\n#> 7 38.6\n#> 8 49.7\n```\n:::\n\n\n## `randomForest` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"randomForest\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(793)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = reg_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> \n#> Call:\n#> randomForest(x = maybe_data_frame(x), y = y) \n#> Type of random forest: regression\n#> Number of trees: 500\n#> No. of variables tried at each split: 1\n#> \n#> Mean of squared residuals: 90.38475\n#> % Var explained: 68.7\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 23.5\n#> 2 36.8\n#> 3 28.6\n#> 4 58.0\n#> 5 38.3\n#> 6 35.4\n#> 7 38.1\n#> 8 33.7\n```\n:::\n\n\n## `spark` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n set_engine(\"spark\") |> \n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(157)\nrand_forest_fit <- rand_forest_spec |> fit(compressive_strength ~ ., data = tbl_reg$training)\nrand_forest_fit\n#> parsnip model object\n#> \n#> Formula: compressive_strength ~ .\n#> \n#> RandomForestRegressionModel: uid=random_forest__b4dbadd6_b45e_4531_8bab_c06a4e88a0c0, numTrees=20, numFeatures=8\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, new_data = tbl_reg$test)\n#> # Source: SQL [?? x 1]\n#> # Database: spark_connection\n#> pred\n#> \n#> 1 28.2\n#> 2 29.6\n#> 3 23.0\n#> 4 28.2\n#> 5 15.2\n#> 6 35.3\n#> 7 18.6\n#> 8 31.9\n#> 9 36.3\n#> 10 45.4\n#> # ℹ more rows\n```\n:::\n\n\n:::\n\n## Rule Fit (`rule_fit()`) \n\n:::{.panel-tabset}\n\n## `xrf` \n\nThis engine requires the rules extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(rules)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n # and xrf is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(431)\nrule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> An eXtreme RuleFit model of 179 rules.\n#> \n#> Original Formula:\n#> \n#> strength ~ cement + age\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 27.5\n#> 2 32.0\n#> 3 26.5\n#> 4 52.9\n#> 5 35.9\n#> 6 31.8\n#> 7 46.2\n#> 8 30.8\n```\n:::\n\n\n## `h2o` \n\nThis engine requires the agua extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(agua)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrule_fit_spec <- rule_fit() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"h2o\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(236)\nrule_fit_fit <- rule_fit_spec |> fit(strength ~ ., data = reg_train)\nrule_fit_fit\n#> parsnip model object\n#> \n#> Model Details:\n#> ==============\n#> \n#> H2ORegressionModel: rulefit\n#> Model ID: RuleFit_model_R_1763571327438_5841 \n#> Rulefit Model Summary: \n#> family link regularization number_of_predictors_total\n#> 1 gaussian identity Lasso (lambda = 0.9516 ) 1917\n#> number_of_active_predictors number_of_iterations rule_ensemble_size\n#> 1 51 1 1915\n#> number_of_trees number_of_internal_trees min_depth max_depth mean_depth\n#> 1 150 150 0 5 4.00000\n#> min_leaves max_leaves mean_leaves\n#> 1 0 28 12.76667\n#> \n#> \n#> H2ORegressionMetrics: rulefit\n#> ** Reported on training data. **\n#> \n#> MSE: 90.45501\n#> RMSE: 9.510784\n#> MAE: 7.15224\n#> RMSLE: 0.3531064\n#> Mean Residual Deviance : 90.45501\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rule_fit_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 26.9\n#> 2 35.5\n#> 3 26.9\n#> 4 50.1\n#> 5 42.1\n#> 6 34.5\n#> 7 39.3\n#> 8 40.8\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Linear Kernel) (`svm_linear()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"regression\") |>\n set_engine(\"kernlab\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(strength ~ ., data = reg_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Linear (vanilla) kernel function. \n#> \n#> Number of Support Vectors : 85 \n#> \n#> Objective Function Value : -47.4495 \n#> Training error : 0.606701\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.4\n#> 2 30.9\n#> 3 21.7\n#> 4 47.1\n#> 5 36.4\n#> 6 33.4\n#> 7 34.2\n#> 8 43.2\n```\n:::\n\n\n## `LiblineaR` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_spec <- svm_linear() |>\n # We need to set the mode since this engine works with multiple modes\n # and LiblineaR is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_linear_fit <- svm_linear_spec |> fit(strength ~ ., data = reg_train)\nsvm_linear_fit\n#> parsnip model object\n#> \n#> $TypeDetail\n#> [1] \"L2-regularized L2-loss support vector regression primal (L2R_L2LOSS_SVR)\"\n#> \n#> $Type\n#> [1] 11\n#> \n#> $W\n#> cement age Bias\n#> [1,] 8.665447 5.486263 33.34299\n#> \n#> $Bias\n#> [1] 1\n#> \n#> $NbClass\n#> [1] 2\n#> \n#> attr(,\"class\")\n#> [1] \"LiblineaR\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_linear_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 31.9\n#> 2 30.1\n#> 3 21.5\n#> 4 50.9\n#> 5 39.9\n#> 6 35.0\n#> 7 36.0\n#> 8 48.3\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Polynomial Kernel) (`svm_poly()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_spec <- svm_poly() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_poly_fit <- svm_poly_spec |> fit(strength ~ ., data = reg_train)\n#> Setting default kernel parameters\nsvm_poly_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Polynomial kernel function. \n#> Hyperparameters : degree = 1 scale = 1 offset = 1 \n#> \n#> Number of Support Vectors : 85 \n#> \n#> Objective Function Value : -47.4495 \n#> Training error : 0.606702\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_poly_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 29.4\n#> 2 30.9\n#> 3 21.7\n#> 4 47.1\n#> 5 36.4\n#> 6 33.4\n#> 7 34.2\n#> 8 43.2\n```\n:::\n\n\n:::\n\n## Support Vector Machine (Radial Basis Function Kernel) (`svm_rbf()`) \n\n:::{.panel-tabset}\n\n## `kernlab` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_spec <- svm_rbf() |>\n # We need to set the mode since this engine works with multiple modes\n # and kernlab is the default engine so there is no need to set that either.\n set_mode(\"regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsvm_rbf_fit <- svm_rbf_spec |> fit(strength ~ ., data = reg_train)\nsvm_rbf_fit\n#> parsnip model object\n#> \n#> Support Vector Machine object of class \"ksvm\" \n#> \n#> SV type: eps-svr (regression) \n#> parameter : epsilon = 0.1 cost C = 1 \n#> \n#> Gaussian Radial Basis kernel function. \n#> Hyperparameter : sigma = 0.850174270140177 \n#> \n#> Number of Support Vectors : 79 \n#> \n#> Objective Function Value : -33.0277 \n#> Training error : 0.28361\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(svm_rbf_fit, new_data = reg_test)\n#> # A tibble: 8 × 1\n#> .pred\n#> \n#> 1 20.0\n#> 2 41.3\n#> 3 26.0\n#> 4 53.5\n#> 5 35.2\n#> 6 34.7\n#> 7 36.2\n#> 8 42.3\n```\n:::\n\n\n\n:::\n\n# Censored Regression Models\n\nLet's simulate a data set using the prodlim and survival packages: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(survival)\n#> \n#> Attaching package: 'survival'\n#> The following object is masked from 'package:future':\n#> \n#> cluster\nlibrary(prodlim)\n\nset.seed(1000)\ncns_data <- \n SimSurv(250) |> \n mutate(event_time = Surv(time, event)) |> \n select(event_time, X1, X2)\n\ncns_split <- initial_split(cns_data, prop = 0.98)\ncns_split\n#> \n#> <245/5/250>\ncns_train <- training(cns_split)\ncns_test <- testing(cns_split)\n```\n:::\n\n\nFor some types of predictions, we need the _evaluation time(s)_ for the predictions. We'll use these three times to demonstrate: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\neval_times <- c(1, 3, 5)\n```\n:::\n\n\n\n## Bagged Decision Trees (`bag_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_spec <- bag_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"censored regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_fit <- bag_tree_spec |> fit(event_time ~ ., data = cns_train)\nbag_tree_fit\n#> parsnip model object\n#> \n#> \n#> Bagging survival trees with 25 bootstrap replications \n#> \n#> Call: bagging.data.frame(formula = event_time ~ ., data = data, cp = ~0, \n#> minsplit = ~2)\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(bag_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.65\n#> 2 4.12\n#> 3 5.03\n#> 4 5.58\n#> 5 4.88\npredict(bag_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nbag_tree_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.993\n#> 2 3 0.864\n#> 3 5 0.638\n```\n:::\n\n\n:::\n\n## Boosted Decision Trees (`boost_tree()`) \n\n:::{.panel-tabset}\n\n## `mboost` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_spec <- boost_tree() |> \n set_mode(\"censored regression\") |> \n set_engine(\"mboost\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(852)\nboost_tree_fit <- boost_tree_spec |> fit(event_time ~ ., data = cns_train)\nboost_tree_fit\n#> parsnip model object\n#> \n#> \n#> \t Model-based Boosting\n#> \n#> Call:\n#> mboost::blackboost(formula = formula, data = data, family = family, control = mboost::boost_control(), tree_controls = partykit::ctree_control(teststat = \"quadratic\", testtype = \"Teststatistic\", mincriterion = 0, minsplit = 10, minbucket = 4, maxdepth = 2, saveinfo = FALSE))\n#> \n#> \n#> \t Cox Partial Likelihood \n#> \n#> Loss function: \n#> \n#> Number of boosting iterations: mstop = 100 \n#> Step size: 0.1 \n#> Offset: 0 \n#> Number of baselearners: 1\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(boost_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 6.51\n#> 2 3.92\n#> 3 4.51\n#> 4 7.17\n#> 5 4.51\npredict(boost_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(boost_tree_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 0.00839\n#> 2 -1.14 \n#> 3 -0.823 \n#> 4 0.229 \n#> 5 -0.823\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nboost_tree_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.982\n#> 2 3 0.877\n#> 3 5 0.657\n```\n:::\n\n\n:::\n\n## Decision Tree (`decision_tree()`) \n\n:::{.panel-tabset}\n\n## `rpart` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n # and rpart is the default engine so there is no need to set that either.\n set_mode(\"censored regression\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(event_time ~ ., data = cns_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> $rpart\n#> n= 245 \n#> \n#> node), split, n, deviance, yval\n#> * denotes terminal node\n#> \n#> 1) root 245 329.03530 1.0000000 \n#> 2) X2< -0.09937043 110 119.05180 0.5464982 \n#> 4) X2< -0.9419799 41 42.43138 0.3153769 \n#> 8) X1< 0.5 20 12.93725 0.1541742 *\n#> 9) X1>=0.5 21 23.29519 0.5656502 *\n#> 5) X2>=-0.9419799 69 67.76223 0.7336317 *\n#> 3) X2>=-0.09937043 135 157.14990 1.7319010 \n#> 6) X1< 0.5 79 66.30972 1.2572690 *\n#> 7) X1>=0.5 56 69.62652 3.0428230 \n#> 14) X2< 1.222057 44 40.33335 2.5072040 *\n#> 15) X2>=1.222057 12 17.95790 6.3934130 *\n#> \n#> $survfit\n#> \n#> Call: prodlim::prodlim(formula = form, data = data)\n#> Stratified Kaplan-Meier estimator for the conditional event time survival function\n#> Discrete predictor variable: rpartFactor (0.154174164904031, 0.565650228981439, 0.733631734872791, 1.25726850344687, 2.50720371146533, 6.39341334321542)\n#> \n#> Right-censored response of a survival model\n#> \n#> No.Observations: 245 \n#> \n#> Pattern:\n#> Freq\n#> event 161 \n#> right.censored 84 \n#> \n#> $levels\n#> [1] \"0.154174164904031\" \"0.565650228981439\" \"0.733631734872791\"\n#> [4] \"1.25726850344687\" \"2.50720371146533\" \"6.39341334321542\" \n#> \n#> attr(,\"class\")\n#> [1] \"pecRpart\"\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 1.26\n#> 2 2.51\n#> 3 1.26\n#> 4 1.26\n#> 5 1.26\npredict(decision_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.987\n#> 2 3 0.854\n#> 3 5 0.634\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_spec <- decision_tree() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit <- decision_tree_spec |> fit(event_time ~ ., data = cns_train)\ndecision_tree_fit\n#> parsnip model object\n#> \n#> \n#> Model formula:\n#> event_time ~ X1 + X2\n#> \n#> Fitted party:\n#> [1] root\n#> | [2] X2 <= -0.36159\n#> | | [3] X1 <= 0: 13.804 (n = 41)\n#> | | [4] X1 > 0: 8.073 (n = 47)\n#> | [5] X2 > -0.36159\n#> | | [6] X1 <= 0: 6.274 (n = 89)\n#> | | [7] X1 > 0\n#> | | | [8] X2 <= 0.56098: 5.111 (n = 39)\n#> | | | [9] X2 > 0.56098: 2.713 (n = 29)\n#> \n#> Number of inner nodes: 4\n#> Number of terminal nodes: 5\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(decision_tree_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 6.27\n#> 2 5.11\n#> 3 6.27\n#> 4 6.27\n#> 5 6.27\npredict(decision_tree_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ndecision_tree_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.989\n#> 2 3 0.871\n#> 3 5 0.649\n```\n:::\n\n\n:::\n\n## Proportional Hazards (`proportional_hazards()`) \n\n:::{.panel-tabset}\n\n## `survival` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and survival is the default engine so there is no need to set that either.\nproportional_hazards_spec <- proportional_hazards()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit <- proportional_hazards_spec |> fit(event_time ~ ., data = cns_train)\nproportional_hazards_fit\n#> parsnip model object\n#> \n#> Call:\n#> survival::coxph(formula = event_time ~ ., data = data, model = TRUE, \n#> x = TRUE)\n#> \n#> coef exp(coef) se(coef) z p\n#> X1 0.99547 2.70599 0.16799 5.926 3.11e-09\n#> X2 0.91398 2.49422 0.09566 9.555 < 2e-16\n#> \n#> Likelihood ratio test=106.8 on 2 df, p=< 2.2e-16\n#> n= 245, number of events= 161\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(proportional_hazards_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.16\n#> 3 4.62\n#> 4 5.19\n#> 5 4.41\npredict(proportional_hazards_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(proportional_hazards_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -0.111\n#> 2 -1.49 \n#> 3 -1.27 \n#> 4 -1.02 \n#> 5 -1.37\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.985\n#> 2 3 0.909\n#> 3 5 0.750\n```\n:::\n\n\n## `glmnet` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_spec <- proportional_hazards(penalty = 0.01) |> \n # This engine works with a single mode so no need to set that\n set_engine(\"glmnet\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit <- proportional_hazards_spec |> fit(event_time ~ ., data = cns_train)\nproportional_hazards_fit\n#> parsnip model object\n#> \n#> \n#> Call: glmnet::glmnet(x = data_obj$x, y = data_obj$y, family = \"cox\", weights = weights, alpha = alpha, lambda = lambda) \n#> \n#> Df %Dev Lambda\n#> 1 0 0.00 0.39700\n#> 2 1 0.82 0.36170\n#> 3 1 1.51 0.32960\n#> 4 1 2.07 0.30030\n#> 5 1 2.54 0.27360\n#> 6 1 2.94 0.24930\n#> 7 2 3.28 0.22720\n#> 8 2 3.95 0.20700\n#> 9 2 4.50 0.18860\n#> 10 2 4.95 0.17180\n#> 11 2 5.33 0.15660\n#> 12 2 5.65 0.14270\n#> 13 2 5.91 0.13000\n#> 14 2 6.13 0.11840\n#> 15 2 6.31 0.10790\n#> 16 2 6.46 0.09833\n#> 17 2 6.58 0.08960\n#> 18 2 6.69 0.08164\n#> 19 2 6.77 0.07439\n#> 20 2 6.85 0.06778\n#> 21 2 6.91 0.06176\n#> 22 2 6.96 0.05627\n#> 23 2 7.00 0.05127\n#> 24 2 7.03 0.04672\n#> 25 2 7.06 0.04257\n#> 26 2 7.08 0.03879\n#> 27 2 7.10 0.03534\n#> 28 2 7.12 0.03220\n#> 29 2 7.13 0.02934\n#> 30 2 7.14 0.02673\n#> 31 2 7.15 0.02436\n#> 32 2 7.16 0.02219\n#> 33 2 7.17 0.02022\n#> 34 2 7.17 0.01843\n#> 35 2 7.18 0.01679\n#> 36 2 7.18 0.01530\n#> 37 2 7.18 0.01394\n#> 38 2 7.19 0.01270\n#> 39 2 7.19 0.01157\n#> 40 2 7.19 0.01054\n#> 41 2 7.19 0.00961\n#> 42 2 7.19 0.00875\n#> The training data has been saved for prediction.\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(proportional_hazards_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.80\n#> 2 4.21\n#> 3 4.63\n#> 4 5.18\n#> 5 4.42\npredict(proportional_hazards_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(proportional_hazards_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -0.108\n#> 2 -1.43 \n#> 3 -1.23 \n#> 4 -0.993\n#> 5 -1.33\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nproportional_hazards_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.984\n#> 2 3 0.906\n#> 3 5 0.743\n```\n:::\n\n\n:::\n\n## Random Forests (`rand_forest()`) \n\n:::{.panel-tabset}\n\n## `aorsf` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"aorsf\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(2)\nrand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> ---------- Oblique random survival forest\n#> \n#> Linear combinations: Accelerated Cox regression\n#> N observations: 245\n#> N events: 161\n#> N trees: 500\n#> N predictors total: 2\n#> N predictors per node: 2\n#> Average leaves per tree: 12.85\n#> Min observations in leaf: 5\n#> Min events in leaf: 1\n#> OOB stat value: 0.70\n#> OOB stat type: Harrell's C-index\n#> Variable importance: anova\n#> \n#> -----------------------------------------\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.93\n#> 2 3.85\n#> 3 4.41\n#> 4 5.43\n#> 5 4.34\npredict(rand_forest_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.999\n#> 2 3 0.873\n#> 3 5 0.627\n```\n:::\n\n\n## `partykit` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n # We need to set the mode since this engine works with multiple modes\n set_mode(\"censored regression\") |>\n set_engine(\"partykit\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(89)\nrand_forest_fit <- rand_forest_spec |> fit(event_time ~ ., data = cns_train)\n```\n:::\n\n\nThe print method has a lot of output: \n\n
\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\ncapture.output(print(rand_forest_fit))[1:100] |> cat(sep = \"\\n\")\n#> parsnip model object\n#> \n#> $nodes\n#> $nodes[[1]]\n#> [1] root\n#> | [2] V3 <= -0.16072\n#> | | [3] V2 <= 0\n#> | | | [4] V3 <= -1.68226 *\n#> | | | [5] V3 > -1.68226\n#> | | | | [6] V3 <= -0.65952 *\n#> | | | | [7] V3 > -0.65952 *\n#> | | [8] V2 > 0\n#> | | | [9] V3 <= -0.98243 *\n#> | | | [10] V3 > -0.98243\n#> | | | | [11] V3 <= -0.67216 *\n#> | | | | [12] V3 > -0.67216 *\n#> | [13] V3 > -0.16072\n#> | | [14] V2 <= 0\n#> | | | [15] V3 <= 0.95981\n#> | | | | [16] V3 <= 0.3117\n#> | | | | | [17] V3 <= 0.09688 *\n#> | | | | | [18] V3 > 0.09688 *\n#> | | | | [19] V3 > 0.3117\n#> | | | | | [20] V3 <= 0.40845 *\n#> | | | | | [21] V3 > 0.40845 *\n#> | | | [22] V3 > 0.95981 *\n#> | | [23] V2 > 0\n#> | | | [24] V3 <= 0.56098 *\n#> | | | [25] V3 > 0.56098 *\n#> \n#> $nodes[[2]]\n#> [1] root\n#> | [2] V3 <= -0.36618\n#> | | [3] V2 <= 0\n#> | | | [4] V3 <= -1.19881 *\n#> | | | [5] V3 > -1.19881 *\n#> | | [6] V2 > 0\n#> | | | [7] V3 <= -1.18263 *\n#> | | | [8] V3 > -1.18263\n#> | | | | [9] V3 <= -0.55449 *\n#> | | | | [10] V3 > -0.55449 *\n#> | [11] V3 > -0.36618\n#> | | [12] V2 <= 0\n#> | | | [13] V3 <= 0.3117\n#> | | | | [14] V3 <= -0.01851 *\n#> | | | | [15] V3 > -0.01851 *\n#> | | | [16] V3 > 0.3117\n#> | | | | [17] V3 <= 0.85976 *\n#> | | | | [18] V3 > 0.85976 *\n#> | | [19] V2 > 0\n#> | | | [20] V3 <= -0.04369 *\n#> | | | [21] V3 > -0.04369\n#> | | | | [22] V3 <= 0.56098 *\n#> | | | | [23] V3 > 0.56098\n#> | | | | | [24] V3 <= 1.22094 *\n#> | | | | | [25] V3 > 1.22094 *\n#> \n#> $nodes[[3]]\n#> [1] root\n#> | [2] V3 <= -0.46092\n#> | | [3] V2 <= 0\n#> | | | [4] V3 <= -1.65465 *\n#> | | | [5] V3 > -1.65465 *\n#> | | [6] V2 > 0\n#> | | | [7] V3 <= -1.36941 *\n#> | | | [8] V3 > -1.36941\n#> | | | | [9] V3 <= -0.83366 *\n#> | | | | [10] V3 > -0.83366 *\n#> | [11] V3 > -0.46092\n#> | | [12] V2 <= 0\n#> | | | [13] V3 <= -0.01851 *\n#> | | | [14] V3 > -0.01851\n#> | | | | [15] V3 <= 0.22967 *\n#> | | | | [16] V3 > 0.22967\n#> | | | | | [17] V3 <= 0.95368\n#> | | | | | | [18] V3 <= 0.68292 *\n#> | | | | | | [19] V3 > 0.68292 *\n#> | | | | | [20] V3 > 0.95368 *\n#> | | [21] V2 > 0\n#> | | | [22] V3 <= 0.15595 *\n#> | | | [23] V3 > 0.15595\n#> | | | | [24] V3 <= 0.51117 *\n#> | | | | [25] V3 > 0.51117 *\n#> \n#> $nodes[[4]]\n#> [1] root\n#> | [2] V3 <= -0.10421\n#> | | [3] V2 <= 0\n#> | | | [4] V3 <= -0.96818 *\n#> | | | [5] V3 > -0.96818\n#> | | | | [6] V3 <= -0.64682 *\n#> | | | | [7] V3 > -0.64682 *\n#> | | [8] V2 > 0\n#> | | | [9] V3 <= -0.83366 *\n#> | | | [10] V3 > -0.83366 *\n#> | [11] V3 > -0.10421\n#> | | [12] V2 <= 0\n#> | | | [13] V3 <= 0.14347 *\n#> | | | [14] V3 > 0.14347\n#> | | | | [15] V3 <= 1.20345\n```\n:::\n\n
\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 5.22\n#> 2 4.12\n#> 3 3.87\n#> 4 4.82\n#> 5 3.87\npredict(rand_forest_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 1 \n#> 2 3 0.870\n#> 3 5 0.594\n```\n:::\n\n\n:::\n\n## Parametric Survival Models (`survival_reg()`) \n\n:::{.panel-tabset}\n\n## `survival` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# This engine works with a single mode so no need to set that\n# and survival is the default engine so there is no need to set that either.\nsurvival_reg_spec <- survival_reg()\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> survival::survreg(formula = event_time ~ ., data = data, model = TRUE)\n#> \n#> Coefficients:\n#> (Intercept) X1 X2 \n#> 2.2351722 -0.4648296 -0.4222887 \n#> \n#> Scale= 0.4728442 \n#> \n#> Loglik(model)= -427.4 Loglik(intercept only)= -481.3\n#> \tChisq= 107.73 on 2 degrees of freedom, p= <2e-16 \n#> n= 245\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 8.88\n#> 2 4.67\n#> 3 5.20\n#> 4 5.83\n#> 5 4.97\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 2.18\n#> 2 1.54\n#> 3 1.65\n#> 4 1.76\n#> 5 1.60\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.990\n#> 2 3 0.904\n#> 3 5 0.743\n```\n:::\n\n\n## `flexsurv` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_spec <- survival_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"flexsurv\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> flexsurv::flexsurvreg(formula = event_time ~ ., data = data, \n#> dist = \"weibull\")\n#> \n#> Estimates: \n#> data mean est L95% U95% se exp(est) L95% \n#> shape NA 2.11486 1.87774 2.38192 0.12832 NA NA\n#> scale NA 9.34809 8.38852 10.41743 0.51658 NA NA\n#> X1 0.46939 -0.46483 -0.61347 -0.31619 0.07584 0.62824 0.54147\n#> X2 -0.00874 -0.42229 -0.50641 -0.33817 0.04292 0.65554 0.60266\n#> U95% \n#> shape NA\n#> scale NA\n#> X1 0.72892\n#> X2 0.71307\n#> \n#> N = 245, Events: 161, Censored: 84\n#> Total time at risk: 1388.951\n#> Log-likelihood = -427.4387, df = 4\n#> AIC = 862.8774\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.13\n#> 3 4.61\n#> 4 5.16\n#> 5 4.40\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 2.18\n#> 2 1.54\n#> 3 1.65\n#> 4 1.76\n#> 5 1.60\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.990\n#> 2 3 0.904\n#> 3 5 0.743\n```\n:::\n\n\n## `flexsurvspline` \n\nThis engine requires the censored extension package, so let's load this first:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlibrary(censored)\n```\n:::\n\n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_spec <- survival_reg() |> \n # This engine works with a single mode so no need to set that\n set_engine(\"flexsurvspline\")\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit <- survival_reg_spec |> fit(event_time ~ ., data = cns_train)\nsurvival_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> flexsurv::flexsurvspline(formula = event_time ~ ., data = data)\n#> \n#> Estimates: \n#> data mean est L95% U95% se exp(est) L95% \n#> gamma0 NA -4.72712 -5.31772 -4.13651 0.30134 NA NA\n#> gamma1 NA 2.11487 1.86338 2.36637 0.12832 NA NA\n#> X1 0.46939 0.98305 0.65928 1.30683 0.16519 2.67261 1.93340\n#> X2 -0.00874 0.89308 0.70943 1.07673 0.09370 2.44265 2.03283\n#> U95% \n#> gamma0 NA\n#> gamma1 NA\n#> X1 3.69444\n#> X2 2.93508\n#> \n#> N = 245, Events: 161, Censored: 84\n#> Total time at risk: 1388.951\n#> Log-likelihood = -427.4387, df = 4\n#> AIC = 862.8774\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(survival_reg_fit, type = \"time\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_time\n#> \n#> 1 7.87\n#> 2 4.13\n#> 3 4.61\n#> 4 5.16\n#> 5 4.40\npredict(survival_reg_fit, type = \"survival\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"hazard\", new_data = cns_test, eval_time = eval_times)\n#> # A tibble: 5 × 1\n#> .pred \n#> \n#> 1 \n#> 2 \n#> 3 \n#> 4 \n#> 5 \npredict(survival_reg_fit, type = \"linear_pred\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_linear_pred\n#> \n#> 1 -4.62\n#> 2 -3.26\n#> 3 -3.49\n#> 4 -3.73\n#> 5 -3.39\npredict(survival_reg_fit, type = \"quantile\", new_data = cns_test)\n#> # A tibble: 5 × 1\n#> .pred_quantile\n#> \n#> 1 [7.47]\n#> 2 [3.92]\n#> 3 [4.37]\n#> 4 [4.9]\n#> 5 [4.18]\n```\n:::\n\n\nEach row of the survival predictions has results for each evaluation time: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nsurvival_reg_fit |> \n predict(type = \"survival\", new_data = cns_test, eval_time = eval_times) |> \n slice(1) |> \n pluck(\".pred\")\n#> [[1]]\n#> # A tibble: 3 × 2\n#> .eval_time .pred_survival\n#> \n#> 1 1 0.990\n#> 2 3 0.904\n#> 3 5 0.743\n```\n:::\n\n\n:::\n\n# Quantile Regression Models\n\nTo demonstrate quantile regression, let's make a larger version of our regression data: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nset.seed(938)\nqnt_split <-\n modeldata::concrete |> \n slice_sample(n = 100) |> \n select(strength = compressive_strength, cement, age) |> \n initial_split(prop = 0.95, strata = strength)\nqnt_split\n#> \n#> <92/8/100>\n\nqnt_rec <- \n recipe(strength ~ ., data = training(qnt_split)) |> \n step_normalize(all_numeric_predictors()) |> \n prep()\n\nqnt_train <- bake(qnt_rec, new_data = NULL)\nqnt_test <- bake(qnt_rec, new_data = testing(qnt_split))\n```\n:::\n\n\nWe'll also predict these quantile levels: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nqnt_lvls <- (1:3) / 4\n```\n:::\n\n\n## Linear Regression (`linear_reg()`) \n\n:::{.panel-tabset}\n\n## `quantreg` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_spec <- linear_reg() |> \n set_engine(\"quantreg\") |> \n set_mode(\"quantile regression\", quantile_levels = qnt_lvls)\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit <- linear_reg_spec |> fit(strength ~ ., data = qnt_train)\nlinear_reg_fit\n#> parsnip model object\n#> \n#> Call:\n#> quantreg::rq(formula = strength ~ ., tau = quantile_levels, data = data)\n#> \n#> Coefficients:\n#> tau= 0.25 tau= 0.50 tau= 0.75\n#> (Intercept) 23.498029 33.265428 42.046031\n#> cement 6.635233 7.955658 8.181235\n#> age 5.566668 9.514832 7.110702\n#> \n#> Degrees of freedom: 92 total; 89 residual\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(linear_reg_fit, type = \"quantile\", new_data = qnt_test)\n#> # A tibble: 8 × 1\n#> .pred_quantile\n#> \n#> 1 [29.2]\n#> 2 [31.5]\n#> 3 [21.4]\n#> 4 [48.3]\n#> 5 [36.6]\n#> 6 [33.8]\n#> 7 [34.6]\n#> 8 [43.8]\n```\n:::\n\n\nEach row of predictions has a special vector class containing all of the quantile predictions: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nlinear_reg_fit |> \n predict(type = \"quantile\", new_data = qnt_test)|> \n slice(1) |> \n pluck(\".pred_quantile\") |> \n # Expand the results for each quantile level by converting to a tibble\n as_tibble()\n#> # A tibble: 3 × 3\n#> .pred_quantile .quantile_levels .row\n#> \n#> 1 21.5 0.25 1\n#> 2 29.2 0.5 1\n#> 3 39.5 0.75 1\n```\n:::\n\n\n:::\n\n## Random Forests (`rand_forest()`) \n\n:::{.panel-tabset}\n\n## `grf` \n\nWe create a model specification via:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_spec <- rand_forest() |>\n set_engine(\"grf\") |> \n set_mode(\"quantile regression\", quantile_levels = qnt_lvls)\n```\n:::\n\n\nNow we create the model fit object:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# Set the random number seed to an integer for reproducibility: \nset.seed(435)\nrand_forest_fit <- rand_forest_spec |> fit(strength ~ ., data = qnt_train)\nrand_forest_fit\n#> parsnip model object\n#> \n#> GRF forest object of type quantile_forest \n#> Number of trees: 2000 \n#> Number of training samples: 92 \n#> Variable importance: \n#> 1 2 \n#> 0.454 0.546\n```\n:::\n\n\nThe holdout data can be predicted:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npredict(rand_forest_fit, type = \"quantile\", new_data = qnt_test)\n#> # A tibble: 8 × 1\n#> .pred_quantile\n#> \n#> 1 [26.4]\n#> 2 [36.2]\n#> 3 [26.9]\n#> 4 [43.7]\n#> 5 [39]\n#> 6 [35.9]\n#> 7 [38.5]\n#> 8 [31.8]\n```\n:::\n\n\nEach row of predictions has a special vector class containing all of the quantile predictions: \n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nrand_forest_fit |> \n predict(type = \"quantile\", new_data = qnt_test)|> \n slice(1) |> \n pluck(\".pred_quantile\") |> \n # Expand the results for each quantile level by converting to a tibble\n as_tibble()\n#> # A tibble: 3 × 3\n#> .pred_quantile .quantile_levels .row\n#> \n#> 1 17.2 0.25 1\n#> 2 26.4 0.5 1\n#> 3 39 0.75 1\n```\n:::\n\n\n:::\n\n\n\n", "supporting": [ "index_files" ], diff --git a/learn/models/parsnip-predictions/index.qmd b/learn/models/parsnip-predictions/index.qmd index 4722d177..8411ea82 100644 --- a/learn/models/parsnip-predictions/index.qmd +++ b/learn/models/parsnip-predictions/index.qmd @@ -25,6 +25,9 @@ format: #| eval: true source(here::here("common.R")) +# Remove after parsnip merges this change +pak::pak("tidymodels/parsnip@fix-1309") + # Indicates to enable or not running Spark code run_spark <- TRUE run_h2o <- TRUE @@ -910,7 +913,8 @@ We create a model specification via: #| label: spec-spark-decision-tree-classification #| eval: !expr 'run_spark' decision_tree_spec <- decision_tree() |> - set_mode("classification") + set_mode("classification") |> + set_engine("spark") ``` Now we create the model fit object: @@ -3329,7 +3333,6 @@ Now we create the model fit object: ```{r} #| label: fit-spark-decision-tree-regression -#| eval: false decision_tree_fit <- decision_tree_spec |> fit(compressive_strength ~ ., data = tbl_reg$training) decision_tree_fit ``` @@ -3338,7 +3341,6 @@ The holdout data can be predicted: ```{r} #| label: predict-spark-decision-tree-regression -#| eval: false predict(decision_tree_fit, new_data = tbl_reg$test) ```