Downloading packages and data
library(nberwp) #to clean and arrange data
library(tidyverse)
library(ggthemes) #for cool graphs
library(tidyr)
library(skimr)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(stats)
library(ggthemes)
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(RColorBrewer)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tidymodels)
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.4 ──
## ✓ broom 0.7.12 ✓ rsample 0.1.1
## ✓ dials 0.0.10 ✓ tune 0.1.6
## ✓ infer 1.0.0 ✓ workflows 0.2.4
## ✓ modeldata 0.1.1 ✓ workflowsets 0.1.0
## ✓ parsnip 0.1.7 ✓ yardstick 0.0.8
## ✓ recipes 0.1.17
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(rpart)
##
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
##
## prune
library(rpart.plot)
library(parallel)
library(doParallel)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-3
# Data from the Tidy Tuesday Github
<- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-06-02/marbles.csv') marbles
## Rows: 256 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): date, race, site, source, marble_name, team_name, pole, host, notes
## dbl (5): time_s, points, track_length_m, number_laps, avg_time_lap
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Now for a little bit of data cleaning
::skim(marbles) skimr
Name | marbles |
Number of rows | 256 |
Number of columns | 14 |
_______________________ | |
Column type frequency: | |
character | 9 |
numeric | 5 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
date | 0 | 1.00 | 8 | 9 | 0 | 16 | 0 |
race | 0 | 1.00 | 4 | 4 | 0 | 16 | 0 |
site | 0 | 1.00 | 7 | 15 | 0 | 8 | 0 |
source | 0 | 1.00 | 34 | 34 | 0 | 16 | 0 |
marble_name | 0 | 1.00 | 4 | 9 | 0 | 32 | 0 |
team_name | 0 | 1.00 | 6 | 16 | 0 | 16 | 0 |
pole | 128 | 0.50 | 2 | 3 | 0 | 16 | 0 |
host | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
notes | 249 | 0.03 | 37 | 100 | 0 | 7 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
time_s | 3 | 0.99 | 190.84 | 169.13 | 17.76 | 28.40 | 36.28 | 338.16 | 492.01 | ▇▁▁▇▁ |
points | 128 | 0.50 | 6.45 | 7.74 | 0.00 | 0.00 | 3.00 | 11.25 | 26.00 | ▇▂▂▁▁ |
track_length_m | 0 | 1.00 | 13.22 | 0.95 | 11.90 | 12.62 | 13.02 | 14.13 | 14.55 | ▅▅▂▁▇ |
number_laps | 0 | 1.00 | 6.25 | 5.53 | 1.00 | 1.00 | 5.00 | 10.25 | 16.00 | ▇▁▃▂▂ |
avg_time_lap | 3 | 0.99 | 29.70 | 5.55 | 17.76 | 25.94 | 30.05 | 33.65 | 41.62 | ▃▆▇▇▂ |
str(marbles)
## spec_tbl_df [256 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ date : chr [1:256] "15-Feb-20" "15-Feb-20" "15-Feb-20" "15-Feb-20" ...
## $ race : chr [1:256] "S1Q1" "S1Q1" "S1Q1" "S1Q1" ...
## $ site : chr [1:256] "Savage Speedway" "Savage Speedway" "Savage Speedway" "Savage Speedway" ...
## $ source : chr [1:256] "https://youtu.be/JtsQ_UydjEI?t=356" "https://youtu.be/JtsQ_UydjEI?t=356" "https://youtu.be/JtsQ_UydjEI?t=356" "https://youtu.be/JtsQ_UydjEI?t=356" ...
## $ marble_name : chr [1:256] "Clementin" "Starry" "Momo" "Yellow" ...
## $ team_name : chr [1:256] "O'rangers" "Team Galactic" "Team Momo" "Mellow Yellow" ...
## $ time_s : num [1:256] 28.1 28.4 28.4 28.7 28.7 ...
## $ pole : chr [1:256] "P1" "P2" "P3" "P4" ...
## $ points : num [1:256] NA NA NA NA NA NA NA NA NA NA ...
## $ track_length_m: num [1:256] 12.8 12.8 12.8 12.8 12.8 ...
## $ number_laps : num [1:256] 1 1 1 1 1 1 1 1 1 1 ...
## $ avg_time_lap : num [1:256] 28.1 28.4 28.4 28.7 28.7 ...
## $ host : chr [1:256] "No" "No" "No" "No" ...
## $ notes : chr [1:256] NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. date = col_character(),
## .. race = col_character(),
## .. site = col_character(),
## .. source = col_character(),
## .. marble_name = col_character(),
## .. team_name = col_character(),
## .. time_s = col_double(),
## .. pole = col_character(),
## .. points = col_double(),
## .. track_length_m = col_double(),
## .. number_laps = col_double(),
## .. avg_time_lap = col_double(),
## .. host = col_character(),
## .. notes = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# now to fix this funky date format
<- marbles %>% separate(date, sep="-", into = c("day", "month", "year"))
marbles $month <-match(marbles$month, month.abb)
marbles$date <-with(marbles,paste(month, day, year, sep="/"),"%m/%d/%y")
marbles$date <-as.Date(marbles$date, format="%m/%d/%y")
marbles
# Getting rid of unwanted variables
<- subset(marbles, select = -c(source, host, notes, day, month, year))
marbles<- marbles[-c(64, 128, 192), ] # deleting row 192 because that is the only time a marble did not compete for a reason
marbles # I want to compare speed across races, so I will create a second per meter variable
$mps <- (marbles$track_length_m * marbles$number_laps)/ marbles$time_s
marbles
# ranking the place by time
= marbles %>% group_by(race) %>% mutate(rank = rank(time_s))
marbles
# creating a season summary table
<- marbles %>%
season mutate_if(is.numeric, funs(ifelse(is.na(.), 0, .))) %>%
group_by(team_name, marble_name) %>%
summarise(PR = min(mps), PW = max(mps), SPM = mean(mps), Rank = mean(rank), Points = sum(points))
## `mutate_if()` ignored the following grouping variables:
## Column `race`
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## `summarise()` has grouped output by 'team_name'. You can override using the `.groups` argument.
# model dataset
<- subset(marbles, select = -c(1:4, 6, 7, 11))
model str(model) # rank should be a factor
## tibble [253 × 6] (S3: tbl_df/tbl/data.frame)
## $ time_s : num [1:253] 28.1 28.4 28.4 28.7 28.7 ...
## $ track_length_m: num [1:253] 12.8 12.8 12.8 12.8 12.8 ...
## $ number_laps : num [1:253] 1 1 1 1 1 1 1 1 1 1 ...
## $ avg_time_lap : num [1:253] 28.1 28.4 28.4 28.7 28.7 ...
## $ mps : num [1:253] 0.456 0.452 0.451 0.446 0.446 ...
## $ rank : num [1:253] 1 2 3 4 5 6 7 8 9 10 ...
$rank <- as.factor(model$rank)
modelhist(model$mps,col='red') # ok distribution looks good
First lets look at the overall performance by score over time
# Marble stats by team
<- table1::table1(~mps + rank + points + avg_time_lap | team_name, data = marbles) Table1
## Warning in .table1.internal(x = x, labels = labels, groupspan = groupspan, :
## Table has 17 columns. Are you sure this is what you want?
Table1
Balls of Chaos (N=16) |
Green Ducks (N=16) |
Hazers (N=16) |
Hornets (N=16) |
Limers (N=15) |
Mellow Yellow (N=16) |
Midnight Wisps (N=15) |
O'rangers (N=16) |
Raspberry Racers (N=16) |
Rojo Rollers (N=16) |
Savage Speeders (N=16) |
Snowballs (N=16) |
Team Galactic (N=16) |
Team Momo (N=16) |
Team Primary (N=15) |
Thunderbolts (N=16) |
Overall (N=253) |
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
mps | |||||||||||||||||
Mean (SD) | 0.451 (0.0691) | 0.458 (0.0657) | 0.465 (0.0722) | 0.451 (0.0651) | 0.452 (0.0704) | 0.463 (0.0739) | 0.463 (0.0644) | 0.461 (0.0671) | 0.454 (0.0600) | 0.452 (0.0715) | 0.463 (0.0781) | 0.457 (0.0673) | 0.455 (0.0654) | 0.457 (0.0718) | 0.455 (0.0845) | 0.457 (0.0685) | 0.457 (0.0679) |
Median [Min, Max] | 0.432 [0.356, 0.616] | 0.439 [0.357, 0.614] | 0.448 [0.357, 0.650] | 0.439 [0.353, 0.631] | 0.424 [0.359, 0.617] | 0.442 [0.361, 0.664] | 0.449 [0.358, 0.616] | 0.444 [0.364, 0.613] | 0.447 [0.357, 0.590] | 0.428 [0.359, 0.646] | 0.438 [0.351, 0.667] | 0.438 [0.354, 0.614] | 0.444 [0.357, 0.607] | 0.436 [0.358, 0.657] | 0.442 [0.350, 0.670] | 0.429 [0.362, 0.621] | 0.438 [0.350, 0.670] |
rank | |||||||||||||||||
Mean (SD) | 9.84 (5.10) | 7.63 (3.65) | 5.41 (3.84) | 11.3 (3.07) | 10.0 (4.76) | 7.13 (3.93) | 7.90 (4.02) | 7.38 (5.08) | 9.25 (3.71) | 9.63 (3.20) | 5.88 (4.66) | 8.63 (4.84) | 9.19 (4.89) | 7.75 (4.92) | 9.00 (6.34) | 8.81 (4.20) | 8.41 (4.57) |
Median [Min, Max] | 10.8 [1.00, 15.0] | 8.00 [1.00, 13.0] | 4.50 [1.00, 15.0] | 11.5 [7.00, 16.0] | 10.0 [3.50, 16.0] | 7.50 [1.00, 15.0] | 7.00 [1.00, 14.0] | 7.00 [1.00, 15.0] | 8.50 [3.00, 16.0] | 9.50 [5.00, 15.0] | 5.50 [1.00, 16.0] | 9.50 [1.00, 15.0] | 10.0 [1.00, 15.0] | 6.50 [1.00, 16.0] | 7.00 [1.00, 16.0] | 9.50 [2.00, 15.0] | 8.00 [1.00, 16.0] |
points | |||||||||||||||||
Mean (SD) | 5.75 (8.96) | 8.00 (9.68) | 11.8 (8.41) | 1.00 (2.14) | 3.57 (5.77) | 5.50 (5.66) | 4.86 (6.34) | 8.63 (9.23) | 3.38 (3.07) | 4.00 (4.11) | 12.6 (7.96) | 8.25 (9.48) | 8.00 (10.7) | 6.13 (8.39) | 7.71 (7.52) | 6.13 (8.36) | 6.61 (7.77) |
Median [Min, Max] | 0.500 [0, 25.0] | 3.00 [0, 26.0] | 14.0 [0, 25.0] | 0 [0, 6.00] | 0 [0, 12.0] | 4.00 [0, 15.0] | 1.00 [0, 15.0] | 6.00 [0, 25.0] | 3.00 [0, 8.00] | 2.50 [0, 10.0] | 13.0 [0, 25.0] | 5.00 [0, 25.0] | 4.00 [0, 25.0] | 4.00 [0, 25.0] | 12.0 [0, 18.0] | 1.00 [0, 19.0] | 4.00 [0, 26.0] |
Missing | 8 (50.0%) | 8 (50.0%) | 8 (50.0%) | 8 (50.0%) | 8 (53.3%) | 8 (50.0%) | 8 (53.3%) | 8 (50.0%) | 8 (50.0%) | 8 (50.0%) | 8 (50.0%) | 8 (50.0%) | 8 (50.0%) | 8 (50.0%) | 8 (53.3%) | 8 (50.0%) | 128 (50.6%) |
avg_time_lap | |||||||||||||||||
Mean (SD) | 30.1 (5.77) | 29.6 (5.55) | 29.2 (5.63) | 30.0 (5.54) | 30.2 (5.61) | 29.4 (5.66) | 29.1 (5.33) | 29.4 (5.58) | 29.7 (5.45) | 30.0 (5.73) | 29.4 (5.98) | 29.7 (5.74) | 29.8 (5.69) | 29.7 (5.62) | 30.2 (6.81) | 29.7 (5.58) | 29.7 (5.55) |
Median [Min, Max] | 31.0 [19.3, 40.9] | 29.6 [19.4, 40.8] | 29.7 [18.3, 40.8] | 30.0 [18.9, 41.2] | 30.8 [19.3, 40.5] | 29.8 [17.9, 40.3] | 30.0 [19.3, 40.6] | 29.6 [19.4, 40.0] | 30.3 [20.2, 40.8] | 30.1 [18.4, 40.6] | 30.0 [17.9, 41.4] | 29.8 [19.4, 41.1] | 30.3 [19.6, 40.7] | 30.0 [18.1, 40.6] | 29.8 [17.8, 41.6] | 30.2 [19.2, 40.2] | 30.1 [17.8, 41.6] |
$marble_name = with(marbles, reorder(marble_name, rank))
marbles<- marbles %>%
P1 ggplot(aes(x = marble_name, y = rank)) +
geom_bar(stat="identity") +
facet_wrap(~date, scales = 'free') +
ggtitle("Marble Rank by Race") +
coord_flip() +
scale_color_ptol("") +
theme_minimal()
print(P1)
$marble_name = with(season, reorder(marble_name, Points))
season<- season %>%
P2 ggplot(aes(x = marble_name, y = Points, fill=marble_name)) +
geom_bar(stat="identity", alpha=.6, width=.4) +
coord_flip() +
xlab("") +
ggtitle("Season total Marble Points") +
scale_color_ptol("") +
theme_minimal()
P2
$marble_name = with(marbles, reorder(marble_name, mps))
marbles<- marbles %>%
P3 ggplot( aes(x=marble_name, y=mps, fill=marble_name)) +
geom_violin() +
xlab("") +
ggtitle("Season MPS distribution") +
theme(legend.position="none") +
scale_color_ptol("") +
theme_minimal()
P3
<- season %>% ggplot(aes(y = marble_name, x = PR, xend =PW )) +
P4geom_dumbbell(size = 1.0,
size_x = 3,
size_xend = 3,
colour = "grey",
colour_x = "blue",
colour_xend = "red") +
theme_minimal() +
scale_color_ptol("") +
labs(title = "Personal Best and Worst MPS Times",
x = "Meters Per Second (MPS)", y = "Team Name")
P4
Regrettably we must inform you that this years 3rd place winner, Smoggy has been accused of lead weighting - a tactic that was outlawed in the 2018 Marbula One tournament. After chemical analysis, Smoggy and the Hazers team will be disqualified from this years leader board. Lets take a look at the Hazer Team Stats.
<- subset(marbles, team_name == "Hazers")
hazers <- ggplot() +
P5 geom_line(data = hazers, aes(x = date, y = avg_time_lap, color="Smoggy")) +
geom_line(data = marbles, aes(x = date, y = avg_time_lap, color="Group Average"), linetype=3) +
theme_minimal() +
scale_color_ptol("") +
labs(title = "Plot 1: Avg Time- Court Evidence for Smoggy v State",
x = " ", y = " ")
P5
<- ggplot() +
P6 geom_line(data = hazers, aes(x = date, y = mps, color="Smoggy")) +
geom_line(data = marbles, aes(x = date, y = mps, color="Group Average"), linetype=3) +
theme_minimal() +
scale_color_ptol("") +
labs(title = "Plot 2: MPS- Court Evidence for Smoggy v State",
x = " ", y = " ")
P6
We are going to try out 2 different models 1) Generalized linear model 2) LASSO model 3) Ridge Regression
First lets create a testing and training set ### Data splitting and CV folds Here we are going to split the data randomly into training and testing subsets - Training data will be used to fit the model. - Testing set will be used to evaluate the model.
# Setting a seed for random number generation so if this analysis is reproduced, the same random set will be generated
set.seed(123)
# Subsetting 70% of data into training and 20% of data into testing
# We using Body Temp to stratify
<- initial_split(model, prop = .7, strata = "mps") # dont want all the winners in one category
data_split # Creating training data
<- training(data_split)
train_data # Creating testing data
<- testing(data_split)
test_data
# 5-fold cross-validation, 5x repeated
# Creating a resample object for our traiinng data
set.seed(123)
<- vfold_cv(train_data, v = 5, repeats = 5, strata = "mps") # mps is my continuous outcome folds
Setting up lr.mod that will be used for the rest of the models
<- linear_reg() %>%
lr.mod set_engine("lm") %>%
set_mode("regression")
#Setting up the linear model
<- recipe(mps ~ ., data = train_data) %>%
D.rec step_dummy(all_nominal()) %>%
step_scale(mps) #scaling mps
# Create workflow
<- workflow() %>%
D.wflow add_model(lr.mod) %>%
add_recipe(D.rec)
# Fit model to training data
<-
D.fit %>%
D.wflow fit(data = train_data)
# evaluate
%>%
D.fit extract_fit_parsnip() %>%
tidy()
## # A tibble: 22 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 8.60 0.353 24.4 1.04e-54
## 2 time_s 0.00667 0.000651 10.3 4.02e-19
## 3 track_length_m 0.457 0.0364 12.5 2.72e-25
## 4 number_laps -0.200 0.0187 -10.7 2.20e-20
## 5 avg_time_lap -0.268 0.00780 -34.4 4.26e-74
## 6 rank_X2 0.00420 0.0869 0.0484 9.61e- 1
## 7 rank_X3 0.0162 0.0832 0.195 8.46e- 1
## 8 rank_X3.5 -0.0201 0.153 -0.131 8.96e- 1
## 9 rank_X4 -0.0229 0.0875 -0.262 7.94e- 1
## 10 rank_X5 0.00788 0.0872 0.0903 9.28e- 1
## # … with 12 more rows
# Create null formula
<- recipe(mps ~ 1., data = train_data)
N.rec
# set workflow
<-
N.train.wflow workflow() %>%
add_model(lr.mod) %>%
add_recipe(N.rec)
# fitting
<-
N.train.fit %>%
N.train.wflow fit(data = train_data)
# usual
%>%
N.train.fit extract_fit_parsnip() %>%
tidy()
## # A tibble: 1 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.457 0.00514 88.9 1.57e-147
# RMSE
predict(N.train.fit, train_data)
## # A tibble: 176 × 1
## .pred
## <dbl>
## 1 0.457
## 2 0.457
## 3 0.457
## 4 0.457
## 5 0.457
## 6 0.457
## 7 0.457
## 8 0.457
## 9 0.457
## 10 0.457
## # … with 166 more rows
<- augment(N.train.fit, train_data)
N.train.aug %>% select(mps, .pred) N.train.aug
## # A tibble: 176 × 2
## mps .pred
## <dbl> <dbl>
## 1 0.411 0.457
## 2 0.395 0.457
## 3 0.394 0.457
## 4 0.393 0.457
## 5 0.391 0.457
## 6 0.390 0.457
## 7 0.387 0.457
## 8 0.387 0.457
## 9 0.383 0.457
## 10 0.382 0.457
## # … with 166 more rows
%>% rmse(truth = mps, .pred) N.train.aug
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.0680
# RMSE = 0.06801068
# fitting
predict(N.train.fit, test_data)
## # A tibble: 77 × 1
## .pred
## <dbl>
## 1 0.457
## 2 0.457
## 3 0.457
## 4 0.457
## 5 0.457
## 6 0.457
## 7 0.457
## 8 0.457
## 9 0.457
## 10 0.457
## # … with 67 more rows
<- augment(N.train.fit, test_data)
N.test.aug %>% select(mps, .pred) N.test.aug
## # A tibble: 77 × 2
## mps .pred
## <dbl> <dbl>
## 1 0.456 0.457
## 2 0.452 0.457
## 3 0.437 0.457
## 4 0.434 0.457
## 5 0.412 0.457
## 6 0.395 0.457
## 7 0.383 0.457
## 8 0.380 0.457
## 9 0.377 0.457
## 10 0.468 0.457
## # … with 67 more rows
%>% #taking the root-mean square error of the model
N.test.aug rmse(truth = mps, .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.0672
# RMSE = 0.06721659
## Tuning hyperparameters
<-
tune_spec decision_tree(cost_complexity = tune(),
tree_depth = tune()) %>%
set_engine("rpart") %>%
set_mode("regression")
# We will come back to these parameters tune_spec
## Decision Tree Model Specification (regression)
##
## Main Arguments:
## cost_complexity = tune()
## tree_depth = tune()
##
## Computational engine: rpart
# setting workflow
<- workflow() %>%
tree.wflow add_model(tune_spec) %>%
add_recipe(D.rec)
# Create a grid
<- parallel::detectCores()
cores cores
## [1] 4
= 4
ncores <- makePSOCKcluster(ncores)
cl registerDoParallel(cl)
<- grid_regular(cost_complexity(),
tree_grid tree_depth(),
levels = 5)
# tuning
<- tree.wflow %>%
tree_res tune_grid(resamples = folds, grid = tree_grid)
%>% collect_metrics() tree_res
## # A tibble: 50 × 8
## cost_complexity tree_depth .metric .estimator mean n std_err .config
## <dbl> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000000001 1 rmse standard 0.614 25 0.0136 Preprocess…
## 2 0.0000000001 1 rsq standard 0.632 25 0.0106 Preprocess…
## 3 0.0000000178 1 rmse standard 0.614 25 0.0136 Preprocess…
## 4 0.0000000178 1 rsq standard 0.632 25 0.0106 Preprocess…
## 5 0.00000316 1 rmse standard 0.614 25 0.0136 Preprocess…
## 6 0.00000316 1 rsq standard 0.632 25 0.0106 Preprocess…
## 7 0.000562 1 rmse standard 0.614 25 0.0136 Preprocess…
## 8 0.000562 1 rsq standard 0.632 25 0.0106 Preprocess…
## 9 0.1 1 rmse standard 0.614 25 0.0136 Preprocess…
## 10 0.1 1 rsq standard 0.632 25 0.0106 Preprocess…
## # … with 40 more rows
# turn off parallel cluster
stopCluster(cl)
%>%
tree_res collect_metrics() %>%
mutate(tree_depth = factor(tree_depth)) %>%
ggplot(aes(cost_complexity, mean, color = tree_depth)) +
geom_line(size = 1.5, alpha = 0.6) +
geom_point(size = 2) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10(labels = scales::label_number()) +
scale_color_viridis_d(option = "plasma", begin = .9, end = 0)
# Lets check out the top 5
%>% show_best("rmse") tree_res
## # A tibble: 5 × 8
## cost_complexity tree_depth .metric .estimator mean n std_err .config
## <dbl> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000000001 8 rmse standard 0.259 25 0.00917 Preprocesso…
## 2 0.0000000178 8 rmse standard 0.259 25 0.00917 Preprocesso…
## 3 0.00000316 8 rmse standard 0.259 25 0.00917 Preprocesso…
## 4 0.0000000001 11 rmse standard 0.259 25 0.00917 Preprocesso…
## 5 0.0000000178 11 rmse standard 0.259 25 0.00917 Preprocesso…
# Now to pull out the best set of hyperparameter values for our decision tree model
<- tree_res %>% select_best("rmse")
best_tree # finalize workflow
<- tree.wflow %>% finalize_workflow(best_tree)
final_wf # final fit
<- final_wf %>% fit(data = train_data)
final_fit final_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_dummy()
## • step_scale()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## n= 176
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 176 175.00000000 6.697913
## 2) avg_time_lap>=26.11 130 36.05376000 6.228446
## 4) avg_time_lap>=32.42 58 7.47878700 5.793831
## 8) avg_time_lap>=37.035 11 0.10836970 5.250606 *
## 9) avg_time_lap< 37.035 47 3.36467900 5.920968
## 18) track_length_m< 14.215 26 0.44794260 5.715494
## 36) time_s>=325.15 18 0.06109299 5.655044 *
## 37) time_s< 325.15 8 0.17307930 5.851506 *
## 19) track_length_m>=14.215 21 0.45994480 6.175366
## 38) avg_time_lap>=34.415 7 0.05028384 5.986830 *
## 39) avg_time_lap< 34.415 14 0.03643271 6.269633 *
## 5) avg_time_lap< 32.42 72 8.79396900 6.578552
## 10) time_s>=29.925 51 2.45283700 6.422064
## 20) track_length_m>=12.43 39 1.62091200 6.353963
## 40) track_length_m< 13.625 30 0.23420380 6.264969
## 80) avg_time_lap>=30.68 9 0.05438351 6.189007 *
## 81) avg_time_lap< 30.68 21 0.10563120 6.297524
## 162) track_length_m< 13.02 14 0.03521699 6.259880 *
## 163) track_length_m>=13.02 7 0.01089504 6.372814 *
## 41) track_length_m>=13.625 9 0.35711570 6.650610 *
## 21) track_length_m< 12.43 12 0.06322134 6.643392 *
## 11) time_s< 29.925 21 2.05913500 6.958595
## 22) track_length_m< 13.02 7 0.37816590 6.622303 *
## 23) track_length_m>=13.02 14 0.49349810 7.126741 *
## 3) avg_time_lap< 26.11 46 29.32136000 8.024669
## 6) time_s>=21.5 29 1.54729000 7.486443
## 12) time_s>=24.36 22 0.17383850 7.379369
## 24) time_s>=25.865 14 0.06531379 7.343423 *
## 25) time_s< 25.865 8 0.05877888 7.442274 *
## 13) time_s< 24.36 7 0.32851020 7.822961 *
## 7) time_s< 21.5 17 5.04212400 8.942820 *
%>% show_best("rmse", n = 1) tree_res
## # A tibble: 1 × 8
## cost_complexity tree_depth .metric .estimator mean n std_err .config
## <dbl> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000000001 8 rmse standard 0.259 25 0.00917 Preprocesso…
RMSE = 0.2588305
rpart.plot(extract_fit_parsnip(final_fit)$fit)
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.plot with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.
# set workflow
<- linear_reg(mode = "regression", penalty = tune(), mixture = 1) %>%
lasso.mod set_engine("glmnet")
<- workflow() %>%
lasso.wflow add_model(lasso.mod) %>%
add_recipe(D.rec)
<- parallel::detectCores()
cores cores
## [1] 4
= 4
ncores <- makePSOCKcluster(ncores)
cl registerDoParallel(cl)
# creating grid and tuning
<- tibble(penalty = 10^seq(-3, 0, length.out = 30))
lr_reg_grid # tuning on training data
<- lasso.wflow %>%
lasso.res tune_grid(resamples = folds,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE),
metrics = metric_set(rmse))
%>% autoplot() lasso.res
# turn off parallel cluster
stopCluster(cl)
<- lasso.res %>%
lasso.top.models select_best("rmse")
%>% show_best(n=1) lasso.res
## # A tibble: 1 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.001 rmse standard 0.203 25 0.00600 Preprocessor1_Model01
# finalize workflow with the best model
<- lasso.wflow %>%
best.lasso.wflow finalize_workflow(lasso.top.models)
# fitting best performing model
<- best.lasso.wflow %>%
best.lasso.fit fit(data = train_data)
RMSE = 0.207073
This code is borrowed from Dr. Handel
<- best.lasso.fit$fit$fit$fit
x plot(x, "lambda")
Let us figure out what type of model we should watch. Here are the RMSE of all: - Null: RMSE = 0.06721659 # Maybe because I did not normalize the outcome like I did in other models? - Tree Model: RMSE = 0.2588305 - LASSO: RMSE = 0.207073
Looks like tree model
<- makePSOCKcluster(ncores)
cl registerDoParallel(cl)
# use test data
<- best.lasso.wflow %>% last_fit(data_split) # using test data
final.mod %>% collect_metrics() final.mod
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.213 Preprocessor1_Model1
## 2 rsq standard 0.956 Preprocessor1_Model1
# turn off parallel cluster
stopCluster(cl)
RMSE = 0.213288, different from null, but similar to LASSO ## Visualization
# residuals
<- final.mod %>%
final.res augment() %>%
select(.pred, mps) %>%
mutate(.resid = mps - .pred)
# training vs truth
ggplot(final.res, aes(x = mps, y = .pred)) +
geom_point()
# residuals vs trth
ggplot(final.res, aes(y = .resid, x = .pred)) +
geom_point()
#compare to null model