class: center, middle, inverse, title-slide # Linear Regression (part 2) ## with tidymodels ### Statistical Learning ### Alfonso Iodice D'Enza --- class: animated fadeIn ### Interactions ```r library(scatterplot3d) adv_fit_update = adv_wflow %>% update_formula(sales~.-newspaper) %>% fit(data=adv_data) data3d=adv_fit_update %>% extract_fit_engine() %>% augment(data=adv_data) %>% mutate(und_ov_est = ifelse(.std.resid<0,"indianred","blue")) %>% select(TV, radio, sales, und_ov_est) plot_3d = scatterplot3d(x=data3d$TV,y=data3d$radio,z=data3d$sales,xlab="TV",ylab="radio",zlab="sales",color=data3d$und_ov_est,pch=16,box=FALSE,grid=TRUE) plot_3d$plane3d(adv_fit_update %>% extract_fit_engine(),draw_polygon=TRUE,polygon_args = list(col = rgb(.1,.6,.4,.25))) ``` <img src="Linear-Regression-part_2_files/figure-html/unnamed-chunk-2-1.png" width="45%" style="display: block; margin: auto;" /> --- class: animated fadeIn center middle .center[<h2 style="color:orange"> intermission: more stuff you know of </h2>] --- class: animated fadeIn center middle inverse ### Goodness-of-fit ### Global test statistic F ### testing blocks of predictors ### autocorrelation ### heteroschedasticity ### outliers and high leverage ### multicollinearity --- class: animated fadeIn center middle .center[<h2 style="color:orange"> end of intermission </h2>] --- class: animated fadeIn center middle inverse ## estimate the model performance (on the test set) ### resampling methods --- class: animated fadeIn ### the credit data set **want**: predict balance as a response ```r library(janitor) data(Credit,package = "ISLR2") credit=Credit %>% clean_names() credit %>% slice_sample(n=12) %>% kbl() %>% kable_styling(font_size = 10) ``` <table class="table" style="font-size: 10px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;"> income </th> <th style="text-align:right;"> limit </th> <th style="text-align:right;"> rating </th> <th style="text-align:right;"> cards </th> <th style="text-align:right;"> age </th> <th style="text-align:right;"> education </th> <th style="text-align:left;"> own </th> <th style="text-align:left;"> student </th> <th style="text-align:left;"> married </th> <th style="text-align:left;"> region </th> <th style="text-align:right;"> balance </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 21.786 </td> <td style="text-align:right;"> 4632 </td> <td style="text-align:right;"> 355 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 50 </td> <td style="text-align:right;"> 17 </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> Yes </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 580 </td> </tr> <tr> <td style="text-align:right;"> 57.337 </td> <td style="text-align:right;"> 5310 </td> <td style="text-align:right;"> 392 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:right;"> 45 </td> <td style="text-align:right;"> 7 </td> <td style="text-align:left;"> Yes </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 456 </td> </tr> <tr> <td style="text-align:right;"> 20.918 </td> <td style="text-align:right;"> 1233 </td> <td style="text-align:right;"> 128 </td> <td style="text-align:right;"> 3 </td> <td style="text-align:right;"> 47 </td> <td style="text-align:right;"> 18 </td> <td style="text-align:left;"> Yes </td> <td style="text-align:left;"> Yes </td> <td style="text-align:left;"> Yes </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 16 </td> </tr> <tr> <td style="text-align:right;"> 62.413 </td> <td style="text-align:right;"> 6457 </td> <td style="text-align:right;"> 455 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:right;"> 71 </td> <td style="text-align:right;"> 11 </td> <td style="text-align:left;"> Yes </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> Yes </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 762 </td> </tr> <tr> <td style="text-align:right;"> 42.529 </td> <td style="text-align:right;"> 4986 </td> <td style="text-align:right;"> 369 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:right;"> 37 </td> <td style="text-align:right;"> 11 </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> Yes </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 489 </td> </tr> <tr> <td style="text-align:right;"> 76.782 </td> <td style="text-align:right;"> 5977 </td> <td style="text-align:right;"> 429 </td> <td style="text-align:right;"> 4 </td> <td style="text-align:right;"> 44 </td> <td style="text-align:right;"> 12 </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> Yes </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 548 </td> </tr> <tr> <td style="text-align:right;"> 29.725 </td> <td style="text-align:right;"> 3536 </td> <td style="text-align:right;"> 270 </td> <td style="text-align:right;"> 2 </td> <td style="text-align:right;"> 52 </td> <td style="text-align:right;"> 15 </td> <td style="text-align:left;"> Yes </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 133 </td> </tr> <tr> <td style="text-align:right;"> 148.924 </td> <td style="text-align:right;"> 9504 </td> <td style="text-align:right;"> 681 </td> <td style="text-align:right;"> 3 </td> <td style="text-align:right;"> 36 </td> <td style="text-align:right;"> 11 </td> <td style="text-align:left;"> Yes </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 964 </td> </tr> <tr> <td style="text-align:right;"> 58.165 </td> <td style="text-align:right;"> 6617 </td> <td style="text-align:right;"> 460 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 56 </td> <td style="text-align:right;"> 12 </td> <td style="text-align:left;"> Yes </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> Yes </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 856 </td> </tr> <tr> <td style="text-align:right;"> 15.629 </td> <td style="text-align:right;"> 2493 </td> <td style="text-align:right;"> 186 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 60 </td> <td style="text-align:right;"> 14 </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> Yes </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 0 </td> </tr> <tr> <td style="text-align:right;"> 62.328 </td> <td style="text-align:right;"> 5228 </td> <td style="text-align:right;"> 377 </td> <td style="text-align:right;"> 3 </td> <td style="text-align:right;"> 83 </td> <td style="text-align:right;"> 15 </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 380 </td> </tr> <tr> <td style="text-align:right;"> 41.365 </td> <td style="text-align:right;"> 5303 </td> <td style="text-align:right;"> 377 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> 45 </td> <td style="text-align:right;"> 14 </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> No </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 606 </td> </tr> </tbody> </table> --- class: animated fadeIn ### a (random) smaller version of the credit data set ```r set.seed(1234) credit_small = credit %>% select(balance,sample(names(credit)[-11],3)) credit_small %>% slice_sample(n=12) %>% kbl() %>% kable_styling(font_size = 10) ``` <table class="table" style="font-size: 10px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;"> balance </th> <th style="text-align:left;"> region </th> <th style="text-align:right;"> education </th> <th style="text-align:right;"> age </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 47 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 17 </td> <td style="text-align:right;"> 57 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 10 </td> <td style="text-align:right;"> 24 </td> </tr> <tr> <td style="text-align:right;"> 912 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 13 </td> <td style="text-align:right;"> 49 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 9 </td> <td style="text-align:right;"> 62 </td> </tr> <tr> <td style="text-align:right;"> 155 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 17 </td> <td style="text-align:right;"> 74 </td> </tr> <tr> <td style="text-align:right;"> 1587 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 16 </td> <td style="text-align:right;"> 56 </td> </tr> <tr> <td style="text-align:right;"> 637 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 19 </td> <td style="text-align:right;"> 44 </td> </tr> <tr> <td style="text-align:right;"> 1176 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 9 </td> <td style="text-align:right;"> 52 </td> </tr> <tr> <td style="text-align:right;"> 732 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 12 </td> <td style="text-align:right;"> 66 </td> </tr> <tr> <td style="text-align:right;"> 391 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 11 </td> <td style="text-align:right;"> 45 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 15 </td> <td style="text-align:right;"> 75 </td> </tr> <tr> <td style="text-align:right;"> 772 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 18 </td> <td style="text-align:right;"> 48 </td> </tr> </tbody> </table> --- class: animated fadeIn ### validation approaches .my-pull-left[ **train/test** ] .my-pull-right[ >Randomly select a proportion (e.g. **.75**) of observations for **training**, the rest is for **testing** ] -- .my-pull-left[ **validation set** ] .my-pull-right[ >Select a proportion (e.g. **.8**) of observations to **train** the model, the rest is for **validation** ] -- .my-pull-left[ **cross-validation** ] .my-pull-right[ > - **Leave-one-out** cross-validation: all observations but one are for training, the one left is for validation. The procedure is iterated on the observations, until each observation is used for validation. > > - **K-fold** cross-validation: K-1 folds are for training, the K$^{th}$ fold is for validation. The procedure is iterated untill each fold is used for validation. > - the CV estimate of the evaluation metric is the average over the ** `\(n\)` ** (or, ** K **) iterations ] --- class: animated fadeIn ### Validation-set to assess model performance .my-pull-left[ **train/test split** ] .my-pull-right[ ```r first_split = initial_split(data = credit_small,prop=.75,strata = balance) cred_tr = training(first_split) cred_test = testing(first_split) ``` ] -- .my-pull-left[ **analysis/ validate split** ] .my-pull-right[ ```r val_split = validation_split(data = cred_tr, prop=.8,strata = balance) ``` ] -- ** split ** and ** resample ** object: print just minimal info .pull-left[ ```r first_split ``` ``` ## <Analysis/Assess/Total> ## <299/101/400> ``` ] .pull-right[ ```r val_split ``` ``` ## # Validation Set Split (0.8/0.2) using stratification ## # A tibble: 1 × 2 ## splits id ## <list> <chr> ## 1 <split [239/60]> validation ``` ] --- class: animated fadeIn ### Validation-set to assess model performance ** split objects **: they are lists .pull-left[ ```r glimpse(first_split) ``` ``` ## List of 4 ## $ data :'data.frame': 400 obs. of 4 variables: ## ..$ balance : num [1:400] 333 903 580 964 331 ... ## ..$ region : Factor w/ 3 levels "East","South",..: 2 3 3 3 2 2 1 3 2 1 ... ## ..$ education: num [1:400] 11 15 11 11 16 10 12 9 13 19 ... ## ..$ age : num [1:400] 34 82 71 36 68 77 37 87 66 41 ... ## $ in_id : int [1:299] 12 16 17 23 25 32 34 35 41 49 ... ## $ out_id: logi NA ## $ id : tibble [1 × 1] (S3: tbl_df/tbl/data.frame) ## ..$ id: chr "Resample1" ## - attr(*, "class")= chr [1:2] "mc_split" "rsplit" ``` ] .pull-right[ ```r str(val_split) ``` ``` ## validation_split [1 × 2] (S3: validation_split/rset/tbl_df/tbl/data.frame) ## $ splits:List of 1 ## ..$ :List of 4 ## .. ..$ data :'data.frame': 299 obs. of 4 variables: ## .. .. ..$ balance : num [1:299] 0 0 0 0 0 0 0 0 50 0 ... ## .. .. ..$ region : Factor w/ 3 levels "East","South",..: 2 1 1 1 2 3 2 3 1 3 ... ## .. .. ..$ education: num [1:299] 16 15 17 10 15 16 10 14 14 15 ... ## .. .. ..$ age : num [1:299] 64 57 73 61 57 43 30 25 54 72 ... ## .. ..$ in_id : int [1:239] 3 4 5 8 10 11 12 13 14 15 ... ## .. ..$ out_id: logi NA ## .. ..$ id : tibble [1 × 1] (S3: tbl_df/tbl/data.frame) ## .. .. ..$ id: chr "validation" ## .. ..- attr(*, "class")= chr [1:2] "val_split" "rsplit" ## $ id : chr "validation" ## - attr(*, "prop")= num 0.8 ## - attr(*, "strata")= logi TRUE ## - attr(*, "fingerprint")= chr "95463bd733510d7274291db8f6792543" ``` ] --- class: animated fadeIn ### Validation-set to assess model performance Note: ** `\(\texttt{training()}/\texttt{testing()}\)` ** and ** `\(\texttt{analysis()}/\texttt{assessment()}\)` ** are equivalent helper functions to extract the data from the split object... .pull-left[ ```r training(first_split) %>% slice(1:6) %>% kbl() %>% kable_styling(font_size = 8) ``` <table class="table" style="font-size: 8px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;"> balance </th> <th style="text-align:left;"> region </th> <th style="text-align:right;"> education </th> <th style="text-align:right;"> age </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 16 </td> <td style="text-align:right;"> 64 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 15 </td> <td style="text-align:right;"> 57 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 17 </td> <td style="text-align:right;"> 73 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 10 </td> <td style="text-align:right;"> 61 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 15 </td> <td style="text-align:right;"> 57 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 16 </td> <td style="text-align:right;"> 43 </td> </tr> </tbody> </table> ] .pull-right[ ```r first_split$data[first_split$in_id,] %>% slice(1:6) %>% kbl() %>% kable_styling(font_size = 8) ``` <table class="table" style="font-size: 8px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;"> balance </th> <th style="text-align:left;"> region </th> <th style="text-align:right;"> education </th> <th style="text-align:right;"> age </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 16 </td> <td style="text-align:right;"> 64 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 15 </td> <td style="text-align:right;"> 57 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 17 </td> <td style="text-align:right;"> 73 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 10 </td> <td style="text-align:right;"> 61 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 15 </td> <td style="text-align:right;"> 57 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 16 </td> <td style="text-align:right;"> 43 </td> </tr> </tbody> </table> ] .pull-left[ ```r testing(first_split) %>% slice(1:6) %>% kbl() %>% kable_styling(font_size = 8) ``` <table class="table" style="font-size: 8px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;"> balance </th> <th style="text-align:left;"> region </th> <th style="text-align:right;"> education </th> <th style="text-align:right;"> age </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 333 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 11 </td> <td style="text-align:right;"> 34 </td> </tr> <tr> <td style="text-align:right;"> 903 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 15 </td> <td style="text-align:right;"> 82 </td> </tr> <tr> <td style="text-align:right;"> 1151 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 10 </td> <td style="text-align:right;"> 77 </td> </tr> <tr> <td style="text-align:right;"> 204 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 7 </td> <td style="text-align:right;"> 57 </td> </tr> <tr> <td style="text-align:right;"> 1081 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 9 </td> <td style="text-align:right;"> 49 </td> </tr> <tr> <td style="text-align:right;"> 891 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 9 </td> <td style="text-align:right;"> 28 </td> </tr> </tbody> </table> ] .pull-right[ ```r first_split$data[-first_split$in_id,] %>% slice(1:6) %>% kbl() %>% kable_styling(font_size = 8) ``` <table class="table" style="font-size: 8px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;"> balance </th> <th style="text-align:left;"> region </th> <th style="text-align:right;"> education </th> <th style="text-align:right;"> age </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 333 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 11 </td> <td style="text-align:right;"> 34 </td> </tr> <tr> <td style="text-align:right;"> 903 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 15 </td> <td style="text-align:right;"> 82 </td> </tr> <tr> <td style="text-align:right;"> 1151 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 10 </td> <td style="text-align:right;"> 77 </td> </tr> <tr> <td style="text-align:right;"> 204 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 7 </td> <td style="text-align:right;"> 57 </td> </tr> <tr> <td style="text-align:right;"> 1081 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 9 </td> <td style="text-align:right;"> 49 </td> </tr> <tr> <td style="text-align:right;"> 891 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 9 </td> <td style="text-align:right;"> 28 </td> </tr> </tbody> </table> ] --- class: animated fadeIn ### Validation-set to assess model performance Note: ** `\(\texttt{training()}/\texttt{testing()}\)` ** and ** `\(\texttt{analysis()}/\texttt{assessment()}\)` ** are equivalent helper functions to extract the data from the split object... .pull-left[ ```r analysis(first_split) %>% slice(1:6) %>% kbl() %>% kable_styling(font_size = 8) ``` <table class="table" style="font-size: 8px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;"> balance </th> <th style="text-align:left;"> region </th> <th style="text-align:right;"> education </th> <th style="text-align:right;"> age </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 16 </td> <td style="text-align:right;"> 64 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 15 </td> <td style="text-align:right;"> 57 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 17 </td> <td style="text-align:right;"> 73 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 10 </td> <td style="text-align:right;"> 61 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 15 </td> <td style="text-align:right;"> 57 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 16 </td> <td style="text-align:right;"> 43 </td> </tr> </tbody> </table> ] .pull-right[ ```r first_split$data[first_split$in_id,] %>% slice(1:6) %>% kbl() %>% kable_styling(font_size = 8) ``` <table class="table" style="font-size: 8px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;"> balance </th> <th style="text-align:left;"> region </th> <th style="text-align:right;"> education </th> <th style="text-align:right;"> age </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 16 </td> <td style="text-align:right;"> 64 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 15 </td> <td style="text-align:right;"> 57 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 17 </td> <td style="text-align:right;"> 73 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> East </td> <td style="text-align:right;"> 10 </td> <td style="text-align:right;"> 61 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 15 </td> <td style="text-align:right;"> 57 </td> </tr> <tr> <td style="text-align:right;"> 0 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 16 </td> <td style="text-align:right;"> 43 </td> </tr> </tbody> </table> ] .pull-left[ ```r assessment(first_split) %>% slice(1:6) %>% kbl() %>% kable_styling(font_size = 8) ``` <table class="table" style="font-size: 8px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;"> balance </th> <th style="text-align:left;"> region </th> <th style="text-align:right;"> education </th> <th style="text-align:right;"> age </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 333 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 11 </td> <td style="text-align:right;"> 34 </td> </tr> <tr> <td style="text-align:right;"> 903 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 15 </td> <td style="text-align:right;"> 82 </td> </tr> <tr> <td style="text-align:right;"> 1151 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 10 </td> <td style="text-align:right;"> 77 </td> </tr> <tr> <td style="text-align:right;"> 204 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 7 </td> <td style="text-align:right;"> 57 </td> </tr> <tr> <td style="text-align:right;"> 1081 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 9 </td> <td style="text-align:right;"> 49 </td> </tr> <tr> <td style="text-align:right;"> 891 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 9 </td> <td style="text-align:right;"> 28 </td> </tr> </tbody> </table> ] .pull-right[ ```r first_split$data[-first_split$in_id,] %>% slice(1:6) %>% kbl() %>% kable_styling(font_size = 8) ``` <table class="table" style="font-size: 8px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;"> balance </th> <th style="text-align:left;"> region </th> <th style="text-align:right;"> education </th> <th style="text-align:right;"> age </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 333 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 11 </td> <td style="text-align:right;"> 34 </td> </tr> <tr> <td style="text-align:right;"> 903 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 15 </td> <td style="text-align:right;"> 82 </td> </tr> <tr> <td style="text-align:right;"> 1151 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 10 </td> <td style="text-align:right;"> 77 </td> </tr> <tr> <td style="text-align:right;"> 204 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 7 </td> <td style="text-align:right;"> 57 </td> </tr> <tr> <td style="text-align:right;"> 1081 </td> <td style="text-align:left;"> South </td> <td style="text-align:right;"> 9 </td> <td style="text-align:right;"> 49 </td> </tr> <tr> <td style="text-align:right;"> 891 </td> <td style="text-align:left;"> West </td> <td style="text-align:right;"> 9 </td> <td style="text-align:right;"> 28 </td> </tr> </tbody> </table> ] --- class: animated fadeIn ### Validation-set to assess model performance ** pre-processing ** ```r cred_an = analysis(val_split %>% pull(splits) %>% .[[1]]) cred_prep = recipe(balance~., data = cred_an) ``` -- ** model-spec ** ```r cred_mod = linear_reg(mode="regression", engine="lm") ``` -- ** workflow setup ** ```r cred_wflow = workflow() %>% add_recipe(cred_prep) %>% add_model(cred_mod) ``` --- class: animated fadeIn ### Validation-set to assess model performance ** model fit ** ```r cred_fit = cred_wflow %>% fit(cred_an) ``` -- ** assessment set predictions ** ```r cred_as = assessment(val_split %>% pull(splits) %>% .[[1]]) cred_preds = cred_fit %>% augment(cred_as) %>% select(balance,.pred) cred_preds %>% slice_sample(n=3) %>% kbl() %>% kable_styling(font_size=9) ``` <table class="table" style="font-size: 9px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;"> balance </th> <th style="text-align:right;"> .pred </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 429 </td> <td style="text-align:right;"> 547.6457 </td> </tr> <tr> <td style="text-align:right;"> 843 </td> <td style="text-align:right;"> 512.9616 </td> </tr> <tr> <td style="text-align:right;"> 1120 </td> <td style="text-align:right;"> 501.6879 </td> </tr> </tbody> </table> -- ** assessment metric: RMSE ** ```r cred_preds %>% rmse(truth=balance, estimate=.pred) %>% kbl() %>% kable_styling(font_size=9) ``` <table class="table" style="font-size: 9px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:left;"> .metric </th> <th style="text-align:left;"> .estimator </th> <th style="text-align:right;"> .estimate </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> rmse </td> <td style="text-align:left;"> standard </td> <td style="text-align:right;"> 455.6267 </td> </tr> </tbody> </table> --- class: animated fadeIn ### Validation-set to assess model performance: a shortcut ** RMSE computation ** ```r *cred_wflow %>% * fit_resamples(val_split) %>% collect_metrics() ``` ``` ## # Resampling results ## # Validation Set Split (0.8/0.2) using stratification ## # A tibble: 1 × 4 ## splits id .metrics .notes ## <list> <chr> <list> <list> ## 1 <split [239/60]> validation <tibble [2 × 4]> <tibble [0 × 1]> ``` --- class: animated fadeIn ### Validation-set to assess model performance: shortcut ** RMSE computation ** ```r *cred_wflow %>% * fit_resamples(val_split) %>% * collect_metrics() ``` <table class="table" style="font-size: 9px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:left;"> .metric </th> <th style="text-align:left;"> .estimator </th> <th style="text-align:right;"> mean </th> <th style="text-align:right;"> n </th> <th style="text-align:right;"> std_err </th> <th style="text-align:left;"> .config </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> rmse </td> <td style="text-align:left;"> standard </td> <td style="text-align:right;"> 455.6267 </td> <td style="text-align:right;"> 1 </td> <td style="text-align:right;"> NA </td> <td style="text-align:left;"> Preprocessor1_Model1 </td> </tr> </tbody> </table> --- class: animated fadeIn ### v-fold cross validation to assess model performance ** data split: folds ** ```r cred_folds = vfold_cv(cred_tr,v=5,strata=balance) ``` -- ** pre-processing ** ```r cred_prep = recipe(balance~., data = cred_tr) ``` -- ** model-spec ** ```r cred_mod = linear_reg(mode="regression", engine="lm") ``` -- ** workflow setup ** ```r cred_wflow = workflow() %>% add_recipe(cred_prep) %>% add_model(cred_mod) ``` --- class: animated fadeIn ### v-fold cross validation to assess model performance ** model fit ** ```r cred_fit_vfolds = cred_wflow %>% fit_resamples(cred_folds) ``` ``` ## # Resampling results ## # 5-fold cross-validation using stratification ## # A tibble: 5 × 4 ## splits id .metrics .notes ## <list> <chr> <list> <list> ## 1 <split [239/60]> Fold1 <tibble [2 × 4]> <tibble [0 × 1]> ## 2 <split [239/60]> Fold2 <tibble [2 × 4]> <tibble [0 × 1]> ## 3 <split [239/60]> Fold3 <tibble [2 × 4]> <tibble [0 × 1]> ## 4 <split [239/60]> Fold4 <tibble [2 × 4]> <tibble [0 × 1]> ## 5 <split [240/59]> Fold5 <tibble [2 × 4]> <tibble [0 × 1]> ``` -- ** cross-validated RMSE ** ```r cred_fit_vfolds %>% collect_metrics() %>% filter(.metric == "rmse") %>% kbl() %>% kable_styling(font_size = 9) ``` <table class="table" style="font-size: 9px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:left;"> .metric </th> <th style="text-align:left;"> .estimator </th> <th style="text-align:right;"> mean </th> <th style="text-align:right;"> n </th> <th style="text-align:right;"> std_err </th> <th style="text-align:left;"> .config </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> rmse </td> <td style="text-align:left;"> standard </td> <td style="text-align:right;"> 456.6134 </td> <td style="text-align:right;"> 5 </td> <td style="text-align:right;"> 11.20367 </td> <td style="text-align:left;"> Preprocessor1_Model1 </td> </tr> </tbody> </table> --- class: animated fadeIn ### Leave One Out cross validation to assess model performance Note: Leave one out cross-validation is somewhat deprecated, and while there is a specific function ** `\(\texttt{loo_cv}\)` ** in ** `\(\texttt{rsample}\)` **. ** LOO-CV splits ** ```r cred_loo = loo_cv(cred_tr) ``` -- The function is not supported by ** `\(\texttt{fit_resamples}\)` **: a workaround is to use still ** `\(\texttt{v_fold_cv}\)` ** and set as many folds as the as the rows in the training set ```r cred_loo = vfold_cv(cred_tr,v=nrow(cred_tr)) ``` --- class: animated fadeIn ### Leave One Out cross validation to assess model performance ** model fit ** ```r cred_fit_loo = cred_wflow %>% fit_resamples(cred_loo) ``` -- ** cross-validated RMSE ** ```r cred_fit_loo %>% collect_metrics() %>% filter(.metric == "rmse") %>% kbl() %>% kable_styling(font_size = 9) ``` <table class="table" style="font-size: 9px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:left;"> .metric </th> <th style="text-align:left;"> .estimator </th> <th style="text-align:right;"> mean </th> <th style="text-align:right;"> n </th> <th style="text-align:right;"> std_err </th> <th style="text-align:left;"> .config </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> rmse </td> <td style="text-align:left;"> standard </td> <td style="text-align:right;"> 389.9147 </td> <td style="text-align:right;"> 299 </td> <td style="text-align:right;"> 14.094 </td> <td style="text-align:left;"> Preprocessor1_Model1 </td> </tr> </tbody> </table> --- class: animated fadeIn center middle inverse ### Hyperparameter tuning --- class: animated fadeIn ### tiny example on polynomial regression: balance vs rating Consider a tiny example of a tuning process: ```r credit_tiny = credit %>% select(rating, balance) credit_split = initial_split(credit_tiny,prop=3/4) credit_tiny %>% mutate(train_test = replace(rep("test", n()),credit_split$in_id,"train") ) %>% ggplot(aes(x = rating,y = balance,color=train_test)) + geom_point() + theme_minimal() ``` <img src="Linear-Regression-part_2_files/figure-html/unnamed-chunk-39-1.png" width="40%" style="display: block; margin: auto;" /> --- class: animated fadeIn ### tiny example on polynomial regression: balance vs rating - define the CV-folds on the training set ```r tiny_cred_folds = vfold_cv(training(credit_split),5) ``` -- - the degree of the polynomial is set at a recipe level. Since it is an hyperparameter, we set it to ** `\(\texttt{tune()}\)` **, a place holder ```r tiny_rec = recipe(formula = balance~rating, data = training(credit_split)) %>% step_poly(rating, degree=tune()) ``` -- The model specification does not change, so ** `\(\texttt{cred_mod}\)` ** can still be used. The workflow is then ```r tiny_wflow = workflow() %>% add_recipe(tiny_rec) %>% add_model(cred_mod) ``` -- It now takes to specify the hyperparameter values grid, and use the function ** `\(\texttt{tune_grid()}\)` ** that does all the job ```r hparm_grid = tibble(degree=1:10) tiny_cred_tuning = tiny_wflow %>% tune_grid(resamples = tiny_cred_folds, grid = hparm_grid, control = control_grid(save_pred = TRUE) ) ``` --- class: animated fadeIn ### tiny example on polynomial regression: balance vs rating Check the results ```r tiny_cred_tuning %>% collect_metrics() %>% filter(.metric=="rmse") %>% slice_min(mean) %>% kbl() %>% kable_styling(font_size = 10) ``` <table class="table" style="font-size: 10px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:right;"> degree </th> <th style="text-align:left;"> .metric </th> <th style="text-align:left;"> .estimator </th> <th style="text-align:right;"> mean </th> <th style="text-align:right;"> n </th> <th style="text-align:right;"> std_err </th> <th style="text-align:left;"> .config </th> </tr> </thead> <tbody> <tr> <td style="text-align:right;"> 7 </td> <td style="text-align:left;"> rmse </td> <td style="text-align:left;"> standard </td> <td style="text-align:right;"> 215.0154 </td> <td style="text-align:right;"> 5 </td> <td style="text-align:right;"> 8.148388 </td> <td style="text-align:left;"> Preprocessor07_Model1 </td> </tr> </tbody> </table> ```r autoplot(tiny_cred_tuning,metric = "rmse")+theme_minimal() ``` <img src="Linear-Regression-part_2_files/figure-html/unnamed-chunk-44-1.png" width="25%" style="display: block; margin: auto;" /> --- class: animated fadeIn ### tiny example on polynomial regression: balance vs rating Select the best performing model ```r tiny_cred_final_mod = tiny_cred_tuning %>% select_best("rmse") ``` Finalize the workflow (meaning: tell the workflow to pick the best model) ```r final_wflow=tiny_wflow %>% finalize_workflow(tiny_cred_final_mod) ``` Final fit and evaluation of the model: fit on the training, evaluate on the test. Pick ** `\(\texttt{credit_split}\)` ** which is the result of the first split (** `\(\texttt{initial_split()}\)` **) ```r tiny_final_fit = final_wflow %>% last_fit(credit_split) tiny_final_fit %>% extract_fit_parsnip() ``` ``` ## parsnip model object ## ## Fit time: 2ms ## ## Call: ## stats::lm(formula = ..y ~ ., data = data) ## ## Coefficients: ## (Intercept) rating_poly_1 rating_poly_2 rating_poly_3 rating_poly_4 ## 537.0 6923.0 -794.7 -115.0 1181.9 ## rating_poly_5 rating_poly_6 rating_poly_7 ## -724.6 -415.9 401.8 ``` ```r tiny_final_fit %>% collect_metrics("rmse") ``` ``` ## # A tibble: 2 × 4 ## .metric .estimator .estimate .config ## <chr> <chr> <dbl> <chr> ## 1 rmse standard 229. Preprocessor1_Model1 ## 2 rsq standard 0.732 Preprocessor1_Model1 ``` --- class: animated fadeIn ### tiny example on polynomial regression: balance vs rating ```r tiny_final_fit %>% extract_fit_parsnip() ``` ``` ## parsnip model object ## ## Fit time: 2ms ## ## Call: ## stats::lm(formula = ..y ~ ., data = data) ## ## Coefficients: ## (Intercept) rating_poly_1 rating_poly_2 rating_poly_3 rating_poly_4 ## 537.0 6923.0 -794.7 -115.0 1181.9 ## rating_poly_5 rating_poly_6 rating_poly_7 ## -724.6 -415.9 401.8 ``` ```r tiny_final_fit %>% collect_metrics("rmse") %>% kbl() %>% kable_styling(font_size = 8) ``` <table class="table" style="font-size: 8px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:left;"> .metric </th> <th style="text-align:left;"> .estimator </th> <th style="text-align:right;"> .estimate </th> <th style="text-align:left;"> .config </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> rmse </td> <td style="text-align:left;"> standard </td> <td style="text-align:right;"> 229.2952944 </td> <td style="text-align:left;"> Preprocessor1_Model1 </td> </tr> <tr> <td style="text-align:left;"> rsq </td> <td style="text-align:left;"> standard </td> <td style="text-align:right;"> 0.7324706 </td> <td style="text-align:left;"> Preprocessor1_Model1 </td> </tr> </tbody> </table> --- class: animated fadeIn ### tiny example on polynomial regression: balance vs rating ```r tiny_final_fit %>% extract_fit_parsnip() ``` ``` ## parsnip model object ## ## Fit time: 2ms ## ## Call: ## stats::lm(formula = ..y ~ ., data = data) ## ## Coefficients: ## (Intercept) rating_poly_1 rating_poly_2 rating_poly_3 rating_poly_4 ## 537.0 6923.0 -794.7 -115.0 1181.9 ## rating_poly_5 rating_poly_6 rating_poly_7 ## -724.6 -415.9 401.8 ``` ```r tiny_final_fit %>% collect_metrics("rmse") %>% kbl() %>% kable_styling(font_size = 8) ``` <table class="table" style="font-size: 8px; margin-left: auto; margin-right: auto;"> <thead> <tr> <th style="text-align:left;"> .metric </th> <th style="text-align:left;"> .estimator </th> <th style="text-align:right;"> .estimate </th> <th style="text-align:left;"> .config </th> </tr> </thead> <tbody> <tr> <td style="text-align:left;"> rmse </td> <td style="text-align:left;"> standard </td> <td style="text-align:right;"> 229.2952944 </td> <td style="text-align:left;"> Preprocessor1_Model1 </td> </tr> <tr> <td style="text-align:left;"> rsq </td> <td style="text-align:left;"> standard </td> <td style="text-align:right;"> 0.7324706 </td> <td style="text-align:left;"> Preprocessor1_Model1 </td> </tr> </tbody> </table>