Generate synthetic data

The idea is to use synthetic data to investigate some aspects of model flexibility and how it impacts on bias and variance.

library(tidyverse)
library(tidymodels)
library(gganimate)
library(transformr)
library(kableExtra)

We generate a sample with a given \(\texttt{sample\_size}\), the true \(f(X)=2X - 6x^{2} +2 x^{3}\) and \(\epsilon\sim N(0,true\_sigma^{2})\)

true_f = function(x){return(2*x+6*x^2+2*x^3)}
true_sigma=5
xmin=-3
xmax=2

sample_size = 200
synth_data = tibble(x = runif(sample_size,min=xmin,max=xmax),
                    fx= true_f(x),
                    noise=rnorm(sample_size,mean = 0,sd=true_sigma),
                    y=fx+noise) 

synth_plot = synth_data %>% 
  ggplot(aes(x=x,y=y))+ theme_minimal() + geom_point(alpha=.1)

synth_plot

Even for a given data set, flexible methods may lead to different \(\hat{f}(X)\) due to the training/testing split.

n_samples = 99
train_prop = .25
  
synth_samples = tibble(data=rerun(.n = n_samples, synth_data  %>% slice_sample(prop = train_prop)),
                       sample_id = c(paste0("sample: 0",1:9),paste0("sample: ",10:n_samples)),
)

To visualize this, consider 99 random splits, with a small proportion of training observations.

sample_movie = synth_plot + 
geom_point(data=synth_samples %>% unnest(data),aes(x,y),colour="dodgerblue",alpha=.5) +
  labs(title = '{closest_state}')+
  transition_states(sample_id,state_length = 1)
  
anim_save(filename = "pop_sample_movie.gif",animation = sample_movie,path = "figures/",nframes = 200)

samples

Fitting a linear regression, and a polynomial regression of degree 8 on a small training set will results in variable fitted curves. To visulize that…

linear_reg_movie = synth_plot + 
  geom_point(data=synth_samples %>% unnest(data),aes(x,y),colour="dodgerblue",alpha=.5) +
  stat_smooth(data=synth_samples %>% unnest(data), aes(x,y), method = "lm", se = FALSE, color="indianred", geom='line',size=2) +
  labs(title = 'linear reg. {closest_state}',subtitle = paste0("proportion of training obs: ", train_prop))+
  transition_states(sample_id, wrap = TRUE,state_length = 1) + 
  shadow_trail(alpha = 0.35, exclude_layer = 2,color="forestgreen",size=.5)

anim_save(filename = "linear_regression_movie.gif",animation = linear_reg_movie,path = "figures/",nframes = 200)
poly8_movie = synth_plot + 
  geom_point(data=synth_samples %>% unnest(data),aes(x,y),colour="dodgerblue",alpha=.5) +
  stat_smooth(data=synth_samples %>% unnest(data), aes(x,y), method="lm",formula = y~poly(x,8) , se = FALSE, color="indianred", geom='line',size=2) +
  labs(title = 'polynomial reg. degree 8 {closest_state}',subtitle = paste0("proportion of training obs: ", train_prop))+
  transition_states(sample_id, wrap = TRUE,state_length = 1) + 
  shadow_trail(alpha = 0.35, exclude_layer = 2,color="forestgreen",size=.5)

anim_save(filename = "poly8_movie.gif",animation = poly8_movie,path = "figures/",nframes = 200)

lin_reg poly10_reg

By increasing the training set size, the variability of the fitted curves will decrease.

lin_reg poly8_reg

The bias-variance trade-off

Generating data can be used to compute the point-wise decomposition of the expected test MSE in the bias and variance trade-off.

In particular, the expected test MSE at a specific test observation \(x_{0}\) is the sum of three components:

\[E \left[\left( y_{0} - \hat{f}(x_{0})\right)^{2}\right] = var(\hat{f}(x_{0}) + Bias\left[\left(\hat{f}(x_{0})\right)\right]^{2}+var(\epsilon)\]

Non-flexible methods have low variance, but they can have an high bias: increasing the flexibility will reduce the bias, at the cost of increased variance. Now, since \(X\) is non stochastic, fix its values \(X=\bf x\), and generate 1000 sample of size 100 from \(Y|X={\bf x}\): since \(f({\bf x})=2{\bf x}-6{\bf x}^{2}+2{\bf x}^{3}\) is known, to generate a new sample it just takes to add the Gaussian noise \(\epsilon\) to \(f({\bf x})\), where \(\epsilon\sim N(0,\sigma^{2}=true_\sigma^{2})\).

Set the size of the samples, and the number of generated samples, then create a nested tibble of samples

set.seed(123)
sample_size=100
n_samples=1000

synth_ys = tibble(x = rep(runif(sample_size,min=0,max=2),n_samples)) %>%
  mutate(sample_id = rep(c(paste0("sample: 0",1:9),paste0("sample: ",10:n_samples)),each=sample_size),
         fx= true_f(x),
         noise=rnorm(n=n(),mean = 0,sd=true_sigma),
         y=fx+noise) %>% 
  dplyr::select(sample_id,everything()) %>%
  group_by(sample_id) %>% nest() %>% ungroup()

synth_ys %>% slice(1:4) 
## # A tibble: 4 × 2
##   sample_id  data              
##   <chr>      <list>            
## 1 sample: 01 <tibble [100 × 4]>
## 2 sample: 02 <tibble [100 × 4]>
## 3 sample: 03 <tibble [100 × 4]>
## 4 sample: 04 <tibble [100 × 4]>

To have a look at the nested tibbles, unnest \(\texttt{data}\), these are the training samples

synth_ys %>% slice(1) %>% unnest(data) %>%
  slice(1:6) %>% dplyr::select(-sample_id) %>% 
  kbl() %>% kable_styling(bootstrap_options = "hover",full_width = FALSE)
x fx noise y
0.5751550 3.5156564 1.2665926 4.782249
1.5766103 25.9053804 -0.1427338 25.762647
0.8179538 6.7447002 -0.2143523 6.530348
1.7660348 33.2614408 6.8430114 40.104452
1.8809346 38.2985309 -1.1288549 37.169676
0.0911130 0.2335482 7.5823530 7.815901

Generate now a small set of test observations \(x_{0}\). And then add random noise to generate \(y_{0}\)’s.

n_test=10
test_xs = runif(n_test,min = xmin, max = xmax)

test_obs =   tibble(sample_id = c(paste0("sample: 0",1:9),paste0("sample: ",10:n_samples))) %>% 
  mutate(test_data=rerun(.n = n_samples,tibble(x=test_xs) %>% 
                           mutate(fx=true_f(x),
                                  noise = rnorm(n=n_test,sd=true_sigma),
                                  y = noise+fx
                           )
  )
  )  %>% unnest() %>% group_by(sample_id) %>%
  mutate(test_ob_id=1:n()) %>% nest() %>%
  rename(test_data=data)

The next step is to merge the test observations in the training sample data structure.

synth_ys = synth_ys %>% inner_join(test_obs,by="sample_id")

Then a linear regression is fit on each training sample, and for each fitted function \(\hat{f}\), predictions are made for the test observations.

sample_results = synth_ys %>%  mutate(
  model = map(.x = data,.f=~lm(data=.x, formula = y~x)),
  model_preds= map2(.x = model,.y=test_data,.f=~augment(.x, newdata = .y))
)

It is now possible to compute all the quantities: bias, variance, the irreducible error and the expected test mse computed both as whole, and as the sum of the three quantities.

sample_selected_quantities = sample_results %>% unnest(cols = model_preds) %>%
  dplyr::select(sample_id,x:.fitted) 

test_elements = sample_selected_quantities %>% arrange(test_ob_id) %>%  group_by(test_ob_id) %>% 
  summarise(exp_test_mse = mean((.fitted-y)^2),
            E_f_hat = mean(.fitted),
            var_f_hat=var(.fitted)*((n_samples-1)/n_samples),
            fx=mean(fx),
            var_eps=var(y),
            squared_bias = (E_f_hat-fx)^2,
            decomp_test_mse = squared_bias+var_f_hat+ var_eps,
            diff_mse=round(exp_test_mse - decomp_test_mse,digits=3)
            ) %>% dplyr::select(squared_bias,var_f_hat,var_eps, exp_test_mse, decomp_test_mse, diff_mse) 

Let’s now increase the model flexibility by fitting polynomial regression for an increasing degree

tune_synth_ys = synth_ys %>% mutate(degree=map(.x=sample_id,~1:10)) %>% unnest(cols = degree) 

tuned_results = tune_synth_ys %>%  mutate(
  model = map2(.x = data,.y=degree,.f=~lm(data=.x, formula = y~poly(x,.y))),
  model_preds= map2(.x = model,.y=test_data,.f=~augment(.x, newdata = .y))
)

test_mse_by_degree = tuned_results %>% unnest(cols = model_preds) %>% filter(test_ob_id==1)  %>%  
  group_by(degree)  %>% 
summarise(var_f_hat=var(.fitted)*((n_samples-1)/n_samples),
          E_f_hat = mean(.fitted),
          fx=mean(fx),
          squared_bias = ((E_f_hat-fx)^2),
          exp_test_mse = (squared_bias+var_f_hat+true_sigma^2)
          ) %>% dplyr::select(degree,exp_test_mse,var_f_hat,squared_bias) %>% 
  rename(variance=var_f_hat,`test MSE`=exp_test_mse,`squared bias`=squared_bias)


test_mse_by_degree %>% 
  kbl() %>% kable_styling(bootstrap_options = "hover")
degree test MSE variance squared bias
1 29.22676 0.3931515 3.8336074
2 25.57643 0.4405779 0.1358484
3 25.79809 0.7970971 0.0009881
4 25.87612 0.8758269 0.0002892
5 26.01096 1.0109401 0.0000196
6 26.31530 1.3153046 0.0000003
7 26.34686 1.3468448 0.0000185
8 26.54657 1.5465444 0.0000279
9 26.93490 1.9346380 0.0002576
10 26.93110 1.9308807 0.0002162
test_mse_by_degree %>% mutate(variance=variance,
                              `test MSE`=`test MSE`,
                              `squared bias`=`squared bias`) %>% 
  pivot_longer(names_to="source",values_to="error", cols = 2:4) %>% 
  ggplot(aes(x=degree,y=error,group=source,color=source))+
  geom_smooth(se=FALSE)+
  # geom_line()+
  geom_hline(yintercept=(true_sigma^2)) +
  labs(title="test MSE decomposition")