Distances experiment

knitr::opts_chunk$set(echo = TRUE,message = FALSE,warning = FALSE)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.1     ✔ rsample      1.1.0
## ✔ dials        1.0.0     ✔ tune         1.0.1
## ✔ infer        1.0.3     ✔ workflows    1.1.0
## ✔ modeldata    1.0.1     ✔ workflowsets 1.0.0
## ✔ parsnip      1.0.2     ✔ yardstick    1.1.0
## ✔ recipes      1.0.1     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/

library(fastDummies)
library(aricode)
library(corrplot)

## corrplot 0.92 loaded

library(ggh4x)
library(magick)

## Linking to ImageMagick 6.9.12.3
## Enabled features: cairo, fontconfig, freetype, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fftw, ghostscript, x11

library(pdftools)

## Using poppler version 22.02.0

library(clustrd)

## Loading required package: grid
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(cluster)
library(Matrix)

## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

library(purrr)
library(philentropy)
library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose

library(tidytext)
source("R/cat_custom_delta.R")
source("R/z_preproc.R")
source("R/select_data.R")
source("R/catdist.R")
source("R/cat_delta.R")
source("R/predict_pam.R")
source("R/prepare_data.R")
source("R/cat_nearest_neighbors.R")
set.seed(123)

Benchmark datasets and distances

We consider different datasets and distances

selected_distances = c("tot_var_dist", "gifi_chi2",
         "supervised","supervised_full", "matching","eskin",
         "goodall_3","goodall_4","iof","of","lin",
         "var_entropy","var_mutability","kullback-leibler")

dataset_names = c("vote","australian","wbcd", "tictac","balance","tae",
                        "lympho","soybeanlarge","cars")

benchmark_data = tibble(datasets=dataset_names) %>%
  mutate(prepped_data = map(.x=datasets,~select_data(.x)),
         prepped_data = map(.x=prepped_data, ~cbind(.x$df,response=.x$y)),
                  distance_method = rerun(selected_distances,.n=n())
  )

Resamples setup

For each dataset and distance considered, we do a KNN, tuning the parameter on the grid 1, 3, 5, 9, 15, 21 via 5-fold cross-validation. We repeat the process 10 times.

benchmark_data_resamples = benchmark_data %>% 
  mutate(
    cv_iterations = rerun(1:10, .n = n())
  ) %>% unnest(cv_iterations) %>% 
  mutate(
    cross_validation = map(.x=prepped_data,~vfold_cv(.x, v= 5, strata=response)),
    fold_id = map(.x=cross_validation,~.x$id),
    fold_splits = map(.x=cross_validation,~.x$splits),
    k_par = rerun(c(1,3,5,9,15,21),.n=n()),
  )%>% unnest(cols = distance_method)%>% unnest(cols = k_par) %>% unnest(cols = c(fold_id,fold_splits)) %>% 
  select(-cross_validation)

Supervised experiment: KNN with tuning

Apply the KNN on all fold/distance/dataset combinations

benchmark_results_training = benchmark_data_resamples %>% 
  mutate(knn_results = pmap(.l = list(fold_splits, k_par,distance_method),
                            .f = ~cat_nearest_neighbors(train_df = analysis(..1),
                                                        assess_df = assessment(..1),
                                                        k=..2, method=..3)
  )
  )

Collection of the cross-validation estimate of accuracy

accuracy_results = benchmark_results_training %>%
  mutate(
    same_levs = map_lgl(.x = knn_results,~(length(levels(.x$.pred))==length(levels(.x$truth)))),
    classes  = map_dbl(.x=prepped_data,~length(levels(.x$response))),
    accuracy= map_dbl(.x = knn_results,
                      .f = function(x=.x){
                        x$.pred = factor(x$.pred, levels = levels(x$truth))
                        return(accuracy(x,truth=x$truth,estimate=x$.pred) %>% pull(.estimate)
                        )
                      }
    )
  ) %>% group_by(datasets,distance_method,k_par,cv_iterations) %>%
  summarise(cv_accuracy=mean(accuracy)) %>% ungroup() %>%
  group_by(datasets,distance_method,k_par) %>% 
  summarise(sd_cv_accuracy=sd(cv_accuracy),
            cv_accuracy=mean(cv_accuracy)
            ) %>% ungroup() %>%
  group_by(datasets,distance_method)

Select the best K for each distance method and data set

accuracy_results_tuned = accuracy_results %>%
  slice_max(cv_accuracy,with_ties = FALSE)%>%
  mutate(
    distance_method = str_to_title(distance_method),
    datasets=as.factor(datasets)
  )

Select the same value for the parameter: K=9

accuracy_results_fix = accuracy_results %>%
  filter(k_par==9)%>%
  mutate(
    distance_method = str_to_title(distance_method),
    datasets=as.factor(datasets)
  )

Creating data set names that contain the data size and the number of clusters; update the names in the results for the cross-validated K and for the fixed K=9

long_levels = benchmark_data %>% 
  mutate(
    cv_iterations = rerun(1:10, .n = n())
  ) %>% unnest(cv_iterations) %>% 
  mutate(
  class = c(rep(2,10),rep(2,10),rep(2,10),rep(2,10),rep(3,10),rep(3,10),rep(4,10),rep(19,10),rep(4,10)),
  data_n = map_dbl(prepped_data,nrow),
  data_p = map_dbl(prepped_data,ncol)-1,
  long_name = pmap_chr(.l=list(datasets,data_n,data_p,class),.f= ~paste0(..1," (n: ",..2,
                                                                         ", p: ",..3,", cl: ",..4,")"))
)%>% pull(long_name) %>% as.factor() %>% levels

nds_tuned=accuracy_results_tuned %>% pull(datasets)
levels(nds_tuned) = long_levels
accuracy_results_tuned$nds = nds_tuned
accuracy_results_tuned=accuracy_results_tuned %>%
  mutate(datasets = nds)

nds_fix=accuracy_results_fix %>% pull(datasets)
levels(nds_fix) = long_levels
accuracy_results_fix$nds = nds_fix
accuracy_results_fix = accuracy_results_fix %>%
  mutate(datasets = nds)

Create a structure with all the results (fixed and cross-validated)

all_results_tuned_vs_fix = rbind(accuracy_results_fix %>% mutate(hyper_par="b: fixed"),
                                 accuracy_results_tuned %>% mutate(hyper_par="a: tuned")
)

KNN results: cross-validated value for K

labelize <- Vectorize(function(x){library("stringr")
  word(x, start = 1, sep = "\\:")
}
)
a_tuned = accuracy_results_tuned %>% ungroup() %>% 
  mutate(ordered_dist = reorder_within(x =distance_method,by = cv_accuracy,
                                       within = datasets, sep = ":")
         ) %>% 
  rename(`distance measure` = `distance_method`) %>% 
  ggplot(aes(x = cv_accuracy, y = ordered_dist))+theme_minimal() +
  geom_point(aes(colour = `distance measure`, size = k_par))+
  geom_linerange(aes(xmin = cv_accuracy-sd_cv_accuracy,xmax = cv_accuracy+sd_cv_accuracy, y = ordered_dist), inherit_aes=FALSE)+
  facet_wrap(~datasets, scales = "free_y", nrow=3, ncol=3)+
  theme(axis.text.y =  element_text(size=5),
        axis.text.x =  element_text(angle=90, size=5),
        legend.position = "bottom"
  )  + 
  scale_y_discrete(label=labelize)+
  scale_size(range=c(.5,2.5))+guides(size="none")+
  labs(title="KNN", subtitle = "5-fold CV") +
  xlab("accuracy") + ylab("distances")+scale_size(range=c(.5,2.5))

a_tuned

ggsave(file="figures/knn_experiment_tuned.pdf",a_tuned,height=6,width=10)

KNN results: fixed K=9

a_fix = accuracy_results_fix %>% ungroup() %>% 
  mutate(ordered_dist = reorder_within(x =distance_method,by = cv_accuracy,
                                       within = datasets, sep = ":")
         ) %>% 
  rename(`distance measure` = `distance_method`) %>% 
  ggplot(aes(x = cv_accuracy, y = ordered_dist))+theme_minimal() +
  geom_point(aes(colour = `distance measure`, size = k_par))+
  geom_linerange(aes(xmin = cv_accuracy-sd_cv_accuracy,xmax = cv_accuracy+sd_cv_accuracy, y = ordered_dist), inherit_aes=FALSE)+
  facet_wrap(~datasets, scales = "free_y", nrow=3, ncol=3)+
  theme(axis.text.y =  element_text(size=5),
        axis.text.x =  element_text(angle=90, size=5),
        legend.position = "bottom"
  )  + 
  scale_y_discrete(label=labelize)+
  scale_size(range=c(.5,2.5))+guides(size="none")+
  labs(title="KNN", subtitle = "5-fold CV") +
  xlab("accuracy") + ylab("distances")+scale_size(range=c(.5,2.5))
a_fix

ggsave(file="figures/knn_experiment_k_fix.pdf",a_fix,height=6,width=10)

KNN results: cross-validated vs fixed K

Creating a plots grid to ease comparison

a_all = all_results_tuned_vs_fix %>%
   ungroup() %>%  
  mutate(
         ordered_dist = reorder_within(x =distance_method,by = cv_accuracy,
                                       within = datasets, sep = ":")
         ) %>% 
  rename(`distance measure` = `distance_method`) %>% 
  ggplot(aes(x = cv_accuracy, y = ordered_dist))+theme_bw() +
  geom_point(aes(colour = `distance measure`, size = k_par))+
  geom_linerange(aes(xmin = cv_accuracy-sd_cv_accuracy,xmax = cv_accuracy+sd_cv_accuracy, y = ordered_dist), inherit_aes=FALSE)+
  facet_nested_wrap( vars(hyper_par,datasets), scales = "free_y",strip.position = "top", nrow=2,dir = "h")+
  theme(axis.text.y =  element_text(size=5),
        axis.text.x =  element_text(angle=90, size=5),
        legend.position = "bottom"
  )  +
  scale_y_discrete(label=labelize)+ 
  scale_size(range=c(.5,2.5))+guides(size="none")+
  labs(title="KNN", subtitle = "5-fold CV") +
  xlab("accuracy") + ylab("distances")+scale_size(range=c(.5,2.5))


ggsave(file="figures/knn_experiment_k_all.pdf",a_all,height=6,width=18)

magick::image_read_pdf("./figures/knn_experiment_k_all.pdf",
                       pages = 1)

Unsupervised experiment: partitioning around medoids (PAM)

Apply the PAM on all fold/distance/dataset combinations, repeating the cross-validation process 10 times

pam_results_list=list()
for(cv_iter in 1:10){

benchmark_data_resamples_pam = benchmark_data %>% 
  mutate(
         cross_validation = map(.x=prepped_data,~vfold_cv(.x, v= 5, strata=response)),
         fold_id = map(.x=cross_validation,~.x$id),
         fold_splits = map(.x=cross_validation,~.x$splits),
         n_clusters  = map_dbl(.x=prepped_data,~length(levels(.x$response)))
  )%>% unnest(cols = distance_method) %>% unnest(cols = c(fold_id,fold_splits)) %>% #%>% unnest(cols = k_par)
  select(-cross_validation,-prepped_data) %>%
  mutate(train_fold=map(.x=fold_splits,.f=~analysis(.x)),
         test_fold=map(.x=fold_splits,.f=~assessment(.x))
  ) %>% 
  select(-fold_splits) %>%
    mutate(
         distance_res = pmap(.l=list(..1 = train_fold,..2 = distance_method,..3 = 1),
                        .f=~catdist(x = ..1 %>% select(-response),
                                  y = ..1 %>% pull(response),
                                  method = ..2,out_delta=..3)),
         distance_mat = map(.x=distance_res,.f=function(x=.x){x$distance_mat[is.na(x$distance_mat)]=0;
         return(x$distance_mat)}),
         delta = map(.x=distance_res,~.x$delta)
         )


benchmark_results_pam = benchmark_data_resamples_pam %>% 
  mutate(
      pam_solution = map2(.x=distance_mat,.y=n_clusters,.f=~pam(x = .x,k = .y)),
      pam_clust = map(.x=pam_solution, .f= ~return(.x$clustering %>% factor())),
      pam_medoid_ids = map(.x=pam_solution, ~return(.x$id.med)),
      true_clust_test = map(.x = test_fold, ~return(.x %>% pull(response))),
      test_data = map(.x = test_fold, ~return(.x %>% select(-response))),
      train_data = map(.x = train_fold, ~return(.x %>% select(-response))),
      medoids = map2(.x = train_data,.y=pam_medoid_ids, ~return(.x[.y,])),
      clust_test = pmap(.l=list(..1 = medoids,..2 = test_data,..3=delta),
                        ~predict_pam(medoids = ..1, newdata=..2, delta=..3)),
      measure =  map2_dbl(.x = true_clust_test,.y = clust_test,
                     .f=function(x=.x,y=.y) clustComp(c1 = x, c2 = y)$ARI
    )
  )
   
pam_results_list[[cv_iter]] = benchmark_results_pam %>% select(datasets,distance_method,measure,fold_id) %>% group_by(datasets,distance_method) %>% 
  summarise(ARI=mean(measure)) %>% ungroup() %>% mutate(cv_iteration=cv_iter)
}

save(file="pam_results_list.RData",pam_results_list)

Preparing the data structures and the corresponding labels for the plots

labelize <- Vectorize(function(x){library("stringr")
  word(x, start = 1, sep = "\\:")
}
)

long_levels = benchmark_data %>% mutate(
  class = c(2,2,2,2,3,3,4,19,4),
  data_n = map_dbl(prepped_data,nrow),
  data_p = map_dbl(prepped_data,ncol)-1,
  long_name = pmap_chr(.l=list(datasets,data_n,data_p,class),.f= ~paste0(..1," (n: ",..2,
                                                                         ", p: ",..3,", cl: ",..4,")"))
)%>% pull(long_name) %>% as.factor() %>% levels

pam_results=bind_rows(pam_results_list) %>% group_by(datasets,distance_method) %>% summarise(sd_ARI=sd(ARI),
                                                                                             ARI=mean(ARI))

pam_results = pam_results  %>%
  mutate(distance_method = str_to_title(distance_method),
         datasets=as.factor(datasets)
  )

nds = pam_results %>% pull(datasets) %>% as.factor
levels(nds) = long_levels

pam_results$nds = nds

pam_results=pam_results %>% 
  mutate(datasets = nds)

Create the plots for the results, with and without fixed ARI axis scale

pam_a = pam_results %>% 
  mutate(ordered_distances = reorder_within(distance_method,ARI,datasets,sep = ":")) %>% 
  rename(`distance measure` = `distance_method`) %>% 
  ggplot(aes(x = ARI, y = ordered_distances))+theme_minimal() +
  geom_point(aes(colour=`distance measure`)) +
  geom_linerange(aes(xmin = ARI-sd_ARI,xmax = ARI+sd_ARI, y = ordered_distances),inherit.aes=FALSE)+
  theme(axis.text.y =  element_text(size=5),
        axis.text.x =  element_text(angle=90,size=5),
        legend.position = "bottom"
  ) +
  facet_wrap(~datasets,scales = "free",nrow=3,ncol=3) + 
  scale_y_discrete(label=labelize)+
  labs(title="PAM clustering",subtitle = "k-medoids")+xlab("test ARI")+ ylab("distances")

 pam_a

pam_b = pam_results %>% 
  mutate(ordered_distances = reorder_within(distance_method,ARI,datasets,sep = ":")) %>% 
  rename(`distance measure` = `distance_method`) %>% 
  ggplot(aes(x = ARI, y = ordered_distances))+theme_minimal() +
  geom_point(aes(colour=`distance measure`)) +
  geom_linerange(aes(xmin = ARI-sd_ARI,xmax = ARI+sd_ARI, y = ordered_distances),inherit.aes=FALSE)+
  theme(axis.text.y =  element_text(size=5),
        axis.text.x =  element_text(angle=90,size=5),
        legend.position = "bottom"
  ) +
  facet_wrap(~datasets,scales = "free_y",nrow=3,ncol=3) + 
  scale_y_discrete(label=labelize)+
  labs(title="PAM clustering",subtitle = "k-medoids")+xlab("test ARI")+ ylab("distances")

 pam_b

ggsave(file="figures/test_pam_experiment.pdf",pam_a,height=6,width=10)
ggsave(file="figures/test_pam_experiment_fix_scale.pdf",pam_b,height=6,width=10)