knitr::opts_chunk$set(echo = TRUE,message = FALSE,warning = FALSE)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom 1.0.1 ✔ rsample 1.1.0
## ✔ dials 1.0.0 ✔ tune 1.0.1
## ✔ infer 1.0.3 ✔ workflows 1.1.0
## ✔ modeldata 1.0.1 ✔ workflowsets 1.0.0
## ✔ parsnip 1.0.2 ✔ yardstick 1.1.0
## ✔ recipes 1.0.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(fastDummies)
library(aricode)
library(corrplot)
## corrplot 0.92 loaded
library(ggh4x)
library(magick)
## Linking to ImageMagick 6.9.12.3
## Enabled features: cairo, fontconfig, freetype, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fftw, ghostscript, x11
library(pdftools)
## Using poppler version 22.02.0
library(clustrd)
## Loading required package: grid
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(cluster)
library(Matrix)
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(purrr)
library(philentropy)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(tidytext)
source("R/cat_custom_delta.R")
source("R/z_preproc.R")
source("R/select_data.R")
source("R/catdist.R")
source("R/cat_delta.R")
source("R/predict_pam.R")
source("R/prepare_data.R")
source("R/cat_nearest_neighbors.R")
set.seed(123)
We consider different datasets and distances
selected_distances = c("tot_var_dist", "gifi_chi2",
"supervised","supervised_full", "matching","eskin",
"goodall_3","goodall_4","iof","of","lin",
"var_entropy","var_mutability","kullback-leibler")
dataset_names = c("vote","australian","wbcd", "tictac","balance","tae",
"lympho","soybeanlarge","cars")
benchmark_data = tibble(datasets=dataset_names) %>%
mutate(prepped_data = map(.x=datasets,~select_data(.x)),
prepped_data = map(.x=prepped_data, ~cbind(.x$df,response=.x$y)),
distance_method = rerun(selected_distances,.n=n())
)
For each dataset and distance considered, we do a KNN, tuning the parameter on the grid 1, 3, 5, 9, 15, 21 via 5-fold cross-validation. We repeat the process 10 times.
benchmark_data_resamples = benchmark_data %>%
mutate(
cv_iterations = rerun(1:10, .n = n())
) %>% unnest(cv_iterations) %>%
mutate(
cross_validation = map(.x=prepped_data,~vfold_cv(.x, v= 5, strata=response)),
fold_id = map(.x=cross_validation,~.x$id),
fold_splits = map(.x=cross_validation,~.x$splits),
k_par = rerun(c(1,3,5,9,15,21),.n=n()),
)%>% unnest(cols = distance_method)%>% unnest(cols = k_par) %>% unnest(cols = c(fold_id,fold_splits)) %>%
select(-cross_validation)
Apply the KNN on all fold/distance/dataset combinations
benchmark_results_training = benchmark_data_resamples %>%
mutate(knn_results = pmap(.l = list(fold_splits, k_par,distance_method),
.f = ~cat_nearest_neighbors(train_df = analysis(..1),
assess_df = assessment(..1),
k=..2, method=..3)
)
)
Collection of the cross-validation estimate of accuracy
accuracy_results = benchmark_results_training %>%
mutate(
same_levs = map_lgl(.x = knn_results,~(length(levels(.x$.pred))==length(levels(.x$truth)))),
classes = map_dbl(.x=prepped_data,~length(levels(.x$response))),
accuracy= map_dbl(.x = knn_results,
.f = function(x=.x){
x$.pred = factor(x$.pred, levels = levels(x$truth))
return(accuracy(x,truth=x$truth,estimate=x$.pred) %>% pull(.estimate)
)
}
)
) %>% group_by(datasets,distance_method,k_par,cv_iterations) %>%
summarise(cv_accuracy=mean(accuracy)) %>% ungroup() %>%
group_by(datasets,distance_method,k_par) %>%
summarise(sd_cv_accuracy=sd(cv_accuracy),
cv_accuracy=mean(cv_accuracy)
) %>% ungroup() %>%
group_by(datasets,distance_method)
Select the best K for each distance method and data set
accuracy_results_tuned = accuracy_results %>%
slice_max(cv_accuracy,with_ties = FALSE)%>%
mutate(
distance_method = str_to_title(distance_method),
datasets=as.factor(datasets)
)
Select the same value for the parameter: K=9
accuracy_results_fix = accuracy_results %>%
filter(k_par==9)%>%
mutate(
distance_method = str_to_title(distance_method),
datasets=as.factor(datasets)
)
Creating data set names that contain the data size and the number of clusters; update the names in the results for the cross-validated K and for the fixed K=9
long_levels = benchmark_data %>%
mutate(
cv_iterations = rerun(1:10, .n = n())
) %>% unnest(cv_iterations) %>%
mutate(
class = c(rep(2,10),rep(2,10),rep(2,10),rep(2,10),rep(3,10),rep(3,10),rep(4,10),rep(19,10),rep(4,10)),
data_n = map_dbl(prepped_data,nrow),
data_p = map_dbl(prepped_data,ncol)-1,
long_name = pmap_chr(.l=list(datasets,data_n,data_p,class),.f= ~paste0(..1," (n: ",..2,
", p: ",..3,", cl: ",..4,")"))
)%>% pull(long_name) %>% as.factor() %>% levels
nds_tuned=accuracy_results_tuned %>% pull(datasets)
levels(nds_tuned) = long_levels
accuracy_results_tuned$nds = nds_tuned
accuracy_results_tuned=accuracy_results_tuned %>%
mutate(datasets = nds)
nds_fix=accuracy_results_fix %>% pull(datasets)
levels(nds_fix) = long_levels
accuracy_results_fix$nds = nds_fix
accuracy_results_fix = accuracy_results_fix %>%
mutate(datasets = nds)
Create a structure with all the results (fixed and cross-validated)
all_results_tuned_vs_fix = rbind(accuracy_results_fix %>% mutate(hyper_par="b: fixed"),
accuracy_results_tuned %>% mutate(hyper_par="a: tuned")
)
labelize <- Vectorize(function(x){library("stringr")
word(x, start = 1, sep = "\\:")
}
)
a_tuned = accuracy_results_tuned %>% ungroup() %>%
mutate(ordered_dist = reorder_within(x =distance_method,by = cv_accuracy,
within = datasets, sep = ":")
) %>%
rename(`distance measure` = `distance_method`) %>%
ggplot(aes(x = cv_accuracy, y = ordered_dist))+theme_minimal() +
geom_point(aes(colour = `distance measure`, size = k_par))+
geom_linerange(aes(xmin = cv_accuracy-sd_cv_accuracy,xmax = cv_accuracy+sd_cv_accuracy, y = ordered_dist), inherit_aes=FALSE)+
facet_wrap(~datasets, scales = "free_y", nrow=3, ncol=3)+
theme(axis.text.y = element_text(size=5),
axis.text.x = element_text(angle=90, size=5),
legend.position = "bottom"
) +
scale_y_discrete(label=labelize)+
scale_size(range=c(.5,2.5))+guides(size="none")+
labs(title="KNN", subtitle = "5-fold CV") +
xlab("accuracy") + ylab("distances")+scale_size(range=c(.5,2.5))
a_tuned
ggsave(file="figures/knn_experiment_tuned.pdf",a_tuned,height=6,width=10)
a_fix = accuracy_results_fix %>% ungroup() %>%
mutate(ordered_dist = reorder_within(x =distance_method,by = cv_accuracy,
within = datasets, sep = ":")
) %>%
rename(`distance measure` = `distance_method`) %>%
ggplot(aes(x = cv_accuracy, y = ordered_dist))+theme_minimal() +
geom_point(aes(colour = `distance measure`, size = k_par))+
geom_linerange(aes(xmin = cv_accuracy-sd_cv_accuracy,xmax = cv_accuracy+sd_cv_accuracy, y = ordered_dist), inherit_aes=FALSE)+
facet_wrap(~datasets, scales = "free_y", nrow=3, ncol=3)+
theme(axis.text.y = element_text(size=5),
axis.text.x = element_text(angle=90, size=5),
legend.position = "bottom"
) +
scale_y_discrete(label=labelize)+
scale_size(range=c(.5,2.5))+guides(size="none")+
labs(title="KNN", subtitle = "5-fold CV") +
xlab("accuracy") + ylab("distances")+scale_size(range=c(.5,2.5))
a_fix
ggsave(file="figures/knn_experiment_k_fix.pdf",a_fix,height=6,width=10)
a_all = all_results_tuned_vs_fix %>%
ungroup() %>%
mutate(
ordered_dist = reorder_within(x =distance_method,by = cv_accuracy,
within = datasets, sep = ":")
) %>%
rename(`distance measure` = `distance_method`) %>%
ggplot(aes(x = cv_accuracy, y = ordered_dist))+theme_bw() +
geom_point(aes(colour = `distance measure`, size = k_par))+
geom_linerange(aes(xmin = cv_accuracy-sd_cv_accuracy,xmax = cv_accuracy+sd_cv_accuracy, y = ordered_dist), inherit_aes=FALSE)+
facet_nested_wrap( vars(hyper_par,datasets), scales = "free_y",strip.position = "top", nrow=2,dir = "h")+
theme(axis.text.y = element_text(size=5),
axis.text.x = element_text(angle=90, size=5),
legend.position = "bottom"
) +
scale_y_discrete(label=labelize)+
scale_size(range=c(.5,2.5))+guides(size="none")+
labs(title="KNN", subtitle = "5-fold CV") +
xlab("accuracy") + ylab("distances")+scale_size(range=c(.5,2.5))
ggsave(file="figures/knn_experiment_k_all.pdf",a_all,height=6,width=18)
magick::image_read_pdf("./figures/knn_experiment_k_all.pdf",
pages = 1)
Apply the PAM on all fold/distance/dataset combinations, repeating the cross-validation process 10 times
pam_results_list=list()
for(cv_iter in 1:10){
benchmark_data_resamples_pam = benchmark_data %>%
mutate(
cross_validation = map(.x=prepped_data,~vfold_cv(.x, v= 5, strata=response)),
fold_id = map(.x=cross_validation,~.x$id),
fold_splits = map(.x=cross_validation,~.x$splits),
n_clusters = map_dbl(.x=prepped_data,~length(levels(.x$response)))
)%>% unnest(cols = distance_method) %>% unnest(cols = c(fold_id,fold_splits)) %>% #%>% unnest(cols = k_par)
select(-cross_validation,-prepped_data) %>%
mutate(train_fold=map(.x=fold_splits,.f=~analysis(.x)),
test_fold=map(.x=fold_splits,.f=~assessment(.x))
) %>%
select(-fold_splits) %>%
mutate(
distance_res = pmap(.l=list(..1 = train_fold,..2 = distance_method,..3 = 1),
.f=~catdist(x = ..1 %>% select(-response),
y = ..1 %>% pull(response),
method = ..2,out_delta=..3)),
distance_mat = map(.x=distance_res,.f=function(x=.x){x$distance_mat[is.na(x$distance_mat)]=0;
return(x$distance_mat)}),
delta = map(.x=distance_res,~.x$delta)
)
benchmark_results_pam = benchmark_data_resamples_pam %>%
mutate(
pam_solution = map2(.x=distance_mat,.y=n_clusters,.f=~pam(x = .x,k = .y)),
pam_clust = map(.x=pam_solution, .f= ~return(.x$clustering %>% factor())),
pam_medoid_ids = map(.x=pam_solution, ~return(.x$id.med)),
true_clust_test = map(.x = test_fold, ~return(.x %>% pull(response))),
test_data = map(.x = test_fold, ~return(.x %>% select(-response))),
train_data = map(.x = train_fold, ~return(.x %>% select(-response))),
medoids = map2(.x = train_data,.y=pam_medoid_ids, ~return(.x[.y,])),
clust_test = pmap(.l=list(..1 = medoids,..2 = test_data,..3=delta),
~predict_pam(medoids = ..1, newdata=..2, delta=..3)),
measure = map2_dbl(.x = true_clust_test,.y = clust_test,
.f=function(x=.x,y=.y) clustComp(c1 = x, c2 = y)$ARI
)
)
pam_results_list[[cv_iter]] = benchmark_results_pam %>% select(datasets,distance_method,measure,fold_id) %>% group_by(datasets,distance_method) %>%
summarise(ARI=mean(measure)) %>% ungroup() %>% mutate(cv_iteration=cv_iter)
}
save(file="pam_results_list.RData",pam_results_list)
Preparing the data structures and the corresponding labels for the plots
labelize <- Vectorize(function(x){library("stringr")
word(x, start = 1, sep = "\\:")
}
)
long_levels = benchmark_data %>% mutate(
class = c(2,2,2,2,3,3,4,19,4),
data_n = map_dbl(prepped_data,nrow),
data_p = map_dbl(prepped_data,ncol)-1,
long_name = pmap_chr(.l=list(datasets,data_n,data_p,class),.f= ~paste0(..1," (n: ",..2,
", p: ",..3,", cl: ",..4,")"))
)%>% pull(long_name) %>% as.factor() %>% levels
pam_results=bind_rows(pam_results_list) %>% group_by(datasets,distance_method) %>% summarise(sd_ARI=sd(ARI),
ARI=mean(ARI))
pam_results = pam_results %>%
mutate(distance_method = str_to_title(distance_method),
datasets=as.factor(datasets)
)
nds = pam_results %>% pull(datasets) %>% as.factor
levels(nds) = long_levels
pam_results$nds = nds
pam_results=pam_results %>%
mutate(datasets = nds)
Create the plots for the results, with and without fixed ARI axis scale
pam_a = pam_results %>%
mutate(ordered_distances = reorder_within(distance_method,ARI,datasets,sep = ":")) %>%
rename(`distance measure` = `distance_method`) %>%
ggplot(aes(x = ARI, y = ordered_distances))+theme_minimal() +
geom_point(aes(colour=`distance measure`)) +
geom_linerange(aes(xmin = ARI-sd_ARI,xmax = ARI+sd_ARI, y = ordered_distances),inherit.aes=FALSE)+
theme(axis.text.y = element_text(size=5),
axis.text.x = element_text(angle=90,size=5),
legend.position = "bottom"
) +
facet_wrap(~datasets,scales = "free",nrow=3,ncol=3) +
scale_y_discrete(label=labelize)+
labs(title="PAM clustering",subtitle = "k-medoids")+xlab("test ARI")+ ylab("distances")
pam_a
pam_b = pam_results %>%
mutate(ordered_distances = reorder_within(distance_method,ARI,datasets,sep = ":")) %>%
rename(`distance measure` = `distance_method`) %>%
ggplot(aes(x = ARI, y = ordered_distances))+theme_minimal() +
geom_point(aes(colour=`distance measure`)) +
geom_linerange(aes(xmin = ARI-sd_ARI,xmax = ARI+sd_ARI, y = ordered_distances),inherit.aes=FALSE)+
theme(axis.text.y = element_text(size=5),
axis.text.x = element_text(angle=90,size=5),
legend.position = "bottom"
) +
facet_wrap(~datasets,scales = "free_y",nrow=3,ncol=3) +
scale_y_discrete(label=labelize)+
labs(title="PAM clustering",subtitle = "k-medoids")+xlab("test ARI")+ ylab("distances")
pam_b
ggsave(file="figures/test_pam_experiment.pdf",pam_a,height=6,width=10)
ggsave(file="figures/test_pam_experiment_fix_scale.pdf",pam_b,height=6,width=10)