library("tidyverse")
library("kableExtra")
library("purrr")
library("clusterGeneration")
library("RSpectra")
library("fpc")
library("catdist")
source("R/core_spectral.r")
source("R/catdist.R")
n=1000
note: the t2/t4 code for balanced/unbalanced scenario is inverted for continuous and categorical
4 active vars, low strength
# categorical
load(file = "data/m_pop_t2_s2_act_4_noise_low.RData")
sc_unbal_low_strength_act_4_noise_low=rep_l
# continuous
load(file = "data/continuous_NC_t2_s2_4_noise_low.rdata")
cont_sc_unbal_low_strength_act_4_noise_low=dataSim
4 active vars, high strength
# categorical
load(file = "data/m_pop_t2_s3_act_4_noise_low.RData")
sc_unbal_high_strength_act_4_noise_low=rep_l
# continuous
load(file = "data/continuous_NC_t2_s3_4_noise_low.rdata")
cont_sc_unbal_high_strength_act_4_noise_low=dataSim
8 active vars, low strength
# categorical
load(file = "data/m_pop_t2_s2_act_8_noise_low.RData")
sc_unbal_low_strength_act_8_noise_low=rep_l
# continuous
load(file = "data/continuous_t2_s2_8_noise_low.rdata")
cont_sc_unbal_low_strength_act_8_noise_low=dataSim
8 active vars, high strength
# categorical
load(file = "data/m_pop_t2_s3_act_8_noise_low.RData")
sc_unbal_high_strength_act_8_noise_low=rep_l
# continuous
load(file = "data/continuous_t2_s3_8_noise_low.rdata")
cont_sc_unbal_high_strength_act_8_noise_low=dataSim
4 active vars, low strength
# categorical
load(file = "data/m_pop_t4_s2_act_4_noise_low.RData")
sc_bal_low_strength_act_4_noise_low=rep_l
# continuous
load(file = "data/continuous_NC_t4_s2_4_noise_low.rdata")
cont_sc_bal_low_strength_act_4_noise_low=dataSim
4 active vars, high strength
# categorical
load(file = "data/m_pop_t4_s3_act_4_noise_low.RData")
sc_bal_high_strength_act_4_noise_low=rep_l
# continuous
load(file = "data/continuous_NC_t4_s3_4_noise_low.rdata")
cont_sc_bal_high_strength_act_4_noise_low=dataSim
8 active vars, low strength
# categorical
load(file = "data/m_pop_t4_s2_act_8_noise_low.RData")
sc_bal_low_strength_act_8_noise_low=rep_l
# continuous
load(file = "data/continuous_t4_s2_8_noise_low.rdata")
cont_sc_bal_low_strength_act_8_noise_low=dataSim
8 active vars, high strength
# categorical
load(file = "data/m_pop_t4_s3_act_8_noise_low.RData")
sc_bal_high_strength_act_8_noise_low=rep_l
# continuous
load(file = "data/continuous_t4_s3_8_noise_low.rdata")
cont_sc_bal_high_strength_act_8_noise_low=dataSim
balanced = ordered(c("no", "yes"))
actives=c(4,8)
strength = ordered(c("mild", "high"))
scenario_data = crossing(balanced,actives, strength) %>%
arrange(balanced,actives,desc(strength)) %>%
mutate(
data = list(
sc_unbal_low_strength_act_4_noise_low[[1]]$scenario_data %>%
.[as.numeric(sc_unbal_low_strength_act_4_noise_low[[1]]$trueCluID) %>% order(),],
sc_unbal_high_strength_act_4_noise_low[[1]]$scenario_data %>%
.[as.numeric(sc_unbal_high_strength_act_4_noise_low[[1]]$trueCluID) %>% order(),],
sc_unbal_low_strength_act_8_noise_low[[1]]$scenario_data %>%
.[as.numeric(sc_unbal_low_strength_act_8_noise_low[[1]]$trueCluID) %>% order(),],
sc_unbal_high_strength_act_8_noise_low[[1]]$scenario_data %>%
.[as.numeric(sc_unbal_high_strength_act_8_noise_low[[1]]$trueCluID) %>% order(),],
sc_bal_low_strength_act_4_noise_low[[1]]$scenario_data %>%
.[as.numeric(sc_bal_low_strength_act_4_noise_low[[1]]$trueCluID) %>% order(),],
sc_bal_high_strength_act_4_noise_low[[1]]$scenario_data %>%
.[as.numeric(sc_bal_high_strength_act_4_noise_low[[1]]$trueCluID) %>% order(),],
sc_bal_low_strength_act_8_noise_low[[1]]$scenario_data %>%
.[as.numeric(sc_bal_low_strength_act_8_noise_low[[1]]$trueCluID) %>% order(),],
sc_bal_high_strength_act_8_noise_low[[1]]$scenario_data %>%
.[as.numeric(sc_bal_high_strength_act_8_noise_low[[1]]$trueCluID) %>% order(),]
),
cont_data = list(
cont_sc_unbal_low_strength_act_4_noise_low[[1]]$scenario_data,
cont_sc_unbal_high_strength_act_4_noise_low[[1]]$scenario_data,
cont_sc_unbal_low_strength_act_8_noise_low[[1]]$scenario_data,
cont_sc_unbal_high_strength_act_8_noise_low[[1]]$scenario_data,
cont_sc_bal_low_strength_act_4_noise_low[[1]]$scenario_data,
cont_sc_bal_high_strength_act_4_noise_low[[1]]$scenario_data,
cont_sc_bal_low_strength_act_8_noise_low[[1]]$scenario_data,
cont_sc_bal_high_strength_act_8_noise_low[[1]]$scenario_data
),
true_clusters=list(
sc_unbal_low_strength_act_4_noise_low[[1]]$trueCluID %>%
.[as.numeric(sc_unbal_low_strength_act_4_noise_low[[1]]$trueCluID) %>% order()],
sc_unbal_high_strength_act_4_noise_low[[1]]$trueCluID %>%
.[as.numeric(sc_unbal_high_strength_act_4_noise_low[[1]]$trueCluID) %>% order()],
sc_unbal_low_strength_act_8_noise_low[[1]]$trueCluID %>%
.[as.numeric(sc_unbal_low_strength_act_8_noise_low[[1]]$trueCluID) %>% order()],
sc_unbal_high_strength_act_8_noise_low[[1]]$trueCluID %>%
.[as.numeric(sc_unbal_high_strength_act_8_noise_low[[1]]$trueCluID) %>% order()],
sc_bal_low_strength_act_4_noise_low[[1]]$trueCluID %>%
.[as.numeric(sc_bal_low_strength_act_4_noise_low[[1]]$trueCluID) %>% order()],
sc_bal_high_strength_act_4_noise_low[[1]]$trueCluID %>%
.[as.numeric(sc_bal_high_strength_act_4_noise_low[[1]]$trueCluID) %>% order()],
sc_bal_low_strength_act_8_noise_low[[1]]$trueCluID %>%
.[as.numeric(sc_bal_low_strength_act_8_noise_low[[1]]$trueCluID) %>% order()],
sc_bal_high_strength_act_8_noise_low[[1]]$trueCluID %>%
.[as.numeric(sc_bal_high_strength_act_8_noise_low[[1]]$trueCluID) %>% order()]
),
cont_true_clusters=list(
as.factor(cont_sc_unbal_low_strength_act_4_noise_low[[1]]$trueCluID),
as.factor(cont_sc_unbal_high_strength_act_4_noise_low[[1]]$trueCluID),
as.factor(cont_sc_unbal_low_strength_act_8_noise_low[[1]]$trueCluID),
as.factor(cont_sc_unbal_high_strength_act_8_noise_low[[1]]$trueCluID),
as.factor(cont_sc_bal_low_strength_act_4_noise_low[[1]]$trueCluID),
as.factor(cont_sc_bal_high_strength_act_4_noise_low[[1]]$trueCluID),
as.factor(cont_sc_bal_low_strength_act_8_noise_low[[1]]$trueCluID),
as.factor(cont_sc_bal_high_strength_act_8_noise_low[[1]]$trueCluID)
)
)
selected_distances = c("matching","tot_var_dist")
# selected_distances = c("matching","tot_var_dist", "gifi_chi2","kullback-leibler")
scenario_data = crossing(scenario_data,selected_distances)
The spectral clustering is applied on
scenario_analysis_full = scenario_data %>%
mutate(
cat_distance_matrix = map2(.x = data,.y = selected_distances,.f=~catdist(x=.x, method = .y)),
cont_distance_matrix = map(.x = cont_data,.f=~dist(x=.x, method = "euclidean") %>% as.matrix()),
full_distance_matrix = map2(.x = cat_distance_matrix,.y=cont_distance_matrix, ~.x$distance_mat+.y),
n_clusters = map_dbl(.x = cont_true_clusters,.f=~nlevels(x=.x)),
spectral_results_cont = map2(.x = cont_distance_matrix, .y = n_clusters, .f=~core_spectral(Dist = .x, K = .y)),
spectral_results_cat = map2(.x = cat_distance_matrix, .y = n_clusters, .f=~core_spectral(Dist = .x$distance_mat, K = .y)),
spectral_results_mix = map2(.x = full_distance_matrix, .y = n_clusters, .f=~core_spectral(Dist = .x, K = .y)))
scenario_analysis_full %>%
rename(spectral_results = spectral_results_cont) %>%
mutate(
between_within_ratio = map_dbl(.x=spectral_results, ~.x[["bt_wt"]]),
spectral_clusters = map(.x=spectral_results, ~.x[["labels"]]),
true_clusters = map(.x=true_clusters,~as.integer(.x)),
spectral_ari = map2(.x = true_clusters, .y = spectral_clusters,
.f=~cluster.stats(
clustering = .x,alt.clustering = .y,compareonly = TRUE)$corrected.rand
)
) %>%
dplyr::select(1:3,selected_distances,spectral_ari) %>%
filter(selected_distances=="matching") %>%
ggplot(aes(x=balanced,y=spectral_ari,fill=balanced)) +
geom_col()+facet_wrap(actives~strength)
scenario_analysis_cat = scenario_analysis_full %>%
rename(spectral_results = spectral_results_cat) %>%
mutate(
between_within_ratio = map_dbl(.x=spectral_results, ~.x[["bt_wt"]]),
spectral_clusters = map(.x=spectral_results, ~.x[["labels"]]),
true_clusters = map(.x=true_clusters,~as.integer(.x)),
spectral_ari = map2(.x = true_clusters, .y = spectral_clusters,
.f=~cluster.stats(
clustering = .x,alt.clustering = .y,compareonly = TRUE)$corrected.rand
)
)
scenario_analysis_cat %>%
dplyr::select(1:3,selected_distances,spectral_ari) %>%
filter(balanced=="yes") %>%
ggplot(aes(x=selected_distances,y=spectral_ari,fill=selected_distances)) +
geom_col()+facet_wrap(actives~strength)
scenario_analysis_cat %>%
dplyr::select(1:3,selected_distances,spectral_ari) %>%
filter(balanced=="no") %>%
ggplot(aes(x=selected_distances,y=spectral_ari,fill=selected_distances)) +
geom_col()+facet_wrap(actives~strength)
scenario_analysis_mix = scenario_analysis_full %>%
rename(spectral_results = spectral_results_mix) %>%
mutate(
between_within_ratio = map_dbl(.x=spectral_results, ~.x[["bt_wt"]]),
spectral_clusters = map(.x=spectral_results, ~.x[["labels"]]),
true_clusters = map(.x=true_clusters,~as.integer(.x)),
spectral_ari = map2(.x = true_clusters, .y = spectral_clusters,
.f=~cluster.stats(
clustering = .x,alt.clustering = .y,compareonly = TRUE)$corrected.rand
)
)
scenario_analysis_mix %>%
dplyr::select(1:3,selected_distances,spectral_ari) %>%
filter(balanced=="yes") %>%
ggplot(aes(x=selected_distances,y=spectral_ari,fill=selected_distances)) +
geom_col()+facet_wrap(actives~strength)
scenario_analysis_mix %>%
dplyr::select(1:3,selected_distances,spectral_ari) %>%
filter(balanced=="no") %>%
ggplot(aes(x=selected_distances,y=spectral_ari,fill=selected_distances)) +
geom_col()+facet_wrap(actives~strength)