library("tidyverse")
library("kableExtra")
library("purrr")
library("clusterGeneration")
library("RSpectra")
library("fpc")
library("catdist")
source("R/core_spectral.r")
source("R/catdist.R")
n=1000

Spectral clustering for mixed data

Loading/generating data

note: the t2/t4 code for balanced/unbalanced scenario is inverted for continuous and categorical

  • I renamed the RData files so that the categorical and the continuous files match.

Loading the unbalanced scenarios

4 active vars, low strength

# categorical
load(file = "data/m_pop_t2_s2_act_4_noise_low.RData")
sc_unbal_low_strength_act_4_noise_low=rep_l

# continuous
load(file = "data/continuous_NC_t2_s2_4_noise_low.rdata")
cont_sc_unbal_low_strength_act_4_noise_low=dataSim

4 active vars, high strength

# categorical
load(file = "data/m_pop_t2_s3_act_4_noise_low.RData")
sc_unbal_high_strength_act_4_noise_low=rep_l
# continuous
load(file = "data/continuous_NC_t2_s3_4_noise_low.rdata")
cont_sc_unbal_high_strength_act_4_noise_low=dataSim

8 active vars, low strength

# categorical
load(file = "data/m_pop_t2_s2_act_8_noise_low.RData")
sc_unbal_low_strength_act_8_noise_low=rep_l
# continuous
load(file = "data/continuous_t2_s2_8_noise_low.rdata")
cont_sc_unbal_low_strength_act_8_noise_low=dataSim

8 active vars, high strength

# categorical
load(file = "data/m_pop_t2_s3_act_8_noise_low.RData")
sc_unbal_high_strength_act_8_noise_low=rep_l
# continuous
load(file = "data/continuous_t2_s3_8_noise_low.rdata")
cont_sc_unbal_high_strength_act_8_noise_low=dataSim

Loading the balanced scenarios

4 active vars, low strength

# categorical
load(file = "data/m_pop_t4_s2_act_4_noise_low.RData")
sc_bal_low_strength_act_4_noise_low=rep_l
# continuous
load(file = "data/continuous_NC_t4_s2_4_noise_low.rdata")
cont_sc_bal_low_strength_act_4_noise_low=dataSim

4 active vars, high strength

# categorical
load(file = "data/m_pop_t4_s3_act_4_noise_low.RData")
sc_bal_high_strength_act_4_noise_low=rep_l
# continuous
load(file = "data/continuous_NC_t4_s3_4_noise_low.rdata")
cont_sc_bal_high_strength_act_4_noise_low=dataSim

8 active vars, low strength

# categorical
load(file = "data/m_pop_t4_s2_act_8_noise_low.RData")
sc_bal_low_strength_act_8_noise_low=rep_l
# continuous
load(file = "data/continuous_t4_s2_8_noise_low.rdata")
cont_sc_bal_low_strength_act_8_noise_low=dataSim

8 active vars, high strength

# categorical
load(file = "data/m_pop_t4_s3_act_8_noise_low.RData")
sc_bal_high_strength_act_8_noise_low=rep_l
# continuous
load(file = "data/continuous_t4_s3_8_noise_low.rdata")
cont_sc_bal_high_strength_act_8_noise_low=dataSim

Creating the list structure with the data

balanced = ordered(c("no", "yes"))
actives=c(4,8)
strength = ordered(c("mild", "high"))

scenario_data = crossing(balanced,actives, strength) %>% 
  arrange(balanced,actives,desc(strength)) %>% 
  mutate(
    data = list(
      sc_unbal_low_strength_act_4_noise_low[[1]]$scenario_data %>% 
        .[as.numeric(sc_unbal_low_strength_act_4_noise_low[[1]]$trueCluID) %>% order(),],
      sc_unbal_high_strength_act_4_noise_low[[1]]$scenario_data %>% 
        .[as.numeric(sc_unbal_high_strength_act_4_noise_low[[1]]$trueCluID) %>% order(),],
      sc_unbal_low_strength_act_8_noise_low[[1]]$scenario_data %>% 
        .[as.numeric(sc_unbal_low_strength_act_8_noise_low[[1]]$trueCluID) %>% order(),],
      sc_unbal_high_strength_act_8_noise_low[[1]]$scenario_data %>% 
        .[as.numeric(sc_unbal_high_strength_act_8_noise_low[[1]]$trueCluID) %>% order(),],
      sc_bal_low_strength_act_4_noise_low[[1]]$scenario_data %>% 
        .[as.numeric(sc_bal_low_strength_act_4_noise_low[[1]]$trueCluID) %>% order(),],
      sc_bal_high_strength_act_4_noise_low[[1]]$scenario_data %>% 
        .[as.numeric(sc_bal_high_strength_act_4_noise_low[[1]]$trueCluID) %>% order(),],
      sc_bal_low_strength_act_8_noise_low[[1]]$scenario_data %>% 
        .[as.numeric(sc_bal_low_strength_act_8_noise_low[[1]]$trueCluID) %>% order(),],
      sc_bal_high_strength_act_8_noise_low[[1]]$scenario_data %>% 
        .[as.numeric(sc_bal_high_strength_act_8_noise_low[[1]]$trueCluID) %>% order(),]
    ),
    cont_data = list(
      cont_sc_unbal_low_strength_act_4_noise_low[[1]]$scenario_data,
      cont_sc_unbal_high_strength_act_4_noise_low[[1]]$scenario_data,
      cont_sc_unbal_low_strength_act_8_noise_low[[1]]$scenario_data,
      cont_sc_unbal_high_strength_act_8_noise_low[[1]]$scenario_data,
      cont_sc_bal_low_strength_act_4_noise_low[[1]]$scenario_data,
      cont_sc_bal_high_strength_act_4_noise_low[[1]]$scenario_data,
      cont_sc_bal_low_strength_act_8_noise_low[[1]]$scenario_data,
      cont_sc_bal_high_strength_act_8_noise_low[[1]]$scenario_data
    ),
    true_clusters=list(
      sc_unbal_low_strength_act_4_noise_low[[1]]$trueCluID %>% 
        .[as.numeric(sc_unbal_low_strength_act_4_noise_low[[1]]$trueCluID) %>% order()],
      sc_unbal_high_strength_act_4_noise_low[[1]]$trueCluID %>% 
        .[as.numeric(sc_unbal_high_strength_act_4_noise_low[[1]]$trueCluID) %>% order()],
      sc_unbal_low_strength_act_8_noise_low[[1]]$trueCluID %>% 
        .[as.numeric(sc_unbal_low_strength_act_8_noise_low[[1]]$trueCluID) %>% order()],
      sc_unbal_high_strength_act_8_noise_low[[1]]$trueCluID %>% 
        .[as.numeric(sc_unbal_high_strength_act_8_noise_low[[1]]$trueCluID) %>% order()],
      sc_bal_low_strength_act_4_noise_low[[1]]$trueCluID %>% 
        .[as.numeric(sc_bal_low_strength_act_4_noise_low[[1]]$trueCluID) %>% order()],
      sc_bal_high_strength_act_4_noise_low[[1]]$trueCluID %>% 
        .[as.numeric(sc_bal_high_strength_act_4_noise_low[[1]]$trueCluID) %>% order()],
      sc_bal_low_strength_act_8_noise_low[[1]]$trueCluID %>% 
        .[as.numeric(sc_bal_low_strength_act_8_noise_low[[1]]$trueCluID) %>% order()],      
      sc_bal_high_strength_act_8_noise_low[[1]]$trueCluID %>% 
        .[as.numeric(sc_bal_high_strength_act_8_noise_low[[1]]$trueCluID) %>% order()]
    ),
    cont_true_clusters=list(
      as.factor(cont_sc_unbal_low_strength_act_4_noise_low[[1]]$trueCluID),
      as.factor(cont_sc_unbal_high_strength_act_4_noise_low[[1]]$trueCluID),
      as.factor(cont_sc_unbal_low_strength_act_8_noise_low[[1]]$trueCluID),
      as.factor(cont_sc_unbal_high_strength_act_8_noise_low[[1]]$trueCluID),
      as.factor(cont_sc_bal_low_strength_act_4_noise_low[[1]]$trueCluID),
      as.factor(cont_sc_bal_high_strength_act_4_noise_low[[1]]$trueCluID),
      as.factor(cont_sc_bal_low_strength_act_8_noise_low[[1]]$trueCluID),
      as.factor(cont_sc_bal_high_strength_act_8_noise_low[[1]]$trueCluID)
    )
  )

Distance computations

selected_distances = c("matching","tot_var_dist")
# selected_distances = c("matching","tot_var_dist", "gifi_chi2","kullback-leibler")
scenario_data = crossing(scenario_data,selected_distances)

Spectral clustering

The spectral clustering is applied on

scenario_analysis_full = scenario_data %>% 
  mutate(
    cat_distance_matrix = map2(.x = data,.y = selected_distances,.f=~catdist(x=.x, method = .y)),
    cont_distance_matrix = map(.x = cont_data,.f=~dist(x=.x, method = "euclidean") %>% as.matrix()),
    full_distance_matrix = map2(.x = cat_distance_matrix,.y=cont_distance_matrix, ~.x$distance_mat+.y),
    n_clusters = map_dbl(.x = cont_true_clusters,.f=~nlevels(x=.x)),
    spectral_results_cont = map2(.x = cont_distance_matrix, .y = n_clusters, .f=~core_spectral(Dist = .x, K = .y)),
spectral_results_cat = map2(.x = cat_distance_matrix, .y = n_clusters, .f=~core_spectral(Dist = .x$distance_mat, K = .y)),
spectral_results_mix = map2(.x = full_distance_matrix, .y = n_clusters, .f=~core_spectral(Dist = .x, K = .y)))

Continuous case

scenario_analysis_full %>% 
  rename(spectral_results = spectral_results_cont) %>% 
  mutate(
    between_within_ratio = map_dbl(.x=spectral_results, ~.x[["bt_wt"]]),
    spectral_clusters = map(.x=spectral_results, ~.x[["labels"]]),
    true_clusters = map(.x=true_clusters,~as.integer(.x)),
    spectral_ari = map2(.x = true_clusters, .y = spectral_clusters,
                        .f=~cluster.stats(
                          clustering = .x,alt.clustering = .y,compareonly = TRUE)$corrected.rand
                        )
  ) %>% 
  dplyr::select(1:3,selected_distances,spectral_ari) %>%
  filter(selected_distances=="matching") %>% 
  ggplot(aes(x=balanced,y=spectral_ari,fill=balanced)) + 
  geom_col()+facet_wrap(actives~strength)

Categorical case

scenario_analysis_cat = scenario_analysis_full %>% 
  rename(spectral_results = spectral_results_cat) %>% 
  mutate(
    between_within_ratio = map_dbl(.x=spectral_results, ~.x[["bt_wt"]]),
    spectral_clusters = map(.x=spectral_results, ~.x[["labels"]]),
    true_clusters = map(.x=true_clusters,~as.integer(.x)),
    spectral_ari = map2(.x = true_clusters, .y = spectral_clusters,
                        .f=~cluster.stats(
                          clustering = .x,alt.clustering = .y,compareonly = TRUE)$corrected.rand
                        )
  )

scenario_analysis_cat %>% 
  dplyr::select(1:3,selected_distances,spectral_ari) %>%
  filter(balanced=="yes") %>% 
  ggplot(aes(x=selected_distances,y=spectral_ari,fill=selected_distances)) + 
  geom_col()+facet_wrap(actives~strength)

scenario_analysis_cat %>% 
  dplyr::select(1:3,selected_distances,spectral_ari) %>%
  filter(balanced=="no") %>% 
  ggplot(aes(x=selected_distances,y=spectral_ari,fill=selected_distances)) + 
  geom_col()+facet_wrap(actives~strength)

Mixed case

scenario_analysis_mix = scenario_analysis_full %>% 
  rename(spectral_results = spectral_results_mix) %>% 
  mutate(
    between_within_ratio = map_dbl(.x=spectral_results, ~.x[["bt_wt"]]),
    spectral_clusters = map(.x=spectral_results, ~.x[["labels"]]),
    true_clusters = map(.x=true_clusters,~as.integer(.x)),
    spectral_ari = map2(.x = true_clusters, .y = spectral_clusters,
                        .f=~cluster.stats(
                          clustering = .x,alt.clustering = .y,compareonly = TRUE)$corrected.rand
                        )
  )

scenario_analysis_mix %>% 
  dplyr::select(1:3,selected_distances,spectral_ari) %>%
  filter(balanced=="yes") %>% 
  ggplot(aes(x=selected_distances,y=spectral_ari,fill=selected_distances)) + 
  geom_col()+facet_wrap(actives~strength)

scenario_analysis_mix %>% 
  dplyr::select(1:3,selected_distances,spectral_ari) %>%
  filter(balanced=="no") %>% 
  ggplot(aes(x=selected_distances,y=spectral_ari,fill=selected_distances)) + 
  geom_col()+facet_wrap(actives~strength)