Unbiased mixed-variable distance

supplementary material

Code
packages <- c(
  "aricode", "cluster", "fpc", "manydist", "patchwork", "tidyverse",
  "tidymodels", "varhandle", "vegan", "kdml", "ggplot2", "viridis",
  "dplyr", "entropy", "kernlab", "mclust", "Matrix", "philentropy",
  "conflicted", "ggnewscale", "glue"
)

invisible(lapply(packages, require, character.only = TRUE))

tidymodels_prefer()
conflict_prefer("select", "dplyr")
conflict_prefer("filter", "dplyr")
conflict_prefer("distance", "philentropy")

# prefer base set operations (if needed later)
conflict_prefer("intersect", "base")
conflict_prefer("setdiff", "base")

canon_method <- function(m) dplyr::recode(m, dkss = "dkps")

# helper: rinomina colonne solo se esistono (non rompe nulla se non ci sono)
canon_cols <- function(df) {
  df |> dplyr::rename_with(canon_method, .cols = dplyr::any_of("dkss"))
}

Here we provide the code to reproduce the results of the supplementary material of the paper “Unbiased mixed variables distances”.

Data generation

These are the simulation parameers

Code
reps<-100  # 100 can take 30min
n<-500
k<-2 # Dimension solution
sigma<-0.03 # Noise

porig<-6 # Number of variables corresponding to true config
pnnoise<-0 # Number of numerical noise vars
pcnoise<-0 # Number of categorical noise vars
pn<-2 # Number of numerical variables underying config

pnum<-pn+pnnoise+pcnoise  # Total number of numerical vars+noise vars (Needed to know which vars to discretize)
p<-porig+pnnoise+pcnoise # Total number of variables
qoptions<-c(2,3,5,9) # Distribution of the categorical variables corresponding to true config
pcat<-length(qoptions) # Number of cat variables underlying config


methods = c("baseline","naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")
method_levels <- c("baseline", "naive", "hl", "hl_add", "gower","gudmm", "dkps", "mod_gower", "u_ind", "u_dep", "u_mix")


my_colors <- c(
  # Set B
  "baseline"    = "#8C564B",  # brown
  "naive"       = "#E377C2",  # pink
  "hl"          = "#9C9E00",  # darker yellow-green (better contrast)
  "gudmm"       = "#FF7F0E",  # orange
  "dkps"        = "#7F7F7F",  # darker grey (more visible)

  # Set A
  "hl_add"      = "#9EC5E5",  # light blue
  "u_ind"       = "#2CA02C",  # green
  "u_dep"       = "#D62728",  # red
  "u_mix" = "#1E90FF",
  "gower"       = "#FFBF00",  # gold
  "mod_gower"   = "#9467BD"   # purple
)

my_shapes <- c(
  "baseline"    = 16,  # filled circle
  "naive"       = 17,  # filled triangle
  "hl"          = 15,  # filled square
  "hl_add"      = 3,   # plus
  "gower"       = 4,   # cross
  "gudmm"       = 18,  # filled diamond
  "dkps"        = 8,   # star
  "mod_gower"   = 0,   # empty square
  "u_ind"       = 1,   # empty circle
  "u_dep"       = 2,    # empty triangle
  "u_mix" = 5 # empty diamond
)

The considered methods are

additive distances

  • baseline: Manhattan distance on all the standardized numerical variables (before discretization)

  • gower: classic Gower distance

  • mod_gower: modified version of the Gower distance, as proposed in Liu et al. (2024)

  • hl_add: Manhattan distance with scaling of indicators as proposed by Hennig and Liao (2013)

  • u_ind: commensurable with simple matching

  • u_dep: commensurable PCA-scaled numerical and total-variance categorical

  • u_mix: commensurable Manhattan numerical and total-variance categorical

non-additive distances

  • naive: Euclidean distance on all variables, with one-hot encoded categorical variables.

  • hl: Euclidean distance with scaling of indicators as proposed by Hennig and Liao (2013)

  • gudmm: Generalized Unified Distance Metric for Mixed-type data as proposed by Mousavi and Sehhati (2023)

  • dkps: Distance using kernel product similarity as proposed by Ghashti and Thompson (2024)

Compute distances: leave one variable out

Code
library(progress)


pb <- progress_bar$new(
  format = "  Simulating [:bar] :percent eta: :eta",
  total = reps * length(methods),
  clear = FALSE,
  width = 60
)

# methods = c("baseline","naive","gower","hl","u_dep","u_ind","gudmm","dkss","mod_gower")


generated_datasets = tibble(replicate=1:reps) |> 
  mutate(dataset = map(.x=replicate, ~generate_dataset(n = n, pn=pn, porig=porig, pnnoise =pnnoise , pcnoise = pcnoise, sigma = sigma, qoptions = qoptions, seed = 1234+.x)),
         method=map(.x=replicate,~methods)
         ) |> unnest(method)|>
  left_join(mdist_method_lookup, by = "method")
Code
## a little helper for mdist

run_mdist_within <- function(df, mdist_type, mdist_preset, param_set, outcome = "response") {
  x <- df

  if (is.character(outcome) && length(outcome) == 1 && outcome %in% names(df)) {
    x <- dplyr::select(df, -dplyr::all_of(outcome))
  }

  if (mdist_type == "preset") {
    out <- manydist::mdist(x = x, preset = mdist_preset)

  } else if (mdist_type == "custom") {
    if (is.null(param_set)) stop("param_set is NULL for mdist_type = 'custom'")
    args <- param_set
    args$x <- x
    out <- do.call(manydist::mdist, args = args)

  } else {
    stop("Unknown mdist_type: ", mdist_type)
  }
if (identical(mdist_type, "preset") && identical(mdist_preset, "gower")) {
  
  out$distance*ncol(x)
  }else{
  out$distance
  }
}

# little helper for the leave one variable out version of mdist
run_lovo <- function(df, mdist_type, mdist_preset, param_set,
                     outcome = NULL, dims = 2, keep_dist = FALSE,
                     legacy_gower_sum = FALSE) {

  # drop outcome if present
  x <- df
  if (is.character(outcome) && length(outcome) == 1 && outcome %in% names(df)) {
    x <- dplyr::select(df, -dplyr::all_of(outcome))
  }

  # ---- special case: legacy Gower SUM scaling for LOVO ----
  if (legacy_gower_sum && identical(mdist_type, "preset") && identical(mdist_preset, "gower")) {

    vars <- names(x)
    p_full <- length(vars)

    # full
    D_full <- manydist::mdist(x = x, preset = "gower")$distance |> as.matrix()
    D_full <- D_full * p_full

    base_mds <- cmdscale(D_full, eig = TRUE, k = dims)$points[, 1:dims, drop = FALSE]
    loo_list <- vector("list", length(vars)); names(loo_list) <- vars

    for (i in seq_along(vars)) {
      var <- vars[i]
      x_sub <- dplyr::select(x, -dplyr::any_of(var))
      p_loo <- ncol(x_sub)

      D_loo <- manydist::mdist(x = x_sub, preset = "gower")$distance |> as.matrix()
      D_loo <- D_loo * p_loo

      loo_list[[i]] <- D_loo
    }

    mad <- vapply(loo_list, function(m) mean(abs(D_full - m)), numeric(1))
    cc  <- vapply(loo_list, function(m) {
      pts <- cmdscale(m, eig = TRUE, k = dims)$points[, 1:dims, drop = FALSE]
      congruence_coeff(base_mds, pts)
    }, numeric(1))
    ac <- sqrt(1 - cc^2)

    res <- tibble::tibble(
      variable       = vars,
      mad_importance = mad,
      cc_importance  = cc,
      ac_importance  = ac,
      mad_normalized = mad / sum(mad)
    )

    out <- list(results = res, base_mds = base_mds)
    if (keep_dist) {
      out$full_dist <- D_full
      out$loo_dist  <- loo_list
    }
    return(out)
  }

  # ---- default path: your package's LOVO ----
  if (mdist_type == "preset") {
    obj <- manydist::lovo_mdist(x = x, preset = mdist_preset, dims = dims, keep_dist = keep_dist)

  } else if (mdist_type == "custom") {
    if (is.null(param_set)) stop("param_set is NULL for mdist_type = 'custom'")
    args <- param_set
    args$x <- x
    args$dims <- dims
    args$keep_dist <- keep_dist
    obj <- do.call(manydist::lovo_mdist, args = args)

  } else {
    stop("Unknown mdist_type: ", mdist_type)
  }

  obj
}
Code
simulation_structure <- generated_datasets |>
  dplyr::mutate(
    loo_results = purrr::pmap(
      dplyr::pick(dataset, method, mdist_type, mdist_preset, param_set),
      \(dataset, method, mdist_type, mdist_preset, param_set) {
        pb$tick()
        X <- if (method == "baseline") dataset$Xorig else dataset$X
        run_lovo(X, mdist_type, mdist_preset, param_set,legacy_gower_sum = (method == "gower"))
      }
    )
  )


 save(file="simulation_structure.RData", simulation_structure)
Code
reference_method = "baseline"

baseline_refs = simulation_structure |>
  filter(method == "baseline") |>
  select(replicate, baseline_loo = loo_results)


rel_simulation_structure <- simulation_structure |>
  dplyr::left_join(baseline_refs, by = "replicate") |>
  dplyr::mutate(
    loo_results = purrr::map2(.x = loo_results,.y = baseline_loo,
        .f = \(res_obj, ref_obj) {
        res <- res_obj$results
        ref <- ref_obj$results

        # join by variable to be safe (order might differ)
        dplyr::left_join(
          res,
          dplyr::select(ref, variable,
                        ref_mad_normalized = mad_normalized,
                        ref_ac_importance  = ac_importance),
          by = "variable"
        ) |>
          dplyr::mutate(
            mad_relative_importance = mad_normalized / ref_mad_normalized,
            ac_relative_importance  = ac_importance  / ref_ac_importance
          )
      }
    )
  ) |>
  select(-baseline_loo)

results_long <- rel_simulation_structure |>
  tidyr::unnest(loo_results) |>
  # enforce one value per replicate (if already unique, no change)
  dplyr::group_by(replicate, variable, method) |>
  dplyr::summarise(
    dplyr::across(
      c(mad_importance, cc_importance, ac_importance,
        mad_normalized, mad_relative_importance, ac_relative_importance),
      ~ mean(.x, na.rm = TRUE)
    ),
    .groups = "drop"
  )

results_summary <- results_long |>
  dplyr::group_by(variable, method) |>
  dplyr::summarise(
    dplyr::across(
      c(mad_importance, cc_importance, ac_importance,
        mad_normalized, mad_relative_importance, ac_relative_importance),
      list(
        mean = \(x) mean(x, na.rm = TRUE),
        sd   = \(x) sd(x,   na.rm = TRUE)
      ),
      .names = "{.col}_{.fn}"
    ),
    .groups = "drop"
  )

results_wide <- results_summary |>
  tidyr::pivot_wider(
    names_from = method,
    values_from = -c(variable, method)
  )

measure_names <- c(
  "mad_importance",
  "cc_importance",
  "ac_importance",
  "mad_normalized",
  "mad_relative_importance",
  "ac_relative_importance"
)

split_by_measure_mean <- purrr::map(
  measure_names,
  \(measure) {
    results_wide |>
      dplyr::select(variable, dplyr::starts_with(paste0(measure, "_mean_"))) |>
      dplyr::rename_with(
        ~ stringr::str_remove(., paste0(measure, "_mean_")),
        -variable
      )
  }
) |> rlang::set_names(measure_names)

split_by_measure_sd <- purrr::map(
  measure_names,
  \(measure) {
    results_wide |>
      dplyr::select(variable, dplyr::starts_with(paste0(measure, "_sd_"))) |>
      dplyr::rename_with(
        ~ stringr::str_remove(., paste0(measure, "_sd_")),
        -variable
      )
  }
) |> rlang::set_names(measure_names)  

Code for Figure 1

Code
meas_to_plot_mad_mean <- split_by_measure_mean |> pluck("mad_importance") |> 
  mutate(measure="mad_importance") |> slice(3:6,1,2) |> mutate(var_id=1:6)

meas_to_plot_mad_sd <- split_by_measure_sd |> pluck("mad_importance") |> 
  mutate(measure="mad_importance") |> slice(3:6,1,2) |> mutate(var_id=1:6)

mad_max <- max(meas_to_plot_mad_mean |> select(where(is.numeric), -var_id), na.rm = TRUE)

rect_data <- tibble(
  xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max,
  xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max
)  

meas_to_plot_mad_rel_mean <- split_by_measure_mean |> pluck("mad_normalized") |> 
  mutate(measure="mad_normalized") |> slice(3:6,1,2) |> mutate(var_id=1:6)

meas_to_plot_mad_rel_sd <- split_by_measure_sd |> pluck("mad_normalized") |> 
  mutate(measure="mad_normalized") |> slice(3:6,1,2) |> mutate(var_id=1:6)

mad_max_rel <- max(meas_to_plot_mad_rel_mean |> select(where(is.numeric), -var_id), na.rm = TRUE)

rect_data_rel <- tibble(
  xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max_rel,
  xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max_rel
)
  
loo_mad_plot <- meas_to_plot_mad_mean |> 
   canon_cols() |> 
  dplyr::select(variable, var_id, any_of(method_levels)) |> 
  pivot_longer(cols = naive:u_mix, names_to="method", values_to="value") |> 
  mutate(
    method = factor(method, levels = method_levels),
    additive = ifelse(method %in% c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix"),
                      "additive distances", "non-additive distances")
  ) |>
  dplyr::left_join(
    meas_to_plot_mad_sd |>
       canon_cols() |>
      dplyr::select(variable, var_id, any_of(method_levels)) |>
      pivot_longer(cols = naive:u_mix, names_to="method", values_to="sd") |>
      mutate(method = factor(method, levels = method_levels)),
    by = c("variable","var_id","method")
  ) |>
  ggplot(aes(x = var_id, y = value, color = method, shape = method)) +
  geom_rect(data=rect_data,
            aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
            alpha=.2, fill="indianred", color="lightgrey", inherit.aes=FALSE) +
  geom_rect(data=rect_data,
            aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
            alpha=.2, fill="dodgerblue", color="lightgrey", inherit.aes=FALSE) +
  geom_errorbar(aes(ymin = value - sd, ymax = value + sd), width = 0.12, linewidth = 0.35) +
  geom_point() + geom_line() +
  ylab("absolute variable contribution to distance") +
  xlab("") + theme_bw() + theme(legend.position = "none") +
  scale_x_continuous(breaks=seq(1,6,1), labels = rep("",6)) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  scale_shape_manual(values = my_shapes) +
  scale_color_manual(values = my_colors) +
  facet_wrap(.~additive)

add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")

nonadd_methods <- c("naive","hl","gudmm","dkps")

# add_methods <- c("gower", "mod_gower", "hl_add", "u_ind", "u_dep", "u_mix")

legend_breaks <- c(
  c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix"),  # additive first
  c("naive","hl","gudmm","dkps")                           # non-additive after
)

legend_labels <- setNames(
  ifelse(legend_breaks %in% c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix"),
         paste0(legend_breaks, " (additive)"),
         paste0(legend_breaks, " (non-additive)")),
  legend_breaks
)

library(ggnewscale)

loo_mad_rel_df <- meas_to_plot_mad_rel_mean |>
   canon_cols() |>
  dplyr::select(variable, var_id, any_of(method_levels)) |>
  tidyr::pivot_longer(cols = naive:u_mix, names_to="method", values_to="value") |>
  mutate(
    method = factor(method, levels = c("naive","hl","hl_add","gower","gudmm","dkps","mod_gower","u_ind","u_dep","u_mix")),
    additive = ifelse(as.character(method) %in% add_methods, "additive", "non-additive")
  ) |>
  dplyr::left_join(
    meas_to_plot_mad_rel_sd |>
       canon_cols() |>
      dplyr::select(variable, var_id, any_of(method_levels)) |>
      tidyr::pivot_longer(cols = naive:u_mix, names_to="method", values_to="sd") |>
      mutate(method = factor(method, levels = c("naive","hl","hl_add","gower","gudmm","dkps","mod_gower","u_ind","u_dep","u_mix"))),
    by = c("variable","var_id","method")
  )

# loo_mad_rel_plot <-
#   ggplot() +
#   geom_rect(data=rect_data_rel,
#             aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
#             alpha=.2, fill="indianred", color="lightgrey", inherit.aes=FALSE) +
#   geom_rect(data=rect_data_rel,
#             aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
#             alpha=.2, fill="dodgerblue", color="lightgrey", inherit.aes=FALSE) +
# 
#   # --- Additive layer + legend 1 ---
#   # geom_errorbar(
#   #   data = dplyr::filter(loo_mad_rel_df, additive == "additive"),
#   #   aes(x = var_id, ymin = value - sd, ymax = value + sd, color = method),
#   #   width = 0.12, linewidth = 0.35
#   # ) +
#   geom_line(
#     data = dplyr::filter(loo_mad_rel_df, additive == "additive"),
#     aes(x = var_id, y = value, color = method)
#   ) +
#   geom_point(
#     data = dplyr::filter(loo_mad_rel_df, additive == "additive"),
#     aes(x = var_id, y = value, color = method, shape = method)
#   ) +
#   scale_color_manual(
#     name   = "additive",
#     values = my_colors,
#     breaks = add_methods
#   ) +
#   scale_shape_manual(
#     name   = "additive",
#     values = my_shapes,
#     breaks = add_methods
#   ) +
#   guides(
#   color = guide_legend(
#     order = 1,
#     override.aes = list(shape = my_shapes[add_methods])
#   ),
#   shape = "none"
# ) +
# 
#   ggnewscale::new_scale_color() +
#   ggnewscale::new_scale("shape") +
# 
#   # --- Non-additive layer + legend 2 ---
#   # geom_errorbar(
#   #   data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"),
#   #   aes(x = var_id, ymin = value - sd, ymax = value + sd, color = method),
#   #   width = 0.12, linewidth = 0.35
#   # ) +
#   geom_line(
#     data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"),
#     aes(x = var_id, y = value, color = method)
#   ) +
#   geom_point(
#     data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"),
#     aes(x = var_id, y = value, color = method, shape = method)
#   ) +
#   scale_color_manual(
#     name   = "non-additive",
#     values = my_colors,
#     breaks = nonadd_methods
#   ) +
#   scale_shape_manual(
#     name   = "non-additive",
#     values = my_shapes,
#     breaks = nonadd_methods
#   ) +
#   guides(
#   color = guide_legend(
#     order = 2,
#     override.aes = list(shape = my_shapes[nonadd_methods])
#   ),
#   shape = "none"
# ) +
# 
#   # cosmetics (same as yours)
#   ylab("relative variable contribution to distance") +
#   xlab("left-out variable") + theme_bw() +
#   scale_x_continuous(
#     breaks=seq(1,6,1),
#     labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2")
#   ) +
#   theme(
#     axis.text.x = element_text(angle = 60, hjust = 1),
#     strip.text = element_blank(),
#     legend.position = "bottom",
#     legend.title.position = "top",
#     legend.text = element_text(size = 13)
#     )+facet_wrap(.~additive)
add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkps")

loo_mad_rel_plot <-
  ggplot() +
  geom_rect(
    data = rect_data_rel,
    aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont),
    alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE
  ) +
  geom_rect(
    data = rect_data_rel,
    aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat),
    alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE
  ) +

  # --- Additive layer ---
  geom_line(
    data = dplyr::filter(loo_mad_rel_df, additive == "additive"),
    aes(x = var_id, y = value, color = method, linetype = method),
    linewidth = 0.5
  ) +
  geom_point(
    data = dplyr::filter(loo_mad_rel_df, additive == "additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "additive",
    values = my_colors,
    breaks = add_methods
  ) +
  scale_shape_manual(
    name   = "additive",
    values = my_shapes,
    breaks = add_methods
  ) +
  scale_linetype_manual(
    name   = "additive",
    values = setNames(rep("solid", length(add_methods)), add_methods),
    breaks = add_methods
  ) +
  guides(
    color = guide_legend(order = 1),
    shape = guide_legend(order = 1),
    linetype = guide_legend(order = 1)
  ) +

  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +
  ggnewscale::new_scale("linetype") +

  # --- Non-additive layer ---
  geom_line(
    data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"),
    aes(x = var_id, y = value, color = method, linetype = method),
    linewidth = 0.5
  ) +
  geom_point(
    data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "non-additive",
    values = my_colors,
    breaks = nonadd_methods
  ) +
  scale_shape_manual(
    name   = "non-additive",
    values = my_shapes,
    breaks = nonadd_methods
  ) +
  scale_linetype_manual(
    name   = "non-additive",
    values = setNames(rep("solid", length(nonadd_methods)), nonadd_methods),
    breaks = nonadd_methods
  ) +
  guides(
    color = guide_legend(order = 2),
    shape = guide_legend(order = 2),
    linetype = guide_legend(order = 2)
  ) +

  ylab("relative variable contribution to distance") +
  xlab("left-out variable") +
  theme_bw() +
  scale_x_continuous(
    breaks = seq(1, 6, 1),
    labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2")
  ) +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    strip.text = element_blank(),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  ) +
  facet_wrap(. ~ additive)
fig_1_plot = loo_mad_plot / loo_mad_rel_plot

fig_1_plot

Code for Figure 2

Code
library(ggnewscale)

add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkps")

meas_to_plot_ac_mean <- split_by_measure_mean |> 
  pluck("ac_relative_importance") |> 
  mutate(measure = "ac_relative_importance") |> 
  slice(3:6, 1, 2) |> 
  mutate(var_id = 1:6)

meas_to_plot_ac_sd <- split_by_measure_sd |> 
  pluck("ac_relative_importance") |> 
  mutate(measure = "ac_relative_importance") |> 
  slice(3:6, 1, 2) |> 
  mutate(var_id = 1:6)

ac_max <- max(meas_to_plot_ac_mean |> select(where(is.numeric), -var_id), na.rm = TRUE)

rect_data_ac <- tibble(
  xmin_cont = 4.5, xmax_cont = 6, ymin_cont = 0, ymax_cont = ac_max,
  xmin_cat  = .5,  xmax_cat  = 4.5, ymin_cat  = 0, ymax_cat  = ac_max
)

ac_df <- meas_to_plot_ac_mean |>
  canon_cols() |>
  dplyr::select(variable, var_id, any_of(method_levels)) |>
  tidyr::pivot_longer(cols = naive:u_mix, names_to = "method", values_to = "value") |>
  mutate(
    method = factor(
      method,
      levels = c("naive","hl","hl_add","gower","gudmm","dkps","mod_gower","u_ind","u_dep","u_mix")
    ),
    additive = ifelse(as.character(method) %in% add_methods, "additive", "non-additive")
  ) |>
  dplyr::left_join(
    meas_to_plot_ac_sd |>
      canon_cols() |>
      dplyr::select(variable, var_id, any_of(method_levels)) |>
      tidyr::pivot_longer(cols = naive:u_mix, names_to = "method", values_to = "sd") |>
      mutate(
        method = factor(
          method,
          levels = c("naive","hl","hl_add","gower","gudmm","dkps","mod_gower","u_ind","u_dep","u_mix")
        )
      ),
    by = c("variable", "var_id", "method")
  )

ac_plot <-
  ggplot() +
  geom_rect(
    data = rect_data_ac,
    aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont),
    alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE
  ) +
  geom_rect(
    data = rect_data_ac,
    aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat),
    alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE
  ) +

  # --- Additive block ---
  geom_line(
    data = dplyr::filter(ac_df, additive == "additive"),
    aes(x = var_id, y = value, color = method, linetype = method),
    linewidth = 0.5
  ) +
  geom_point(
    data = dplyr::filter(ac_df, additive == "additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "additive",
    values = my_colors,
    breaks = add_methods
  ) +
  scale_shape_manual(
    name   = "additive",
    values = my_shapes,
    breaks = add_methods
  ) +
  scale_linetype_manual(
    name   = "additive",
    values = setNames(rep("solid", length(add_methods)), add_methods),
    breaks = add_methods
  ) +
  guides(
    color = guide_legend(order = 1),
    shape = guide_legend(order = 1),
    linetype = guide_legend(order = 1)
  ) +

  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +
  ggnewscale::new_scale("linetype") +

  # --- Non-additive block ---
  geom_line(
    data = dplyr::filter(ac_df, additive == "non-additive"),
    aes(x = var_id, y = value, color = method, linetype = method),
    linewidth = 0.5
  ) +
  geom_point(
    data = dplyr::filter(ac_df, additive == "non-additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "non-additive",
    values = my_colors,
    breaks = nonadd_methods
  ) +
  scale_shape_manual(
    name   = "non-additive",
    values = my_shapes,
    breaks = nonadd_methods
  ) +
  scale_linetype_manual(
    name   = "non-additive",
    values = setNames(rep("solid", length(nonadd_methods)), nonadd_methods),
    breaks = nonadd_methods
  ) +
  guides(
    color = guide_legend(order = 2),
    shape = guide_legend(order = 2),
    linetype = guide_legend(order = 2)
  ) +

  ylab("relative alienation coefficient") +
  xlab("left-out variable") +
  scale_x_continuous(
    breaks = seq(1, 6, 1),
    labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2")
  ) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  ) +
  facet_wrap(. ~ additive)

ac_plot

Retrieval of the true configuration

Data generation

Code
recovery_structure <- tidyr::crossing(
  tibble::tibble(replicate = 1:reps),
  method = methods,
  categories = qoptions
) |>
  dplyr::left_join(manydist:::mdist_method_lookup, by = "method") |>
  dplyr::mutate(
    dataset = purrr::map2(
      .x = replicate, .y = categories,
      ~ generate_dataset(
        n = n, porig = porig, pn = pn,
        pnnoise = pnnoise, pcnoise = pcnoise,
        sigma = sigma, qoptions = .y, mode = "shared",
        seed = 1234 + .x
      )
    ),
    # compute distance matrix using method specs + baseline uses Xorig
    distance_matrix = purrr::pmap(
      dplyr::pick(dataset, method, mdist_type, mdist_preset, param_set),
      \(dataset, method, mdist_type, mdist_preset, param_set) {
        X <- if (method == "baseline") dataset$Xorig else dataset$X

        run_mdist_within(
          df = X,
          mdist_type = mdist_type,
          mdist_preset = mdist_preset,
          param_set = param_set,
          outcome = NULL   # because X / Xorig have no response column
        ) |>
          base::as.matrix()
      }
    ),

    mds_results = purrr::map(
      distance_matrix,
      ~ cmdscale(.x, eig = TRUE, k = 2)$points |>
        as.data.frame() |>
        setNames(c("x1", "x2"))
    ),

    congruence_coeff = purrr::map2(
      .x = mds_results, .y = dataset,
      \(mds, ds) congruence_coeff(ds$truth, mds)
    )
  )

# save(file="recovery_structure.RData", recovery_structure)
recovery_structure_lite = recovery_structure |>
  select(-dataset, -distance_matrix, -mds_results) |>
  unnest(congruence_coeff) |>
  select(replicate, method, categories, congruence_coeff) |> 
  mutate(categories=factor(paste0(categories," categories")),
         alienation_coeff = sqrt(1-congruence_coeff^2))

save(file="recovery_structure_lite.RData", recovery_structure_lite)

Code for figure Figure 3

Code
library(dplyr)
library(ggplot2)
library(ggnewscale)

# n_add   <- length(add_methods)

add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkps")
method_order <- c(add_methods, nonadd_methods, "baseline")

n_add    <- length(add_methods)
n_nonadd <- length(nonadd_methods)

n_total <- length(method_order)

rect_bg <- tibble::tibble(
  xmin = c(0.5, n_add + 0.5),
  xmax = c(n_add + 0.5, n_add + n_nonadd + 0.5),
  ymin = c(-Inf, -Inf),
  ymax = c( Inf,  Inf),
  grp  = c("additive", "non-additive")
)
# --- method groups ---


# x-axis order: additive first, then non-additive, baseline last

# --- prepare data ---
box_df <- recovery_structure_lite |>
  mutate(
    method = ifelse(is.na(method), "baseline", method),
    method = canon_method(method),  # paper naming
    method = factor(method, levels = method_order),
    additive = ifelse(as.character(method) %in% add_methods, "additive", "non-additive")
  )

# --- plot ---
boxplot_figure_simu <-
  ggplot(box_df, aes(x = method, y = alienation_coeff)) +
  geom_rect(
  data = dplyr::filter(rect_bg, grp == "additive"),
  aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
  inherit.aes = FALSE,
  alpha = 0.20, fill = "dodgerblue", color = "lightgrey"
) +
geom_rect(
  data = dplyr::filter(rect_bg, grp == "non-additive"),
  aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
  inherit.aes = FALSE,
  alpha = 0.20, fill = "indianred", color = "lightgrey"
) +
  facet_wrap(~ categories) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  ) +
  ylab("alienation coefficient") +
  xlab("methods") +

  # --- additive boxes + legend 1 (left) ---
  geom_boxplot(
    data = dplyr::filter(box_df, additive == "additive"),
    aes(fill = method),
    outlier.alpha = 0.25,
    linewidth = 0.35
  ) +
  scale_fill_manual(
    name   = "additive",
    values = my_colors,
    breaks = add_methods
  ) +
  guides(fill = guide_legend(order = 1)) +

  ggnewscale::new_scale_fill() +

  # --- non-additive boxes (incl baseline) + legend 2 (right) ---
  geom_boxplot(
    data = dplyr::filter(box_df, additive == "non-additive"),
    aes(fill = method),
    outlier.alpha = 0.25,
    linewidth = 0.35
  ) +
  scale_fill_manual(
    name   = "non-additive",
    values = my_colors,
    breaks = c(nonadd_methods, "baseline")  # baseline included here; appears last in x-axis
  ) +
  guides(fill = guide_legend(order = 2))

boxplot_figure_simu

Clustering experiment: partitioning around medoids

numsep =.1 and catsep=0.35 generation

Code
nrep <- 100
k_true <- 4
q <- 9
numsep <- 0.1
catsep <- 0.35
clustSizeEq <- 50

param_generator_grid <- tibble::tibble(
  numsignal = c(0,0,4,4,4,8,8,8),
  numnoise  = c(8,8,4,4,4,0,0,0),
  catsignal = c(4,8,0,4,8,0,4,8),
  catnoise  = c(4,0,8,4,0,8,4,0)
) |>
  tidyr::crossing(replicate = 1:nrep)

mixed_datasets_grid <- param_generator_grid |>
  dplyr::mutate(
    data = purrr::pmap(
      dplyr::pick(numsignal, catsignal, numnoise, catnoise, replicate),
      \(numsignal, catsignal, numnoise, catnoise, replicate) {
        gen_mixed(
          k_true      = k_true,
          clustSizeEq = clustSizeEq,
          numsignal   = numsignal,
          numnoise    = numnoise,
          catsignal   = catsignal,
          catnoise    = catnoise,
          q           = q,
          q_err       = q,
          numsep      = numsep,
          catsep      = catsep,
          seed        = replicate
        )
      }
    )
  )

methods <- c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")

sim_methods_grid <- mixed_datasets_grid |>
  tidyr::crossing(method = methods) |>
  dplyr::left_join(manydist:::mdist_method_lookup, by = "method")

stopifnot(!any(is.na(sim_methods_grid$mdist_type)))

ari_pam_results_01_035 <- sim_methods_grid |>
  dplyr::mutate(
    ari = purrr::pmap_dbl(
      dplyr::pick(data, mdist_type, mdist_preset, param_set),
      \(data, mdist_type, mdist_preset, param_set) {
        df <- data$df
        D <- run_mdist_within(df, mdist_type, mdist_preset, param_set, outcome = "y")
        pam_fit <- cluster::pam(D, k = k_true, diss = TRUE)
        mclust::adjustedRandIndex(df$y, pam_fit$clustering)
      }
    )
  )
Code
# Method groups (as you provided)
add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkps")
method_levels  <- c(add_methods, nonadd_methods)

# Facet order (row-wise)
scenario_order <- c("0844","0880","4408","8008","4444","4480","8044","8080")

ari_plot_df_01_035 <- ari_pam_results_01_035 |>
  mutate(
    # Used only to match facet order; preserves leading zeros
    scenario_id = sprintf("%01d%01d%01d%01d", numsignal, numnoise, catsignal, catnoise),

    scenario = str_glue(
      "num: {numsignal} s. + {numnoise} n. | cat: {catsignal} s. + {catnoise} n."
    ),
method = canon_method(method),
    # Row-wise facet order as provided
    scenario = factor(scenario, levels = scenario[match(scenario_order, scenario_id)]),

    # Method order
    method = factor(method, levels = method_levels),

    # Additive / Non-additive label
    method_type = if_else(as.character(method) %in% add_methods, "Additive", "Non-additive")
  )

# Background bands (no fill scale consumed)
k_add <- length(add_methods)
k_tot <- length(method_levels)

bg_add <- tibble(
  xmin = 0.5, xmax = k_add + 0.5,
  ymin = -Inf, ymax = Inf
)
bg_nonadd <- tibble(
  xmin = k_add + 0.5, xmax = k_tot + 0.5,
  ymin = -Inf, ymax = Inf
)

ari_plot_01_035 <- ggplot() +
  # Shaded regions
  geom_rect(
    data = bg_add,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
    inherit.aes = FALSE,
    fill = "indianred",
    alpha = 0.12,
    show.legend = FALSE
  ) +
  geom_rect(
    data = bg_nonadd,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
    inherit.aes = FALSE,
    fill = "dodgerblue",
    alpha = 0.12,
    show.legend = FALSE
  ) +

  # Additive boxplots + legend block
  geom_boxplot(
    data = ari_plot_df_01_035 |> filter(method_type == "Additive"),
    aes(x = method, y = ari, fill = method),
    outlier.alpha = 0.25,
    linewidth = 0.35
  ) +
  # Additive legend (comes first)
scale_fill_manual(
  name   = "Additive",
  values = my_colors[add_methods],
  breaks = add_methods,
  drop   = FALSE,
  guide  = guide_legend(order = 1)
) +

  ggnewscale::new_scale_fill() +

  # Non-additive boxplots + legend block
  geom_boxplot(
    data = ari_plot_df_01_035 |> filter(method_type == "Non-additive"),
    aes(x = method, y = ari, fill = method),
    outlier.alpha = 0.25,
    linewidth = 0.35
  ) +
 # Non-additive legend (comes second)
scale_fill_manual(
  name   = "Non-additive",
  values = my_colors[nonadd_methods],
  breaks = nonadd_methods,
  drop   = FALSE,
  guide  = guide_legend(order = 2)
) +
  facet_wrap(~ scenario, ncol = 2) +
  coord_cartesian(ylim = c(0, 1)) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    strip.text  = element_text(size = 9),
    legend.text = element_text(size = 13)
  ) +
  labs(x = "", y = "ARI (PAM on dissimilarities)")

print(ari_plot_01_035)

Code
# ---- method groups ----
add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkps")

# ---- scenario order (shared with boxplot figure) ----
scenario_order <- c("0844","0880","4408","8008","4444","4480","8044","8080")

ari_parallel_df <- ari_pam_results_01_035 |>
  mutate(
    # unify method naming
    method = canon_method(method), 

    # key used for ordering
    scenario_id = sprintf(
      "%01d%01d%01d%01d",
      numsignal, numnoise, catsignal, catnoise
    ),

    # label shown on the x-axis
    scenario = stringr::str_glue(
      "n:({numsignal},{numnoise}); c:({catsignal},{catnoise})"
    )
  ) |>
  group_by(scenario_id, scenario, method) |>
  summarise(
    mean_ari = mean(ari, na.rm = TRUE),
    sd_ari   = sd(ari,   na.rm = TRUE),
    .groups  = "drop"
  ) |>
  mutate(
    # enforce scenario order explicitly
    scenario = factor(
      scenario,
      levels = scenario[match(scenario_order, scenario_id)]
    ),

    type = ifelse(method %in% add_methods, "additive", "non-additive"),
    method = factor(method, levels = c(add_methods, nonadd_methods))
  )

Code for Figure 4

Code
ari_parallel_plot <-
  ggplot() +

  # ================= ADDITIVE =================
  geom_errorbar(
    data = dplyr::filter(ari_parallel_df, type == "additive"),
    aes(
      x = scenario,
      ymin = mean_ari - sd_ari,
      ymax = mean_ari + sd_ari,
      color = method
    ),
    width = 0.12,
    linewidth = 0.35
  ) +
  geom_line(
    data = dplyr::filter(ari_parallel_df, type == "additive"),
    aes(
      x = scenario,
      y = mean_ari,
      color = method,
      shape = method,
      linetype = method,
      group = method
    ),
    linewidth = 0.6,
    alpha = .75
  ) +
  geom_point(
    data = dplyr::filter(ari_parallel_df, type == "additive"),
    aes(x = scenario, y = mean_ari, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name = "additive",
    values = my_colors,
    breaks = add_methods
  ) +
  scale_shape_manual(
    name = "additive",
    values = my_shapes,
    breaks = add_methods
  ) +
  scale_linetype_manual(
    name = "additive",
    values = setNames(rep("solid", length(add_methods)), add_methods),
    breaks = add_methods
  ) +
  guides(
    color = guide_legend(order = 1),
    shape = guide_legend(order = 1),
    linetype = guide_legend(order = 1)
  ) +

  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +
  ggnewscale::new_scale("linetype") +

  # ================= NON-ADDITIVE =================
  geom_errorbar(
    data = dplyr::filter(ari_parallel_df, type == "non-additive"),
    aes(
      x = scenario,
      ymin = mean_ari - sd_ari,
      ymax = mean_ari + sd_ari,
      color = method
    ),
    width = 0.12,
    linewidth = 0.35
  ) +
  geom_line(
    data = dplyr::filter(ari_parallel_df, type == "non-additive"),
    aes(
      x = scenario,
      y = mean_ari,
      color = method,
      shape = method,
      linetype = method,
      group = method
    ),
    linewidth = 0.6,
    alpha = .75
  ) +
  geom_point(
    data = dplyr::filter(ari_parallel_df, type == "non-additive"),
    aes(x = scenario, y = mean_ari, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name = "non-additive",
    values = my_colors,
    breaks = nonadd_methods
  ) +
  scale_shape_manual(
    name = "non-additive",
    values = my_shapes,
    breaks = nonadd_methods
  ) +
  scale_linetype_manual(
    name = "non-additive",
    values = setNames(rep("solid", length(nonadd_methods)), nonadd_methods),
    breaks = nonadd_methods
  ) +
  guides(
    color = guide_legend(order = 2),
    shape = guide_legend(order = 2),
    linetype = guide_legend(order = 2)
  ) +

  labs(x = "scenario", y = "mean ARI (± 1 SD)") +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 30, hjust = 1),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  )

ari_parallel_plot

Code
ggsave(
  filename = "figure_parallel_ari.pdf",
  plot     = ari_parallel_plot,
  width    = 18,
  height   = 11,
  units    = "cm"
)
Code
load(file = "ari_pam_experiment_numsep_01_catsep_035.RData")

add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkps")

scenario_order <- c("0844","0880","4408","8008","4444","4480","8044","8080")

ari_rank_df <- ari_pam_results_01_035 |>
  mutate(
    method = canon_method(method),

    scenario_id = sprintf("%01d%01d%01d%01d", numsignal, numnoise, catsignal, catnoise),
    scenario = stringr::str_glue("n:({numsignal},{numnoise}); c:({catsignal},{catnoise})")
  ) |>
  group_by(scenario_id, scenario, method) |>
  summarise(
    mean_ari = mean(ari, na.rm = TRUE),
    .groups  = "drop"
  ) |>
  group_by(scenario_id, scenario) |>
  mutate(
    rank_desc = dplyr::dense_rank(dplyr::desc(mean_ari))  # 1 = best
  ) |>
  ungroup() |>
  mutate(
    # enforce scenario order (same as boxplot panels)
    scenario = factor(scenario, levels = scenario[match(scenario_order, scenario_id)]),

    type = ifelse(method %in% add_methods, "additive", "non-additive"),
    method = factor(method, levels = c(add_methods, nonadd_methods))
  )

Illustration: FIFA player data

Data preparation and analysis

Code
# Variables are organized: Categorical first, then numerical

data("fifa_nl", package = "manydist")
#Xorig<-X<-ddata[,-c(1,2)]
X<-fifa_nl[,-c(1,2,19:24)] # vars 1 and 2 are a bit useless (too many categories)
# vars 19:24 interesting but same scale and little variation
# leaving them out makes the plots a bit smaller.
# leaving them in doesn't seem to change a lot
n<-nrow(X)
nd<-2 # Dimensionality of MDS solution
X=X[,c(8:1,9:16)]  # Order
X=X[,-3]  # Remove int rep: 380 have something, 21+7 in other cats

X=X[,c(1:12,14,13,15)]
p<-ncol(X)
X[8:15] <- lapply(X[8:15], as.numeric)
fifa_clean <- as_tibble(X)  |> 
  select(where(is.integer), where(is.numeric),where(is.factor) ,-release_clause_eur) |> 
  rename(pref_foot=`preferred_foot`,
         club=`club_name`,
         position=`team_position`)
Code
fifa_structure <- tibble::tibble(
  method = c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")
) |>
  dplyr::left_join(manydist:::mdist_method_lookup, by = "method") |>
  dplyr::mutate(
    loo_results = purrr::pmap(
      dplyr::pick(method, mdist_type, mdist_preset, param_set),
      \(method, mdist_type, mdist_preset, param_set) {
        run_lovo(
          df = fifa_clean,
          mdist_type = mdist_type,
          mdist_preset = mdist_preset,
          param_set = param_set,
          outcome = NULL,
          dims = nd,
          keep_dist = FALSE,
          legacy_gower_sum = (method == "gower")  # triggers only for preset+gower anyway
        )
      }
    )
  )
Code
results_fifa_wide <- fifa_structure |>
  dplyr::mutate(loo_tbl = purrr::map(loo_results, "results")) |>
  dplyr::select(method, loo_tbl) |>
  tidyr::unnest(loo_tbl) |>
  tidyr::pivot_wider(
    id_cols    = variable,
    names_from = method,
    values_from = c(mad_importance, cc_importance, ac_importance, mad_normalized)
  )


measure_names <- c(
  "mad_importance",
  "cc_importance",
  "ac_importance",
  "mad_normalized"
)

fifa_split_by_measure <- purrr::map(
  measure_names,
  \(measure) {
    results_fifa_wide |>
      dplyr::select(variable, dplyr::starts_with(paste0(measure, "_"))) |>
      dplyr::rename_with(~ stringr::str_remove(.x, paste0(measure, "_")))
  }
) |>
  rlang::set_names(measure_names)

Code for Figure 5

Code
meas_to_plot_mad = fifa_split_by_measure |> pluck("mad_importance") |>
  mutate(measure = "mad_importances") |> slice(c(8:14, 1:7)) |> mutate(var_id = 1:n())

mad_max = max(meas_to_plot_mad |> select(where(is.numeric), -var_id))

rect_data = tibble(
  xmin_cont = 8.5, xmax_cont = 14, ymin_cont = 0, ymax_cont = mad_max,
  xmin_cat  = .5,  xmax_cat  = 8.5, ymin_cat  = 0, ymax_cat  = mad_max
)

meas_to_plot_mad_rel = fifa_split_by_measure |> pluck("mad_normalized") |>
  mutate(measure = "mad_normalized") |> slice(c(8:14, 1:7)) |> mutate(var_id = 1:n())

mad_max_rel = max(meas_to_plot_mad_rel |> select(where(is.numeric), -var_id))

rect_data_rel = tibble(
  xmin_cont = 8.5, xmax_cont = 14, ymin_cont = 0, ymax_cont = mad_max_rel,
  xmin_cat  = .5,  xmax_cat  = 8.5, ymin_cat  = 0, ymax_cat  = mad_max_rel
)

method_levels <- c("naive","hl","hl_add","gower","gudmm","dkps","mod_gower","u_ind","u_dep","u_mix")
additive_set  <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
conflict_prefer("intersect", "base")
conflict_prefer("setdiff", "base")
add_present    <- intersect(additive_set, method_levels)
nonadd_present <- setdiff(method_levels, additive_set)

# ---------- TOP: absolute contributions ----------
loo_mad_long <- meas_to_plot_mad |>
  canon_cols() |>
  pivot_longer(
    cols = any_of(method_levels),
    names_to = "method",
    values_to = "value"
  ) |>
  mutate(
    method   = factor(method, levels = method_levels),
    additive = ifelse(method %in% additive_set, "additive distances", "non-additive distances"),
    type     = ifelse(method %in% additive_set, "Additive", "Non-additive")
  )

loo_mad_plot <-
  ggplot() +
  geom_rect(
    data = rect_data,
    aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont),
    alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE
  ) +
  geom_rect(
    data = rect_data,
    aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat),
    alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE
  ) +
  # Additive
  geom_line(
    data = filter(loo_mad_long, type == "Additive"),
    aes(x = var_id, y = value, color = method, linetype = method, group = method),
    alpha = 0.85
  ) +
  geom_point(
    data = filter(loo_mad_long, type == "Additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "Additive",
    values = my_colors,
    breaks = add_present,
    drop   = FALSE
  ) +
  scale_shape_manual(
    name   = "Additive",
    values = my_shapes,
    breaks = add_present,
    drop   = FALSE
  ) +
  scale_linetype_manual(
    name   = "Additive",
    values = setNames(rep("solid", length(add_present)), add_present),
    breaks = add_present,
    drop   = FALSE
  ) +
  guides(
    color = guide_legend(order = 1),
    shape = guide_legend(order = 1),
    linetype = guide_legend(order = 1)
  ) +
  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +
  ggnewscale::new_scale("linetype") +
  # Non-additive
  geom_line(
    data = filter(loo_mad_long, type == "Non-additive"),
    aes(x = var_id, y = value, color = method, linetype = method, group = method),
    alpha = 0.85
  ) +
  geom_point(
    data = filter(loo_mad_long, type == "Non-additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "Non-additive",
    values = my_colors,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  scale_shape_manual(
    name   = "Non-additive",
    values = my_shapes,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  scale_linetype_manual(
    name   = "Non-additive",
    values = setNames(rep("solid", length(nonadd_present)), nonadd_present),
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  guides(
    color = guide_legend(order = 2),
    shape = guide_legend(order = 2),
    linetype = guide_legend(order = 2)
  ) +
  ylab("absolute variable contribution to distance") +
  theme_bw() +
  scale_x_continuous(breaks = seq(1, 14, 1), labels = rep("", 14)) +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    legend.position = "none"
  ) +
  xlab("") +
  facet_wrap(. ~ additive)

# ---------- BOTTOM: relative contributions ----------
loo_mad_rel_long <- meas_to_plot_mad_rel |>
  canon_cols() |>
  pivot_longer(
    cols = any_of(method_levels),
    names_to = "method",
    values_to = "value"
  ) |>
  mutate(
    method   = factor(method, levels = method_levels),
    additive = ifelse(method %in% additive_set, "additive distances", "non-additive distances"),
    type     = ifelse(method %in% additive_set, "Additive", "Non-additive")
  )

loo_mad_rel_plot <-
  ggplot() +
  geom_rect(
    data = rect_data_rel,
    aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont),
    alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE
  ) +
  geom_rect(
    data = rect_data_rel,
    aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat),
    alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE
  ) +
  # Additive
  geom_line(
    data = filter(loo_mad_rel_long, type == "Additive"),
    aes(x = var_id, y = value, color = method, linetype = method, group = method),
    alpha = 0.85
  ) +
  geom_point(
    data = filter(loo_mad_rel_long, type == "Additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "Additive",
    values = my_colors,
    breaks = add_present,
    drop   = FALSE
  ) +
  scale_shape_manual(
    name   = "Additive",
    values = my_shapes,
    breaks = add_present,
    drop   = FALSE
  ) +
  scale_linetype_manual(
    name   = "Additive",
    values = setNames(rep("solid", length(add_present)), add_present),
    breaks = add_present,
    drop   = FALSE
  ) +
  guides(
    color = guide_legend(order = 1),
    shape = guide_legend(order = 1),
    linetype = guide_legend(order = 1)
  ) +
  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +
  ggnewscale::new_scale("linetype") +
  # Non-additive
  geom_line(
    data = filter(loo_mad_rel_long, type == "Non-additive"),
    aes(x = var_id, y = value, color = method, linetype = method, group = method),
    alpha = 0.85
  ) +
  geom_point(
    data = filter(loo_mad_rel_long, type == "Non-additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "Non-additive",
    values = my_colors,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  scale_shape_manual(
    name   = "Non-additive",
    values = my_shapes,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  scale_linetype_manual(
    name   = "Non-additive",
    values = setNames(rep("solid", length(nonadd_present)), nonadd_present),
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  guides(
    color = guide_legend(order = 2),
    shape = guide_legend(order = 2),
    linetype = guide_legend(order = 2)
  ) +
  ylab("relative variable contribution to distance") +
  theme_bw() +
  scale_x_continuous(breaks = seq(1, 14, 1), labels = meas_to_plot_mad_rel$variable) +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    strip.text  = element_blank(),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  ) +
  xlab("") +
  facet_wrap(. ~ additive)

# ---------- COMBINE ----------
fig_4_plot <- loo_mad_plot / loo_mad_rel_plot
fig_4_plot

Code for Figure 6

Code
method_levels <- c("naive","hl","hl_add","gower","gudmm","dkps","mod_gower","u_ind","u_dep","u_mix")
additive_set  <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")

meas_to_plot_ac <- fifa_split_by_measure |>
  purrr::pluck("ac_importance") |>
  dplyr::mutate(measure = "ac_importance") |>
  dplyr::slice(c(8:14, 1:7)) |>
  dplyr::mutate(var_id = dplyr::row_number())

max_ac <- meas_to_plot_ac |>
  dplyr::select(where(is.numeric), -var_id) |>
  unlist(use.names = FALSE) |>
  max(na.rm = TRUE)

rect_data_ac <- tibble::tibble(
  xmin_cont = 7.5, xmax_cont = 14,  ymin_cont = 0, ymax_cont = max_ac,
  xmin_cat  = 0.5, xmax_cat  = 7.5, ymin_cat  = 0, ymax_cat  = max_ac
)

ac_long <- meas_to_plot_ac |>
  canon_cols() |>
  tidyr::pivot_longer(
    cols = dplyr::any_of(method_levels),
    names_to = "method",
    values_to = "value"
  ) |>
  dplyr::mutate(
    method   = factor(method, levels = method_levels),
    additive = ifelse(method %in% additive_set, "additive distances", "non-additive distances"),
    type     = ifelse(method %in% additive_set, "Additive", "Non-additive")
  )

add_present    <- intersect(additive_set, unique(as.character(ac_long$method)))
nonadd_present <- intersect(setdiff(method_levels, additive_set), unique(as.character(ac_long$method)))

ac_plot <-
  ggplot2::ggplot() +
  ggplot2::geom_rect(
    data = rect_data_ac,
    ggplot2::aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont),
    alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE
  ) +
  ggplot2::geom_rect(
    data = rect_data_ac,
    ggplot2::aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat),
    alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE
  ) +

  # ================= ADDITIVE =================
  ggplot2::geom_line(
    data = dplyr::filter(ac_long, type == "Additive"),
    ggplot2::aes(x = var_id, y = value, color = method, linetype = method, group = method),
    alpha = 0.85
  ) +
  ggplot2::geom_point(
    data = dplyr::filter(ac_long, type == "Additive"),
    ggplot2::aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  ggplot2::scale_color_manual(
    name   = "Additive",
    values = my_colors,
    breaks = add_present,
    drop   = FALSE
  ) +
  ggplot2::scale_shape_manual(
    name   = "Additive",
    values = my_shapes,
    breaks = add_present,
    drop   = FALSE
  ) +
  ggplot2::scale_linetype_manual(
    name   = "Additive",
    values = stats::setNames(rep("solid", length(add_present)), add_present),
    breaks = add_present,
    drop   = FALSE
  ) +
  ggplot2::guides(
    color = ggplot2::guide_legend(order = 1),
    shape = ggplot2::guide_legend(order = 1),
    linetype = ggplot2::guide_legend(order = 1)
  ) +

  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +
  ggnewscale::new_scale("linetype") +

  # ================= NON-ADDITIVE =================
  ggplot2::geom_line(
    data = dplyr::filter(ac_long, type == "Non-additive"),
    ggplot2::aes(x = var_id, y = value, color = method, linetype = method, group = method),
    alpha = 0.85
  ) +
  ggplot2::geom_point(
    data = dplyr::filter(ac_long, type == "Non-additive"),
    ggplot2::aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  ggplot2::scale_color_manual(
    name   = "Non-additive",
    values = my_colors,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  ggplot2::scale_shape_manual(
    name   = "Non-additive",
    values = my_shapes,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  ggplot2::scale_linetype_manual(
    name   = "Non-additive",
    values = stats::setNames(rep("solid", length(nonadd_present)), nonadd_present),
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  ggplot2::guides(
    color = ggplot2::guide_legend(order = 2),
    shape = ggplot2::guide_legend(order = 2),
    linetype = ggplot2::guide_legend(order = 2)
  ) +

  ggplot2::ylab("alienation coefficient") +
  ggplot2::theme_bw() +
  ggplot2::scale_x_continuous(
    breaks = seq(1, 14, 1),
    labels = meas_to_plot_ac$variable
  ) +
  ggplot2::theme(
    axis.text.x = ggplot2::element_text(angle = 60, hjust = 1),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = ggplot2::element_text(size = 13)
  ) +
  ggplot2::xlab("") +
  ggplot2::facet_wrap(. ~ additive)

ac_plot

Appendix D Categorical mean distances: skewed distributions

Code
LeHo<-function(X){  #  Returns the LeHo Distance (symmetric Kullback-Leibler)
                    #  X is matrix with conditional probabilities
#if(any(rowSums(X)!=1)){
#  print("Input must be conditional probs (row sums should be 1)")
#} else {
  
require(philentropy)

n<-nrow(X)
D<-matrix(0,n,n)
for (i in 1:(n-1)){
  for (j in (i+1):n){
    D[j,i]<-D[i,j]<-distance(X[c(i,j),],"kullback-leibler",unit="log2")+
      distance(X[c(j,i),],"kullback-leibler",unit="log2")}

}

return(D)
}

Data generation

Code
# Let's consider some specific unbalanced scenarios
# One probility is d. The others are (1-d)/(q-1)
# We consider skewed options:
dis<-c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)

#qi=5
qs<-c(2,3,5,10)#,18) # Some options for qi

nint<-length(dis)    # Number of options for d (nint-1)
rsm<-rsma<-rhl<-sdsm<-rse<-rsof<-rsiof<-sdse<-rsa<-sdsa<-sdsof<-sdsiof<-sddsma<-sdssm<-
  sddse<-sddsa<-sddsm<-sdhl<-sddsmca<-sddsof<-sddsiof<-sddshl<- sdshl<-
  rsmca<-sdsmca<-esm<-em<-ee<-ea<-ehl<-emca<-eof<-eiof<-matrix(NA,nrow=nint,ncol=length(qs))

#par(mfrow=c(2,2))
for (qis in 1:length(qs)){
  #ds<-NULL
  qi<-qs[qis]
  for (i in (1:nint)){ # variant 2

    Deltam<-matrix(1,qi,qi)    # Matching
    diag(Deltam)<-rep(0,qi)    # Matching Delta

    #d=i/(2*qi)  # i=2 corresponds to equal probs if variant 1 is chosen. (Only works for one qi. Not for qs loop)
    d=dis[i]  # variant 2

    #ds<-rbind(ds,d)
    p<-c(d,rep((1-d)/(qi-1),(qi-1)))
    sum(p)

    Dp<-diag(p)
    pp<-p %*% t(p)
    spi<-(diag(Dp-pp))^-1
    spi5<-(diag(Dp-pp))^-.5

    DEsk<-(2/(qi^2))*Deltam # Delta Eskin

    DOF<-log(p)%*%t(log(p)) * Deltam  # OF
    n=160 # For inv OF we need n. Soon gets very large (increases in n) 160 is sufficient
    #n<-ceiling(1/min(p))
    DIOF<-log(n*p)%*%t(log(n*p)) * Deltam  # inverse OF. If some np<1 we can get negatives

    n=160  # For HL we need to find a way to get somewhat consistent values. Let's choose n large
    marginals=p*n

    HLemp<-distancefactor(cat=qi,n=n,catsizes=marginals) #
    DHL<-Deltam*HLemp*2#*sqrt(2)  # NOTE 13/09: Corrected: Was sqrt(2). But, should be, see paper, 2x
    #DE<-sqrt((1/qi)*(spi %*% t(rep(1,qi)) + t(spi %*% t(rep(1,qi)))))
    #diag(DE)<-rep(0,qi)

    DM<-sqrt(1/qi)*(spi5 %*% t(rep(1,qi)) + t(spi5 %*% t(rep(1,qi))))
    diag(DM)<-rep(0,qi)

    DA<-(1/qi)*diag(spi5)%*%Deltam%*%diag(spi5)
    DMCA<-(1/qi) * diag(p^-.5)%*%Deltam%*%diag(p^-.5)

    #we<-ee[i,(qis)]<- t(p)%*%DE%*%p

    wsm<-esm[i,(qis)]<- t(p)%*%Deltam%*%p   # Simple Matching
    we<-ee[i,(qis)]<- t(p)%*%DEsk%*%p       # Eskin
    wof<-eof[i,(qis)]<-t(p)%*%DOF%*%p       # OF
    wiof<-eiof[i,(qis)]<-t(p)%*%DIOF%*%p    # IOF
    whl<-ehl[i,(qis)]<-t(p)%*%DHL%*%p       # HL
    wm<-em[i,(qis)]<- t(p)%*%DM%*%p         # SD scaling
    wa<-ea[i,(qis)]<- t(p)%*%DA%*%p         # Cat dis scaling
    wmca<-emca[i,(qis)]<- t(p)%*%DMCA%*%p   # MCA scaling

    # We can also estimate the variance among the distances using Deltas: E(X^2)-E(X)^2
    # We can do this dividing by expected values so that we look at what happens
    # when delta is commensurable. Or not
    # if not:

    #we<-wm<-wa<-wmca<-wof<-wiof<-whl<-wsm<-1

    sddse[i,(qis)]<- sqrt(t(p)%*%DEsk^2%*%p - ee[i,(qis)]^2)/we  # Eskin
    sddsma[i,(qis)]<- sqrt(t(p)%*%Deltam^2%*%p - esm[i,(qis)]^2)/ wsm  # Simple Matching
    sddshl[i,(qis)]<- sqrt(t(p)%*%DHL^2%*%p - ehl[i,(qis)]^2)/ whl  # HL
    sddsm[i,(qis)]<- sqrt(t(p)%*%DM^2%*%p - em[i,(qis)]^2)/ wm  # SD
    sddsa[i,(qis)]<- sqrt(t(p)%*%DA^2%*%p -  ea[i,(qis)]^2)/wa # Cat dissim
    sddsmca[i,(qis)]<- sqrt(t(p)%*%DMCA^2%*%p - emca[i,(qis)]^2)/wmca  # MCA
    sddsof[i,(qis)]<-sqrt(t(p)%*%DOF^2%*%p - eof[i,(qis)]^2)/wof  # OF
    sddsiof[i,(qis)]<-sqrt(t(p)%*%(DIOF/as.numeric(wiof))^2%*%p - 1 )  #IOF

    # Let's consider spread among cat dissimilarities. But: This only makes sense
    # when on similar scales. So, use commensurable deltas: (Or not: see above)
    sdse[i,qis]<-sd((1/we)*DEsk[upper.tri(DEsk)])  # Eskin
    sdsm[i,qis]<-sd((1/wm)*DM[upper.tri(DM)]) # SD
    sdshl[i,qis]<-sd((1/whl)*DHL[upper.tri(DHL)]) # HL
    sdssm[i,qis]<-sd((1/wsm)*Deltam[upper.tri(Deltam)]) # Simple Matching
    sdsa[i,qis]<-sd((1/wa)*DA[upper.tri(DA)]) #  Cat dissim,
    sdsmca[i,qis]<-sd((1/wmca)*DMCA[upper.tri(DMCA)]) # MCA
    sdsof[i,qis]<-sd((1/wof)*DOF[upper.tri(DOF)])  # OF
    sdsiof[i,qis]<-sd((1/wiof)*DIOF[upper.tri(DIOF)]) # IOF



  }

}

cats<-as.character(qs)
skew<-as.character(dis)#as.character(1:nint)

Code for Figure 7

Code
sk_distance_to_plot = function(exp_mean_dist,distance_name){
  exp_mean_dist = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |>
    rename(`2 cats` = V1,
           `3 cats` = V2,
           `5 cats` = V3,
           `10 cats` = V4) |>
    mutate(method=distance_name) |>
    pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`),
                 names_to = "Number of categories", values_to = "Mean distance")

  return(exp_mean_dist)
}

all_exp_mean_dist=rbind(sk_distance_to_plot(esm,"Simple Matching"),
                        sk_distance_to_plot(ee,"Eskin"),
                        sk_distance_to_plot(eof,"OF"),
                        sk_distance_to_plot(eiof,"IOF")
)

paper_plot_skew1=all_exp_mean_dist |>
  mutate(`Number of categories`=fct_inorder(`Number of categories`),
         method=fct_inorder(method),) |>
  ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
  geom_line() + geom_point() + facet_wrap(~method,scales = "free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
  xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")


print(paper_plot_skew1,width=10,height=6)

Code for Figure 8

Code
all_exp_mean_dist2=rbind(sk_distance_to_plot(ehl,"Hennig-Liao"),
                         sk_distance_to_plot(em,"Std. dev."),
                         sk_distance_to_plot(ea,"Cat. dissim.")
)

paper_plot_skew2=all_exp_mean_dist2 |>
  mutate(`Number of categories`=fct_inorder(`Number of categories`),
         method=fct_inorder(method),) |>
  ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
  geom_line() + geom_point() + facet_wrap(~method) + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
  xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")


print(paper_plot_skew2)

Code for Figure 9

Code
qs<-c(2,3,5,10)#,18) # Some options for qs

nint<-length(dis)    # Number of options for d (nint-1)

CVchd<-CVtvd<-CVmsd<-CVlhd<-Echi<-Echd<-Etvi<-Etvd<-Emsi<-Elhi<-Elhd<-Emsd<-array(NA,dim=c(length(qs),length(qs),nint,nint))

for (q1 in 1:length(qs)){  # cats row
  qi<-qs[q1]
  for (q2 in q1:length(qs)){ # cats cols
    qj<-qs[q2]
    for (pi in 1:nint){  # row distributions
      d1<-dis[pi]
      p1<-c(d1,rep((1-d1)/(qj-1),(qj-1)))  # nint different marginals: Distribution over colums
      for (pj in 1:nint){
        d2<-dis[pj]
        p2<-c(d2,rep((1-d2)/(qi-1),(qi-1)))  # corresponding col marginals: Distribution over rows
        Pi<-p2 %*% t(p1)   # joint table independent
        Ri<-Pi/rowSums(Pi) # conditional table independent
        #  Ci<-Ri/sqrt(p2)

        if (qj>qi) {
          Pd<-diag(p1[1:qi])
          Addzeros<-matrix(0,(qi-1),(qj-qi))
          Addps<-p1[qi+1:(qj-qi)]
          Add<-rbind(Addzeros,Addps)
          Pd<-cbind(Pd,Add)
        } else { #qi=qj
          pd<-p2
          Pd<-diag(pd)       # joint table dependent case (diagonal smallest q)
        }
        Rd<-Pd/rowSums(Pd)
        Cd<-Rd/sqrt(p2)    # Chi-square table: cond. prob/sqrt(p)

        # category dissimilarities:
        Dlhd<-LeHo(Rd)                   #  Le Ho
        # Dmsd<-MousaviSehhati(Pd)         # Moussavi
        Dtvd<-dist(Rd,"manhattan")/2     # TVD
        Dchd<-dist(Cd,"euclidean")^2     # Chi2 dist
        #    Dchd<-Dchd/(2*(qi-1))     # Chi2 dist corrected for number of categories...

        Elhd[q1,q2,pi,pj]<-t(pd) %*%Dlhd %*%pd
        # Emsd[q1,q2,pi,pj]<-t(pd) %*%Dmsd %*%pd
        Etvd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dtvd) %*%pd
        Echd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dchd) %*%pd

        # Coefficient of variation

        CVlhd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dlhd^2 %*%pd - (t(pd) %*%Dlhd %*%pd)^2)/(t(pd) %*%Dlhd %*%pd)
        # CVmsd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dmsd^2 %*%pd - (t(pd) %*%Dmsd %*%pd)^2)/(t(pd) %*%Dmsd %*%pd)
        CVtvd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dtvd))^2 %*%pd - (t(pd) %*%as.matrix(Dtvd) %*%pd)^2)/(t(pd) %*%as.matrix(Dtvd) %*%pd)
        CVchd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dchd))^2 %*%pd - (t(pd) %*%as.matrix(Dchd) %*%pd)^2)/(t(pd) %*%as.matrix(Dchd) %*%pd)

        Dlhi<-LeHo(Ri)
        # Dmsi<-MousaviSehhati(Pi)
        Dtvi<-dist(Ri,"manhattan")/2
        #  Dchi<-dist(Ci,"euclidean")^2

        Elhi[q1,q2,pi,pj]<-t(p2) %*%Dlhi %*%p2
        # Emsi[q1,q2,pi,pj]<-t(p2) %*%Dmsi %*%p2
        Etvi[q1,q2,pi,pj]<-t(p2) %*%as.matrix(Dtvi) %*%p2
        #  Echi[q1,q2,pi,pj]<-t(p1) %*%as.matrix(Dchi) %*%p1
      }
    }
  }
}



Elhs<-as.data.frame(rbind(Elhd[1,1,1,],Elhd[2,2,1,],Elhd[3,3,1,],Elhd[4,4,1,]))
Etvs<-as.data.frame(rbind(Etvd[1,1,1,],Etvd[2,2,1,],Etvd[3,3,1,],Etvd[4,4,1,]))
Emss<-as.data.frame(rbind(Emsd[1,1,1,],Emsd[2,2,1,],Emsd[3,3,1,],Emsd[4,4,1,]))
Echs<-as.data.frame(rbind(Echd[1,1,1,],Echd[2,2,1,],Echd[3,3,1,],Echd[4,4,1,]))

CVlhs<-as.data.frame(rbind(CVlhd[1,1,1,],CVlhd[2,2,1,],CVlhd[3,3,1,],CVlhd[4,4,1,]))
CVtvs<-as.data.frame(rbind(CVtvd[1,1,1,],CVtvd[2,2,1,],CVtvd[3,3,1,],CVtvd[4,4,1,]))
CVmss<-as.data.frame(rbind(CVmsd[1,1,1,],CVmsd[2,2,1,],CVmsd[3,3,1,],CVmsd[4,4,1,]))
CVchs<-as.data.frame(rbind(CVchd[1,1,1,],CVchd[2,2,1,],CVchd[3,3,1,],CVchd[4,4,1,]))

cats<-as.character(qs)
skew<-as.character(dis)#as.character(1:nint)


sk_distance_to_plot = function(exp_mean_dist,distance_name){
  exp_mean_dist = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |>
    rename(`2 cats` = V1,
           `3 cats` = V2,
           `5 cats` = V3,
           `10 cats` = V4) |>
    mutate(method=distance_name) |>
    pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`),
                 names_to = "Number of categories", values_to = "Mean distance")

  return(exp_mean_dist)
}


all_exp_mean_dist3=rbind(sk_distance_to_plot(t(as.matrix(Elhs)),"Le-Ho"),
                         sk_distance_to_plot(t(as.matrix(Etvs)),"Total Variance Distance")
)

paper_plot_skew3=all_exp_mean_dist3 |>
  mutate(`Number of categories`=fct_inorder(`Number of categories`),
         method=fct_inorder(method),) |>
  ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
  geom_line() + geom_point() + facet_wrap(~method,scales="free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
  xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")


print(paper_plot_skew3,width=8,height=4)

Appendix E FIFA variable distributions

Code for Figure 10

Code
# load("data/Fifa21NL.RData") # Load the data
data(fifa_nl,package="manydist")
#Xorig<-X<-ddata[,-c(1,2)]
Xorig<-X<-fifa_nl[,-c(1,2,19:24)] # vars 1 and 2 are a bit useless (too many categories)
# vars 19:24 interesting but same scale and little variation
# leaving them out makes the plots a bit smaller.
# leaving them in doesn't seem to change a lot


n<-nrow(X)
# p<-ncol(X)
K<-10  # Number of clusters in cluster analysis
nd<-2 # Dimensionality of MDS solution
X=X[,c(8:1,9:16)]  # Order
Xorig=Xorig[,c(8:1,9:16)]  # Order
X=X[,-3]  # Remove int rep: 380 have something, 21+7 in other cats
Xorig=Xorig[,-3]
X=X[,c(1:12,14,13,15)]
Xorig=Xorig[,c(1:12,14,13,15)]
p<-ncol(X)

# First convert integers to numeric manually
X[8:15] <- lapply(X[8:15], as.numeric)

# reordering / numeric first
X <- X %>%
  select(where(is.integer), where(is.numeric), where(is.factor))


X_tib_num = X |> as_tibble() |> select(where(is.numeric))

X_tib_cat = X |> as_tibble() |> select(where(is.factor))

levels(X_tib_cat$work_rate)=c("H/H", "H/L", "H/M", "L/H", "L/L", "L/M", "M/H", "M/L", "M/M")
levels(X_tib_cat$club_name) = c("ADO","Ajax","AZ","Emmen","Groningen","Twente","Utrecht",
                                "Feyenoord","Fortuna","Heracles","PEC","PSV","RKC","Heerenveen",
                                "Sparta","Vitesse","VVV","Willem II")

X_tib_cat=X_tib_cat |> rename(`club`=club_name,
                              `position`=team_position)

X_tib_num_nest =  tibble(values=X_tib_num |> map((~.x)),features=names(X_tib_num))|>
  mutate(
    hplot=map2(.x=values,.y=features,
               function(x=.x,y=.y){
                 my_tib=tibble(a=x)
                 names(my_tib) = y

                 my_plot= my_tib |> ggplot(aes(x)) + geom_histogram(bins=15,alpha=.75)+theme_bw()+ggtitle(y)+
                   theme(plot.title = element_text(hjust = 0.5))+xlab("")+ylab("")
               }
    )
  )


X_tib_cat_nest =  tibble(values=X_tib_cat |> map((~.x)),features=names(X_tib_cat))|>
  mutate(
    bplot=map2(.x=values,.y=features,
               function(x=.x,y=.y){
                 my_tib=tibble(a=x)
                 names(my_tib) = y

                 my_plot= my_tib |> ggplot(aes(x)) + geom_bar(alpha=.75)+theme_bw()+ggtitle(y)+
                   theme(plot.title = element_text(hjust = 0.5),
                         axis.text.x =element_text(angle=90 ))+xlab("")+ylab("")
               }
    )
  )


fifa_descriptive_plots=(X_tib_cat_nest$bplot[[1]]/
                          X_tib_cat_nest$bplot[[2]]/
                          X_tib_cat_nest$bplot[[3]]/
                          X_tib_cat_nest$bplot[[4]]/
                          X_tib_cat_nest$bplot[[5]]/
                          X_tib_cat_nest$bplot[[6]]/
                          X_tib_cat_nest$bplot[[7]])|
  (X_tib_num_nest$hplot[[1]]/
     X_tib_num_nest$hplot[[2]]/
     X_tib_num_nest$hplot[[3]]/
     X_tib_num_nest$hplot[[4]]/
     X_tib_num_nest$hplot[[5]]/
     X_tib_num_nest$hplot[[6]]/
     X_tib_num_nest$hplot[[7]])#+ plot_layout(guides = 'collect')


print(fifa_descriptive_plots)

References

Ghashti, J. S., and Thompson, J. R. J. (2024), “Mixed-type distance shrinkage and selection for clustering via kernel metric learning,” Journal of Classification. https://doi.org/https://doi.org/10.1007/s00357-024-09493-z.
Hennig, C., and Liao, T. F. (2013), How to Find an Appropriate Clustering for Mixed-Type Variables with Application to Socio-Economic Stratification,” Journal of the Royal Statistical Society Series C: Applied Statistics, 62, 309–369.
Liu, P., Yuan, H., Ning, Y., Chakraborty, B., Liu, N., and Peres, M. A. (2024), “A modified and weighted gower distance-based clustering analysis for mixed type data: A simulation and empirical analyses,” BMC Medical Research Methodology. https://doi.org/https://doi.org/10.1186/s12874-024-02427-8.
Mousavi, E., and Sehhati, M. (2023), “A generalized multi-aspect distance metric for mixed-type data clustering,” Pattern Recognition, 138, 109353.