Unbiased mixed-variable distance

supplementary material

Code

packages <- c(
  "aricode", "cluster", "fpc", "manydist", "patchwork", "tidyverse",
  "tidymodels", "varhandle", "vegan", "kdml", "ggplot2", "viridis",
  "dplyr", "entropy", "kernlab", "mclust", "Matrix", "philentropy",
  "conflicted", "ggnewscale", "glue"
)

invisible(lapply(packages, require, character.only = TRUE))

tidymodels_prefer()
conflict_prefer("select", "dplyr")
conflict_prefer("filter", "dplyr")
conflict_prefer("distance", "philentropy")

# prefer base set operations (if needed later)
conflict_prefer("intersect", "base")
conflict_prefer("setdiff", "base")

Here we provide the code to reproduce the results of the supplementary material of the paper “Unbiased mixed variables distances”.

Data generation

These are the simulation parameers

Code

reps<-100  # 100 can take 30min
n<-500
k<-2 # Dimension solution
sigma<-0.03 # Noise

porig<-6 # Number of variables corresponding to true config
pnnoise<-0 # Number of numerical noise vars
pcnoise<-0 # Number of categorical noise vars
pn<-2 # Number of numerical variables underying config

pnum<-pn+pnnoise+pcnoise  # Total number of numerical vars+noise vars (Needed to know which vars to discretize)
p<-porig+pnnoise+pcnoise # Total number of variables
qoptions<-c(2,3,5,9) # Distribution of the categorical variables corresponding to true config
pcat<-length(qoptions) # Number of cat variables underlying config


methods = c("baseline","naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")
method_levels <- c("baseline", "naive", "hl", "hl_add", "gower","gudmm", "dkss", "mod_gower", "u_ind", "u_dep", "u_mix")


my_colors <- c(
  # Set B
  "baseline"    = "#8C564B",  # brown
  "naive"       = "#E377C2",  # pink
  "hl"          = "#9C9E00",  # darker yellow-green (better contrast)
  "gudmm"       = "#FF7F0E",  # orange
  "dkss"        = "#7F7F7F",  # darker grey (more visible)

  # Set A
  "hl_add"      = "#9EC5E5",  # light blue
  "u_ind"       = "#2CA02C",  # green
  "u_dep"       = "#D62728",  # red
  "u_mix" = "#1E90FF",
  "gower"       = "#FFBF00",  # gold
  "mod_gower"   = "#9467BD"   # purple
)

my_shapes <- c(
  "baseline"    = 16,  # filled circle
  "naive"       = 17,  # filled triangle
  "hl"          = 15,  # filled square
  "hl_add"      = 3,   # plus
  "gower"       = 4,   # cross
  "gudmm"       = 18,  # filled diamond
  "dkss"        = 8,   # star
  "mod_gower"   = 0,   # empty square
  "u_ind"       = 1,   # empty circle
  "u_dep"       = 2,    # empty triangle
  "u_mix" = 5 # empty diamond
)

The considered methods are

additive distances

baseline: Manhattan distance on all the standardized numerical variables (before discretization)
gower: classic Gower distance
mod_gower: modified version of the Gower distance, as proposed in Liu et al. (2024)
hl_add: Manhattan distance with scaling of indicators as proposed by Hennig and Liao (2013)
u_ind: commensurable with simple matching
u_dep: commensurable PCA-scaled numerical and total-variance categorical
u_mix: commensurable Manhattan numerical and total-variance categorical

non-additive distances

naive: Euclidean distance on all variables, with one-hot encoded categorical variables.
hl: Euclidean distance with scaling of indicators as proposed by Hennig and Liao (2013)
gudmm: Generalized Unified Distance Metric for Mixed-type data as proposed by Mousavi and Sehhati (2023)
dkss: Distance using kernel product similarity as proposed by Ghashti and Thompson (2024)

Compute distances: leave one variable out

Code

library(progress)


pb <- progress_bar$new(
  format = "  Simulating [:bar] :percent eta: :eta",
  total = reps * length(methods),
  clear = FALSE,
  width = 60
)

# methods = c("baseline","naive","gower","hl","u_dep","u_ind","gudmm","dkss","mod_gower")


generated_datasets = tibble(replicate=1:reps) |> 
  mutate(dataset = map(.x=replicate, ~generate_dataset(n = n, pn=pn, porig=porig, pnnoise =pnnoise , pcnoise = pcnoise, sigma = sigma, qoptions = qoptions, seed = 1234+.x)),
         method=map(.x=replicate,~methods)
         ) |> unnest(method)|>
  left_join(mdist_method_lookup, by = "method")

Code

## a little helper for mdist

run_mdist_within <- function(df, mdist_type, mdist_preset, param_set, outcome = "response") {
  x <- df

  if (is.character(outcome) && length(outcome) == 1 && outcome %in% names(df)) {
    x <- dplyr::select(df, -dplyr::all_of(outcome))
  }

  if (mdist_type == "preset") {
    out <- manydist::mdist(x = x, preset = mdist_preset)

  } else if (mdist_type == "custom") {
    if (is.null(param_set)) stop("param_set is NULL for mdist_type = 'custom'")
    args <- param_set
    args$x <- x
    out <- do.call(manydist::mdist, args = args)

  } else {
    stop("Unknown mdist_type: ", mdist_type)
  }
if (identical(mdist_type, "preset") && identical(mdist_preset, "gower")) {
  
  out$distance*ncol(x)
  }else{
  out$distance
  }
}

# little helper for the leave one variable out version of mdist
run_lovo <- function(df, mdist_type, mdist_preset, param_set,
                     outcome = NULL, dims = 2, keep_dist = FALSE,
                     legacy_gower_sum = FALSE) {

  # drop outcome if present
  x <- df
  if (is.character(outcome) && length(outcome) == 1 && outcome %in% names(df)) {
    x <- dplyr::select(df, -dplyr::all_of(outcome))
  }

  # ---- special case: legacy Gower SUM scaling for LOVO ----
  if (legacy_gower_sum && identical(mdist_type, "preset") && identical(mdist_preset, "gower")) {

    vars <- names(x)
    p_full <- length(vars)

    # full
    D_full <- manydist::mdist(x = x, preset = "gower")$distance |> as.matrix()
    D_full <- D_full * p_full

    base_mds <- cmdscale(D_full, eig = TRUE, k = dims)$points[, 1:dims, drop = FALSE]
    loo_list <- vector("list", length(vars)); names(loo_list) <- vars

    for (i in seq_along(vars)) {
      var <- vars[i]
      x_sub <- dplyr::select(x, -dplyr::any_of(var))
      p_loo <- ncol(x_sub)

      D_loo <- manydist::mdist(x = x_sub, preset = "gower")$distance |> as.matrix()
      D_loo <- D_loo * p_loo

      loo_list[[i]] <- D_loo
    }

    mad <- vapply(loo_list, function(m) mean(abs(D_full - m)), numeric(1))
    cc  <- vapply(loo_list, function(m) {
      pts <- cmdscale(m, eig = TRUE, k = dims)$points[, 1:dims, drop = FALSE]
      congruence_coeff(base_mds, pts)
    }, numeric(1))
    ac <- sqrt(1 - cc^2)

    res <- tibble::tibble(
      variable       = vars,
      mad_importance = mad,
      cc_importance  = cc,
      ac_importance  = ac,
      mad_normalized = mad / sum(mad)
    )

    out <- list(results = res, base_mds = base_mds)
    if (keep_dist) {
      out$full_dist <- D_full
      out$loo_dist  <- loo_list
    }
    return(out)
  }

  # ---- default path: your package's LOVO ----
  if (mdist_type == "preset") {
    obj <- manydist::lovo_mdist(x = x, preset = mdist_preset, dims = dims, keep_dist = keep_dist)

  } else if (mdist_type == "custom") {
    if (is.null(param_set)) stop("param_set is NULL for mdist_type = 'custom'")
    args <- param_set
    args$x <- x
    args$dims <- dims
    args$keep_dist <- keep_dist
    obj <- do.call(manydist::lovo_mdist, args = args)

  } else {
    stop("Unknown mdist_type: ", mdist_type)
  }

  obj
}

Code

simulation_structure <- generated_datasets |>
  dplyr::mutate(
    loo_results = purrr::pmap(
      dplyr::pick(dataset, method, mdist_type, mdist_preset, param_set),
      \(dataset, method, mdist_type, mdist_preset, param_set) {
        pb$tick()
        X <- if (method == "baseline") dataset$Xorig else dataset$X
        run_lovo(X, mdist_type, mdist_preset, param_set,legacy_gower_sum = (method == "gower"))
      }
    )
  )


 save(file="simulation_structure.RData", simulation_structure)

Code

reference_method = "baseline"

baseline_refs = simulation_structure |>
  filter(method == "baseline") |>
  select(replicate, baseline_loo = loo_results)


rel_simulation_structure <- simulation_structure |>
  dplyr::left_join(baseline_refs, by = "replicate") |>
  dplyr::mutate(
    loo_results = purrr::map2(.x = loo_results,.y = baseline_loo,
        .f = \(res_obj, ref_obj) {
        res <- res_obj$results
        ref <- ref_obj$results

        # join by variable to be safe (order might differ)
        dplyr::left_join(
          res,
          dplyr::select(ref, variable,
                        ref_mad_normalized = mad_normalized,
                        ref_ac_importance  = ac_importance),
          by = "variable"
        ) |>
          dplyr::mutate(
            mad_relative_importance = mad_normalized / ref_mad_normalized,
            ac_relative_importance  = ac_importance  / ref_ac_importance
          )
      }
    )
  ) |>
  select(-baseline_loo)

results_long <- rel_simulation_structure |>
  tidyr::unnest(loo_results) |>
  # enforce one value per replicate (if already unique, no change)
  dplyr::group_by(replicate, variable, method) |>
  dplyr::summarise(
    dplyr::across(
      c(mad_importance, cc_importance, ac_importance,
        mad_normalized, mad_relative_importance, ac_relative_importance),
      ~ mean(.x, na.rm = TRUE)
    ),
    .groups = "drop"
  )

results_summary <- results_long |>
  dplyr::group_by(variable, method) |>
  dplyr::summarise(
    dplyr::across(
      c(mad_importance, cc_importance, ac_importance,
        mad_normalized, mad_relative_importance, ac_relative_importance),
      list(
        mean = \(x) mean(x, na.rm = TRUE),
        sd   = \(x) sd(x,   na.rm = TRUE)
      ),
      .names = "{.col}_{.fn}"
    ),
    .groups = "drop"
  )

results_wide <- results_summary |>
  tidyr::pivot_wider(
    names_from = method,
    values_from = -c(variable, method)
  )

measure_names <- c(
  "mad_importance",
  "cc_importance",
  "ac_importance",
  "mad_normalized",
  "mad_relative_importance",
  "ac_relative_importance"
)

split_by_measure_mean <- purrr::map(
  measure_names,
  \(measure) {
    results_wide |>
      dplyr::select(variable, dplyr::starts_with(paste0(measure, "_mean_"))) |>
      dplyr::rename_with(
        ~ stringr::str_remove(., paste0(measure, "_mean_")),
        -variable
      )
  }
) |> rlang::set_names(measure_names)

split_by_measure_sd <- purrr::map(
  measure_names,
  \(measure) {
    results_wide |>
      dplyr::select(variable, dplyr::starts_with(paste0(measure, "_sd_"))) |>
      dplyr::rename_with(
        ~ stringr::str_remove(., paste0(measure, "_sd_")),
        -variable
      )
  }
) |> rlang::set_names(measure_names)

Code for `Figure 1`

Code

meas_to_plot_mad_mean <- split_by_measure_mean |> pluck("mad_importance") |> 
  mutate(measure="mad_importance") |> slice(3:6,1,2) |> mutate(var_id=1:6)

meas_to_plot_mad_sd <- split_by_measure_sd |> pluck("mad_importance") |> 
  mutate(measure="mad_importance") |> slice(3:6,1,2) |> mutate(var_id=1:6)

mad_max <- max(meas_to_plot_mad_mean |> select(where(is.numeric), -var_id), na.rm = TRUE)

rect_data <- tibble(
  xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max,
  xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max
)  

meas_to_plot_mad_rel_mean <- split_by_measure_mean |> pluck("mad_normalized") |> 
  mutate(measure="mad_normalized") |> slice(3:6,1,2) |> mutate(var_id=1:6)

meas_to_plot_mad_rel_sd <- split_by_measure_sd |> pluck("mad_normalized") |> 
  mutate(measure="mad_normalized") |> slice(3:6,1,2) |> mutate(var_id=1:6)

mad_max_rel <- max(meas_to_plot_mad_rel_mean |> select(where(is.numeric), -var_id), na.rm = TRUE)

rect_data_rel <- tibble(
  xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max_rel,
  xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max_rel
)
  
loo_mad_plot <- meas_to_plot_mad_mean |> 
  dplyr::select(variable, var_id, any_of(method_levels)) |> 
  pivot_longer(cols = naive:u_mix, names_to="method", values_to="value") |> 
  mutate(
    method = factor(method, levels = method_levels),
    additive = ifelse(method %in% c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix"),
                      "additive distances", "non-additive distances")
  ) |>
  dplyr::left_join(
    meas_to_plot_mad_sd |>
      dplyr::select(variable, var_id, any_of(method_levels)) |>
      pivot_longer(cols = naive:u_mix, names_to="method", values_to="sd") |>
      mutate(method = factor(method, levels = method_levels)),
    by = c("variable","var_id","method")
  ) |>
  ggplot(aes(x = var_id, y = value, color = method, shape = method)) +
  geom_rect(data=rect_data,
            aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
            alpha=.2, fill="indianred", color="lightgrey", inherit.aes=FALSE) +
  geom_rect(data=rect_data,
            aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
            alpha=.2, fill="dodgerblue", color="lightgrey", inherit.aes=FALSE) +
  geom_errorbar(aes(ymin = value - sd, ymax = value + sd), width = 0.12, linewidth = 0.35) +
  geom_point() + geom_line() +
  ylab("absolute variable contribution to distance") +
  xlab("") + theme_bw() + theme(legend.position = "none") +
  scale_x_continuous(breaks=seq(1,6,1), labels = rep("",6)) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  scale_shape_manual(values = my_shapes) +
  scale_color_manual(values = my_colors) +
  facet_wrap(.~additive)

add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")

nonadd_methods <- c("naive","hl","gudmm","dkss")

# add_methods <- c("gower", "mod_gower", "hl_add", "u_ind", "u_dep", "u_mix")

legend_breaks <- c(
  c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix"),  # additive first
  c("naive","hl","gudmm","dkss")                           # non-additive after
)

legend_labels <- setNames(
  ifelse(legend_breaks %in% c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix"),
         paste0(legend_breaks, " (additive)"),
         paste0(legend_breaks, " (non-additive)")),
  legend_breaks
)

library(ggnewscale)

loo_mad_rel_df <- meas_to_plot_mad_rel_mean |>
  dplyr::select(variable, var_id, any_of(method_levels)) |>
  tidyr::pivot_longer(cols = naive:u_mix, names_to="method", values_to="value") |>
  mutate(
    method = factor(method, levels = c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")),
    additive = ifelse(as.character(method) %in% add_methods, "additive", "non-additive")
  ) |>
  dplyr::left_join(
    meas_to_plot_mad_rel_sd |>
      dplyr::select(variable, var_id, any_of(method_levels)) |>
      tidyr::pivot_longer(cols = naive:u_mix, names_to="method", values_to="sd") |>
      mutate(method = factor(method, levels = c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix"))),
    by = c("variable","var_id","method")
  )

# loo_mad_rel_plot <-
#   ggplot() +
#   geom_rect(data=rect_data_rel,
#             aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
#             alpha=.2, fill="indianred", color="lightgrey", inherit.aes=FALSE) +
#   geom_rect(data=rect_data_rel,
#             aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
#             alpha=.2, fill="dodgerblue", color="lightgrey", inherit.aes=FALSE) +
# 
#   # --- Additive layer + legend 1 ---
#   # geom_errorbar(
#   #   data = dplyr::filter(loo_mad_rel_df, additive == "additive"),
#   #   aes(x = var_id, ymin = value - sd, ymax = value + sd, color = method),
#   #   width = 0.12, linewidth = 0.35
#   # ) +
#   geom_line(
#     data = dplyr::filter(loo_mad_rel_df, additive == "additive"),
#     aes(x = var_id, y = value, color = method)
#   ) +
#   geom_point(
#     data = dplyr::filter(loo_mad_rel_df, additive == "additive"),
#     aes(x = var_id, y = value, color = method, shape = method)
#   ) +
#   scale_color_manual(
#     name   = "additive",
#     values = my_colors,
#     breaks = add_methods
#   ) +
#   scale_shape_manual(
#     name   = "additive",
#     values = my_shapes,
#     breaks = add_methods
#   ) +
#   guides(
#   color = guide_legend(
#     order = 1,
#     override.aes = list(shape = my_shapes[add_methods])
#   ),
#   shape = "none"
# ) +
# 
#   ggnewscale::new_scale_color() +
#   ggnewscale::new_scale("shape") +
# 
#   # --- Non-additive layer + legend 2 ---
#   # geom_errorbar(
#   #   data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"),
#   #   aes(x = var_id, ymin = value - sd, ymax = value + sd, color = method),
#   #   width = 0.12, linewidth = 0.35
#   # ) +
#   geom_line(
#     data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"),
#     aes(x = var_id, y = value, color = method)
#   ) +
#   geom_point(
#     data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"),
#     aes(x = var_id, y = value, color = method, shape = method)
#   ) +
#   scale_color_manual(
#     name   = "non-additive",
#     values = my_colors,
#     breaks = nonadd_methods
#   ) +
#   scale_shape_manual(
#     name   = "non-additive",
#     values = my_shapes,
#     breaks = nonadd_methods
#   ) +
#   guides(
#   color = guide_legend(
#     order = 2,
#     override.aes = list(shape = my_shapes[nonadd_methods])
#   ),
#   shape = "none"
# ) +
# 
#   # cosmetics (same as yours)
#   ylab("relative variable contribution to distance") +
#   xlab("left-out variable") + theme_bw() +
#   scale_x_continuous(
#     breaks=seq(1,6,1),
#     labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2")
#   ) +
#   theme(
#     axis.text.x = element_text(angle = 60, hjust = 1),
#     strip.text = element_blank(),
#     legend.position = "bottom",
#     legend.title.position = "top",
#     legend.text = element_text(size = 13)
#     )+facet_wrap(.~additive)
add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkss")

loo_mad_rel_plot <-
  ggplot() +
  geom_rect(
    data = rect_data_rel,
    aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont),
    alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE
  ) +
  geom_rect(
    data = rect_data_rel,
    aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat),
    alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE
  ) +

  # --- Additive layer ---
  geom_line(
    data = dplyr::filter(loo_mad_rel_df, additive == "additive"),
    aes(x = var_id, y = value, color = method, linetype = method),
    linewidth = 0.5
  ) +
  geom_point(
    data = dplyr::filter(loo_mad_rel_df, additive == "additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "additive",
    values = my_colors,
    breaks = add_methods
  ) +
  scale_shape_manual(
    name   = "additive",
    values = my_shapes,
    breaks = add_methods
  ) +
  scale_linetype_manual(
    name   = "additive",
    values = setNames(rep("solid", length(add_methods)), add_methods),
    breaks = add_methods
  ) +
  guides(
    color = guide_legend(order = 1),
    shape = guide_legend(order = 1),
    linetype = guide_legend(order = 1)
  ) +

  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +
  ggnewscale::new_scale("linetype") +

  # --- Non-additive layer ---
  geom_line(
    data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"),
    aes(x = var_id, y = value, color = method, linetype = method),
    linewidth = 0.5
  ) +
  geom_point(
    data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "non-additive",
    values = my_colors,
    breaks = nonadd_methods
  ) +
  scale_shape_manual(
    name   = "non-additive",
    values = my_shapes,
    breaks = nonadd_methods
  ) +
  scale_linetype_manual(
    name   = "non-additive",
    values = setNames(rep("solid", length(nonadd_methods)), nonadd_methods),
    breaks = nonadd_methods
  ) +
  guides(
    color = guide_legend(order = 2),
    shape = guide_legend(order = 2),
    linetype = guide_legend(order = 2)
  ) +

  ylab("relative variable contribution to distance") +
  xlab("left-out variable") +
  theme_bw() +
  scale_x_continuous(
    breaks = seq(1, 6, 1),
    labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2")
  ) +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    strip.text = element_blank(),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  ) +
  facet_wrap(. ~ additive)
fig_1_plot = loo_mad_plot / loo_mad_rel_plot

fig_1_plot

Code for `Figure 2`

Code

library(ggnewscale)

add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkss")

meas_to_plot_ac_mean <- split_by_measure_mean |> 
  pluck("ac_relative_importance") |> 
  mutate(measure = "ac_relative_importance") |> 
  slice(3:6, 1, 2) |> 
  mutate(var_id = 1:6)

meas_to_plot_ac_sd <- split_by_measure_sd |> 
  pluck("ac_relative_importance") |> 
  mutate(measure = "ac_relative_importance") |> 
  slice(3:6, 1, 2) |> 
  mutate(var_id = 1:6)

ac_max <- max(meas_to_plot_ac_mean |> select(where(is.numeric), -var_id), na.rm = TRUE)

rect_data_ac <- tibble(
  xmin_cont = 4.5, xmax_cont = 6, ymin_cont = 0, ymax_cont = ac_max,
  xmin_cat  = .5,  xmax_cat  = 4.5, ymin_cat  = 0, ymax_cat  = ac_max
)

ac_df <- meas_to_plot_ac_mean |>
  dplyr::select(variable, var_id, any_of(method_levels)) |>
  tidyr::pivot_longer(cols = naive:u_mix, names_to = "method", values_to = "value") |>
  mutate(
    method = factor(
      method,
      levels = c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")
    ),
    additive = ifelse(as.character(method) %in% add_methods, "additive", "non-additive")
  ) |>
  dplyr::left_join(
    meas_to_plot_ac_sd |>
      dplyr::select(variable, var_id, any_of(method_levels)) |>
      tidyr::pivot_longer(cols = naive:u_mix, names_to = "method", values_to = "sd") |>
      mutate(
        method = factor(
          method,
          levels = c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")
        )
      ),
    by = c("variable", "var_id", "method")
  )

ac_plot <-
  ggplot() +
  geom_rect(
    data = rect_data_ac,
    aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont),
    alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE
  ) +
  geom_rect(
    data = rect_data_ac,
    aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat),
    alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE
  ) +

  # --- Additive block ---
  geom_line(
    data = dplyr::filter(ac_df, additive == "additive"),
    aes(x = var_id, y = value, color = method, linetype = method),
    linewidth = 0.5
  ) +
  geom_point(
    data = dplyr::filter(ac_df, additive == "additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "additive",
    values = my_colors,
    breaks = add_methods
  ) +
  scale_shape_manual(
    name   = "additive",
    values = my_shapes,
    breaks = add_methods
  ) +
  scale_linetype_manual(
    name   = "additive",
    values = setNames(rep("solid", length(add_methods)), add_methods),
    breaks = add_methods
  ) +
  guides(
    color = guide_legend(order = 1),
    shape = guide_legend(order = 1),
    linetype = guide_legend(order = 1)
  ) +

  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +
  ggnewscale::new_scale("linetype") +

  # --- Non-additive block ---
  geom_line(
    data = dplyr::filter(ac_df, additive == "non-additive"),
    aes(x = var_id, y = value, color = method, linetype = method),
    linewidth = 0.5
  ) +
  geom_point(
    data = dplyr::filter(ac_df, additive == "non-additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "non-additive",
    values = my_colors,
    breaks = nonadd_methods
  ) +
  scale_shape_manual(
    name   = "non-additive",
    values = my_shapes,
    breaks = nonadd_methods
  ) +
  scale_linetype_manual(
    name   = "non-additive",
    values = setNames(rep("solid", length(nonadd_methods)), nonadd_methods),
    breaks = nonadd_methods
  ) +
  guides(
    color = guide_legend(order = 2),
    shape = guide_legend(order = 2),
    linetype = guide_legend(order = 2)
  ) +

  ylab("relative alienation coefficient") +
  xlab("left-out variable") +
  scale_x_continuous(
    breaks = seq(1, 6, 1),
    labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2")
  ) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  ) +
  facet_wrap(. ~ additive)

ac_plot

Retrieval of the true configuration

Data generation

Code

recovery_structure <- tidyr::crossing(
  tibble::tibble(replicate = 1:reps),
  method = methods,
  categories = qoptions
) |>
  dplyr::left_join(manydist:::mdist_method_lookup, by = "method") |>
  dplyr::mutate(
    dataset = purrr::map2(
      .x = replicate, .y = categories,
      ~ generate_dataset(
        n = n, porig = porig, pn = pn,
        pnnoise = pnnoise, pcnoise = pcnoise,
        sigma = sigma, qoptions = .y, mode = "shared",
        seed = 1234 + .x
      )
    ),
    # compute distance matrix using method specs + baseline uses Xorig
    distance_matrix = purrr::pmap(
      dplyr::pick(dataset, method, mdist_type, mdist_preset, param_set),
      \(dataset, method, mdist_type, mdist_preset, param_set) {
        X <- if (method == "baseline") dataset$Xorig else dataset$X

        run_mdist_within(
          df = X,
          mdist_type = mdist_type,
          mdist_preset = mdist_preset,
          param_set = param_set,
          outcome = NULL   # because X / Xorig have no response column
        ) |>
          base::as.matrix()
      }
    ),

    mds_results = purrr::map(
      distance_matrix,
      ~ cmdscale(.x, eig = TRUE, k = 2)$points |>
        as.data.frame() |>
        setNames(c("x1", "x2"))
    ),

    congruence_coeff = purrr::map2(
      .x = mds_results, .y = dataset,
      \(mds, ds) congruence_coeff(ds$truth, mds)
    )
  )

# save(file="recovery_structure.RData", recovery_structure)
recovery_structure_lite = recovery_structure |>
  select(-dataset, -distance_matrix, -mds_results) |>
  unnest(congruence_coeff) |>
  select(replicate, method, categories, congruence_coeff) |> 
  mutate(categories=factor(paste0(categories," categories")),
         alienation_coeff = sqrt(1-congruence_coeff^2))

save(file="recovery_structure_lite.RData", recovery_structure_lite)

Code for figure `Figure 3`

Code

library(dplyr)
library(ggplot2)
library(ggnewscale)

# n_add   <- length(add_methods)

add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkss")
method_order <- c(add_methods, nonadd_methods, "baseline")

n_add    <- length(add_methods)
n_nonadd <- length(nonadd_methods)

n_total <- length(method_order)

rect_bg <- tibble::tibble(
  xmin = c(0.5, n_add + 0.5),
  xmax = c(n_add + 0.5, n_add + n_nonadd + 0.5),
  ymin = c(-Inf, -Inf),
  ymax = c( Inf,  Inf),
  grp  = c("additive", "non-additive")
)
# --- method groups ---


# x-axis order: additive first, then non-additive, baseline last

# --- prepare data ---
box_df <- recovery_structure_lite |>
  mutate(
    method = ifelse(is.na(method), "baseline", method),
    method = factor(method, levels = method_order),
    additive = ifelse(as.character(method) %in% add_methods, "additive", "non-additive")
  )

# --- plot ---
boxplot_figure_simu <-
  ggplot(box_df, aes(x = method, y = alienation_coeff)) +
  geom_rect(
  data = dplyr::filter(rect_bg, grp == "additive"),
  aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
  inherit.aes = FALSE,
  alpha = 0.20, fill = "dodgerblue", color = "lightgrey"
) +
geom_rect(
  data = dplyr::filter(rect_bg, grp == "non-additive"),
  aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
  inherit.aes = FALSE,
  alpha = 0.20, fill = "indianred", color = "lightgrey"
) +
  facet_wrap(~ categories) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  ) +
  ylab("alienation coefficient") +
  xlab("methods") +

  # --- additive boxes + legend 1 (left) ---
  geom_boxplot(
    data = dplyr::filter(box_df, additive == "additive"),
    aes(fill = method),
    outlier.alpha = 0.25,
    linewidth = 0.35
  ) +
  scale_fill_manual(
    name   = "additive",
    values = my_colors,
    breaks = add_methods
  ) +
  guides(fill = guide_legend(order = 1)) +

  ggnewscale::new_scale_fill() +

  # --- non-additive boxes (incl baseline) + legend 2 (right) ---
  geom_boxplot(
    data = dplyr::filter(box_df, additive == "non-additive"),
    aes(fill = method),
    outlier.alpha = 0.25,
    linewidth = 0.35
  ) +
  scale_fill_manual(
    name   = "non-additive",
    values = my_colors,
    breaks = c(nonadd_methods, "baseline")  # baseline included here; appears last in x-axis
  ) +
  guides(fill = guide_legend(order = 2))

boxplot_figure_simu

Clustering experiment: partitioning around medoids

numsep =.1 and catsep=0.35 generation

Code

nrep <- 100
k_true <- 4
q <- 9
numsep <- 0.1
catsep <- 0.35
clustSizeEq <- 50

param_generator_grid <- tibble::tibble(
  numsignal = c(0,0,4,4,4,8,8,8),
  numnoise  = c(8,8,4,4,4,0,0,0),
  catsignal = c(4,8,0,4,8,0,4,8),
  catnoise  = c(4,0,8,4,0,8,4,0)
) |>
  tidyr::crossing(replicate = 1:nrep)

mixed_datasets_grid <- param_generator_grid |>
  dplyr::mutate(
    data = purrr::pmap(
      dplyr::pick(numsignal, catsignal, numnoise, catnoise, replicate),
      \(numsignal, catsignal, numnoise, catnoise, replicate) {
        gen_mixed(
          k_true      = k_true,
          clustSizeEq = clustSizeEq,
          numsignal   = numsignal,
          numnoise    = numnoise,
          catsignal   = catsignal,
          catnoise    = catnoise,
          q           = q,
          q_err       = q,
          numsep      = numsep,
          catsep      = catsep,
          seed        = replicate
        )
      }
    )
  )

methods <- c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")

sim_methods_grid <- mixed_datasets_grid |>
  tidyr::crossing(method = methods) |>
  dplyr::left_join(manydist:::mdist_method_lookup, by = "method")

stopifnot(!any(is.na(sim_methods_grid$mdist_type)))

ari_pam_results_01_035 <- sim_methods_grid |>
  dplyr::mutate(
    ari = purrr::pmap_dbl(
      dplyr::pick(data, mdist_type, mdist_preset, param_set),
      \(data, mdist_type, mdist_preset, param_set) {
        df <- data$df
        D <- run_mdist_within(df, mdist_type, mdist_preset, param_set, outcome = "y")
        pam_fit <- cluster::pam(D, k = k_true, diss = TRUE)
        mclust::adjustedRandIndex(df$y, pam_fit$clustering)
      }
    )
  )

Code

# Method groups (as you provided)
add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkss")
method_levels  <- c(add_methods, nonadd_methods)

# Facet order (row-wise)
scenario_order <- c("0844","0880","4408","8008","4444","4480","8044","8080")

ari_plot_df_01_035 <- ari_pam_results_01_035 |>
  mutate(
    # Used only to match facet order; preserves leading zeros
    scenario_id = sprintf("%01d%01d%01d%01d", numsignal, numnoise, catsignal, catnoise),

    scenario = str_glue(
      "num: {numsignal} s. + {numnoise} n. | cat: {catsignal} s. + {catnoise} n."
    ),
    # Row-wise facet order as provided
    scenario = factor(scenario, levels = scenario[match(scenario_order, scenario_id)]),

    # Method order
    method = factor(method, levels = method_levels),

    # Additive / Non-additive label
    method_type = if_else(as.character(method) %in% add_methods, "Additive", "Non-additive")
  )

# Background bands (no fill scale consumed)
k_add <- length(add_methods)
k_tot <- length(method_levels)

bg_add <- tibble(
  xmin = 0.5, xmax = k_add + 0.5,
  ymin = -Inf, ymax = Inf
)
bg_nonadd <- tibble(
  xmin = k_add + 0.5, xmax = k_tot + 0.5,
  ymin = -Inf, ymax = Inf
)

ari_plot_01_035 <- ggplot() +
  # Shaded regions
  geom_rect(
    data = bg_add,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
    inherit.aes = FALSE,
    fill = "indianred",
    alpha = 0.12,
    show.legend = FALSE
  ) +
  geom_rect(
    data = bg_nonadd,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
    inherit.aes = FALSE,
    fill = "dodgerblue",
    alpha = 0.12,
    show.legend = FALSE
  ) +

  # Additive boxplots + legend block
  geom_boxplot(
    data = ari_plot_df_01_035 |> filter(method_type == "Additive"),
    aes(x = method, y = ari, fill = method),
    outlier.alpha = 0.25,
    linewidth = 0.35
  ) +
  # Additive legend (comes first)
scale_fill_manual(
  name   = "Additive",
  values = my_colors[add_methods],
  breaks = add_methods,
  drop   = FALSE,
  guide  = guide_legend(order = 1)
) +

  ggnewscale::new_scale_fill() +

  # Non-additive boxplots + legend block
  geom_boxplot(
    data = ari_plot_df_01_035 |> filter(method_type == "Non-additive"),
    aes(x = method, y = ari, fill = method),
    outlier.alpha = 0.25,
    linewidth = 0.35
  ) +
 # Non-additive legend (comes second)
scale_fill_manual(
  name   = "Non-additive",
  values = my_colors[nonadd_methods],
  breaks = nonadd_methods,
  drop   = FALSE,
  guide  = guide_legend(order = 2)
) +
  facet_wrap(~ scenario, ncol = 2) +
  coord_cartesian(ylim = c(0, 1)) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    strip.text  = element_text(size = 9),
    legend.text = element_text(size = 13)
  ) +
  labs(x = "", y = "ARI (PAM on dissimilarities)")

print(ari_plot_01_035)

Code

# ---- method groups ----
add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkss")

# ---- scenario order (shared with boxplot figure) ----
scenario_order <- c("0844","0880","4408","8008","4444","4480","8044","8080")

ari_parallel_df <- ari_pam_results_01_035 |>
  mutate(
    # key used for ordering
    scenario_id = sprintf(
      "%01d%01d%01d%01d",
      numsignal, numnoise, catsignal, catnoise
    ),

    # label shown on the x-axis
    scenario = stringr::str_glue(
      "n:({numsignal},{numnoise}); c:({catsignal},{catnoise})"
    )
  ) |>
  group_by(scenario_id, scenario, method) |>
  summarise(
    mean_ari = mean(ari, na.rm = TRUE),
    sd_ari   = sd(ari,   na.rm = TRUE),
    .groups  = "drop"
  ) |>
  mutate(
    # enforce scenario order explicitly
    scenario = factor(
      scenario,
      levels = scenario[match(scenario_order, scenario_id)]
    ),

    type = ifelse(method %in% add_methods, "additive", "non-additive"),
    method = factor(method, levels = c(add_methods, nonadd_methods))
  )

Code for `Figure 4`

Code

ari_parallel_plot <-
  ggplot() +

  # ================= ADDITIVE =================
  geom_errorbar(
    data = dplyr::filter(ari_parallel_df, type == "additive"),
    aes(
      x = scenario,
      ymin = mean_ari - sd_ari,
      ymax = mean_ari + sd_ari,
      color = method
    ),
    width = 0.12,
    linewidth = 0.35
  ) +
  geom_line(
    data = dplyr::filter(ari_parallel_df, type == "additive"),
    aes(
      x = scenario,
      y = mean_ari,
      color = method,
      shape = method,
      linetype = method,
      group = method
    ),
    linewidth = 0.6,
    alpha = .75
  ) +
  geom_point(
    data = dplyr::filter(ari_parallel_df, type == "additive"),
    aes(x = scenario, y = mean_ari, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name = "additive",
    values = my_colors,
    breaks = add_methods
  ) +
  scale_shape_manual(
    name = "additive",
    values = my_shapes,
    breaks = add_methods
  ) +
  scale_linetype_manual(
    name = "additive",
    values = setNames(rep("solid", length(add_methods)), add_methods),
    breaks = add_methods
  ) +
  guides(
    color = guide_legend(order = 1),
    shape = guide_legend(order = 1),
    linetype = guide_legend(order = 1)
  ) +

  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +
  ggnewscale::new_scale("linetype") +

  # ================= NON-ADDITIVE =================
  geom_errorbar(
    data = dplyr::filter(ari_parallel_df, type == "non-additive"),
    aes(
      x = scenario,
      ymin = mean_ari - sd_ari,
      ymax = mean_ari + sd_ari,
      color = method
    ),
    width = 0.12,
    linewidth = 0.35
  ) +
  geom_line(
    data = dplyr::filter(ari_parallel_df, type == "non-additive"),
    aes(
      x = scenario,
      y = mean_ari,
      color = method,
      shape = method,
      linetype = method,
      group = method
    ),
    linewidth = 0.6,
    alpha = .75
  ) +
  geom_point(
    data = dplyr::filter(ari_parallel_df, type == "non-additive"),
    aes(x = scenario, y = mean_ari, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name = "non-additive",
    values = my_colors,
    breaks = nonadd_methods
  ) +
  scale_shape_manual(
    name = "non-additive",
    values = my_shapes,
    breaks = nonadd_methods
  ) +
  scale_linetype_manual(
    name = "non-additive",
    values = setNames(rep("solid", length(nonadd_methods)), nonadd_methods),
    breaks = nonadd_methods
  ) +
  guides(
    color = guide_legend(order = 2),
    shape = guide_legend(order = 2),
    linetype = guide_legend(order = 2)
  ) +

  labs(x = "scenario", y = "mean ARI (± 1 SD)") +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 30, hjust = 1),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  )

ari_parallel_plot

Code

ggsave(
  filename = "figure_parallel_ari.pdf",
  plot     = ari_parallel_plot,
  width    = 18,
  height   = 11,
  units    = "cm"
)

Code

load(file = "ari_pam_experiment_numsep_01_catsep_035.RData")

add_methods    <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
nonadd_methods <- c("naive","hl","gudmm","dkss")

scenario_order <- c("0844","0880","4408","8008","4444","4480","8044","8080")

ari_rank_df <- ari_pam_results_01_035 |>
  mutate(
    scenario_id = sprintf("%01d%01d%01d%01d", numsignal, numnoise, catsignal, catnoise),
    scenario = stringr::str_glue("n:({numsignal},{numnoise}); c:({catsignal},{catnoise})")
  ) |>
  group_by(scenario_id, scenario, method) |>
  summarise(
    mean_ari = mean(ari, na.rm = TRUE),
    .groups  = "drop"
  ) |>
  group_by(scenario_id, scenario) |>
  mutate(
    rank_desc = dplyr::dense_rank(dplyr::desc(mean_ari))  # 1 = best
  ) |>
  ungroup() |>
  mutate(
    # enforce scenario order (same as boxplot panels)
    scenario = factor(scenario, levels = scenario[match(scenario_order, scenario_id)]),

    type = ifelse(method %in% add_methods, "additive", "non-additive"),
    method = factor(method, levels = c(add_methods, nonadd_methods))
  )

Illustration: FIFA player data

Data preparation and analysis

Code

# Variables are organized: Categorical first, then numerical

data("fifa_nl", package = "manydist")
#Xorig<-X<-ddata[,-c(1,2)]
X<-fifa_nl[,-c(1,2,19:24)] # vars 1 and 2 are a bit useless (too many categories)
# vars 19:24 interesting but same scale and little variation
# leaving them out makes the plots a bit smaller.
# leaving them in doesn't seem to change a lot
n<-nrow(X)
nd<-2 # Dimensionality of MDS solution
X=X[,c(8:1,9:16)]  # Order
X=X[,-3]  # Remove int rep: 380 have something, 21+7 in other cats

X=X[,c(1:12,14,13,15)]
p<-ncol(X)
X[8:15] <- lapply(X[8:15], as.numeric)
fifa_clean <- as_tibble(X)  |> 
  select(where(is.integer), where(is.numeric),where(is.factor) ,-release_clause_eur) |> 
  rename(pref_foot=`preferred_foot`,
         club=`club_name`,
         position=`team_position`)

Code

fifa_structure <- tibble::tibble(
  method = c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")
) |>
  dplyr::left_join(manydist:::mdist_method_lookup, by = "method") |>
  dplyr::mutate(
    loo_results = purrr::pmap(
      dplyr::pick(method, mdist_type, mdist_preset, param_set),
      \(method, mdist_type, mdist_preset, param_set) {
        run_lovo(
          df = fifa_clean,
          mdist_type = mdist_type,
          mdist_preset = mdist_preset,
          param_set = param_set,
          outcome = NULL,
          dims = nd,
          keep_dist = FALSE,
          legacy_gower_sum = (method == "gower")  # triggers only for preset+gower anyway
        )
      }
    )
  )

Code

results_fifa_wide <- fifa_structure |>
  dplyr::mutate(loo_tbl = purrr::map(loo_results, "results")) |>
  dplyr::select(method, loo_tbl) |>
  tidyr::unnest(loo_tbl) |>
  tidyr::pivot_wider(
    id_cols    = variable,
    names_from = method,
    values_from = c(mad_importance, cc_importance, ac_importance, mad_normalized)
  )


measure_names <- c(
  "mad_importance",
  "cc_importance",
  "ac_importance",
  "mad_normalized"
)

fifa_split_by_measure <- purrr::map(
  measure_names,
  \(measure) {
    results_fifa_wide |>
      dplyr::select(variable, dplyr::starts_with(paste0(measure, "_"))) |>
      dplyr::rename_with(~ stringr::str_remove(.x, paste0(measure, "_")))
  }
) |>
  rlang::set_names(measure_names)

Code for `Figure 5`

Code

meas_to_plot_mad = fifa_split_by_measure |> pluck("mad_importance") |>
  mutate(measure = "mad_importances") |> slice(c(8:14, 1:7)) |> mutate(var_id = 1:n())

mad_max = max(meas_to_plot_mad |> select(where(is.numeric), -var_id))

rect_data = tibble(
  xmin_cont = 8.5, xmax_cont = 14, ymin_cont = 0, ymax_cont = mad_max,
  xmin_cat  = .5,  xmax_cat  = 8.5, ymin_cat  = 0, ymax_cat  = mad_max
)

meas_to_plot_mad_rel = fifa_split_by_measure |> pluck("mad_normalized") |>
  mutate(measure = "mad_normalized") |> slice(c(8:14, 1:7)) |> mutate(var_id = 1:n())

mad_max_rel = max(meas_to_plot_mad_rel |> select(where(is.numeric), -var_id))

rect_data_rel = tibble(
  xmin_cont = 8.5, xmax_cont = 14, ymin_cont = 0, ymax_cont = mad_max_rel,
  xmin_cat  = .5,  xmax_cat  = 8.5, ymin_cat  = 0, ymax_cat  = mad_max_rel
)

method_levels <- c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")
additive_set  <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")
conflict_prefer("intersect", "base")
conflict_prefer("setdiff", "base")
add_present    <- intersect(additive_set, method_levels)
nonadd_present <- setdiff(method_levels, additive_set)

# ---------- TOP: absolute contributions ----------
loo_mad_long <- meas_to_plot_mad |>
  pivot_longer(
    cols = any_of(method_levels),
    names_to = "method",
    values_to = "value"
  ) |>
  mutate(
    method   = factor(method, levels = method_levels),
    additive = ifelse(method %in% additive_set, "additive distances", "non-additive distances"),
    type     = ifelse(method %in% additive_set, "Additive", "Non-additive")
  )

loo_mad_plot <-
  ggplot() +
  geom_rect(
    data = rect_data,
    aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont),
    alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE
  ) +
  geom_rect(
    data = rect_data,
    aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat),
    alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE
  ) +
  # Additive
  geom_line(
    data = filter(loo_mad_long, type == "Additive"),
    aes(x = var_id, y = value, color = method, linetype = method, group = method),
    alpha = 0.85
  ) +
  geom_point(
    data = filter(loo_mad_long, type == "Additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "Additive",
    values = my_colors,
    breaks = add_present,
    drop   = FALSE
  ) +
  scale_shape_manual(
    name   = "Additive",
    values = my_shapes,
    breaks = add_present,
    drop   = FALSE
  ) +
  scale_linetype_manual(
    name   = "Additive",
    values = setNames(rep("solid", length(add_present)), add_present),
    breaks = add_present,
    drop   = FALSE
  ) +
  guides(
    color = guide_legend(order = 1),
    shape = guide_legend(order = 1),
    linetype = guide_legend(order = 1)
  ) +
  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +
  ggnewscale::new_scale("linetype") +
  # Non-additive
  geom_line(
    data = filter(loo_mad_long, type == "Non-additive"),
    aes(x = var_id, y = value, color = method, linetype = method, group = method),
    alpha = 0.85
  ) +
  geom_point(
    data = filter(loo_mad_long, type == "Non-additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "Non-additive",
    values = my_colors,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  scale_shape_manual(
    name   = "Non-additive",
    values = my_shapes,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  scale_linetype_manual(
    name   = "Non-additive",
    values = setNames(rep("solid", length(nonadd_present)), nonadd_present),
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  guides(
    color = guide_legend(order = 2),
    shape = guide_legend(order = 2),
    linetype = guide_legend(order = 2)
  ) +
  ylab("absolute variable contribution to distance") +
  theme_bw() +
  scale_x_continuous(breaks = seq(1, 14, 1), labels = rep("", 14)) +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    legend.position = "none"
  ) +
  xlab("") +
  facet_wrap(. ~ additive)

# ---------- BOTTOM: relative contributions ----------
loo_mad_rel_long <- meas_to_plot_mad_rel |>
  pivot_longer(
    cols = any_of(method_levels),
    names_to = "method",
    values_to = "value"
  ) |>
  mutate(
    method   = factor(method, levels = method_levels),
    additive = ifelse(method %in% additive_set, "additive distances", "non-additive distances"),
    type     = ifelse(method %in% additive_set, "Additive", "Non-additive")
  )

loo_mad_rel_plot <-
  ggplot() +
  geom_rect(
    data = rect_data_rel,
    aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont),
    alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE
  ) +
  geom_rect(
    data = rect_data_rel,
    aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat),
    alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE
  ) +
  # Additive
  geom_line(
    data = filter(loo_mad_rel_long, type == "Additive"),
    aes(x = var_id, y = value, color = method, linetype = method, group = method),
    alpha = 0.85
  ) +
  geom_point(
    data = filter(loo_mad_rel_long, type == "Additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "Additive",
    values = my_colors,
    breaks = add_present,
    drop   = FALSE
  ) +
  scale_shape_manual(
    name   = "Additive",
    values = my_shapes,
    breaks = add_present,
    drop   = FALSE
  ) +
  scale_linetype_manual(
    name   = "Additive",
    values = setNames(rep("solid", length(add_present)), add_present),
    breaks = add_present,
    drop   = FALSE
  ) +
  guides(
    color = guide_legend(order = 1),
    shape = guide_legend(order = 1),
    linetype = guide_legend(order = 1)
  ) +
  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +
  ggnewscale::new_scale("linetype") +
  # Non-additive
  geom_line(
    data = filter(loo_mad_rel_long, type == "Non-additive"),
    aes(x = var_id, y = value, color = method, linetype = method, group = method),
    alpha = 0.85
  ) +
  geom_point(
    data = filter(loo_mad_rel_long, type == "Non-additive"),
    aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  scale_color_manual(
    name   = "Non-additive",
    values = my_colors,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  scale_shape_manual(
    name   = "Non-additive",
    values = my_shapes,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  scale_linetype_manual(
    name   = "Non-additive",
    values = setNames(rep("solid", length(nonadd_present)), nonadd_present),
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  guides(
    color = guide_legend(order = 2),
    shape = guide_legend(order = 2),
    linetype = guide_legend(order = 2)
  ) +
  ylab("relative variable contribution to distance") +
  theme_bw() +
  scale_x_continuous(breaks = seq(1, 14, 1), labels = meas_to_plot_mad_rel$variable) +
  theme(
    axis.text.x = element_text(angle = 60, hjust = 1),
    strip.text  = element_blank(),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = element_text(size = 13)
  ) +
  xlab("") +
  facet_wrap(. ~ additive)

# ---------- COMBINE ----------
fig_4_plot <- loo_mad_plot / loo_mad_rel_plot
fig_4_plot

Code for `Figure 6`

Code

method_levels <- c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")
additive_set  <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix")

meas_to_plot_ac <- fifa_split_by_measure |>
  purrr::pluck("ac_importance") |>
  dplyr::mutate(measure = "ac_importance") |>
  dplyr::slice(c(8:14, 1:7)) |>
  dplyr::mutate(var_id = dplyr::row_number())

max_ac <- meas_to_plot_ac |>
  dplyr::select(where(is.numeric), -var_id) |>
  unlist(use.names = FALSE) |>
  max(na.rm = TRUE)

rect_data_ac <- tibble::tibble(
  xmin_cont = 7.5, xmax_cont = 14,  ymin_cont = 0, ymax_cont = max_ac,
  xmin_cat  = 0.5, xmax_cat  = 7.5, ymin_cat  = 0, ymax_cat  = max_ac
)

ac_long <- meas_to_plot_ac |>
  tidyr::pivot_longer(
    cols = dplyr::any_of(method_levels),
    names_to = "method",
    values_to = "value"
  ) |>
  dplyr::mutate(
    method   = factor(method, levels = method_levels),
    additive = ifelse(method %in% additive_set, "additive distances", "non-additive distances"),
    type     = ifelse(method %in% additive_set, "Additive", "Non-additive")
  )

add_present    <- intersect(additive_set, unique(as.character(ac_long$method)))
nonadd_present <- intersect(setdiff(method_levels, additive_set), unique(as.character(ac_long$method)))

ac_plot <-
  ggplot2::ggplot() +
  ggplot2::geom_rect(
    data = rect_data_ac,
    ggplot2::aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont),
    alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE
  ) +
  ggplot2::geom_rect(
    data = rect_data_ac,
    ggplot2::aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat),
    alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE
  ) +

  # ================= ADDITIVE =================
  ggplot2::geom_line(
    data = dplyr::filter(ac_long, type == "Additive"),
    ggplot2::aes(x = var_id, y = value, color = method, linetype = method, group = method),
    alpha = 0.85
  ) +
  ggplot2::geom_point(
    data = dplyr::filter(ac_long, type == "Additive"),
    ggplot2::aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  ggplot2::scale_color_manual(
    name   = "Additive",
    values = my_colors,
    breaks = add_present,
    drop   = FALSE
  ) +
  ggplot2::scale_shape_manual(
    name   = "Additive",
    values = my_shapes,
    breaks = add_present,
    drop   = FALSE
  ) +
  ggplot2::scale_linetype_manual(
    name   = "Additive",
    values = stats::setNames(rep("solid", length(add_present)), add_present),
    breaks = add_present,
    drop   = FALSE
  ) +
  ggplot2::guides(
    color = ggplot2::guide_legend(order = 1),
    shape = ggplot2::guide_legend(order = 1),
    linetype = ggplot2::guide_legend(order = 1)
  ) +

  ggnewscale::new_scale_color() +
  ggnewscale::new_scale("shape") +
  ggnewscale::new_scale("linetype") +

  # ================= NON-ADDITIVE =================
  ggplot2::geom_line(
    data = dplyr::filter(ac_long, type == "Non-additive"),
    ggplot2::aes(x = var_id, y = value, color = method, linetype = method, group = method),
    alpha = 0.85
  ) +
  ggplot2::geom_point(
    data = dplyr::filter(ac_long, type == "Non-additive"),
    ggplot2::aes(x = var_id, y = value, color = method, shape = method),
    size = 2
  ) +
  ggplot2::scale_color_manual(
    name   = "Non-additive",
    values = my_colors,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  ggplot2::scale_shape_manual(
    name   = "Non-additive",
    values = my_shapes,
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  ggplot2::scale_linetype_manual(
    name   = "Non-additive",
    values = stats::setNames(rep("solid", length(nonadd_present)), nonadd_present),
    breaks = nonadd_present,
    drop   = FALSE
  ) +
  ggplot2::guides(
    color = ggplot2::guide_legend(order = 2),
    shape = ggplot2::guide_legend(order = 2),
    linetype = ggplot2::guide_legend(order = 2)
  ) +

  ggplot2::ylab("alienation coefficient") +
  ggplot2::theme_bw() +
  ggplot2::scale_x_continuous(
    breaks = seq(1, 14, 1),
    labels = meas_to_plot_ac$variable
  ) +
  ggplot2::theme(
    axis.text.x = ggplot2::element_text(angle = 60, hjust = 1),
    legend.position = "bottom",
    legend.title.position = "top",
    legend.text = ggplot2::element_text(size = 13)
  ) +
  ggplot2::xlab("") +
  ggplot2::facet_wrap(. ~ additive)

ac_plot

Appendix D Categorical mean distances: skewed distributions

Code

LeHo<-function(X){  #  Returns the LeHo Distance (symmetric Kullback-Leibler)
                    #  X is matrix with conditional probabilities
#if(any(rowSums(X)!=1)){
#  print("Input must be conditional probs (row sums should be 1)")
#} else {
  
require(philentropy)

n<-nrow(X)
D<-matrix(0,n,n)
for (i in 1:(n-1)){
  for (j in (i+1):n){
    D[j,i]<-D[i,j]<-distance(X[c(i,j),],"kullback-leibler",unit="log2")+
      distance(X[c(j,i),],"kullback-leibler",unit="log2")}

}

return(D)
}

Data generation

Code

# Let's consider some specific unbalanced scenarios
# One probility is d. The others are (1-d)/(q-1)
# We consider skewed options:
dis<-c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)

#qi=5
qs<-c(2,3,5,10)#,18) # Some options for qi

nint<-length(dis)    # Number of options for d (nint-1)
rsm<-rsma<-rhl<-sdsm<-rse<-rsof<-rsiof<-sdse<-rsa<-sdsa<-sdsof<-sdsiof<-sddsma<-sdssm<-
  sddse<-sddsa<-sddsm<-sdhl<-sddsmca<-sddsof<-sddsiof<-sddshl<- sdshl<-
  rsmca<-sdsmca<-esm<-em<-ee<-ea<-ehl<-emca<-eof<-eiof<-matrix(NA,nrow=nint,ncol=length(qs))

#par(mfrow=c(2,2))
for (qis in 1:length(qs)){
  #ds<-NULL
  qi<-qs[qis]
  for (i in (1:nint)){ # variant 2

    Deltam<-matrix(1,qi,qi)    # Matching
    diag(Deltam)<-rep(0,qi)    # Matching Delta

    #d=i/(2*qi)  # i=2 corresponds to equal probs if variant 1 is chosen. (Only works for one qi. Not for qs loop)
    d=dis[i]  # variant 2

    #ds<-rbind(ds,d)
    p<-c(d,rep((1-d)/(qi-1),(qi-1)))
    sum(p)

    Dp<-diag(p)
    pp<-p %*% t(p)
    spi<-(diag(Dp-pp))^-1
    spi5<-(diag(Dp-pp))^-.5

    DEsk<-(2/(qi^2))*Deltam # Delta Eskin

    DOF<-log(p)%*%t(log(p)) * Deltam  # OF
    n=160 # For inv OF we need n. Soon gets very large (increases in n) 160 is sufficient
    #n<-ceiling(1/min(p))
    DIOF<-log(n*p)%*%t(log(n*p)) * Deltam  # inverse OF. If some np<1 we can get negatives

    n=160  # For HL we need to find a way to get somewhat consistent values. Let's choose n large
    marginals=p*n

    HLemp<-distancefactor(cat=qi,n=n,catsizes=marginals) #
    DHL<-Deltam*HLemp*2#*sqrt(2)  # NOTE 13/09: Corrected: Was sqrt(2). But, should be, see paper, 2x
    #DE<-sqrt((1/qi)*(spi %*% t(rep(1,qi)) + t(spi %*% t(rep(1,qi)))))
    #diag(DE)<-rep(0,qi)

    DM<-sqrt(1/qi)*(spi5 %*% t(rep(1,qi)) + t(spi5 %*% t(rep(1,qi))))
    diag(DM)<-rep(0,qi)

    DA<-(1/qi)*diag(spi5)%*%Deltam%*%diag(spi5)
    DMCA<-(1/qi) * diag(p^-.5)%*%Deltam%*%diag(p^-.5)

    #we<-ee[i,(qis)]<- t(p)%*%DE%*%p

    wsm<-esm[i,(qis)]<- t(p)%*%Deltam%*%p   # Simple Matching
    we<-ee[i,(qis)]<- t(p)%*%DEsk%*%p       # Eskin
    wof<-eof[i,(qis)]<-t(p)%*%DOF%*%p       # OF
    wiof<-eiof[i,(qis)]<-t(p)%*%DIOF%*%p    # IOF
    whl<-ehl[i,(qis)]<-t(p)%*%DHL%*%p       # HL
    wm<-em[i,(qis)]<- t(p)%*%DM%*%p         # SD scaling
    wa<-ea[i,(qis)]<- t(p)%*%DA%*%p         # Cat dis scaling
    wmca<-emca[i,(qis)]<- t(p)%*%DMCA%*%p   # MCA scaling

    # We can also estimate the variance among the distances using Deltas: E(X^2)-E(X)^2
    # We can do this dividing by expected values so that we look at what happens
    # when delta is commensurable. Or not
    # if not:

    #we<-wm<-wa<-wmca<-wof<-wiof<-whl<-wsm<-1

    sddse[i,(qis)]<- sqrt(t(p)%*%DEsk^2%*%p - ee[i,(qis)]^2)/we  # Eskin
    sddsma[i,(qis)]<- sqrt(t(p)%*%Deltam^2%*%p - esm[i,(qis)]^2)/ wsm  # Simple Matching
    sddshl[i,(qis)]<- sqrt(t(p)%*%DHL^2%*%p - ehl[i,(qis)]^2)/ whl  # HL
    sddsm[i,(qis)]<- sqrt(t(p)%*%DM^2%*%p - em[i,(qis)]^2)/ wm  # SD
    sddsa[i,(qis)]<- sqrt(t(p)%*%DA^2%*%p -  ea[i,(qis)]^2)/wa # Cat dissim
    sddsmca[i,(qis)]<- sqrt(t(p)%*%DMCA^2%*%p - emca[i,(qis)]^2)/wmca  # MCA
    sddsof[i,(qis)]<-sqrt(t(p)%*%DOF^2%*%p - eof[i,(qis)]^2)/wof  # OF
    sddsiof[i,(qis)]<-sqrt(t(p)%*%(DIOF/as.numeric(wiof))^2%*%p - 1 )  #IOF

    # Let's consider spread among cat dissimilarities. But: This only makes sense
    # when on similar scales. So, use commensurable deltas: (Or not: see above)
    sdse[i,qis]<-sd((1/we)*DEsk[upper.tri(DEsk)])  # Eskin
    sdsm[i,qis]<-sd((1/wm)*DM[upper.tri(DM)]) # SD
    sdshl[i,qis]<-sd((1/whl)*DHL[upper.tri(DHL)]) # HL
    sdssm[i,qis]<-sd((1/wsm)*Deltam[upper.tri(Deltam)]) # Simple Matching
    sdsa[i,qis]<-sd((1/wa)*DA[upper.tri(DA)]) #  Cat dissim,
    sdsmca[i,qis]<-sd((1/wmca)*DMCA[upper.tri(DMCA)]) # MCA
    sdsof[i,qis]<-sd((1/wof)*DOF[upper.tri(DOF)])  # OF
    sdsiof[i,qis]<-sd((1/wiof)*DIOF[upper.tri(DIOF)]) # IOF



  }

}

cats<-as.character(qs)
skew<-as.character(dis)#as.character(1:nint)

Code for `Figure 7`

Code

sk_distance_to_plot = function(exp_mean_dist,distance_name){
  exp_mean_dist = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |>
    rename(`2 cats` = V1,
           `3 cats` = V2,
           `5 cats` = V3,
           `10 cats` = V4) |>
    mutate(method=distance_name) |>
    pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`),
                 names_to = "Number of categories", values_to = "Mean distance")

  return(exp_mean_dist)
}

all_exp_mean_dist=rbind(sk_distance_to_plot(esm,"Simple Matching"),
                        sk_distance_to_plot(ee,"Eskin"),
                        sk_distance_to_plot(eof,"OF"),
                        sk_distance_to_plot(eiof,"IOF")
)

paper_plot_skew1=all_exp_mean_dist |>
  mutate(`Number of categories`=fct_inorder(`Number of categories`),
         method=fct_inorder(method),) |>
  ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
  geom_line() + geom_point() + facet_wrap(~method,scales = "free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
  xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")


print(paper_plot_skew1,width=10,height=6)

Code for `Figure 8`

Code

all_exp_mean_dist2=rbind(sk_distance_to_plot(ehl,"Hennig-Liao"),
                         sk_distance_to_plot(em,"Std. dev."),
                         sk_distance_to_plot(ea,"Cat. dissim.")
)

paper_plot_skew2=all_exp_mean_dist2 |>
  mutate(`Number of categories`=fct_inorder(`Number of categories`),
         method=fct_inorder(method),) |>
  ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
  geom_line() + geom_point() + facet_wrap(~method) + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
  xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")


print(paper_plot_skew2)

Code for `Figure 9`

Code

qs<-c(2,3,5,10)#,18) # Some options for qs

nint<-length(dis)    # Number of options for d (nint-1)

CVchd<-CVtvd<-CVmsd<-CVlhd<-Echi<-Echd<-Etvi<-Etvd<-Emsi<-Elhi<-Elhd<-Emsd<-array(NA,dim=c(length(qs),length(qs),nint,nint))

for (q1 in 1:length(qs)){  # cats row
  qi<-qs[q1]
  for (q2 in q1:length(qs)){ # cats cols
    qj<-qs[q2]
    for (pi in 1:nint){  # row distributions
      d1<-dis[pi]
      p1<-c(d1,rep((1-d1)/(qj-1),(qj-1)))  # nint different marginals: Distribution over colums
      for (pj in 1:nint){
        d2<-dis[pj]
        p2<-c(d2,rep((1-d2)/(qi-1),(qi-1)))  # corresponding col marginals: Distribution over rows
        Pi<-p2 %*% t(p1)   # joint table independent
        Ri<-Pi/rowSums(Pi) # conditional table independent
        #  Ci<-Ri/sqrt(p2)

        if (qj>qi) {
          Pd<-diag(p1[1:qi])
          Addzeros<-matrix(0,(qi-1),(qj-qi))
          Addps<-p1[qi+1:(qj-qi)]
          Add<-rbind(Addzeros,Addps)
          Pd<-cbind(Pd,Add)
        } else { #qi=qj
          pd<-p2
          Pd<-diag(pd)       # joint table dependent case (diagonal smallest q)
        }
        Rd<-Pd/rowSums(Pd)
        Cd<-Rd/sqrt(p2)    # Chi-square table: cond. prob/sqrt(p)

        # category dissimilarities:
        Dlhd<-LeHo(Rd)                   #  Le Ho
        # Dmsd<-MousaviSehhati(Pd)         # Moussavi
        Dtvd<-dist(Rd,"manhattan")/2     # TVD
        Dchd<-dist(Cd,"euclidean")^2     # Chi2 dist
        #    Dchd<-Dchd/(2*(qi-1))     # Chi2 dist corrected for number of categories...

        Elhd[q1,q2,pi,pj]<-t(pd) %*%Dlhd %*%pd
        # Emsd[q1,q2,pi,pj]<-t(pd) %*%Dmsd %*%pd
        Etvd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dtvd) %*%pd
        Echd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dchd) %*%pd

        # Coefficient of variation

        CVlhd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dlhd^2 %*%pd - (t(pd) %*%Dlhd %*%pd)^2)/(t(pd) %*%Dlhd %*%pd)
        # CVmsd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dmsd^2 %*%pd - (t(pd) %*%Dmsd %*%pd)^2)/(t(pd) %*%Dmsd %*%pd)
        CVtvd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dtvd))^2 %*%pd - (t(pd) %*%as.matrix(Dtvd) %*%pd)^2)/(t(pd) %*%as.matrix(Dtvd) %*%pd)
        CVchd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dchd))^2 %*%pd - (t(pd) %*%as.matrix(Dchd) %*%pd)^2)/(t(pd) %*%as.matrix(Dchd) %*%pd)

        Dlhi<-LeHo(Ri)
        # Dmsi<-MousaviSehhati(Pi)
        Dtvi<-dist(Ri,"manhattan")/2
        #  Dchi<-dist(Ci,"euclidean")^2

        Elhi[q1,q2,pi,pj]<-t(p2) %*%Dlhi %*%p2
        # Emsi[q1,q2,pi,pj]<-t(p2) %*%Dmsi %*%p2
        Etvi[q1,q2,pi,pj]<-t(p2) %*%as.matrix(Dtvi) %*%p2
        #  Echi[q1,q2,pi,pj]<-t(p1) %*%as.matrix(Dchi) %*%p1
      }
    }
  }
}



Elhs<-as.data.frame(rbind(Elhd[1,1,1,],Elhd[2,2,1,],Elhd[3,3,1,],Elhd[4,4,1,]))
Etvs<-as.data.frame(rbind(Etvd[1,1,1,],Etvd[2,2,1,],Etvd[3,3,1,],Etvd[4,4,1,]))
Emss<-as.data.frame(rbind(Emsd[1,1,1,],Emsd[2,2,1,],Emsd[3,3,1,],Emsd[4,4,1,]))
Echs<-as.data.frame(rbind(Echd[1,1,1,],Echd[2,2,1,],Echd[3,3,1,],Echd[4,4,1,]))

CVlhs<-as.data.frame(rbind(CVlhd[1,1,1,],CVlhd[2,2,1,],CVlhd[3,3,1,],CVlhd[4,4,1,]))
CVtvs<-as.data.frame(rbind(CVtvd[1,1,1,],CVtvd[2,2,1,],CVtvd[3,3,1,],CVtvd[4,4,1,]))
CVmss<-as.data.frame(rbind(CVmsd[1,1,1,],CVmsd[2,2,1,],CVmsd[3,3,1,],CVmsd[4,4,1,]))
CVchs<-as.data.frame(rbind(CVchd[1,1,1,],CVchd[2,2,1,],CVchd[3,3,1,],CVchd[4,4,1,]))

cats<-as.character(qs)
skew<-as.character(dis)#as.character(1:nint)


sk_distance_to_plot = function(exp_mean_dist,distance_name){
  exp_mean_dist = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |>
    rename(`2 cats` = V1,
           `3 cats` = V2,
           `5 cats` = V3,
           `10 cats` = V4) |>
    mutate(method=distance_name) |>
    pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`),
                 names_to = "Number of categories", values_to = "Mean distance")

  return(exp_mean_dist)
}


all_exp_mean_dist3=rbind(sk_distance_to_plot(t(as.matrix(Elhs)),"Le-Ho"),
                         sk_distance_to_plot(t(as.matrix(Etvs)),"Total Variance Distance")
)

paper_plot_skew3=all_exp_mean_dist3 |>
  mutate(`Number of categories`=fct_inorder(`Number of categories`),
         method=fct_inorder(method),) |>
  ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
  geom_line() + geom_point() + facet_wrap(~method,scales="free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
  xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")


print(paper_plot_skew3,width=8,height=4)

Appendix E FIFA variable distributions

Code for `Figure 10`

Code

# load("data/Fifa21NL.RData") # Load the data
data(fifa_nl,package="manydist")
#Xorig<-X<-ddata[,-c(1,2)]
Xorig<-X<-fifa_nl[,-c(1,2,19:24)] # vars 1 and 2 are a bit useless (too many categories)
# vars 19:24 interesting but same scale and little variation
# leaving them out makes the plots a bit smaller.
# leaving them in doesn't seem to change a lot


n<-nrow(X)
# p<-ncol(X)
K<-10  # Number of clusters in cluster analysis
nd<-2 # Dimensionality of MDS solution
X=X[,c(8:1,9:16)]  # Order
Xorig=Xorig[,c(8:1,9:16)]  # Order
X=X[,-3]  # Remove int rep: 380 have something, 21+7 in other cats
Xorig=Xorig[,-3]
X=X[,c(1:12,14,13,15)]
Xorig=Xorig[,c(1:12,14,13,15)]
p<-ncol(X)

# First convert integers to numeric manually
X[8:15] <- lapply(X[8:15], as.numeric)

# reordering / numeric first
X <- X %>%
  select(where(is.integer), where(is.numeric), where(is.factor))


X_tib_num = X |> as_tibble() |> select(where(is.numeric))

X_tib_cat = X |> as_tibble() |> select(where(is.factor))

levels(X_tib_cat$work_rate)=c("H/H", "H/L", "H/M", "L/H", "L/L", "L/M", "M/H", "M/L", "M/M")
levels(X_tib_cat$club_name) = c("ADO","Ajax","AZ","Emmen","Groningen","Twente","Utrecht",
                                "Feyenoord","Fortuna","Heracles","PEC","PSV","RKC","Heerenveen",
                                "Sparta","Vitesse","VVV","Willem II")

X_tib_cat=X_tib_cat |> rename(`club`=club_name,
                              `position`=team_position)

X_tib_num_nest =  tibble(values=X_tib_num |> map((~.x)),features=names(X_tib_num))|>
  mutate(
    hplot=map2(.x=values,.y=features,
               function(x=.x,y=.y){
                 my_tib=tibble(a=x)
                 names(my_tib) = y

                 my_plot= my_tib |> ggplot(aes(x)) + geom_histogram(bins=15,alpha=.75)+theme_bw()+ggtitle(y)+
                   theme(plot.title = element_text(hjust = 0.5))+xlab("")+ylab("")
               }
    )
  )


X_tib_cat_nest =  tibble(values=X_tib_cat |> map((~.x)),features=names(X_tib_cat))|>
  mutate(
    bplot=map2(.x=values,.y=features,
               function(x=.x,y=.y){
                 my_tib=tibble(a=x)
                 names(my_tib) = y

                 my_plot= my_tib |> ggplot(aes(x)) + geom_bar(alpha=.75)+theme_bw()+ggtitle(y)+
                   theme(plot.title = element_text(hjust = 0.5),
                         axis.text.x =element_text(angle=90 ))+xlab("")+ylab("")
               }
    )
  )


fifa_descriptive_plots=(X_tib_cat_nest$bplot[[1]]/
                          X_tib_cat_nest$bplot[[2]]/
                          X_tib_cat_nest$bplot[[3]]/
                          X_tib_cat_nest$bplot[[4]]/
                          X_tib_cat_nest$bplot[[5]]/
                          X_tib_cat_nest$bplot[[6]]/
                          X_tib_cat_nest$bplot[[7]])|
  (X_tib_num_nest$hplot[[1]]/
     X_tib_num_nest$hplot[[2]]/
     X_tib_num_nest$hplot[[3]]/
     X_tib_num_nest$hplot[[4]]/
     X_tib_num_nest$hplot[[5]]/
     X_tib_num_nest$hplot[[6]]/
     X_tib_num_nest$hplot[[7]])#+ plot_layout(guides = 'collect')


print(fifa_descriptive_plots)

References

Ghashti, J. S., and Thompson, J. R. J. (2024), “Mixed-type distance shrinkage and selection for clustering via kernel metric learning,” Journal of Classification. https://doi.org/https://doi.org/10.1007/s00357-024-09493-z.

Hennig, C., and Liao, T. F. (2013), “How to Find an Appropriate Clustering for Mixed-Type Variables with Application to Socio-Economic Stratification,” Journal of the Royal Statistical Society Series C: Applied Statistics, 62, 309–369.

Liu, P., Yuan, H., Ning, Y., Chakraborty, B., Liu, N., and Peres, M. A. (2024), “A modified and weighted gower distance-based clustering analysis for mixed type data: A simulation and empirical analyses,” BMC Medical Research Methodology. https://doi.org/https://doi.org/10.1186/s12874-024-02427-8.

Mousavi, E., and Sehhati, M. (2023), “A generalized multi-aspect distance metric for mixed-type data clustering,” Pattern Recognition, 138, 109353.

--- title: "Unbiased mixed-variable distance" subtitle: "supplementary material" format: html: theme: minty toc: true toc-location: left toc-expand: 2 page-layout: full code-fold: true code-tools: true embed-resources: true callout-icon: false bibliography: unbiased_dist_refs.bib csl: asa.csl execute: collapse: true message: false warning: false --- ```{r setup} #| include: true packages <- c( "aricode", "cluster", "fpc", "manydist", "patchwork", "tidyverse", "tidymodels", "varhandle", "vegan", "kdml", "ggplot2", "viridis", "dplyr", "entropy", "kernlab", "mclust", "Matrix", "philentropy", "conflicted", "ggnewscale", "glue" ) invisible(lapply(packages, require, character.only = TRUE)) tidymodels_prefer() conflict_prefer("select", "dplyr") conflict_prefer("filter", "dplyr") conflict_prefer("distance", "philentropy") # prefer base set operations (if needed later) conflict_prefer("intersect", "base") conflict_prefer("setdiff", "base") ``` Here we provide the code to reproduce the results of the supplementary material of the paper "Unbiased mixed variables distances". ### Data generation These are the simulation parameers ```{r,eval=TRUE} reps<-100 # 100 can take 30min n<-500 k<-2 # Dimension solution sigma<-0.03 # Noise porig<-6 # Number of variables corresponding to true config pnnoise<-0 # Number of numerical noise vars pcnoise<-0 # Number of categorical noise vars pn<-2 # Number of numerical variables underying config pnum<-pn+pnnoise+pcnoise # Total number of numerical vars+noise vars (Needed to know which vars to discretize) p<-porig+pnnoise+pcnoise # Total number of variables qoptions<-c(2,3,5,9) # Distribution of the categorical variables corresponding to true config pcat<-length(qoptions) # Number of cat variables underlying config methods = c("baseline","naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix") method_levels <- c("baseline", "naive", "hl", "hl_add", "gower","gudmm", "dkss", "mod_gower", "u_ind", "u_dep", "u_mix") my_colors <- c( # Set B "baseline" = "#8C564B", # brown "naive" = "#E377C2", # pink "hl" = "#9C9E00", # darker yellow-green (better contrast) "gudmm" = "#FF7F0E", # orange "dkss" = "#7F7F7F", # darker grey (more visible) # Set A "hl_add" = "#9EC5E5", # light blue "u_ind" = "#2CA02C", # green "u_dep" = "#D62728", # red "u_mix" = "#1E90FF", "gower" = "#FFBF00", # gold "mod_gower" = "#9467BD" # purple ) my_shapes <- c( "baseline" = 16, # filled circle "naive" = 17, # filled triangle "hl" = 15, # filled square "hl_add" = 3, # plus "gower" = 4, # cross "gudmm" = 18, # filled diamond "dkss" = 8, # star "mod_gower" = 0, # empty square "u_ind" = 1, # empty circle "u_dep" = 2, # empty triangle "u_mix" = 5 # empty diamond ) ``` The considered methods are #### additive distances - `baseline`: Manhattan distance on all the standardized numerical variables (before discretization) - `gower`: classic Gower distance - `mod_gower`: modified version of the Gower distance, as proposed in @LYNCLP:2024 - `hl_add`: Manhattan distance with scaling of indicators as proposed by @HennigLiao2013 - `u_ind`: commensurable with simple matching - `u_dep`: commensurable PCA-scaled numerical and total-variance categorical - `u_mix`: commensurable Manhattan numerical and total-variance categorical #### non-additive distances - `naive`: Euclidean distance on all variables, with one-hot encoded categorical variables. - `hl`: Euclidean distance with scaling of indicators as proposed by @HennigLiao2013 - `gudmm`: Generalized Unified Distance Metric for Mixed-type data as proposed by @MS:2023 - `dkss`: Distance using kernel product similarity as proposed by @GT:2024  ### Compute distances: leave one variable out ```{r, eval=FALSE} library(progress) pb <- progress_bar$new( format = " Simulating [:bar] :percent eta: :eta", total = reps * length(methods), clear = FALSE, width = 60 ) # methods = c("baseline","naive","gower","hl","u_dep","u_ind","gudmm","dkss","mod_gower") generated_datasets = tibble(replicate=1:reps) |> mutate(dataset = map(.x=replicate, ~generate_dataset(n = n, pn=pn, porig=porig, pnnoise =pnnoise , pcnoise = pcnoise, sigma = sigma, qoptions = qoptions, seed = 1234+.x)), method=map(.x=replicate,~methods) ) |> unnest(method)|> left_join(mdist_method_lookup, by = "method") ``` ```{r, eval=FALSE} ## a little helper for mdist run_mdist_within <- function(df, mdist_type, mdist_preset, param_set, outcome = "response") { x <- df if (is.character(outcome) && length(outcome) == 1 && outcome %in% names(df)) { x <- dplyr::select(df, -dplyr::all_of(outcome)) } if (mdist_type == "preset") { out <- manydist::mdist(x = x, preset = mdist_preset) } else if (mdist_type == "custom") { if (is.null(param_set)) stop("param_set is NULL for mdist_type = 'custom'") args <- param_set args$x <- x out <- do.call(manydist::mdist, args = args) } else { stop("Unknown mdist_type: ", mdist_type) } if (identical(mdist_type, "preset") && identical(mdist_preset, "gower")) { out$distance*ncol(x) }else{ out$distance } } # little helper for the leave one variable out version of mdist run_lovo <- function(df, mdist_type, mdist_preset, param_set, outcome = NULL, dims = 2, keep_dist = FALSE, legacy_gower_sum = FALSE) { # drop outcome if present x <- df if (is.character(outcome) && length(outcome) == 1 && outcome %in% names(df)) { x <- dplyr::select(df, -dplyr::all_of(outcome)) } # ---- special case: legacy Gower SUM scaling for LOVO ---- if (legacy_gower_sum && identical(mdist_type, "preset") && identical(mdist_preset, "gower")) { vars <- names(x) p_full <- length(vars) # full D_full <- manydist::mdist(x = x, preset = "gower")$distance |> as.matrix() D_full <- D_full * p_full base_mds <- cmdscale(D_full, eig = TRUE, k = dims)$points[, 1:dims, drop = FALSE] loo_list <- vector("list", length(vars)); names(loo_list) <- vars for (i in seq_along(vars)) { var <- vars[i] x_sub <- dplyr::select(x, -dplyr::any_of(var)) p_loo <- ncol(x_sub) D_loo <- manydist::mdist(x = x_sub, preset = "gower")$distance |> as.matrix() D_loo <- D_loo * p_loo loo_list[[i]] <- D_loo } mad <- vapply(loo_list, function(m) mean(abs(D_full - m)), numeric(1)) cc <- vapply(loo_list, function(m) { pts <- cmdscale(m, eig = TRUE, k = dims)$points[, 1:dims, drop = FALSE] congruence_coeff(base_mds, pts) }, numeric(1)) ac <- sqrt(1 - cc^2) res <- tibble::tibble( variable = vars, mad_importance = mad, cc_importance = cc, ac_importance = ac, mad_normalized = mad / sum(mad) ) out <- list(results = res, base_mds = base_mds) if (keep_dist) { out$full_dist <- D_full out$loo_dist <- loo_list } return(out) } # ---- default path: your package's LOVO ---- if (mdist_type == "preset") { obj <- manydist::lovo_mdist(x = x, preset = mdist_preset, dims = dims, keep_dist = keep_dist) } else if (mdist_type == "custom") { if (is.null(param_set)) stop("param_set is NULL for mdist_type = 'custom'") args <- param_set args$x <- x args$dims <- dims args$keep_dist <- keep_dist obj <- do.call(manydist::lovo_mdist, args = args) } else { stop("Unknown mdist_type: ", mdist_type) } obj } ``` ```{r, eval=FALSE} simulation_structure <- generated_datasets |> dplyr::mutate( loo_results = purrr::pmap( dplyr::pick(dataset, method, mdist_type, mdist_preset, param_set), \(dataset, method, mdist_type, mdist_preset, param_set) { pb$tick() X <- if (method == "baseline") dataset$Xorig else dataset$X run_lovo(X, mdist_type, mdist_preset, param_set,legacy_gower_sum = (method == "gower")) } ) ) save(file="simulation_structure.RData", simulation_structure) ``` ```{r, echo=FALSE, eval=TRUE} load("simulation_structure.RData") ``` ```{r} reference_method = "baseline" baseline_refs = simulation_structure |> filter(method == "baseline") |> select(replicate, baseline_loo = loo_results) rel_simulation_structure <- simulation_structure |> dplyr::left_join(baseline_refs, by = "replicate") |> dplyr::mutate( loo_results = purrr::map2(.x = loo_results,.y = baseline_loo, .f = \(res_obj, ref_obj) { res <- res_obj$results ref <- ref_obj$results # join by variable to be safe (order might differ) dplyr::left_join( res, dplyr::select(ref, variable, ref_mad_normalized = mad_normalized, ref_ac_importance = ac_importance), by = "variable" ) |> dplyr::mutate( mad_relative_importance = mad_normalized / ref_mad_normalized, ac_relative_importance = ac_importance / ref_ac_importance ) } ) ) |> select(-baseline_loo) results_long <- rel_simulation_structure |> tidyr::unnest(loo_results) |> # enforce one value per replicate (if already unique, no change) dplyr::group_by(replicate, variable, method) |> dplyr::summarise( dplyr::across( c(mad_importance, cc_importance, ac_importance, mad_normalized, mad_relative_importance, ac_relative_importance), ~ mean(.x, na.rm = TRUE) ), .groups = "drop" ) results_summary <- results_long |> dplyr::group_by(variable, method) |> dplyr::summarise( dplyr::across( c(mad_importance, cc_importance, ac_importance, mad_normalized, mad_relative_importance, ac_relative_importance), list( mean = \(x) mean(x, na.rm = TRUE), sd = \(x) sd(x, na.rm = TRUE) ), .names = "{.col}_{.fn}" ), .groups = "drop" ) results_wide <- results_summary |> tidyr::pivot_wider( names_from = method, values_from = -c(variable, method) ) measure_names <- c( "mad_importance", "cc_importance", "ac_importance", "mad_normalized", "mad_relative_importance", "ac_relative_importance" ) split_by_measure_mean <- purrr::map( measure_names, \(measure) { results_wide |> dplyr::select(variable, dplyr::starts_with(paste0(measure, "_mean_"))) |> dplyr::rename_with( ~ stringr::str_remove(., paste0(measure, "_mean_")), -variable ) } ) |> rlang::set_names(measure_names) split_by_measure_sd <- purrr::map( measure_names, \(measure) { results_wide |> dplyr::select(variable, dplyr::starts_with(paste0(measure, "_sd_"))) |> dplyr::rename_with( ~ stringr::str_remove(., paste0(measure, "_sd_")), -variable ) } ) |> rlang::set_names(measure_names) ``` ### Code for `Figure 1` ```{r,fig.height=5,fig.width=8} meas_to_plot_mad_mean <- split_by_measure_mean |> pluck("mad_importance") |> mutate(measure="mad_importance") |> slice(3:6,1,2) |> mutate(var_id=1:6) meas_to_plot_mad_sd <- split_by_measure_sd |> pluck("mad_importance") |> mutate(measure="mad_importance") |> slice(3:6,1,2) |> mutate(var_id=1:6) mad_max <- max(meas_to_plot_mad_mean |> select(where(is.numeric), -var_id), na.rm = TRUE) rect_data <- tibble( xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max, xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max ) meas_to_plot_mad_rel_mean <- split_by_measure_mean |> pluck("mad_normalized") |> mutate(measure="mad_normalized") |> slice(3:6,1,2) |> mutate(var_id=1:6) meas_to_plot_mad_rel_sd <- split_by_measure_sd |> pluck("mad_normalized") |> mutate(measure="mad_normalized") |> slice(3:6,1,2) |> mutate(var_id=1:6) mad_max_rel <- max(meas_to_plot_mad_rel_mean |> select(where(is.numeric), -var_id), na.rm = TRUE) rect_data_rel <- tibble( xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max_rel, xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max_rel ) loo_mad_plot <- meas_to_plot_mad_mean |> dplyr::select(variable, var_id, any_of(method_levels)) |> pivot_longer(cols = naive:u_mix, names_to="method", values_to="value") |> mutate( method = factor(method, levels = method_levels), additive = ifelse(method %in% c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix"), "additive distances", "non-additive distances") ) |> dplyr::left_join( meas_to_plot_mad_sd |> dplyr::select(variable, var_id, any_of(method_levels)) |> pivot_longer(cols = naive:u_mix, names_to="method", values_to="sd") |> mutate(method = factor(method, levels = method_levels)), by = c("variable","var_id","method") ) |> ggplot(aes(x = var_id, y = value, color = method, shape = method)) + geom_rect(data=rect_data, aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont), alpha=.2, fill="indianred", color="lightgrey", inherit.aes=FALSE) + geom_rect(data=rect_data, aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat), alpha=.2, fill="dodgerblue", color="lightgrey", inherit.aes=FALSE) + geom_errorbar(aes(ymin = value - sd, ymax = value + sd), width = 0.12, linewidth = 0.35) + geom_point() + geom_line() + ylab("absolute variable contribution to distance") + xlab("") + theme_bw() + theme(legend.position = "none") + scale_x_continuous(breaks=seq(1,6,1), labels = rep("",6)) + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + scale_shape_manual(values = my_shapes) + scale_color_manual(values = my_colors) + facet_wrap(.~additive) add_methods <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix") nonadd_methods <- c("naive","hl","gudmm","dkss") # add_methods <- c("gower", "mod_gower", "hl_add", "u_ind", "u_dep", "u_mix") legend_breaks <- c( c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix"), # additive first c("naive","hl","gudmm","dkss") # non-additive after ) legend_labels <- setNames( ifelse(legend_breaks %in% c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix"), paste0(legend_breaks, " (additive)"), paste0(legend_breaks, " (non-additive)")), legend_breaks ) library(ggnewscale) loo_mad_rel_df <- meas_to_plot_mad_rel_mean |> dplyr::select(variable, var_id, any_of(method_levels)) |> tidyr::pivot_longer(cols = naive:u_mix, names_to="method", values_to="value") |> mutate( method = factor(method, levels = c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix")), additive = ifelse(as.character(method) %in% add_methods, "additive", "non-additive") ) |> dplyr::left_join( meas_to_plot_mad_rel_sd |> dplyr::select(variable, var_id, any_of(method_levels)) |> tidyr::pivot_longer(cols = naive:u_mix, names_to="method", values_to="sd") |> mutate(method = factor(method, levels = c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix"))), by = c("variable","var_id","method") ) # loo_mad_rel_plot <- # ggplot() + # geom_rect(data=rect_data_rel, # aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont), # alpha=.2, fill="indianred", color="lightgrey", inherit.aes=FALSE) + # geom_rect(data=rect_data_rel, # aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat), # alpha=.2, fill="dodgerblue", color="lightgrey", inherit.aes=FALSE) + # # # --- Additive layer + legend 1 --- # # geom_errorbar( # # data = dplyr::filter(loo_mad_rel_df, additive == "additive"), # # aes(x = var_id, ymin = value - sd, ymax = value + sd, color = method), # # width = 0.12, linewidth = 0.35 # # ) + # geom_line( # data = dplyr::filter(loo_mad_rel_df, additive == "additive"), # aes(x = var_id, y = value, color = method) # ) + # geom_point( # data = dplyr::filter(loo_mad_rel_df, additive == "additive"), # aes(x = var_id, y = value, color = method, shape = method) # ) + # scale_color_manual( # name = "additive", # values = my_colors, # breaks = add_methods # ) + # scale_shape_manual( # name = "additive", # values = my_shapes, # breaks = add_methods # ) + # guides( # color = guide_legend( # order = 1, # override.aes = list(shape = my_shapes[add_methods]) # ), # shape = "none" # ) + # # ggnewscale::new_scale_color() + # ggnewscale::new_scale("shape") + # # # --- Non-additive layer + legend 2 --- # # geom_errorbar( # # data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"), # # aes(x = var_id, ymin = value - sd, ymax = value + sd, color = method), # # width = 0.12, linewidth = 0.35 # # ) + # geom_line( # data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"), # aes(x = var_id, y = value, color = method) # ) + # geom_point( # data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"), # aes(x = var_id, y = value, color = method, shape = method) # ) + # scale_color_manual( # name = "non-additive", # values = my_colors, # breaks = nonadd_methods # ) + # scale_shape_manual( # name = "non-additive", # values = my_shapes, # breaks = nonadd_methods # ) + # guides( # color = guide_legend( # order = 2, # override.aes = list(shape = my_shapes[nonadd_methods]) # ), # shape = "none" # ) + # # # cosmetics (same as yours) # ylab("relative variable contribution to distance") + # xlab("left-out variable") + theme_bw() + # scale_x_continuous( # breaks=seq(1,6,1), # labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2") # ) + # theme( # axis.text.x = element_text(angle = 60, hjust = 1), # strip.text = element_blank(), # legend.position = "bottom", # legend.title.position = "top", # legend.text = element_text(size = 13) # )+facet_wrap(.~additive) add_methods <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix") nonadd_methods <- c("naive","hl","gudmm","dkss") loo_mad_rel_plot <- ggplot() + geom_rect( data = rect_data_rel, aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont), alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE ) + geom_rect( data = rect_data_rel, aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat), alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE ) + # --- Additive layer --- geom_line( data = dplyr::filter(loo_mad_rel_df, additive == "additive"), aes(x = var_id, y = value, color = method, linetype = method), linewidth = 0.5 ) + geom_point( data = dplyr::filter(loo_mad_rel_df, additive == "additive"), aes(x = var_id, y = value, color = method, shape = method), size = 2 ) + scale_color_manual( name = "additive", values = my_colors, breaks = add_methods ) + scale_shape_manual( name = "additive", values = my_shapes, breaks = add_methods ) + scale_linetype_manual( name = "additive", values = setNames(rep("solid", length(add_methods)), add_methods), breaks = add_methods ) + guides( color = guide_legend(order = 1), shape = guide_legend(order = 1), linetype = guide_legend(order = 1) ) + ggnewscale::new_scale_color() + ggnewscale::new_scale("shape") + ggnewscale::new_scale("linetype") + # --- Non-additive layer --- geom_line( data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"), aes(x = var_id, y = value, color = method, linetype = method), linewidth = 0.5 ) + geom_point( data = dplyr::filter(loo_mad_rel_df, additive == "non-additive"), aes(x = var_id, y = value, color = method, shape = method), size = 2 ) + scale_color_manual( name = "non-additive", values = my_colors, breaks = nonadd_methods ) + scale_shape_manual( name = "non-additive", values = my_shapes, breaks = nonadd_methods ) + scale_linetype_manual( name = "non-additive", values = setNames(rep("solid", length(nonadd_methods)), nonadd_methods), breaks = nonadd_methods ) + guides( color = guide_legend(order = 2), shape = guide_legend(order = 2), linetype = guide_legend(order = 2) ) + ylab("relative variable contribution to distance") + xlab("left-out variable") + theme_bw() + scale_x_continuous( breaks = seq(1, 6, 1), labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2") ) + theme( axis.text.x = element_text(angle = 60, hjust = 1), strip.text = element_blank(), legend.position = "bottom", legend.title.position = "top", legend.text = element_text(size = 13) ) + facet_wrap(. ~ additive) fig_1_plot = loo_mad_plot / loo_mad_rel_plot fig_1_plot ``` ```{r, eval = TRUE, echo = FALSE} ggsave(fig_1_plot, file = "figure_1_mad_importances.pdf", width = 8, height = 8) ``` ### Code for `Figure 2` ```{r,fig.height=4,fig.width=8} library(ggnewscale) add_methods <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix") nonadd_methods <- c("naive","hl","gudmm","dkss") meas_to_plot_ac_mean <- split_by_measure_mean |> pluck("ac_relative_importance") |> mutate(measure = "ac_relative_importance") |> slice(3:6, 1, 2) |> mutate(var_id = 1:6) meas_to_plot_ac_sd <- split_by_measure_sd |> pluck("ac_relative_importance") |> mutate(measure = "ac_relative_importance") |> slice(3:6, 1, 2) |> mutate(var_id = 1:6) ac_max <- max(meas_to_plot_ac_mean |> select(where(is.numeric), -var_id), na.rm = TRUE) rect_data_ac <- tibble( xmin_cont = 4.5, xmax_cont = 6, ymin_cont = 0, ymax_cont = ac_max, xmin_cat = .5, xmax_cat = 4.5, ymin_cat = 0, ymax_cat = ac_max ) ac_df <- meas_to_plot_ac_mean |> dplyr::select(variable, var_id, any_of(method_levels)) |> tidyr::pivot_longer(cols = naive:u_mix, names_to = "method", values_to = "value") |> mutate( method = factor( method, levels = c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix") ), additive = ifelse(as.character(method) %in% add_methods, "additive", "non-additive") ) |> dplyr::left_join( meas_to_plot_ac_sd |> dplyr::select(variable, var_id, any_of(method_levels)) |> tidyr::pivot_longer(cols = naive:u_mix, names_to = "method", values_to = "sd") |> mutate( method = factor( method, levels = c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix") ) ), by = c("variable", "var_id", "method") ) ac_plot <- ggplot() + geom_rect( data = rect_data_ac, aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont), alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE ) + geom_rect( data = rect_data_ac, aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat), alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE ) + # --- Additive block --- geom_line( data = dplyr::filter(ac_df, additive == "additive"), aes(x = var_id, y = value, color = method, linetype = method), linewidth = 0.5 ) + geom_point( data = dplyr::filter(ac_df, additive == "additive"), aes(x = var_id, y = value, color = method, shape = method), size = 2 ) + scale_color_manual( name = "additive", values = my_colors, breaks = add_methods ) + scale_shape_manual( name = "additive", values = my_shapes, breaks = add_methods ) + scale_linetype_manual( name = "additive", values = setNames(rep("solid", length(add_methods)), add_methods), breaks = add_methods ) + guides( color = guide_legend(order = 1), shape = guide_legend(order = 1), linetype = guide_legend(order = 1) ) + ggnewscale::new_scale_color() + ggnewscale::new_scale("shape") + ggnewscale::new_scale("linetype") + # --- Non-additive block --- geom_line( data = dplyr::filter(ac_df, additive == "non-additive"), aes(x = var_id, y = value, color = method, linetype = method), linewidth = 0.5 ) + geom_point( data = dplyr::filter(ac_df, additive == "non-additive"), aes(x = var_id, y = value, color = method, shape = method), size = 2 ) + scale_color_manual( name = "non-additive", values = my_colors, breaks = nonadd_methods ) + scale_shape_manual( name = "non-additive", values = my_shapes, breaks = nonadd_methods ) + scale_linetype_manual( name = "non-additive", values = setNames(rep("solid", length(nonadd_methods)), nonadd_methods), breaks = nonadd_methods ) + guides( color = guide_legend(order = 2), shape = guide_legend(order = 2), linetype = guide_legend(order = 2) ) + ylab("relative alienation coefficient") + xlab("left-out variable") + scale_x_continuous( breaks = seq(1, 6, 1), labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2") ) + theme_bw() + theme( axis.text.x = element_text(angle = 60, hjust = 1), legend.position = "bottom", legend.title.position = "top", legend.text = element_text(size = 13) ) + facet_wrap(. ~ additive) ac_plot ``` ```{r, echo=FALSE,eval=TRUE} ggsave("figure_2_ac_importances.pdf", ac_plot, width = 8, height = 4) ``` ## Retrieval of the true configuration ### Data generation ```{r, eval=FALSE} recovery_structure <- tidyr::crossing( tibble::tibble(replicate = 1:reps), method = methods, categories = qoptions ) |> dplyr::left_join(manydist:::mdist_method_lookup, by = "method") |> dplyr::mutate( dataset = purrr::map2( .x = replicate, .y = categories, ~ generate_dataset( n = n, porig = porig, pn = pn, pnnoise = pnnoise, pcnoise = pcnoise, sigma = sigma, qoptions = .y, mode = "shared", seed = 1234 + .x ) ), # compute distance matrix using method specs + baseline uses Xorig distance_matrix = purrr::pmap( dplyr::pick(dataset, method, mdist_type, mdist_preset, param_set), \(dataset, method, mdist_type, mdist_preset, param_set) { X <- if (method == "baseline") dataset$Xorig else dataset$X run_mdist_within( df = X, mdist_type = mdist_type, mdist_preset = mdist_preset, param_set = param_set, outcome = NULL # because X / Xorig have no response column ) |> base::as.matrix() } ), mds_results = purrr::map( distance_matrix, ~ cmdscale(.x, eig = TRUE, k = 2)$points |> as.data.frame() |> setNames(c("x1", "x2")) ), congruence_coeff = purrr::map2( .x = mds_results, .y = dataset, \(mds, ds) congruence_coeff(ds$truth, mds) ) ) # save(file="recovery_structure.RData", recovery_structure) recovery_structure_lite = recovery_structure |> select(-dataset, -distance_matrix, -mds_results) |> unnest(congruence_coeff) |> select(replicate, method, categories, congruence_coeff) |> mutate(categories=factor(paste0(categories," categories")), alienation_coeff = sqrt(1-congruence_coeff^2)) save(file="recovery_structure_lite.RData", recovery_structure_lite) ``` ### Code for figure `Figure 3` ```{r,echo=FALSE} load("recovery_structure_lite.RData") ``` ```{r,fig.height=8,fig.width=8} library(dplyr) library(ggplot2) library(ggnewscale) # n_add <- length(add_methods) add_methods <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix") nonadd_methods <- c("naive","hl","gudmm","dkss") method_order <- c(add_methods, nonadd_methods, "baseline") n_add <- length(add_methods) n_nonadd <- length(nonadd_methods) n_total <- length(method_order) rect_bg <- tibble::tibble( xmin = c(0.5, n_add + 0.5), xmax = c(n_add + 0.5, n_add + n_nonadd + 0.5), ymin = c(-Inf, -Inf), ymax = c( Inf, Inf), grp = c("additive", "non-additive") ) # --- method groups --- # x-axis order: additive first, then non-additive, baseline last # --- prepare data --- box_df <- recovery_structure_lite |> mutate( method = ifelse(is.na(method), "baseline", method), method = factor(method, levels = method_order), additive = ifelse(as.character(method) %in% add_methods, "additive", "non-additive") ) # --- plot --- boxplot_figure_simu <- ggplot(box_df, aes(x = method, y = alienation_coeff)) + geom_rect( data = dplyr::filter(rect_bg, grp == "additive"), aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), inherit.aes = FALSE, alpha = 0.20, fill = "dodgerblue", color = "lightgrey" ) + geom_rect( data = dplyr::filter(rect_bg, grp == "non-additive"), aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), inherit.aes = FALSE, alpha = 0.20, fill = "indianred", color = "lightgrey" ) + facet_wrap(~ categories) + theme_bw() + theme( axis.text.x = element_text(angle = 60, hjust = 1), legend.position = "bottom", legend.title.position = "top", legend.text = element_text(size = 13) ) + ylab("alienation coefficient") + xlab("methods") + # --- additive boxes + legend 1 (left) --- geom_boxplot( data = dplyr::filter(box_df, additive == "additive"), aes(fill = method), outlier.alpha = 0.25, linewidth = 0.35 ) + scale_fill_manual( name = "additive", values = my_colors, breaks = add_methods ) + guides(fill = guide_legend(order = 1)) + ggnewscale::new_scale_fill() + # --- non-additive boxes (incl baseline) + legend 2 (right) --- geom_boxplot( data = dplyr::filter(box_df, additive == "non-additive"), aes(fill = method), outlier.alpha = 0.25, linewidth = 0.35 ) + scale_fill_manual( name = "non-additive", values = my_colors, breaks = c(nonadd_methods, "baseline") # baseline included here; appears last in x-axis ) + guides(fill = guide_legend(order = 2)) boxplot_figure_simu ``` ```{r,echo=FALSE, eval=TRUE} ggsave(file="AC_boxplots.pdf",boxplot_figure_simu,width=8.5,height=6) ``` ## Clustering experiment: partitioning around medoids ### numsep =.1 and catsep=0.35 generation ```{r, echo=TRUE,eval=FALSE} nrep <- 100 k_true <- 4 q <- 9 numsep <- 0.1 catsep <- 0.35 clustSizeEq <- 50 param_generator_grid <- tibble::tibble( numsignal = c(0,0,4,4,4,8,8,8), numnoise = c(8,8,4,4,4,0,0,0), catsignal = c(4,8,0,4,8,0,4,8), catnoise = c(4,0,8,4,0,8,4,0) ) |> tidyr::crossing(replicate = 1:nrep) mixed_datasets_grid <- param_generator_grid |> dplyr::mutate( data = purrr::pmap( dplyr::pick(numsignal, catsignal, numnoise, catnoise, replicate), \(numsignal, catsignal, numnoise, catnoise, replicate) { gen_mixed( k_true = k_true, clustSizeEq = clustSizeEq, numsignal = numsignal, numnoise = numnoise, catsignal = catsignal, catnoise = catnoise, q = q, q_err = q, numsep = numsep, catsep = catsep, seed = replicate ) } ) ) methods <- c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix") sim_methods_grid <- mixed_datasets_grid |> tidyr::crossing(method = methods) |> dplyr::left_join(manydist:::mdist_method_lookup, by = "method") stopifnot(!any(is.na(sim_methods_grid$mdist_type))) ari_pam_results_01_035 <- sim_methods_grid |> dplyr::mutate( ari = purrr::pmap_dbl( dplyr::pick(data, mdist_type, mdist_preset, param_set), \(data, mdist_type, mdist_preset, param_set) { df <- data$df D <- run_mdist_within(df, mdist_type, mdist_preset, param_set, outcome = "y") pam_fit <- cluster::pam(D, k = k_true, diss = TRUE) mclust::adjustedRandIndex(df$y, pam_fit$clustering) } ) ) ``` ```{r, echo=FALSE,eval=FALSE} save(ari_pam_results_01_035, file = "ari_pam_experiment_numsep_01_catsep_035.RData") ``` ```{r, echo=FALSE,eval=TRUE} load(file = "ari_pam_experiment_numsep_01_catsep_035.RData") ``` ```{r} # Method groups (as you provided) add_methods <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix") nonadd_methods <- c("naive","hl","gudmm","dkss") method_levels <- c(add_methods, nonadd_methods) # Facet order (row-wise) scenario_order <- c("0844","0880","4408","8008","4444","4480","8044","8080") ari_plot_df_01_035 <- ari_pam_results_01_035 |> mutate( # Used only to match facet order; preserves leading zeros scenario_id = sprintf("%01d%01d%01d%01d", numsignal, numnoise, catsignal, catnoise), scenario = str_glue( "num: {numsignal} s. + {numnoise} n. | cat: {catsignal} s. + {catnoise} n." ), # Row-wise facet order as provided scenario = factor(scenario, levels = scenario[match(scenario_order, scenario_id)]), # Method order method = factor(method, levels = method_levels), # Additive / Non-additive label method_type = if_else(as.character(method) %in% add_methods, "Additive", "Non-additive") ) # Background bands (no fill scale consumed) k_add <- length(add_methods) k_tot <- length(method_levels) bg_add <- tibble( xmin = 0.5, xmax = k_add + 0.5, ymin = -Inf, ymax = Inf ) bg_nonadd <- tibble( xmin = k_add + 0.5, xmax = k_tot + 0.5, ymin = -Inf, ymax = Inf ) ari_plot_01_035 <- ggplot() + # Shaded regions geom_rect( data = bg_add, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), inherit.aes = FALSE, fill = "indianred", alpha = 0.12, show.legend = FALSE ) + geom_rect( data = bg_nonadd, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), inherit.aes = FALSE, fill = "dodgerblue", alpha = 0.12, show.legend = FALSE ) + # Additive boxplots + legend block geom_boxplot( data = ari_plot_df_01_035 |> filter(method_type == "Additive"), aes(x = method, y = ari, fill = method), outlier.alpha = 0.25, linewidth = 0.35 ) + # Additive legend (comes first) scale_fill_manual( name = "Additive", values = my_colors[add_methods], breaks = add_methods, drop = FALSE, guide = guide_legend(order = 1) ) + ggnewscale::new_scale_fill() + # Non-additive boxplots + legend block geom_boxplot( data = ari_plot_df_01_035 |> filter(method_type == "Non-additive"), aes(x = method, y = ari, fill = method), outlier.alpha = 0.25, linewidth = 0.35 ) + # Non-additive legend (comes second) scale_fill_manual( name = "Non-additive", values = my_colors[nonadd_methods], breaks = nonadd_methods, drop = FALSE, guide = guide_legend(order = 2) ) + facet_wrap(~ scenario, ncol = 2) + coord_cartesian(ylim = c(0, 1)) + theme_bw() + theme( axis.text.x = element_text(angle = 60, hjust = 1), strip.text = element_text(size = 9), legend.text = element_text(size = 13) ) + labs(x = "", y = "ARI (PAM on dissimilarities)") print(ari_plot_01_035) ``` ```{r,echo=FALSE,eval=TRUE} ggsave(file = "ari_plot_01_035.pdf", ari_plot_01_035, width = 6, height = 8) ``` ```{r,echo=FALSE} load(file = "ari_pam_experiment_numsep_01_catsep_035.RData") ``` ```{r} # ---- method groups ---- add_methods <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix") nonadd_methods <- c("naive","hl","gudmm","dkss") # ---- scenario order (shared with boxplot figure) ---- scenario_order <- c("0844","0880","4408","8008","4444","4480","8044","8080") ari_parallel_df <- ari_pam_results_01_035 |> mutate( # key used for ordering scenario_id = sprintf( "%01d%01d%01d%01d", numsignal, numnoise, catsignal, catnoise ), # label shown on the x-axis scenario = stringr::str_glue( "n:({numsignal},{numnoise}); c:({catsignal},{catnoise})" ) ) |> group_by(scenario_id, scenario, method) |> summarise( mean_ari = mean(ari, na.rm = TRUE), sd_ari = sd(ari, na.rm = TRUE), .groups = "drop" ) |> mutate( # enforce scenario order explicitly scenario = factor( scenario, levels = scenario[match(scenario_order, scenario_id)] ), type = ifelse(method %in% add_methods, "additive", "non-additive"), method = factor(method, levels = c(add_methods, nonadd_methods)) ) ``` ### Code for `Figure 4` ```{r} ari_parallel_plot <- ggplot() + # ================= ADDITIVE ================= geom_errorbar( data = dplyr::filter(ari_parallel_df, type == "additive"), aes( x = scenario, ymin = mean_ari - sd_ari, ymax = mean_ari + sd_ari, color = method ), width = 0.12, linewidth = 0.35 ) + geom_line( data = dplyr::filter(ari_parallel_df, type == "additive"), aes( x = scenario, y = mean_ari, color = method, shape = method, linetype = method, group = method ), linewidth = 0.6, alpha = .75 ) + geom_point( data = dplyr::filter(ari_parallel_df, type == "additive"), aes(x = scenario, y = mean_ari, color = method, shape = method), size = 2 ) + scale_color_manual( name = "additive", values = my_colors, breaks = add_methods ) + scale_shape_manual( name = "additive", values = my_shapes, breaks = add_methods ) + scale_linetype_manual( name = "additive", values = setNames(rep("solid", length(add_methods)), add_methods), breaks = add_methods ) + guides( color = guide_legend(order = 1), shape = guide_legend(order = 1), linetype = guide_legend(order = 1) ) + ggnewscale::new_scale_color() + ggnewscale::new_scale("shape") + ggnewscale::new_scale("linetype") + # ================= NON-ADDITIVE ================= geom_errorbar( data = dplyr::filter(ari_parallel_df, type == "non-additive"), aes( x = scenario, ymin = mean_ari - sd_ari, ymax = mean_ari + sd_ari, color = method ), width = 0.12, linewidth = 0.35 ) + geom_line( data = dplyr::filter(ari_parallel_df, type == "non-additive"), aes( x = scenario, y = mean_ari, color = method, shape = method, linetype = method, group = method ), linewidth = 0.6, alpha = .75 ) + geom_point( data = dplyr::filter(ari_parallel_df, type == "non-additive"), aes(x = scenario, y = mean_ari, color = method, shape = method), size = 2 ) + scale_color_manual( name = "non-additive", values = my_colors, breaks = nonadd_methods ) + scale_shape_manual( name = "non-additive", values = my_shapes, breaks = nonadd_methods ) + scale_linetype_manual( name = "non-additive", values = setNames(rep("solid", length(nonadd_methods)), nonadd_methods), breaks = nonadd_methods ) + guides( color = guide_legend(order = 2), shape = guide_legend(order = 2), linetype = guide_legend(order = 2) ) + labs(x = "scenario", y = "mean ARI (± 1 SD)") + theme_bw() + theme( axis.text.x = element_text(angle = 30, hjust = 1), legend.position = "bottom", legend.title.position = "top", legend.text = element_text(size = 13) ) ari_parallel_plot ggsave( filename = "figure_parallel_ari.pdf", plot = ari_parallel_plot, width = 18, height = 11, units = "cm" ) ```  ```{r} load(file = "ari_pam_experiment_numsep_01_catsep_035.RData") add_methods <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix") nonadd_methods <- c("naive","hl","gudmm","dkss") scenario_order <- c("0844","0880","4408","8008","4444","4480","8044","8080") ari_rank_df <- ari_pam_results_01_035 |> mutate( scenario_id = sprintf("%01d%01d%01d%01d", numsignal, numnoise, catsignal, catnoise), scenario = stringr::str_glue("n:({numsignal},{numnoise}); c:({catsignal},{catnoise})") ) |> group_by(scenario_id, scenario, method) |> summarise( mean_ari = mean(ari, na.rm = TRUE), .groups = "drop" ) |> group_by(scenario_id, scenario) |> mutate( rank_desc = dplyr::dense_rank(dplyr::desc(mean_ari)) # 1 = best ) |> ungroup() |> mutate( # enforce scenario order (same as boxplot panels) scenario = factor(scenario, levels = scenario[match(scenario_order, scenario_id)]), type = ifelse(method %in% add_methods, "additive", "non-additive"), method = factor(method, levels = c(add_methods, nonadd_methods)) ) ```                                                                       ## Illustration: FIFA player data ### Data preparation and analysis ```{r, eval=FALSE} # Variables are organized: Categorical first, then numerical data("fifa_nl", package = "manydist") #Xorig<-X<-ddata[,-c(1,2)] X<-fifa_nl[,-c(1,2,19:24)] # vars 1 and 2 are a bit useless (too many categories) # vars 19:24 interesting but same scale and little variation # leaving them out makes the plots a bit smaller. # leaving them in doesn't seem to change a lot n<-nrow(X) nd<-2 # Dimensionality of MDS solution X=X[,c(8:1,9:16)] # Order X=X[,-3] # Remove int rep: 380 have something, 21+7 in other cats X=X[,c(1:12,14,13,15)] p<-ncol(X) X[8:15] <- lapply(X[8:15], as.numeric) fifa_clean <- as_tibble(X) |> select(where(is.integer), where(is.numeric),where(is.factor) ,-release_clause_eur) |> rename(pref_foot=`preferred_foot`, club=`club_name`, position=`team_position`) ``` ```{r, eval=FALSE} fifa_structure <- tibble::tibble( method = c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix") ) |> dplyr::left_join(manydist:::mdist_method_lookup, by = "method") |> dplyr::mutate( loo_results = purrr::pmap( dplyr::pick(method, mdist_type, mdist_preset, param_set), \(method, mdist_type, mdist_preset, param_set) { run_lovo( df = fifa_clean, mdist_type = mdist_type, mdist_preset = mdist_preset, param_set = param_set, outcome = NULL, dims = nd, keep_dist = FALSE, legacy_gower_sum = (method == "gower") # triggers only for preset+gower anyway ) } ) ) ``` ```{r, eval=FALSE,echo=FALSE} save(file="fifa_structure_R3.RData", fifa_structure) ``` ```{r, echo = FALSE} load("fifa_structure_R3.RData") ``` ```{r} results_fifa_wide <- fifa_structure |> dplyr::mutate(loo_tbl = purrr::map(loo_results, "results")) |> dplyr::select(method, loo_tbl) |> tidyr::unnest(loo_tbl) |> tidyr::pivot_wider( id_cols = variable, names_from = method, values_from = c(mad_importance, cc_importance, ac_importance, mad_normalized) ) measure_names <- c( "mad_importance", "cc_importance", "ac_importance", "mad_normalized" ) fifa_split_by_measure <- purrr::map( measure_names, \(measure) { results_fifa_wide |> dplyr::select(variable, dplyr::starts_with(paste0(measure, "_"))) |> dplyr::rename_with(~ stringr::str_remove(.x, paste0(measure, "_"))) } ) |> rlang::set_names(measure_names) ``` ### Code for `Figure 5` ```{r,fig.height=5,fig.width=8} meas_to_plot_mad = fifa_split_by_measure |> pluck("mad_importance") |> mutate(measure = "mad_importances") |> slice(c(8:14, 1:7)) |> mutate(var_id = 1:n()) mad_max = max(meas_to_plot_mad |> select(where(is.numeric), -var_id)) rect_data = tibble( xmin_cont = 8.5, xmax_cont = 14, ymin_cont = 0, ymax_cont = mad_max, xmin_cat = .5, xmax_cat = 8.5, ymin_cat = 0, ymax_cat = mad_max ) meas_to_plot_mad_rel = fifa_split_by_measure |> pluck("mad_normalized") |> mutate(measure = "mad_normalized") |> slice(c(8:14, 1:7)) |> mutate(var_id = 1:n()) mad_max_rel = max(meas_to_plot_mad_rel |> select(where(is.numeric), -var_id)) rect_data_rel = tibble( xmin_cont = 8.5, xmax_cont = 14, ymin_cont = 0, ymax_cont = mad_max_rel, xmin_cat = .5, xmax_cat = 8.5, ymin_cat = 0, ymax_cat = mad_max_rel ) method_levels <- c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix") additive_set <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix") conflict_prefer("intersect", "base") conflict_prefer("setdiff", "base") add_present <- intersect(additive_set, method_levels) nonadd_present <- setdiff(method_levels, additive_set) # ---------- TOP: absolute contributions ---------- loo_mad_long <- meas_to_plot_mad |> pivot_longer( cols = any_of(method_levels), names_to = "method", values_to = "value" ) |> mutate( method = factor(method, levels = method_levels), additive = ifelse(method %in% additive_set, "additive distances", "non-additive distances"), type = ifelse(method %in% additive_set, "Additive", "Non-additive") ) loo_mad_plot <- ggplot() + geom_rect( data = rect_data, aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont), alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE ) + geom_rect( data = rect_data, aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat), alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE ) + # Additive geom_line( data = filter(loo_mad_long, type == "Additive"), aes(x = var_id, y = value, color = method, linetype = method, group = method), alpha = 0.85 ) + geom_point( data = filter(loo_mad_long, type == "Additive"), aes(x = var_id, y = value, color = method, shape = method), size = 2 ) + scale_color_manual( name = "Additive", values = my_colors, breaks = add_present, drop = FALSE ) + scale_shape_manual( name = "Additive", values = my_shapes, breaks = add_present, drop = FALSE ) + scale_linetype_manual( name = "Additive", values = setNames(rep("solid", length(add_present)), add_present), breaks = add_present, drop = FALSE ) + guides( color = guide_legend(order = 1), shape = guide_legend(order = 1), linetype = guide_legend(order = 1) ) + ggnewscale::new_scale_color() + ggnewscale::new_scale("shape") + ggnewscale::new_scale("linetype") + # Non-additive geom_line( data = filter(loo_mad_long, type == "Non-additive"), aes(x = var_id, y = value, color = method, linetype = method, group = method), alpha = 0.85 ) + geom_point( data = filter(loo_mad_long, type == "Non-additive"), aes(x = var_id, y = value, color = method, shape = method), size = 2 ) + scale_color_manual( name = "Non-additive", values = my_colors, breaks = nonadd_present, drop = FALSE ) + scale_shape_manual( name = "Non-additive", values = my_shapes, breaks = nonadd_present, drop = FALSE ) + scale_linetype_manual( name = "Non-additive", values = setNames(rep("solid", length(nonadd_present)), nonadd_present), breaks = nonadd_present, drop = FALSE ) + guides( color = guide_legend(order = 2), shape = guide_legend(order = 2), linetype = guide_legend(order = 2) ) + ylab("absolute variable contribution to distance") + theme_bw() + scale_x_continuous(breaks = seq(1, 14, 1), labels = rep("", 14)) + theme( axis.text.x = element_text(angle = 60, hjust = 1), legend.position = "none" ) + xlab("") + facet_wrap(. ~ additive) # ---------- BOTTOM: relative contributions ---------- loo_mad_rel_long <- meas_to_plot_mad_rel |> pivot_longer( cols = any_of(method_levels), names_to = "method", values_to = "value" ) |> mutate( method = factor(method, levels = method_levels), additive = ifelse(method %in% additive_set, "additive distances", "non-additive distances"), type = ifelse(method %in% additive_set, "Additive", "Non-additive") ) loo_mad_rel_plot <- ggplot() + geom_rect( data = rect_data_rel, aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont), alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE ) + geom_rect( data = rect_data_rel, aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat), alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE ) + # Additive geom_line( data = filter(loo_mad_rel_long, type == "Additive"), aes(x = var_id, y = value, color = method, linetype = method, group = method), alpha = 0.85 ) + geom_point( data = filter(loo_mad_rel_long, type == "Additive"), aes(x = var_id, y = value, color = method, shape = method), size = 2 ) + scale_color_manual( name = "Additive", values = my_colors, breaks = add_present, drop = FALSE ) + scale_shape_manual( name = "Additive", values = my_shapes, breaks = add_present, drop = FALSE ) + scale_linetype_manual( name = "Additive", values = setNames(rep("solid", length(add_present)), add_present), breaks = add_present, drop = FALSE ) + guides( color = guide_legend(order = 1), shape = guide_legend(order = 1), linetype = guide_legend(order = 1) ) + ggnewscale::new_scale_color() + ggnewscale::new_scale("shape") + ggnewscale::new_scale("linetype") + # Non-additive geom_line( data = filter(loo_mad_rel_long, type == "Non-additive"), aes(x = var_id, y = value, color = method, linetype = method, group = method), alpha = 0.85 ) + geom_point( data = filter(loo_mad_rel_long, type == "Non-additive"), aes(x = var_id, y = value, color = method, shape = method), size = 2 ) + scale_color_manual( name = "Non-additive", values = my_colors, breaks = nonadd_present, drop = FALSE ) + scale_shape_manual( name = "Non-additive", values = my_shapes, breaks = nonadd_present, drop = FALSE ) + scale_linetype_manual( name = "Non-additive", values = setNames(rep("solid", length(nonadd_present)), nonadd_present), breaks = nonadd_present, drop = FALSE ) + guides( color = guide_legend(order = 2), shape = guide_legend(order = 2), linetype = guide_legend(order = 2) ) + ylab("relative variable contribution to distance") + theme_bw() + scale_x_continuous(breaks = seq(1, 14, 1), labels = meas_to_plot_mad_rel$variable) + theme( axis.text.x = element_text(angle = 60, hjust = 1), strip.text = element_blank(), legend.position = "bottom", legend.title.position = "top", legend.text = element_text(size = 13) ) + xlab("") + facet_wrap(. ~ additive) # ---------- COMBINE ---------- fig_4_plot <- loo_mad_plot / loo_mad_rel_plot fig_4_plot ``` ```{r,echo=FALSE,eval=TRUE} ggsave(fig_4_plot, file = "figure_4_mad_importances.pdf", width = 7, height = 8) ``` ### Code for `Figure 6` ```{r,fig.height=5,fig.width=8} method_levels <- c("naive","hl","hl_add","gower","gudmm","dkss","mod_gower","u_ind","u_dep","u_mix") additive_set <- c("gower","mod_gower","hl_add","u_ind","u_dep","u_mix") meas_to_plot_ac <- fifa_split_by_measure |> purrr::pluck("ac_importance") |> dplyr::mutate(measure = "ac_importance") |> dplyr::slice(c(8:14, 1:7)) |> dplyr::mutate(var_id = dplyr::row_number()) max_ac <- meas_to_plot_ac |> dplyr::select(where(is.numeric), -var_id) |> unlist(use.names = FALSE) |> max(na.rm = TRUE) rect_data_ac <- tibble::tibble( xmin_cont = 7.5, xmax_cont = 14, ymin_cont = 0, ymax_cont = max_ac, xmin_cat = 0.5, xmax_cat = 7.5, ymin_cat = 0, ymax_cat = max_ac ) ac_long <- meas_to_plot_ac |> tidyr::pivot_longer( cols = dplyr::any_of(method_levels), names_to = "method", values_to = "value" ) |> dplyr::mutate( method = factor(method, levels = method_levels), additive = ifelse(method %in% additive_set, "additive distances", "non-additive distances"), type = ifelse(method %in% additive_set, "Additive", "Non-additive") ) add_present <- intersect(additive_set, unique(as.character(ac_long$method))) nonadd_present <- intersect(setdiff(method_levels, additive_set), unique(as.character(ac_long$method))) ac_plot <- ggplot2::ggplot() + ggplot2::geom_rect( data = rect_data_ac, ggplot2::aes(xmin = xmin_cont, xmax = xmax_cont, ymin = ymin_cont, ymax = ymax_cont), alpha = .2, fill = "indianred", color = "lightgrey", inherit.aes = FALSE ) + ggplot2::geom_rect( data = rect_data_ac, ggplot2::aes(xmin = xmin_cat, xmax = xmax_cat, ymin = ymin_cat, ymax = ymax_cat), alpha = .2, fill = "dodgerblue", color = "lightgrey", inherit.aes = FALSE ) + # ================= ADDITIVE ================= ggplot2::geom_line( data = dplyr::filter(ac_long, type == "Additive"), ggplot2::aes(x = var_id, y = value, color = method, linetype = method, group = method), alpha = 0.85 ) + ggplot2::geom_point( data = dplyr::filter(ac_long, type == "Additive"), ggplot2::aes(x = var_id, y = value, color = method, shape = method), size = 2 ) + ggplot2::scale_color_manual( name = "Additive", values = my_colors, breaks = add_present, drop = FALSE ) + ggplot2::scale_shape_manual( name = "Additive", values = my_shapes, breaks = add_present, drop = FALSE ) + ggplot2::scale_linetype_manual( name = "Additive", values = stats::setNames(rep("solid", length(add_present)), add_present), breaks = add_present, drop = FALSE ) + ggplot2::guides( color = ggplot2::guide_legend(order = 1), shape = ggplot2::guide_legend(order = 1), linetype = ggplot2::guide_legend(order = 1) ) + ggnewscale::new_scale_color() + ggnewscale::new_scale("shape") + ggnewscale::new_scale("linetype") + # ================= NON-ADDITIVE ================= ggplot2::geom_line( data = dplyr::filter(ac_long, type == "Non-additive"), ggplot2::aes(x = var_id, y = value, color = method, linetype = method, group = method), alpha = 0.85 ) + ggplot2::geom_point( data = dplyr::filter(ac_long, type == "Non-additive"), ggplot2::aes(x = var_id, y = value, color = method, shape = method), size = 2 ) + ggplot2::scale_color_manual( name = "Non-additive", values = my_colors, breaks = nonadd_present, drop = FALSE ) + ggplot2::scale_shape_manual( name = "Non-additive", values = my_shapes, breaks = nonadd_present, drop = FALSE ) + ggplot2::scale_linetype_manual( name = "Non-additive", values = stats::setNames(rep("solid", length(nonadd_present)), nonadd_present), breaks = nonadd_present, drop = FALSE ) + ggplot2::guides( color = ggplot2::guide_legend(order = 2), shape = ggplot2::guide_legend(order = 2), linetype = ggplot2::guide_legend(order = 2) ) + ggplot2::ylab("alienation coefficient") + ggplot2::theme_bw() + ggplot2::scale_x_continuous( breaks = seq(1, 14, 1), labels = meas_to_plot_ac$variable ) + ggplot2::theme( axis.text.x = ggplot2::element_text(angle = 60, hjust = 1), legend.position = "bottom", legend.title.position = "top", legend.text = ggplot2::element_text(size = 13) ) + ggplot2::xlab("") + ggplot2::facet_wrap(. ~ additive) ac_plot ``` ```{r,echo=FALSE,eval=TRUE} ggsave("figure_5_ac_rel_importances.pdf", ac_plot, width = 8, height = 4) ```                                                                                                                                           ## Appendix D Categorical mean distances: skewed distributions ```{r} LeHo<-function(X){ # Returns the LeHo Distance (symmetric Kullback-Leibler) # X is matrix with conditional probabilities #if(any(rowSums(X)!=1)){ # print("Input must be conditional probs (row sums should be 1)") #} else { require(philentropy) n<-nrow(X) D<-matrix(0,n,n) for (i in 1:(n-1)){ for (j in (i+1):n){ D[j,i]<-D[i,j]<-distance(X[c(i,j),],"kullback-leibler",unit="log2")+ distance(X[c(j,i),],"kullback-leibler",unit="log2")} } return(D) } ``` ### Data generation ```{r,messge=FALSE, warning=FALSE} # Let's consider some specific unbalanced scenarios # One probility is d. The others are (1-d)/(q-1) # We consider skewed options: dis<-c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95) #qi=5 qs<-c(2,3,5,10)#,18) # Some options for qi nint<-length(dis) # Number of options for d (nint-1) rsm<-rsma<-rhl<-sdsm<-rse<-rsof<-rsiof<-sdse<-rsa<-sdsa<-sdsof<-sdsiof<-sddsma<-sdssm<- sddse<-sddsa<-sddsm<-sdhl<-sddsmca<-sddsof<-sddsiof<-sddshl<- sdshl<- rsmca<-sdsmca<-esm<-em<-ee<-ea<-ehl<-emca<-eof<-eiof<-matrix(NA,nrow=nint,ncol=length(qs)) #par(mfrow=c(2,2)) for (qis in 1:length(qs)){ #ds<-NULL qi<-qs[qis] for (i in (1:nint)){ # variant 2 Deltam<-matrix(1,qi,qi) # Matching diag(Deltam)<-rep(0,qi) # Matching Delta #d=i/(2*qi) # i=2 corresponds to equal probs if variant 1 is chosen. (Only works for one qi. Not for qs loop) d=dis[i] # variant 2 #ds<-rbind(ds,d) p<-c(d,rep((1-d)/(qi-1),(qi-1))) sum(p) Dp<-diag(p) pp<-p %*% t(p) spi<-(diag(Dp-pp))^-1 spi5<-(diag(Dp-pp))^-.5 DEsk<-(2/(qi^2))*Deltam # Delta Eskin DOF<-log(p)%*%t(log(p)) * Deltam # OF n=160 # For inv OF we need n. Soon gets very large (increases in n) 160 is sufficient #n<-ceiling(1/min(p)) DIOF<-log(n*p)%*%t(log(n*p)) * Deltam # inverse OF. If some np<1 we can get negatives n=160 # For HL we need to find a way to get somewhat consistent values. Let's choose n large marginals=p*n HLemp<-distancefactor(cat=qi,n=n,catsizes=marginals) # DHL<-Deltam*HLemp*2#*sqrt(2) # NOTE 13/09: Corrected: Was sqrt(2). But, should be, see paper, 2x #DE<-sqrt((1/qi)*(spi %*% t(rep(1,qi)) + t(spi %*% t(rep(1,qi))))) #diag(DE)<-rep(0,qi) DM<-sqrt(1/qi)*(spi5 %*% t(rep(1,qi)) + t(spi5 %*% t(rep(1,qi)))) diag(DM)<-rep(0,qi) DA<-(1/qi)*diag(spi5)%*%Deltam%*%diag(spi5) DMCA<-(1/qi) * diag(p^-.5)%*%Deltam%*%diag(p^-.5) #we<-ee[i,(qis)]<- t(p)%*%DE%*%p wsm<-esm[i,(qis)]<- t(p)%*%Deltam%*%p # Simple Matching we<-ee[i,(qis)]<- t(p)%*%DEsk%*%p # Eskin wof<-eof[i,(qis)]<-t(p)%*%DOF%*%p # OF wiof<-eiof[i,(qis)]<-t(p)%*%DIOF%*%p # IOF whl<-ehl[i,(qis)]<-t(p)%*%DHL%*%p # HL wm<-em[i,(qis)]<- t(p)%*%DM%*%p # SD scaling wa<-ea[i,(qis)]<- t(p)%*%DA%*%p # Cat dis scaling wmca<-emca[i,(qis)]<- t(p)%*%DMCA%*%p # MCA scaling # We can also estimate the variance among the distances using Deltas: E(X^2)-E(X)^2 # We can do this dividing by expected values so that we look at what happens # when delta is commensurable. Or not # if not: #we<-wm<-wa<-wmca<-wof<-wiof<-whl<-wsm<-1 sddse[i,(qis)]<- sqrt(t(p)%*%DEsk^2%*%p - ee[i,(qis)]^2)/we # Eskin sddsma[i,(qis)]<- sqrt(t(p)%*%Deltam^2%*%p - esm[i,(qis)]^2)/ wsm # Simple Matching sddshl[i,(qis)]<- sqrt(t(p)%*%DHL^2%*%p - ehl[i,(qis)]^2)/ whl # HL sddsm[i,(qis)]<- sqrt(t(p)%*%DM^2%*%p - em[i,(qis)]^2)/ wm # SD sddsa[i,(qis)]<- sqrt(t(p)%*%DA^2%*%p - ea[i,(qis)]^2)/wa # Cat dissim sddsmca[i,(qis)]<- sqrt(t(p)%*%DMCA^2%*%p - emca[i,(qis)]^2)/wmca # MCA sddsof[i,(qis)]<-sqrt(t(p)%*%DOF^2%*%p - eof[i,(qis)]^2)/wof # OF sddsiof[i,(qis)]<-sqrt(t(p)%*%(DIOF/as.numeric(wiof))^2%*%p - 1 ) #IOF # Let's consider spread among cat dissimilarities. But: This only makes sense # when on similar scales. So, use commensurable deltas: (Or not: see above) sdse[i,qis]<-sd((1/we)*DEsk[upper.tri(DEsk)]) # Eskin sdsm[i,qis]<-sd((1/wm)*DM[upper.tri(DM)]) # SD sdshl[i,qis]<-sd((1/whl)*DHL[upper.tri(DHL)]) # HL sdssm[i,qis]<-sd((1/wsm)*Deltam[upper.tri(Deltam)]) # Simple Matching sdsa[i,qis]<-sd((1/wa)*DA[upper.tri(DA)]) # Cat dissim, sdsmca[i,qis]<-sd((1/wmca)*DMCA[upper.tri(DMCA)]) # MCA sdsof[i,qis]<-sd((1/wof)*DOF[upper.tri(DOF)]) # OF sdsiof[i,qis]<-sd((1/wiof)*DIOF[upper.tri(DIOF)]) # IOF } } cats<-as.character(qs) skew<-as.character(dis)#as.character(1:nint) ``` ### Code for `Figure 7` ```{r,fig.height=7,fig.width=7} sk_distance_to_plot = function(exp_mean_dist,distance_name){ exp_mean_dist = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |> rename(`2 cats` = V1, `3 cats` = V2, `5 cats` = V3, `10 cats` = V4) |> mutate(method=distance_name) |> pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`), names_to = "Number of categories", values_to = "Mean distance") return(exp_mean_dist) } all_exp_mean_dist=rbind(sk_distance_to_plot(esm,"Simple Matching"), sk_distance_to_plot(ee,"Eskin"), sk_distance_to_plot(eof,"OF"), sk_distance_to_plot(eiof,"IOF") ) paper_plot_skew1=all_exp_mean_dist |> mutate(`Number of categories`=fct_inorder(`Number of categories`), method=fct_inorder(method),) |> ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) + geom_line() + geom_point() + facet_wrap(~method,scales = "free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") + xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom") print(paper_plot_skew1,width=10,height=6) ``` ### Code for `Figure 8` ```{r,fig.height=4,fig.width=8} all_exp_mean_dist2=rbind(sk_distance_to_plot(ehl,"Hennig-Liao"), sk_distance_to_plot(em,"Std. dev."), sk_distance_to_plot(ea,"Cat. dissim.") ) paper_plot_skew2=all_exp_mean_dist2 |> mutate(`Number of categories`=fct_inorder(`Number of categories`), method=fct_inorder(method),) |> ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) + geom_line() + geom_point() + facet_wrap(~method) + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") + xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom") print(paper_plot_skew2) ``` ### Code for `Figure 9` ```{r,fig.height=4,fig.width=8} qs<-c(2,3,5,10)#,18) # Some options for qs nint<-length(dis) # Number of options for d (nint-1) CVchd<-CVtvd<-CVmsd<-CVlhd<-Echi<-Echd<-Etvi<-Etvd<-Emsi<-Elhi<-Elhd<-Emsd<-array(NA,dim=c(length(qs),length(qs),nint,nint)) for (q1 in 1:length(qs)){ # cats row qi<-qs[q1] for (q2 in q1:length(qs)){ # cats cols qj<-qs[q2] for (pi in 1:nint){ # row distributions d1<-dis[pi] p1<-c(d1,rep((1-d1)/(qj-1),(qj-1))) # nint different marginals: Distribution over colums for (pj in 1:nint){ d2<-dis[pj] p2<-c(d2,rep((1-d2)/(qi-1),(qi-1))) # corresponding col marginals: Distribution over rows Pi<-p2 %*% t(p1) # joint table independent Ri<-Pi/rowSums(Pi) # conditional table independent # Ci<-Ri/sqrt(p2) if (qj>qi) { Pd<-diag(p1[1:qi]) Addzeros<-matrix(0,(qi-1),(qj-qi)) Addps<-p1[qi+1:(qj-qi)] Add<-rbind(Addzeros,Addps) Pd<-cbind(Pd,Add) } else { #qi=qj pd<-p2 Pd<-diag(pd) # joint table dependent case (diagonal smallest q) } Rd<-Pd/rowSums(Pd) Cd<-Rd/sqrt(p2) # Chi-square table: cond. prob/sqrt(p) # category dissimilarities: Dlhd<-LeHo(Rd) # Le Ho # Dmsd<-MousaviSehhati(Pd) # Moussavi Dtvd<-dist(Rd,"manhattan")/2 # TVD Dchd<-dist(Cd,"euclidean")^2 # Chi2 dist # Dchd<-Dchd/(2*(qi-1)) # Chi2 dist corrected for number of categories... Elhd[q1,q2,pi,pj]<-t(pd) %*%Dlhd %*%pd # Emsd[q1,q2,pi,pj]<-t(pd) %*%Dmsd %*%pd Etvd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dtvd) %*%pd Echd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dchd) %*%pd # Coefficient of variation CVlhd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dlhd^2 %*%pd - (t(pd) %*%Dlhd %*%pd)^2)/(t(pd) %*%Dlhd %*%pd) # CVmsd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dmsd^2 %*%pd - (t(pd) %*%Dmsd %*%pd)^2)/(t(pd) %*%Dmsd %*%pd) CVtvd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dtvd))^2 %*%pd - (t(pd) %*%as.matrix(Dtvd) %*%pd)^2)/(t(pd) %*%as.matrix(Dtvd) %*%pd) CVchd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dchd))^2 %*%pd - (t(pd) %*%as.matrix(Dchd) %*%pd)^2)/(t(pd) %*%as.matrix(Dchd) %*%pd) Dlhi<-LeHo(Ri) # Dmsi<-MousaviSehhati(Pi) Dtvi<-dist(Ri,"manhattan")/2 # Dchi<-dist(Ci,"euclidean")^2 Elhi[q1,q2,pi,pj]<-t(p2) %*%Dlhi %*%p2 # Emsi[q1,q2,pi,pj]<-t(p2) %*%Dmsi %*%p2 Etvi[q1,q2,pi,pj]<-t(p2) %*%as.matrix(Dtvi) %*%p2 # Echi[q1,q2,pi,pj]<-t(p1) %*%as.matrix(Dchi) %*%p1 } } } } Elhs<-as.data.frame(rbind(Elhd[1,1,1,],Elhd[2,2,1,],Elhd[3,3,1,],Elhd[4,4,1,])) Etvs<-as.data.frame(rbind(Etvd[1,1,1,],Etvd[2,2,1,],Etvd[3,3,1,],Etvd[4,4,1,])) Emss<-as.data.frame(rbind(Emsd[1,1,1,],Emsd[2,2,1,],Emsd[3,3,1,],Emsd[4,4,1,])) Echs<-as.data.frame(rbind(Echd[1,1,1,],Echd[2,2,1,],Echd[3,3,1,],Echd[4,4,1,])) CVlhs<-as.data.frame(rbind(CVlhd[1,1,1,],CVlhd[2,2,1,],CVlhd[3,3,1,],CVlhd[4,4,1,])) CVtvs<-as.data.frame(rbind(CVtvd[1,1,1,],CVtvd[2,2,1,],CVtvd[3,3,1,],CVtvd[4,4,1,])) CVmss<-as.data.frame(rbind(CVmsd[1,1,1,],CVmsd[2,2,1,],CVmsd[3,3,1,],CVmsd[4,4,1,])) CVchs<-as.data.frame(rbind(CVchd[1,1,1,],CVchd[2,2,1,],CVchd[3,3,1,],CVchd[4,4,1,])) cats<-as.character(qs) skew<-as.character(dis)#as.character(1:nint) sk_distance_to_plot = function(exp_mean_dist,distance_name){ exp_mean_dist = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |> rename(`2 cats` = V1, `3 cats` = V2, `5 cats` = V3, `10 cats` = V4) |> mutate(method=distance_name) |> pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`), names_to = "Number of categories", values_to = "Mean distance") return(exp_mean_dist) } all_exp_mean_dist3=rbind(sk_distance_to_plot(t(as.matrix(Elhs)),"Le-Ho"), sk_distance_to_plot(t(as.matrix(Etvs)),"Total Variance Distance") ) paper_plot_skew3=all_exp_mean_dist3 |> mutate(`Number of categories`=fct_inorder(`Number of categories`), method=fct_inorder(method),) |> ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) + geom_line() + geom_point() + facet_wrap(~method,scales="free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") + xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom") print(paper_plot_skew3,width=8,height=4) ``` ## Appendix E FIFA variable distributions ### Code for `Figure 10` ```{r, fig.height=12,fig.width=8} # load("data/Fifa21NL.RData") # Load the data data(fifa_nl,package="manydist") #Xorig<-X<-ddata[,-c(1,2)] Xorig<-X<-fifa_nl[,-c(1,2,19:24)] # vars 1 and 2 are a bit useless (too many categories) # vars 19:24 interesting but same scale and little variation # leaving them out makes the plots a bit smaller. # leaving them in doesn't seem to change a lot n<-nrow(X) # p<-ncol(X) K<-10 # Number of clusters in cluster analysis nd<-2 # Dimensionality of MDS solution X=X[,c(8:1,9:16)] # Order Xorig=Xorig[,c(8:1,9:16)] # Order X=X[,-3] # Remove int rep: 380 have something, 21+7 in other cats Xorig=Xorig[,-3] X=X[,c(1:12,14,13,15)] Xorig=Xorig[,c(1:12,14,13,15)] p<-ncol(X) # First convert integers to numeric manually X[8:15] <- lapply(X[8:15], as.numeric) # reordering / numeric first X <- X %>% select(where(is.integer), where(is.numeric), where(is.factor)) X_tib_num = X |> as_tibble() |> select(where(is.numeric)) X_tib_cat = X |> as_tibble() |> select(where(is.factor)) levels(X_tib_cat$work_rate)=c("H/H", "H/L", "H/M", "L/H", "L/L", "L/M", "M/H", "M/L", "M/M") levels(X_tib_cat$club_name) = c("ADO","Ajax","AZ","Emmen","Groningen","Twente","Utrecht", "Feyenoord","Fortuna","Heracles","PEC","PSV","RKC","Heerenveen", "Sparta","Vitesse","VVV","Willem II") X_tib_cat=X_tib_cat |> rename(`club`=club_name, `position`=team_position) X_tib_num_nest = tibble(values=X_tib_num |> map((~.x)),features=names(X_tib_num))|> mutate( hplot=map2(.x=values,.y=features, function(x=.x,y=.y){ my_tib=tibble(a=x) names(my_tib) = y my_plot= my_tib |> ggplot(aes(x)) + geom_histogram(bins=15,alpha=.75)+theme_bw()+ggtitle(y)+ theme(plot.title = element_text(hjust = 0.5))+xlab("")+ylab("") } ) ) X_tib_cat_nest = tibble(values=X_tib_cat |> map((~.x)),features=names(X_tib_cat))|> mutate( bplot=map2(.x=values,.y=features, function(x=.x,y=.y){ my_tib=tibble(a=x) names(my_tib) = y my_plot= my_tib |> ggplot(aes(x)) + geom_bar(alpha=.75)+theme_bw()+ggtitle(y)+ theme(plot.title = element_text(hjust = 0.5), axis.text.x =element_text(angle=90 ))+xlab("")+ylab("") } ) ) fifa_descriptive_plots=(X_tib_cat_nest$bplot[[1]]/ X_tib_cat_nest$bplot[[2]]/ X_tib_cat_nest$bplot[[3]]/ X_tib_cat_nest$bplot[[4]]/ X_tib_cat_nest$bplot[[5]]/ X_tib_cat_nest$bplot[[6]]/ X_tib_cat_nest$bplot[[7]])| (X_tib_num_nest$hplot[[1]]/ X_tib_num_nest$hplot[[2]]/ X_tib_num_nest$hplot[[3]]/ X_tib_num_nest$hplot[[4]]/ X_tib_num_nest$hplot[[5]]/ X_tib_num_nest$hplot[[6]]/ X_tib_num_nest$hplot[[7]])#+ plot_layout(guides = 'collect') print(fifa_descriptive_plots) ```

Data generation

additive distances

non-additive distances

Compute distances: leave one variable out

Code for Figure 1

Code for Figure 2

Retrieval of the true configuration

Data generation

Code for figure Figure 3

Clustering experiment: partitioning around medoids

numsep =.1 and catsep=0.35 generation

Code for Figure 4

Illustration: FIFA player data

Data preparation and analysis

Code for Figure 5

Code for Figure 6

Appendix D Categorical mean distances: skewed distributions

Data generation

Code for Figure 7

Code for Figure 8

Code for Figure 9

Appendix E FIFA variable distributions

Code for Figure 10

References

Code for `Figure 1`

Code for `Figure 2`

Code for figure `Figure 3`

Code for `Figure 4`

Code for `Figure 5`

Code for `Figure 6`

Code for `Figure 7`

Code for `Figure 8`

Code for `Figure 9`

Code for `Figure 10`