Unbiased mixed variables distances

supplementary material

Note

The package manydist is available on GitHub and it can be installed using the following code:

devtools::install_github("alfonsoIodiceDE/manydist_package/manydist")

library(aricode)
library(cluster)
library(fpc)
library(manydist)
library(patchwork)
library(tidyverse)
library(varhandle)
library(vegan)

Here we provide the code to reproduce the results of the supplementary material of the paper “Unbiased mixed variables distances”.

Code

CongruenceCoeff<-function(L1,L2){
  L1<-dist(L1)
  L2<-dist(L2)
  CC<-sum(diag(t(L1) %*% L2)) / sqrt(sum(diag(t(L1) %*% L1))*sum(diag(t(L2) %*% L2)))
  return(CC)
}

LeHo<-function(X){  #  Returns the LeHo Distance (symmetric Kullback-Leibler)
                    #  X is matrix with conditional probabilities
#if(any(rowSums(X)!=1)){
#  print("Input must be conditional probs (row sums should be 1)")
#} else {
  
require(philentropy)

n<-nrow(X)
D<-matrix(0,n,n)
for (i in 1:(n-1)){
  for (j in (i+1):n){
    D[j,i]<-D[i,j]<-distance(X[c(i,j),],"kullback-leibler",unit="log2")+
      distance(X[c(j,i),],"kullback-leibler",unit="log2")}

}

return(D)
}
#  }

Variable specific effects

Data generation

Code

reps<-100  # 100 can take 30min
#reps<-5  # 100 can take 30min
n<-500
k<-2 # Dimension solution
sigma<-0.03 # Noise

porig<-6 # Number of variables corresponding to true config
pnnoise<-0 # Number of numerical noise vars
pcnoise<-0 # Number of categorical noise vars
pn<-2 # Number of numerical variables underying config

pnum<-pn+pnnoise+pcnoise  # Total number of numerical vars+noise vars (Needed to know which vars to discretize)
p<-porig+pnnoise+pcnoise # Total number of variables

qoptions<-c(2,3,5,9) # Distribution of the categorical variables corresponding to true config
pcat<-length(qoptions) # Number of cat variables underlying config

# We consider some variants:
presets<-c("custom","HLa","HLe","catdis","gower","unbiased_dependent","euclidean_onehot","catdissim")


mad_importances<-cc_importances<-array(NA,dim=(c(reps,length(presets),p)))



t0<-Sys.time()
for (rep in 1:reps){
  set.seed(1234+rep)
  # Generate 2d data:
  T<-cbind(runif(n,-2,2),runif(n,-2,2)) # Random Uniform
#  print(T[1:5,])
  SVDT <- svd(T)
  Y<-SVDT$u  # TU is orthonormal. This is the underlying 2d config
  
  R<-NULL
  for (i in 1:porig){
    R<-cbind(R,runif(2,-2,2)) # or uniform. Seems to have little impact
  }
  
  X<-Y%*%R # Inflate/expand the data
  
  Xorig<-as.data.frame(X)
  X<-as.data.frame(Xorig)
  
  
  # Add noise (we do this before making categorical; could also be after)
  
  for (i in 1:porig){
    X[,i]<-X[,i]+rnorm(n,0,sigma)
  }
  
  ### Or: Add noise columns. Completely random numerical variables
  
  N<-NULL
  if (pnnoise>0){
    for (i in 1:pnnoise){
      N<-cbind(N,rnorm(n))
    }
  }
  if (pcnoise>0){
    qr[rep]<-round(runif(1,3,9))
    for (i in 1:pcnoise){
      Cj<-as.factor(round(runif(n,1.5,qr[rep]+0.5)))
      if (is.null(N)){
        N<-cbind(N,Cj)
        N<-as.factor(N)
      } else {
        N<-cbind.data.frame(N,Cj)}
    }
    
    colnames(N)<-(1:(pnnoise+pcnoise))
  }
  
  if ((pnnoise+pcnoise)>0) {
    Xorig<-cbind.data.frame(N,Xorig)  # The random variables are the first pnoise
    X<-cbind.data.frame(N,X)
  }
  
  # END add noise columns
  
  
  
  for (i in (pnum+1):p){ # Create cat variables by simply evenly cutting based on range
    X[,i]<-cut(X[,i],qoptions[i-pnum])
  }

  for (pr in 1:length(presets)){
    # For each preset, first create full distance matrix:
    print(pr)
    if (pr==1){  # This is the real, numerical data: We use manhattan on scaled values:
      Dall<-mdist(Xorig,preset=presets[pr],distance_cont = "manhattan",commensurable=FALSE, scaling="std")
    } else if (pr==2){  # This is an HL variant on the manipulated (5 original, 5 discretized) data
      # Below we specify that we use manhattan, numerical is scaled, categorical uses HL scaling
      Dall<-mdist(X,preset="custom",distance_cont = "manhattan",distance_cat = "HL", commensurable=FALSE, scaling="std")
    } else if (pr==3){ # HL Euclidean
      Dall<-mdist(X,
                  distance_cont = "euclidean",
                  distance_cat = "HLeucl",
                  commensurable = FALSE,
                  scaling = "std")  
    } else if (pr==4){  # catdis
      Dall<-mdist(X,preset="custom",distance_cat = "cat_dis",commensurable = TRUE)
    } else if (pr==5){  # Gower
      Dall<-mdist(X,preset=presets[pr])*ncol(X)
      
    } else { # The other presets are the 3 options
      Dall<-mdist(X,preset=presets[pr])}
    
    MDS_all <- cmdscale(Dall,eig=TRUE, k=2)
    
    
    # Leave-one-out:    
    for (j in 1:p){
      if (pr==1){ 
        Dj<-mdist(Xorig[,-j],preset=presets[pr],distance_cont = "manhattan",commensurable=FALSE, scaling="std")
      } else if (pr==2){
        Dj<-mdist(X[,-j],preset="custom",distance_cont = "manhattan",distance_cat = "HL", commensurable=FALSE, scaling="std")
      } else if (pr==3){ # HL 2
        Dj<-mdist(X[,-j],distance_cont = "euclidean",
                  commensurable = FALSE,
                  scaling = "std",
                  distance_cat = "HLeucl")  
      } else if (pr==4){ # Unbiased catdissim
        Dj<-mdist(X[,-j],preset="custom",
                  distance_cat = "cat_dis",commensurable = TRUE)
      } else if (pr==5){  # Gower
        Dj<-mdist(X[,-j],preset=presets[pr])*(ncol(X)-1)
      } else { # the remaining presets
        Dj<-mdist(X[,-j],preset=presets[pr])}
      
      mad_importances[rep,pr,j]<-mean(abs(Dall-Dj))
      MDS_j <- cmdscale(Dj,eig=TRUE, k=2)
      cc_j<-CongruenceCoeff(MDS_all$points[,1:2], MDS_j$points[,1:2])
      cc_importances[rep,pr,j]<-cc_j
      
    }

  }
  
}  # End replications
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8

(elapsed<-Sys.time()-t0)
## Time difference of 21.07505 mins

ac_importances<-sqrt((1-cc_importances^2)) # We calculated congruences

variants<-c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind")
varreorder<-c(1,7,3,2,5,8,4,6)
mad_importances<-mad_importances[,varreorder,]
ac_importances<-ac_importances[,varreorder,]
mad<-ac<-madranks<-ccranks<-maddis<-madrel<-acrel<-NULL
mad_dists<-madrels<-acrels<-array(NA,dim=c(length(presets),reps,p))

for (pr in 1:length(presets)){
  mad<-rbind(mad,colMeans(mad_importances[,pr,]))
  ac<- rbind(ac,colMeans(ac_importances[,pr,]))
  maddis<-rbind(maddis,colMeans(mad_importances[,pr,]/rowSums(mad_importances[,pr,])))
  
  
  mad_dists[pr,,]<-mad_importances[,pr,]/rowSums(mad_importances[,pr,]) # We need all distributions
  madrels[pr,,]<-mad_dists[pr,,]/mad_dists[1,,]  # If larger: bigger diff than true
  madrel<-rbind(madrel,colMeans(madrels[pr,,]))
  

  acrels[pr,,]<-ac_importances[,pr,]/ac_importances[,1,]  # 
  acrel<-rbind(acrel,colMeans(acrels[pr,,]))
  
}

Measures<-list()
Measures[[1]]<-data.frame(t(mad))
Measures[[2]]<-data.frame(t(maddis))
Measures[[3]]<-data.frame(t(ac))
Measures[[4]]<-data.frame(t(madrel))
Measures[[5]]<-data.frame(t(acrel))



Measures_tib= tibble(meas=Measures,
                     meas_name=c("MAD","MAD distribution","AC","MAD rel","AC rel")) 

save(Measures_tib,file="Measures_tib.RData")

Code for `Figure 1`

Code


variants<-c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind")
varreorder<-c(1,7,3,2,5,8,4,6)
meas_to_plot_mad = Measures_tib |> 
  mutate(meas=map(.x=meas,function(x=.x){colnames(x)=variants[varreorder];return(x)}),
         meas=map(.x=meas,~.x |> slice(c(3:6,1,2)))
  ) |> 
  unnest(meas) |>mutate(loo_v=paste0("var",rep(1:6,times=5))) |>  
  filter(meas_name %in% c("MAD"))

meas_to_plot_mad_rel = Measures_tib |> 
  mutate(meas=map(.x=meas,function(x=.x){colnames(x)=variants[varreorder];return(x)}),
         meas=map(.x=meas,~.x |> slice(c(3:6,1,2)))
  ) |> 
  unnest(meas) |>mutate(loo_v=paste0("var",rep(1:6,times=5))) |>  
  filter(meas_name %in% c("MAD distribution"))

###################
mad_max = max(meas_to_plot_mad |> select(where(is.numeric)))
rect_data = tibble(
  xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max,
  xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max)

mad_max_rel = max(meas_to_plot_mad_rel |> select(where(is.numeric)))
rect_data_rel = tibble(
  xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max_rel,
  xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max_rel)


loo_mad_plot = meas_to_plot_mad |> pivot_longer(cols=Num:Udep, names_to="variant_name",values_to="value") |> 
  ggplot(aes(x = parse_number(loo_v), y = value,
             color = variant_name)) +
  geom_rect(data=rect_data,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
            alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
  geom_rect(data=rect_data,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
            alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
  geom_point()+geom_line(aes(lty=variant_name)) + 
  ylab("absolute variable contribution to distance")+
  xlab("left-out variable") +  theme_bw() + theme(legend.position = "none")+
  scale_x_continuous(breaks=seq(1,6,1),labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2"))+
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

loo_mad_rel_plot = meas_to_plot_mad_rel |> pivot_longer(cols=Num:Udep, names_to="variant_name",values_to="value") |> 
  ggplot(aes(x = parse_number(loo_v), y = value,
             color = variant_name)) + 
  geom_rect(data=rect_data_rel,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
            alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
  geom_rect(data=rect_data_rel,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
            alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
  geom_point()+geom_line(aes(lty=variant_name)) + 
  ylab("relative variable contribution to distance") +
  xlab("left-out variable") +  theme_bw() + 
  scale_x_continuous(breaks=seq(1,6,1),labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2"))+
  theme(axis.text.x = element_text(angle = 60, hjust = 1))


fig_1_plot = loo_mad_plot | loo_mad_rel_plot

print(fig_1_plot, width = 8, height = 4)

Code for `Figure 2`

Code

meas_to_plot_ac = Measures_tib |> 
  mutate(meas=map(.x=meas,function(x=.x){colnames(x)=variants[varreorder];return(x)}),
         meas=map(.x=meas,~.x |> slice(c(3:6,1,2)))
  ) |> 
  unnest(meas) |>mutate(loo_v=paste0("var",rep(1:6,times=5))) |>  
  filter(meas_name %in% c("AC rel"))

ac_max = max(meas_to_plot_ac |> select(where(is.numeric)))
rect_data_ac = tibble(
  xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=ac_max,
  xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=ac_max)

ac_plot = meas_to_plot_ac |> pivot_longer(cols=Num:Udep, names_to="variant_name",values_to="value") |> 
  ggplot(aes(x = parse_number(loo_v), y = value,
             color = variant_name)) + 
  geom_rect(data=rect_data_ac,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
            alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
  geom_rect(data=rect_data_ac,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
            alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
  geom_point()+geom_line(aes(lty=variant_name)) + 
  ylab("alienation coefficient")+
  xlab("left-out variable") +  theme_bw() + 
  scale_x_continuous(breaks=seq(1,6,1),labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2"))+
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

print(ac_plot, width = 8, height = 4)

Retrieval of the true configuration

Data generation

Code

reps <- 100
n<-500 # large is needed for the many (9) categories scenario
k<-2
sigma<-0.03  # standard deviation of Xorig variables is 
# is around 0.067

porig<-6 # Number of variables corresponding to true config
pnnoise<-0 # Number of numerical noise vars
pcnoise<-0 # Number of categorical noise vars
pn<-3 # Number of numerical variables underying config

pnum<-pn+pnnoise+pcnoise  # Total number of numerical vars+noise vars (Needed to know which vars to discretize)
p<-porig+pnnoise+pcnoise # Total number of variables

qoptions<-c(2,3,5,9) # Distribution of the categorical variables corresponding to true config
pcat<-length(qoptions) # Number of cat variables underlying config

# We consider some variants:
presets<-c("custom","HL","HLEuclid","catdis", "gower","unbiased_dependent", "euclidean_onehot","catdissim")#,"gower2","cat dissim")
CC<-array(NA,dim=c(reps,length(presets),length(qoptions)))
qr<-rep(NA,reps)

#set.seed(123)
t0<-Sys.time()
for (rep in 1:reps){
  set.seed(1234+rep)
  # Generate 2d data:
  T<-cbind(runif(n,-2,2),runif(n,-2,2)) # Random Uniform
  
  SVDT <- svd(T)
  Y<-SVDT$u  # TU is orthonormal. This is the underlying 2d config
  
  R<-NULL
  for (i in 1:porig){
    R<-cbind(R,runif(2,-2,2)) # or uniform. Seems to have little impact
  }
  
  X<-Y%*%R # Inflate/expand the data
  
  Xorig<-as.data.frame(X)
  X<-as.data.frame(Xorig)
  # Add noise (we do this before making categorical; could also be after)
  
  for (i in 1:porig){
    X[,i]<-X[,i]+rnorm(n,0,sigma)
  }
  
  ### Or: Add noise columns. Completely random numerical variables
  
  N<-NULL
  if (pnnoise>0){
    for (i in 1:pnnoise){
      N<-cbind(N,rnorm(n))
    }
  }
  if (pcnoise>0){
    qr[rep]<-round(runif(1,3,9))
    for (i in 1:pcnoise){
      Cj<-as.factor(round(runif(n,1.5,qr[rep]+0.5)))
      if (is.null(N)){
        N<-cbind(N,Cj)
        N<-as.factor(N)
      } else {
        N<-cbind.data.frame(N,Cj)}
    }
    
    colnames(N)<-(1:(pnnoise+pcnoise))
  }
  
  if ((pnnoise+pcnoise)>0) {
    Xorig<-cbind.data.frame(N,Xorig)  # The random variables are the first pnoise
    X<-cbind.data.frame(N,X)
  }
  
  # END add noise columns
  
  for (q in 1:length(qoptions)){
    qs<-rep(qoptions[q],p-pnum) # Choice of categories.
    # If number to large, we could get
    # empty categories. All cat variables
    # have same number of categories
    pcat<-length(qs)
    
    for (i in (pnum+1):p){ # Create cat variables by simply evenly cutting based on range
      X[,i]<-cut(Xorig[,i],qs[i-pnum])
    }
    
    for (pr in 1:length(presets)){
      # For each preset, first create full distance matrix:
      if (pr==1){  # This is the real, numerical data: We use manhattan on scaled values:
        Dall<-mdist(Xorig,preset=presets[pr],distance_cont = "manhattan",commensurable=FALSE, scaling="std")
      } else if (pr==2){  # Additive HL, numerical is scaled, categorical uses HL scaling
        Dall<-mdist(X,preset="custom",distance_cont = "manhattan",distance_cat = "HL", commensurable=FALSE, scaling="std")
        
      } else if (pr==3){  # HL Euclid 
        Dall<-mdist(X,
                    distance_cont = "euclidean",
                    commensurable = FALSE,
                    scaling = "std",
                    distance_cat = "HLeucl")
      } else if (pr==4){  # Cat dissimilarity
        Dall<-mdist(X,preset="custom",distance_cat = "cat_dis",commensurable = TRUE)
      } else if (pr==5){  # Gower
        Dall<-mdist(X,preset=presets[pr])*ncol(X)
      } else { # The other presets are the remaining 3 options
        if (pr==7){ # Catch error if no contvars
          if ((pn+pnnoise)==0) {
            print("Naive distances error: No numvars")
            X1<-as.numeric(X[,1])
            X2<-cbind.data.frame(X1,X[,-1])
            Dall<-mdist(X2,preset=presets[pr])
          } else { #pn+pnnoise !=0, pr==7
            Dall<-mdist(X,preset=presets[pr])
          }
        } else {# pr is not 7
          #print(pr)
          Dall<-mdist(X,preset=presets[pr])}
      }
      
      MDS_all <- cmdscale(Dall,eig=TRUE, k=2)
      CC[rep,pr,q]<-CongruenceCoeff(Y,MDS_all$points[,1:2])
    }  # Mix dist variants
    
    
  }  # Number of categories
  
  
}  # End replications

save(CC,file="boxplot_CC_data.RData")

Code for figure `Figure 3`

Code

reps <- 100
qoptions<-c(2,3,5,9)

AC<-sqrt((1-CC^2)) # We calculated congruences
MeasNames<-c("MAD","MAD Distr","Alienation Coeff", "MAD ranks", "CC ranks", "Relative MADs", "Relative ACs")
variants<-c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind")
varreorder<-c(1,7,3,2,5,8,4,6)
str(AC)
##  num [1:100, 1:8, 1:4] 0.2299 0.0585 0.2598 0.2691 0.1671 ...
diff_AC<-array(NA,c(reps,7,4))

for(q in 1:length(qoptions)){
  numeric_AC= matrix(rep(AC[,1,q],7),nrow=reps)
  diff_AC[,,q]<-AC[,-1,q]-numeric_AC   
}


AC_tib = rbind(as_tibble(data.frame(AC[,,1])) |> mutate(categories="two cats"),
               as_tibble(data.frame(AC[,,2])) |> mutate(categories="three cats"),
               as_tibble(data.frame(AC[,,3])) |> mutate(categories="five cats"),
               as_tibble(data.frame(AC[,,4])) |> mutate(categories="nine cats")) |> 
  mutate(categories=fct_inorder(categories))

names(AC_tib) = c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind","categories")

variant_ordering = c(1,7,3,2,5,8,4,6)

AC_tib = AC_tib |> select(variant_ordering,categories)



AC_tib_long = AC_tib |> pivot_longer(
  cols=c("Num","Naive","HL", "HLa","G","Uind","Ustd","Udep"),
  names_to="variants", values_to="diff_AC")  |> mutate(variants=fct_inorder(variants))

gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

my_colors= c("darkgrey",gg_color_hue(7)[c(4,2,3,1,5,6,7)])

boxplot_figure_simu = AC_tib_long |> ggplot(aes(x=(variants), y=diff_AC))+
  geom_boxplot(aes(fill=variants)) +
  facet_wrap(~categories)+ theme_bw()+
  scale_fill_manual(values=my_colors) + theme(legend.position = "none")+
  ylab("Alienation coefficient") + xlab("Variants") 

print(boxplot_figure_simu,width=8.5,height=6)

Illustration: FIFA player data

Data preparation and analysis

Code


# Variables are organized: Categorical first, then numerical
#load("Fifa21NL.RData") # Load the data
data(fifa_nl)
#Xorig<-X<-ddata[,-c(1,2)]
Xorig<-X<-fifa_nl[,-c(1,2,19:24)] # vars 1 and 2 are a bit useless (too many categories)
# vars 19:24 interesting but same scale and little variation
# leaving them out makes the plots a bit smaller. 
# leaving them in doesn't seem to change a lot


n<-nrow(X)
p<-ncol(X)
K<-10  # Number of clusters in cluster analysis
nd<-2 # Dimensionality of MDS solution
X=X[,c(8:1,9:16)]  # Order
X=X[,-3]  # Remove int rep: 380 have something, 21+7 in other cats
X=X[,c(1:12,14,13,15)]
p<-ncol(X)

presets<-c("HL","HLEuclid","catdis", "gower","unbiased_dependent", "euclidean_onehot","catdissim")#,"gower2","cat dissim")

mad_importances<-cc_importances<-cc_ranks<-mad_ranks<-matrix(NA,length(presets),p)
# ASWs<-array(NA,dim=c(length(presets),p+1,K-1))
#FullClusters<-matrix(NA,n,K-1)
# ARIs<-array(NA,dim=c(length(presets),p,K-1))

t0<-Sys.time()
for (pr in 1:length(presets)){
  # For each preset, first create full distance matrix:
  if (pr==1){  # This is an HL variant on the manipulated (5 original, 5 discretized) data
    # Below we specify that we use manhattan, numerical is scaled, categorical uses HL scaling
    Dall<-mdist(X,preset="custom",distance_cont = "manhattan",distance_cat = "HL", commensurable=FALSE, scaling="std")
  } else if (pr==2){ # HL Euclidean
    Dall<-mdist(X,
                distance_cont = "euclidean",
                commensurable = FALSE,
                scaling = "std",
                distance_cat = "HLeucl")  
  } else if (pr==3){  # Catdissim
    Dall<-mdist(X,preset="custom",distance_cat = "cat_dis",commensurable = TRUE)
  } else if (pr==4){  # Gower
    Dall<-mdist(X,preset=presets[pr])*ncol(X)
    #   Dall<-mdist(X,distance_cat = "matching",
    #              distance_cont = "manhattan",
    #             cat_scaling = "none",
    #            cont_scaling = "range",
    #           commensurable = FALSE,
    #           preset = "custom")
  } else { # The other presets are the 3 options
    Dall<-mdist(X,preset=presets[pr])}
  
  MDS_all <- cmdscale(Dall,eig=TRUE, k=nd)  # Full MDS solution
  # FullClusters<-matrix(NA,n,K-1)            # Full cluster solution
  # for (k in 2:K){                           # we do this for k in 2:K
  #   pam.out<-pam(Dall,k)
  #   ASWs[pr,1,k-1]<-pam.out$silinfo$avg.width  # if we want to compare asw
  #   FullClusters[,k-1]<-pam.out$clustering     # For each K we get a cluster solution. We compare solutions leaving one-variable out later
  #   }
  # 
  # Leave-one-out:   (still same pr)  
  for (j in 1:p){
    if (pr==1){
      Dj<-mdist(X[,-j],preset="custom",distance_cont = "manhattan",distance_cat = "HL",  commensurable=FALSE, scaling="std")
    } else if (pr==2){ # HL 2
      Dj<-mdist(X[,-j],distance_cont = "euclidean",
                commensurable = FALSE,
                scaling = "std",
                distance_cat = "HLeucl")  
    } else if (pr==3){ # Unbiased catdissim
      Dj<-mdist(X[,-j],preset="custom",
                distance_cat = "cat_dis",commensurable = TRUE)
    } else if (pr==4){  # Gower
      Dj<-mdist(X[,-j],preset=presets[pr])*(ncol(X)-1)
      #  Dall<-mdist(X[,-j],distance_cat = "matching",
      #             distance_cont = "manhattan",
      #             cat_scaling = "none",
      #            cont_scaling = "range",
      #           commensurable = FALSE,
      #           preset = "custom")
    } else { # the remaining presets
      Dj<-mdist(X[,-j],preset=presets[pr])}
    
    mad_importances[pr,j]<-mean(abs(Dall-Dj))  # Compare MAD
    MDS_j <- cmdscale(Dj,eig=TRUE, k=nd)       # Needed to compare MDS
    cc_j<-CongruenceCoeff(MDS_all$points[,1:nd], MDS_j$points[,1:nd])
    cc_importances[pr,j]<-cc_j                 # The congruence coefficients
    
    # for (k in 2:K){        # Now compare for each k, the obtained clusters
    #   pam.out<-pam(Dj,k)
    #   ASWs[pr,j+1,k-1]<-pam.out$silinfo$avg.width
    #   ARIs[pr,j,k-1]<-ARI(FullClusters[,k-1],pam.out$clustering)  # Compare, 
    #                                                             # for each k, all variable clusters with
    #                                                             # leave-one-out clusters
    #   
    # }   # This gives per pr and k, the effect (ARI) of leaving individual variables out on the clustering
  }
  
  # rankings: (Not sure needed)
  cc_ranks[pr,]<-rank(cc_importances[pr,])
  # mad_ranks[pr,]<-rank(mad_importances[pr,])
  
}

# (elapsed<-Sys.time()-t0)

variants<-c("HLa","HL", "Ustd", "G","Udep","Naive","Uind")
varreorder<-c(6,2,1,4,7,3,5)  # To get better sequence
mad_importances<-t(mad_importances)

cc_importances<-sqrt((1-cc_importances^2)) # Change to Alienation
cc_importances<-t(cc_importances)
varnames<-rownames(mad_importances) <-rownames(cc_importances)<-names(X)

# 
varnames[7]<-"position"
#varnames[3]<-"int_repu"
varnames[1]<-"pref_foot"
varnames[15]<-"clause_eur"
varnames[6]<-"club"
# 
colnames(cc_importances)<-colnames(mad_importances)<-variants
# Reorder appropriate fields:
mad_importances<-mad_importances[,varreorder] 
cc_importances<-cc_importances[,varreorder]
# ARIs<-ARIs[varreorder,,]
# ASWs<-ASWs[varreorder,,]
# Create relative importance
madrel<-sweep(mad_importances,2,colSums(mad_importances),`/`)
rownames(mad_importances)<-rownames(cc_importances)<-varnames

Code for `Figure 4`

Code

max_y= max(mad_importances)
min_y= 0
max_y_rel= max(madrel)
min_y_rel= 0
min_x=0
max_x=p
max_y_cc=max(cc_importances)
rect_data = tibble(
  xmin_cont=7.5,xmax_cont=15,ymin_cont=0,ymax_cont=max_y,
  xmin_cat=.5,xmax_cat=7.5,ymin_cat=0,ymax_cat=max_y)

rect_data_rel = tibble(
  xmin_cont=7.5,xmax_cont=15,ymin_cont=0,ymax_cont=max_y_rel,
  xmin_cat=.5,xmax_cat=7.5,ymin_cat=0,ymax_cat=max_y_rel)


mad_imp_plot = mad_importances |> as_tibble() |> 
  mutate(variable_names=rownames(mad_importances),
         var_number=1:nrow(mad_importances)) |>
  pivot_longer(cols=Naive:Udep,names_to="variant",values_to="value") |> 
  ggplot(aes(x=var_number,y=value,color=variant))+
  geom_rect(data=rect_data,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
            alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
  geom_rect(data=rect_data,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
            alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
  geom_point()+geom_line()+
  ylab("absolute variable contribution to distance") +
  xlab("left-out variable") +  theme_bw() + scale_x_continuous(breaks=seq(1,p,1),labels = varnames)+
  theme(axis.text.x = element_text(angle = 60, hjust = 1),
        legend.position = "none")

madrel_plot = madrel |> as_tibble() |> 
  mutate(variable_names=rownames(mad_importances),
         var_number=1:nrow(mad_importances)) |>
  pivot_longer(cols=Naive:Udep,names_to="variant",values_to="value") |> 
  ggplot(aes(x=var_number,y=value,color=variant))+
  geom_rect(data=rect_data_rel,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
            alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
  geom_rect(data=rect_data_rel,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
            alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
  geom_point()+geom_line()+
  ylab("relative variable contribution to distance") +
  xlab("left-out variable") +  theme_bw() + scale_x_continuous(breaks=seq(1,p,1),labels = varnames)+
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

fifa_mad = mad_imp_plot | madrel_plot

print(fifa_mad, width = 10, height = 5)

Code for `Figure 5`

Code

max_y_cc=max(cc_importances)

rect_data_cc = tibble(
  xmin_cont=7.5,xmax_cont=15,ymin_cont=0,ymax_cont=max_y_cc,
  xmin_cat=.5,xmax_cat=7.5,ymin_cat=0,ymax_cat=max_y_cc)


cc_imp_plot = cc_importances |> as_tibble() |> 
  mutate(variable_names=rownames(cc_importances),
         var_number=1:nrow(cc_importances)) |>
  pivot_longer(cols=Naive:Udep,names_to="variant",values_to="value") |> 
  ggplot(aes(x=var_number,y=value,color=variant))+
  geom_rect(data=rect_data_cc,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
            alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
  geom_rect(data=rect_data_cc,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
            alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
  geom_point()+geom_line()+
  ylab("alienation coefficient") +
  xlab("left-out variable") +  theme_bw() + scale_x_continuous(breaks=seq(1,p,1),labels = varnames)+
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

print(cc_imp_plot,width = 8, height = 4)

Appendix D Categorical mean distances: skewed distributions

Data generation

Code

# Let's consider some specific unbalanced scenarios
# One probility is d. The others are (1-d)/(q-1)
# We consider skewed options:
dis<-c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)

#qi=5
qs<-c(2,3,5,10)#,18) # Some options for qi

nint<-length(dis)    # Number of options for d (nint-1)
rsm<-rsma<-rhl<-sdsm<-rse<-rsof<-rsiof<-sdse<-rsa<-sdsa<-sdsof<-sdsiof<-sddsma<-sdssm<-
  sddse<-sddsa<-sddsm<-sdhl<-sddsmca<-sddsof<-sddsiof<-sddshl<- sdshl<-
  rsmca<-sdsmca<-esm<-em<-ee<-ea<-ehl<-emca<-eof<-eiof<-matrix(NA,nrow=nint,ncol=length(qs))

#par(mfrow=c(2,2))
for (qis in 1:length(qs)){
  #ds<-NULL
  qi<-qs[qis]
  for (i in (1:nint)){ # variant 2
    
    Deltam<-matrix(1,qi,qi)    # Matching
    diag(Deltam)<-rep(0,qi)    # Matching Delta
    
    #d=i/(2*qi)  # i=2 corresponds to equal probs if variant 1 is chosen. (Only works for one qi. Not for qs loop)
    d=dis[i]  # variant 2
    
    #ds<-rbind(ds,d)
    p<-c(d,rep((1-d)/(qi-1),(qi-1)))
    sum(p)
    
    Dp<-diag(p)
    pp<-p %*% t(p)
    spi<-(diag(Dp-pp))^-1
    spi5<-(diag(Dp-pp))^-.5
    
    DEsk<-(2/(qi^2))*Deltam # Delta Eskin
    
    DOF<-log(p)%*%t(log(p)) * Deltam  # OF
    n=160 # For inv OF we need n. Soon gets very large (increases in n) 160 is sufficient
    #n<-ceiling(1/min(p))
    DIOF<-log(n*p)%*%t(log(n*p)) * Deltam  # inverse OF. If some np<1 we can get negatives
    
    n=160  # For HL we need to find a way to get somewhat consistent values. Let's choose n large
    marginals=p*n
    
    HLemp<-distancefactor(cat=qi,n=n,catsizes=marginals) #
    DHL<-Deltam*HLemp*2#*sqrt(2)  # NOTE 13/09: Corrected: Was sqrt(2). But, should be, see paper, 2x
    #DE<-sqrt((1/qi)*(spi %*% t(rep(1,qi)) + t(spi %*% t(rep(1,qi)))))
    #diag(DE)<-rep(0,qi)
    
    DM<-sqrt(1/qi)*(spi5 %*% t(rep(1,qi)) + t(spi5 %*% t(rep(1,qi))))
    diag(DM)<-rep(0,qi)
    
    DA<-(1/qi)*diag(spi5)%*%Deltam%*%diag(spi5)
    DMCA<-(1/qi) * diag(p^-.5)%*%Deltam%*%diag(p^-.5)
    
    #we<-ee[i,(qis)]<- t(p)%*%DE%*%p
    
    wsm<-esm[i,(qis)]<- t(p)%*%Deltam%*%p   # Simple Matching
    we<-ee[i,(qis)]<- t(p)%*%DEsk%*%p       # Eskin
    wof<-eof[i,(qis)]<-t(p)%*%DOF%*%p       # OF
    wiof<-eiof[i,(qis)]<-t(p)%*%DIOF%*%p    # IOF
    whl<-ehl[i,(qis)]<-t(p)%*%DHL%*%p       # HL
    wm<-em[i,(qis)]<- t(p)%*%DM%*%p         # SD scaling
    wa<-ea[i,(qis)]<- t(p)%*%DA%*%p         # Cat dis scaling
    wmca<-emca[i,(qis)]<- t(p)%*%DMCA%*%p   # MCA scaling
    
    # We can also estimate the variance among the distances using Deltas: E(X^2)-E(X)^2
    # We can do this dividing by expected values so that we look at what happens
    # when delta is commensurable. Or not
    # if not: 
    
    #we<-wm<-wa<-wmca<-wof<-wiof<-whl<-wsm<-1
    
    sddse[i,(qis)]<- sqrt(t(p)%*%DEsk^2%*%p - ee[i,(qis)]^2)/we  # Eskin
    sddsma[i,(qis)]<- sqrt(t(p)%*%Deltam^2%*%p - esm[i,(qis)]^2)/ wsm  # Simple Matching
    sddshl[i,(qis)]<- sqrt(t(p)%*%DHL^2%*%p - ehl[i,(qis)]^2)/ whl  # HL
    sddsm[i,(qis)]<- sqrt(t(p)%*%DM^2%*%p - em[i,(qis)]^2)/ wm  # SD 
    sddsa[i,(qis)]<- sqrt(t(p)%*%DA^2%*%p -  ea[i,(qis)]^2)/wa # Cat dissim
    sddsmca[i,(qis)]<- sqrt(t(p)%*%DMCA^2%*%p - emca[i,(qis)]^2)/wmca  # MCA
    sddsof[i,(qis)]<-sqrt(t(p)%*%DOF^2%*%p - eof[i,(qis)]^2)/wof  # OF
    sddsiof[i,(qis)]<-sqrt(t(p)%*%(DIOF/as.numeric(wiof))^2%*%p - 1 )  #IOF
    
    # Let's consider spread among cat dissimilarities. But: This only makes sense
    # when on similar scales. So, use commensurable deltas: (Or not: see above)
    sdse[i,qis]<-sd((1/we)*DEsk[upper.tri(DEsk)])  # Eskin
    sdsm[i,qis]<-sd((1/wm)*DM[upper.tri(DM)]) # SD
    sdshl[i,qis]<-sd((1/whl)*DHL[upper.tri(DHL)]) # HL
    sdssm[i,qis]<-sd((1/wsm)*Deltam[upper.tri(Deltam)]) # Simple Matching
    sdsa[i,qis]<-sd((1/wa)*DA[upper.tri(DA)]) #  Cat dissim,
    sdsmca[i,qis]<-sd((1/wmca)*DMCA[upper.tri(DMCA)]) # MCA
    sdsof[i,qis]<-sd((1/wof)*DOF[upper.tri(DOF)])  # OF
    sdsiof[i,qis]<-sd((1/wiof)*DIOF[upper.tri(DIOF)]) # IOF
    
    
    
  }
  
}

cats<-as.character(qs)
skew<-as.character(dis)#as.character(1:nint)

Code for `Figure 6`

Code

sk_distance_to_plot = function(exp_mean_dist,distance_name){
  exp_mean_dist = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |> 
    rename(`2 cats` = V1,
           `3 cats` = V2,
           `5 cats` = V3,
           `10 cats` = V4) |> 
    mutate(method=distance_name) |> 
    pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`), 
                 names_to = "Number of categories", values_to = "Mean distance")
  
  return(exp_mean_dist)
}

all_exp_mean_dist=rbind(sk_distance_to_plot(esm,"Simple Matching"),
                        sk_distance_to_plot(ee,"Eskin"),
                        sk_distance_to_plot(eof,"OF"),
                        sk_distance_to_plot(eiof,"IOF")
)

paper_plot_skew1=all_exp_mean_dist |> 
  mutate(`Number of categories`=fct_inorder(`Number of categories`),
         method=fct_inorder(method),) |> 
  ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) + 
  geom_line() + geom_point() + facet_wrap(~method,scales = "free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") + 
  xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom") 


print(paper_plot_skew1,width=10,height=6)

Code for `Figure 7`

Code

all_exp_mean_dist2=rbind(sk_distance_to_plot(ehl,"Hennig-Liao"),
                         sk_distance_to_plot(em,"Std. dev."),
                         sk_distance_to_plot(ea,"Cat. dissim.")
)

paper_plot_skew2=all_exp_mean_dist2 |> 
  mutate(`Number of categories`=fct_inorder(`Number of categories`),
         method=fct_inorder(method),) |> 
  ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) + 
  geom_line() + geom_point() + facet_wrap(~method) + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") + 
  xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom") 


print(paper_plot_skew2)

Code for `Figure 8`

Code


qs<-c(2,3,5,10)#,18) # Some options for qs

nint<-length(dis)    # Number of options for d (nint-1)

CVchd<-CVtvd<-CVmsd<-CVlhd<-Echi<-Echd<-Etvi<-Etvd<-Emsi<-Elhi<-Elhd<-Emsd<-array(NA,dim=c(length(qs),length(qs),nint,nint))

for (q1 in 1:length(qs)){  # cats row
  qi<-qs[q1]
  for (q2 in q1:length(qs)){ # cats cols
    qj<-qs[q2]
    for (pi in 1:nint){  # row distributions
      d1<-dis[pi]
      p1<-c(d1,rep((1-d1)/(qj-1),(qj-1)))  # nint different marginals: Distribution over colums
      for (pj in 1:nint){
        d2<-dis[pj]
        p2<-c(d2,rep((1-d2)/(qi-1),(qi-1)))  # corresponding col marginals: Distribution over rows
        Pi<-p2 %*% t(p1)   # joint table independent
        Ri<-Pi/rowSums(Pi) # conditional table independent
        #  Ci<-Ri/sqrt(p2)
        
        if (qj>qi) {
          Pd<-diag(p1[1:qi])
          Addzeros<-matrix(0,(qi-1),(qj-qi))
          Addps<-p1[qi+1:(qj-qi)]
          Add<-rbind(Addzeros,Addps)
          Pd<-cbind(Pd,Add)
        } else { #qi=qj
          pd<-p2
          Pd<-diag(pd)       # joint table dependent case (diagonal smallest q)
        }
        Rd<-Pd/rowSums(Pd) 
        Cd<-Rd/sqrt(p2)    # Chi-square table: cond. prob/sqrt(p)
        
        
        # category dissimilarities:
        Dlhd<-LeHo(Rd)                   #  Le Ho
        # Dmsd<-MousaviSehhati(Pd)         # Moussavi
        Dtvd<-dist(Rd,"manhattan")/2     # TVD
        Dchd<-dist(Cd,"euclidean")^2     # Chi2 dist
        #    Dchd<-Dchd/(2*(qi-1))     # Chi2 dist corrected for number of categories...
        
        Elhd[q1,q2,pi,pj]<-t(pd) %*%Dlhd %*%pd
        # Emsd[q1,q2,pi,pj]<-t(pd) %*%Dmsd %*%pd
        Etvd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dtvd) %*%pd
        Echd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dchd) %*%pd
        
        # Coefficient of variation
        
        CVlhd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dlhd^2 %*%pd - (t(pd) %*%Dlhd %*%pd)^2)/(t(pd) %*%Dlhd %*%pd)
        # CVmsd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dmsd^2 %*%pd - (t(pd) %*%Dmsd %*%pd)^2)/(t(pd) %*%Dmsd %*%pd)
        CVtvd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dtvd))^2 %*%pd - (t(pd) %*%as.matrix(Dtvd) %*%pd)^2)/(t(pd) %*%as.matrix(Dtvd) %*%pd)
        CVchd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dchd))^2 %*%pd - (t(pd) %*%as.matrix(Dchd) %*%pd)^2)/(t(pd) %*%as.matrix(Dchd) %*%pd)
        
        
        Dlhi<-LeHo(Ri)
        # Dmsi<-MousaviSehhati(Pi)
        Dtvi<-dist(Ri,"manhattan")/2
        #  Dchi<-dist(Ci,"euclidean")^2 
        
        Elhi[q1,q2,pi,pj]<-t(p2) %*%Dlhi %*%p2
        # Emsi[q1,q2,pi,pj]<-t(p2) %*%Dmsi %*%p2
        Etvi[q1,q2,pi,pj]<-t(p2) %*%as.matrix(Dtvi) %*%p2
        #  Echi[q1,q2,pi,pj]<-t(p1) %*%as.matrix(Dchi) %*%p1
      }
    }
  }
}



Elhs<-as.data.frame(rbind(Elhd[1,1,1,],Elhd[2,2,1,],Elhd[3,3,1,],Elhd[4,4,1,]))
Etvs<-as.data.frame(rbind(Etvd[1,1,1,],Etvd[2,2,1,],Etvd[3,3,1,],Etvd[4,4,1,]))
Emss<-as.data.frame(rbind(Emsd[1,1,1,],Emsd[2,2,1,],Emsd[3,3,1,],Emsd[4,4,1,]))
Echs<-as.data.frame(rbind(Echd[1,1,1,],Echd[2,2,1,],Echd[3,3,1,],Echd[4,4,1,]))

CVlhs<-as.data.frame(rbind(CVlhd[1,1,1,],CVlhd[2,2,1,],CVlhd[3,3,1,],CVlhd[4,4,1,]))
CVtvs<-as.data.frame(rbind(CVtvd[1,1,1,],CVtvd[2,2,1,],CVtvd[3,3,1,],CVtvd[4,4,1,]))
CVmss<-as.data.frame(rbind(CVmsd[1,1,1,],CVmsd[2,2,1,],CVmsd[3,3,1,],CVmsd[4,4,1,]))
CVchs<-as.data.frame(rbind(CVchd[1,1,1,],CVchd[2,2,1,],CVchd[3,3,1,],CVchd[4,4,1,]))

cats<-as.character(qs)
skew<-as.character(dis)#as.character(1:nint)


sk_distance_to_plot = function(exp_mean_dist,distance_name){
  exp_mean_dist = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |> 
    rename(`2 cats` = V1,
           `3 cats` = V2,
           `5 cats` = V3,
           `10 cats` = V4) |> 
    mutate(method=distance_name) |> 
    pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`), 
                 names_to = "Number of categories", values_to = "Mean distance")
  
  return(exp_mean_dist)
}


all_exp_mean_dist3=rbind(sk_distance_to_plot(t(as.matrix(Elhs)),"Le-Ho"),
                         sk_distance_to_plot(t(as.matrix(Etvs)),"Total Variance Distance")
)

paper_plot_skew3=all_exp_mean_dist3 |> 
  mutate(`Number of categories`=fct_inorder(`Number of categories`),
         method=fct_inorder(method),) |> 
  ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) + 
  geom_line() + geom_point() + facet_wrap(~method,scales="free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") + 
  xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom") 


print(paper_plot_skew3,width=8,height=4)

Appendix E FIFA variable distributions

Code for `Figure 9`

Code

X_tib_num = X |> as_tibble() |> select(where(is.numeric))

X_tib_cat = X |> as_tibble() |> select(where(is.factor))

levels(X_tib_cat$work_rate)=c("H/H", "H/L", "H/M", "L/H", "L/L", "L/M", "M/H", "M/L", "M/M")
levels(X_tib_cat$club_name) = c("ADO","Ajax","AZ","Emmen","Groningen","Twente","Utrecht",
                                "Feyenoord","Fortuna","Heracles","PEC","PSV","RKC","Heerenveen",
                                "Sparta","Vitesse","VVV","Willem II") 

X_tib_cat=X_tib_cat |> rename(`club`=club_name,
                              `position`=team_position)

X_tib_num_nest =  tibble(values=X_tib_num |> map((~.x)),features=names(X_tib_num))|> 
  mutate(
    hplot=map2(.x=values,.y=features,
               function(x=.x,y=.y){
                 my_tib=tibble(a=x)
                 names(my_tib) = y
                 
                 my_plot= my_tib |> ggplot(aes(x)) + geom_histogram(bins=15,alpha=.75)+theme_bw()+ggtitle(y)+
                   theme(plot.title = element_text(hjust = 0.5))+xlab("")+ylab("")
               }
    )
  ) 


X_tib_cat_nest =  tibble(values=X_tib_cat |> map((~.x)),features=names(X_tib_cat))|> 
  mutate(
    bplot=map2(.x=values,.y=features,
               function(x=.x,y=.y){
                 my_tib=tibble(a=x)
                 names(my_tib) = y
                 
                 my_plot= my_tib |> ggplot(aes(x)) + geom_bar(alpha=.75)+theme_bw()+ggtitle(y)+
                   theme(plot.title = element_text(hjust = 0.5),
                         axis.text.x =element_text(angle=90 ))+xlab("")+ylab("")
               }
    )
  ) 


fifa_descriptive_plots=(X_tib_cat_nest$bplot[[1]]/
                          X_tib_cat_nest$bplot[[2]]/
                          X_tib_cat_nest$bplot[[3]]/
                          X_tib_cat_nest$bplot[[4]]/
                          X_tib_cat_nest$bplot[[5]]/
                          X_tib_cat_nest$bplot[[6]]/
                          X_tib_cat_nest$bplot[[7]])|
  (X_tib_num_nest$hplot[[1]]/
     X_tib_num_nest$hplot[[2]]/
     X_tib_num_nest$hplot[[3]]/
     X_tib_num_nest$hplot[[4]]/
     X_tib_num_nest$hplot[[5]]/
     X_tib_num_nest$hplot[[6]]/
     X_tib_num_nest$hplot[[7]])#+ plot_layout(guides = 'collect')


print(fifa_descriptive_plots)

Variable specific effects

Data generation

Code for Figure 1

Code for Figure 2

Retrieval of the true configuration

Data generation

Code for figure Figure 3

Illustration: FIFA player data

Data preparation and analysis

Code for Figure 4

Code for Figure 5

Appendix D Categorical mean distances: skewed distributions

Data generation

Code for Figure 6

Code for Figure 7

Code for Figure 8

Appendix E FIFA variable distributions

Code for Figure 9

Code for `Figure 1`

Code for `Figure 2`

Code for figure `Figure 3`

Code for `Figure 4`

Code for `Figure 5`

Code for `Figure 6`

Code for `Figure 7`

Code for `Figure 8`

Code for `Figure 9`