devtools::install_github("alfonsoIodiceDE/manydist_package/manydist")Unbiased mixed variables distances
supplementary material
library(aricode)
library(cluster)
library(fpc)
library(manydist)
library(patchwork)
library(tidyverse)
library(varhandle)
library(vegan)Here we provide the code to reproduce the results of the supplementary material of the paper “Unbiased mixed variables distances”.
Code
CongruenceCoeff<-function(L1,L2){
L1<-dist(L1)
L2<-dist(L2)
CC<-sum(diag(t(L1) %*% L2)) / sqrt(sum(diag(t(L1) %*% L1))*sum(diag(t(L2) %*% L2)))
return(CC)
}
LeHo<-function(X){ # Returns the LeHo Distance (symmetric Kullback-Leibler)
# X is matrix with conditional probabilities
#if(any(rowSums(X)!=1)){
# print("Input must be conditional probs (row sums should be 1)")
#} else {
require(philentropy)
n<-nrow(X)
D<-matrix(0,n,n)
for (i in 1:(n-1)){
for (j in (i+1):n){
D[j,i]<-D[i,j]<-distance(X[c(i,j),],"kullback-leibler",unit="log2")+
distance(X[c(j,i),],"kullback-leibler",unit="log2")}
}
return(D)
}
# }Variable specific effects
Data generation
Code
reps<-100 # 100 can take 30min
#reps<-5 # 100 can take 30min
n<-500
k<-2 # Dimension solution
sigma<-0.03 # Noise
porig<-6 # Number of variables corresponding to true config
pnnoise<-0 # Number of numerical noise vars
pcnoise<-0 # Number of categorical noise vars
pn<-2 # Number of numerical variables underying config
pnum<-pn+pnnoise+pcnoise # Total number of numerical vars+noise vars (Needed to know which vars to discretize)
p<-porig+pnnoise+pcnoise # Total number of variables
qoptions<-c(2,3,5,9) # Distribution of the categorical variables corresponding to true config
pcat<-length(qoptions) # Number of cat variables underlying config
# We consider some variants:
presets<-c("custom","HLa","HLe","catdis","gower","unbiased_dependent","euclidean_onehot","catdissim")
mad_importances<-cc_importances<-array(NA,dim=(c(reps,length(presets),p)))
t0<-Sys.time()
for (rep in 1:reps){
set.seed(1234+rep)
# Generate 2d data:
T<-cbind(runif(n,-2,2),runif(n,-2,2)) # Random Uniform
# print(T[1:5,])
SVDT <- svd(T)
Y<-SVDT$u # TU is orthonormal. This is the underlying 2d config
R<-NULL
for (i in 1:porig){
R<-cbind(R,runif(2,-2,2)) # or uniform. Seems to have little impact
}
X<-Y%*%R # Inflate/expand the data
Xorig<-as.data.frame(X)
X<-as.data.frame(Xorig)
# Add noise (we do this before making categorical; could also be after)
for (i in 1:porig){
X[,i]<-X[,i]+rnorm(n,0,sigma)
}
### Or: Add noise columns. Completely random numerical variables
N<-NULL
if (pnnoise>0){
for (i in 1:pnnoise){
N<-cbind(N,rnorm(n))
}
}
if (pcnoise>0){
qr[rep]<-round(runif(1,3,9))
for (i in 1:pcnoise){
Cj<-as.factor(round(runif(n,1.5,qr[rep]+0.5)))
if (is.null(N)){
N<-cbind(N,Cj)
N<-as.factor(N)
} else {
N<-cbind.data.frame(N,Cj)}
}
colnames(N)<-(1:(pnnoise+pcnoise))
}
if ((pnnoise+pcnoise)>0) {
Xorig<-cbind.data.frame(N,Xorig) # The random variables are the first pnoise
X<-cbind.data.frame(N,X)
}
# END add noise columns
for (i in (pnum+1):p){ # Create cat variables by simply evenly cutting based on range
X[,i]<-cut(X[,i],qoptions[i-pnum])
}
for (pr in 1:length(presets)){
# For each preset, first create full distance matrix:
print(pr)
if (pr==1){ # This is the real, numerical data: We use manhattan on scaled values:
Dall<-mdist(Xorig,preset=presets[pr],distance_cont = "manhattan",commensurable=FALSE, scaling="std")
} else if (pr==2){ # This is an HL variant on the manipulated (5 original, 5 discretized) data
# Below we specify that we use manhattan, numerical is scaled, categorical uses HL scaling
Dall<-mdist(X,preset="custom",distance_cont = "manhattan",distance_cat = "HL", commensurable=FALSE, scaling="std")
} else if (pr==3){ # HL Euclidean
Dall<-mdist(X,
distance_cont = "euclidean",
distance_cat = "HLeucl",
commensurable = FALSE,
scaling = "std")
} else if (pr==4){ # catdis
Dall<-mdist(X,preset="custom",distance_cat = "cat_dis",commensurable = TRUE)
} else if (pr==5){ # Gower
Dall<-mdist(X,preset=presets[pr])*ncol(X)
} else { # The other presets are the 3 options
Dall<-mdist(X,preset=presets[pr])}
MDS_all <- cmdscale(Dall,eig=TRUE, k=2)
# Leave-one-out:
for (j in 1:p){
if (pr==1){
Dj<-mdist(Xorig[,-j],preset=presets[pr],distance_cont = "manhattan",commensurable=FALSE, scaling="std")
} else if (pr==2){
Dj<-mdist(X[,-j],preset="custom",distance_cont = "manhattan",distance_cat = "HL", commensurable=FALSE, scaling="std")
} else if (pr==3){ # HL 2
Dj<-mdist(X[,-j],distance_cont = "euclidean",
commensurable = FALSE,
scaling = "std",
distance_cat = "HLeucl")
} else if (pr==4){ # Unbiased catdissim
Dj<-mdist(X[,-j],preset="custom",
distance_cat = "cat_dis",commensurable = TRUE)
} else if (pr==5){ # Gower
Dj<-mdist(X[,-j],preset=presets[pr])*(ncol(X)-1)
} else { # the remaining presets
Dj<-mdist(X[,-j],preset=presets[pr])}
mad_importances[rep,pr,j]<-mean(abs(Dall-Dj))
MDS_j <- cmdscale(Dj,eig=TRUE, k=2)
cc_j<-CongruenceCoeff(MDS_all$points[,1:2], MDS_j$points[,1:2])
cc_importances[rep,pr,j]<-cc_j
}
}
} # End replications
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
(elapsed<-Sys.time()-t0)
## Time difference of 21.07505 mins
ac_importances<-sqrt((1-cc_importances^2)) # We calculated congruences
variants<-c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind")
varreorder<-c(1,7,3,2,5,8,4,6)
mad_importances<-mad_importances[,varreorder,]
ac_importances<-ac_importances[,varreorder,]
mad<-ac<-madranks<-ccranks<-maddis<-madrel<-acrel<-NULL
mad_dists<-madrels<-acrels<-array(NA,dim=c(length(presets),reps,p))
for (pr in 1:length(presets)){
mad<-rbind(mad,colMeans(mad_importances[,pr,]))
ac<- rbind(ac,colMeans(ac_importances[,pr,]))
maddis<-rbind(maddis,colMeans(mad_importances[,pr,]/rowSums(mad_importances[,pr,])))
mad_dists[pr,,]<-mad_importances[,pr,]/rowSums(mad_importances[,pr,]) # We need all distributions
madrels[pr,,]<-mad_dists[pr,,]/mad_dists[1,,] # If larger: bigger diff than true
madrel<-rbind(madrel,colMeans(madrels[pr,,]))
acrels[pr,,]<-ac_importances[,pr,]/ac_importances[,1,] #
acrel<-rbind(acrel,colMeans(acrels[pr,,]))
}
Measures<-list()
Measures[[1]]<-data.frame(t(mad))
Measures[[2]]<-data.frame(t(maddis))
Measures[[3]]<-data.frame(t(ac))
Measures[[4]]<-data.frame(t(madrel))
Measures[[5]]<-data.frame(t(acrel))
Measures_tib= tibble(meas=Measures,
meas_name=c("MAD","MAD distribution","AC","MAD rel","AC rel"))
save(Measures_tib,file="Measures_tib.RData")Code for Figure 1
Code
variants<-c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind")
varreorder<-c(1,7,3,2,5,8,4,6)
meas_to_plot_mad = Measures_tib |>
mutate(meas=map(.x=meas,function(x=.x){colnames(x)=variants[varreorder];return(x)}),
meas=map(.x=meas,~.x |> slice(c(3:6,1,2)))
) |>
unnest(meas) |>mutate(loo_v=paste0("var",rep(1:6,times=5))) |>
filter(meas_name %in% c("MAD"))
meas_to_plot_mad_rel = Measures_tib |>
mutate(meas=map(.x=meas,function(x=.x){colnames(x)=variants[varreorder];return(x)}),
meas=map(.x=meas,~.x |> slice(c(3:6,1,2)))
) |>
unnest(meas) |>mutate(loo_v=paste0("var",rep(1:6,times=5))) |>
filter(meas_name %in% c("MAD distribution"))
###################
mad_max = max(meas_to_plot_mad |> select(where(is.numeric)))
rect_data = tibble(
xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max,
xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max)
mad_max_rel = max(meas_to_plot_mad_rel |> select(where(is.numeric)))
rect_data_rel = tibble(
xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max_rel,
xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max_rel)
loo_mad_plot = meas_to_plot_mad |> pivot_longer(cols=Num:Udep, names_to="variant_name",values_to="value") |>
ggplot(aes(x = parse_number(loo_v), y = value,
color = variant_name)) +
geom_rect(data=rect_data,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line(aes(lty=variant_name)) +
ylab("absolute variable contribution to distance")+
xlab("left-out variable") + theme_bw() + theme(legend.position = "none")+
scale_x_continuous(breaks=seq(1,6,1),labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2"))+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
loo_mad_rel_plot = meas_to_plot_mad_rel |> pivot_longer(cols=Num:Udep, names_to="variant_name",values_to="value") |>
ggplot(aes(x = parse_number(loo_v), y = value,
color = variant_name)) +
geom_rect(data=rect_data_rel,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data_rel,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line(aes(lty=variant_name)) +
ylab("relative variable contribution to distance") +
xlab("left-out variable") + theme_bw() +
scale_x_continuous(breaks=seq(1,6,1),labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2"))+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
fig_1_plot = loo_mad_plot | loo_mad_rel_plot
print(fig_1_plot, width = 8, height = 4)Code for Figure 2
Code
meas_to_plot_ac = Measures_tib |>
mutate(meas=map(.x=meas,function(x=.x){colnames(x)=variants[varreorder];return(x)}),
meas=map(.x=meas,~.x |> slice(c(3:6,1,2)))
) |>
unnest(meas) |>mutate(loo_v=paste0("var",rep(1:6,times=5))) |>
filter(meas_name %in% c("AC rel"))
ac_max = max(meas_to_plot_ac |> select(where(is.numeric)))
rect_data_ac = tibble(
xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=ac_max,
xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=ac_max)
ac_plot = meas_to_plot_ac |> pivot_longer(cols=Num:Udep, names_to="variant_name",values_to="value") |>
ggplot(aes(x = parse_number(loo_v), y = value,
color = variant_name)) +
geom_rect(data=rect_data_ac,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data_ac,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line(aes(lty=variant_name)) +
ylab("alienation coefficient")+
xlab("left-out variable") + theme_bw() +
scale_x_continuous(breaks=seq(1,6,1),labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2"))+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
print(ac_plot, width = 8, height = 4)Retrieval of the true configuration
Data generation
Code
reps <- 100
n<-500 # large is needed for the many (9) categories scenario
k<-2
sigma<-0.03 # standard deviation of Xorig variables is
# is around 0.067
porig<-6 # Number of variables corresponding to true config
pnnoise<-0 # Number of numerical noise vars
pcnoise<-0 # Number of categorical noise vars
pn<-3 # Number of numerical variables underying config
pnum<-pn+pnnoise+pcnoise # Total number of numerical vars+noise vars (Needed to know which vars to discretize)
p<-porig+pnnoise+pcnoise # Total number of variables
qoptions<-c(2,3,5,9) # Distribution of the categorical variables corresponding to true config
pcat<-length(qoptions) # Number of cat variables underlying config
# We consider some variants:
presets<-c("custom","HL","HLEuclid","catdis", "gower","unbiased_dependent", "euclidean_onehot","catdissim")#,"gower2","cat dissim")
CC<-array(NA,dim=c(reps,length(presets),length(qoptions)))
qr<-rep(NA,reps)
#set.seed(123)
t0<-Sys.time()
for (rep in 1:reps){
set.seed(1234+rep)
# Generate 2d data:
T<-cbind(runif(n,-2,2),runif(n,-2,2)) # Random Uniform
SVDT <- svd(T)
Y<-SVDT$u # TU is orthonormal. This is the underlying 2d config
R<-NULL
for (i in 1:porig){
R<-cbind(R,runif(2,-2,2)) # or uniform. Seems to have little impact
}
X<-Y%*%R # Inflate/expand the data
Xorig<-as.data.frame(X)
X<-as.data.frame(Xorig)
# Add noise (we do this before making categorical; could also be after)
for (i in 1:porig){
X[,i]<-X[,i]+rnorm(n,0,sigma)
}
### Or: Add noise columns. Completely random numerical variables
N<-NULL
if (pnnoise>0){
for (i in 1:pnnoise){
N<-cbind(N,rnorm(n))
}
}
if (pcnoise>0){
qr[rep]<-round(runif(1,3,9))
for (i in 1:pcnoise){
Cj<-as.factor(round(runif(n,1.5,qr[rep]+0.5)))
if (is.null(N)){
N<-cbind(N,Cj)
N<-as.factor(N)
} else {
N<-cbind.data.frame(N,Cj)}
}
colnames(N)<-(1:(pnnoise+pcnoise))
}
if ((pnnoise+pcnoise)>0) {
Xorig<-cbind.data.frame(N,Xorig) # The random variables are the first pnoise
X<-cbind.data.frame(N,X)
}
# END add noise columns
for (q in 1:length(qoptions)){
qs<-rep(qoptions[q],p-pnum) # Choice of categories.
# If number to large, we could get
# empty categories. All cat variables
# have same number of categories
pcat<-length(qs)
for (i in (pnum+1):p){ # Create cat variables by simply evenly cutting based on range
X[,i]<-cut(Xorig[,i],qs[i-pnum])
}
for (pr in 1:length(presets)){
# For each preset, first create full distance matrix:
if (pr==1){ # This is the real, numerical data: We use manhattan on scaled values:
Dall<-mdist(Xorig,preset=presets[pr],distance_cont = "manhattan",commensurable=FALSE, scaling="std")
} else if (pr==2){ # Additive HL, numerical is scaled, categorical uses HL scaling
Dall<-mdist(X,preset="custom",distance_cont = "manhattan",distance_cat = "HL", commensurable=FALSE, scaling="std")
} else if (pr==3){ # HL Euclid
Dall<-mdist(X,
distance_cont = "euclidean",
commensurable = FALSE,
scaling = "std",
distance_cat = "HLeucl")
} else if (pr==4){ # Cat dissimilarity
Dall<-mdist(X,preset="custom",distance_cat = "cat_dis",commensurable = TRUE)
} else if (pr==5){ # Gower
Dall<-mdist(X,preset=presets[pr])*ncol(X)
} else { # The other presets are the remaining 3 options
if (pr==7){ # Catch error if no contvars
if ((pn+pnnoise)==0) {
print("Naive distances error: No numvars")
X1<-as.numeric(X[,1])
X2<-cbind.data.frame(X1,X[,-1])
Dall<-mdist(X2,preset=presets[pr])
} else { #pn+pnnoise !=0, pr==7
Dall<-mdist(X,preset=presets[pr])
}
} else {# pr is not 7
#print(pr)
Dall<-mdist(X,preset=presets[pr])}
}
MDS_all <- cmdscale(Dall,eig=TRUE, k=2)
CC[rep,pr,q]<-CongruenceCoeff(Y,MDS_all$points[,1:2])
} # Mix dist variants
} # Number of categories
} # End replications
save(CC,file="boxplot_CC_data.RData")Code for figure Figure 3
Code
reps <- 100
qoptions<-c(2,3,5,9)
AC<-sqrt((1-CC^2)) # We calculated congruences
MeasNames<-c("MAD","MAD Distr","Alienation Coeff", "MAD ranks", "CC ranks", "Relative MADs", "Relative ACs")
variants<-c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind")
varreorder<-c(1,7,3,2,5,8,4,6)
str(AC)
## num [1:100, 1:8, 1:4] 0.2299 0.0585 0.2598 0.2691 0.1671 ...
diff_AC<-array(NA,c(reps,7,4))
for(q in 1:length(qoptions)){
numeric_AC= matrix(rep(AC[,1,q],7),nrow=reps)
diff_AC[,,q]<-AC[,-1,q]-numeric_AC
}
AC_tib = rbind(as_tibble(data.frame(AC[,,1])) |> mutate(categories="two cats"),
as_tibble(data.frame(AC[,,2])) |> mutate(categories="three cats"),
as_tibble(data.frame(AC[,,3])) |> mutate(categories="five cats"),
as_tibble(data.frame(AC[,,4])) |> mutate(categories="nine cats")) |>
mutate(categories=fct_inorder(categories))
names(AC_tib) = c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind","categories")
variant_ordering = c(1,7,3,2,5,8,4,6)
AC_tib = AC_tib |> select(variant_ordering,categories)
AC_tib_long = AC_tib |> pivot_longer(
cols=c("Num","Naive","HL", "HLa","G","Uind","Ustd","Udep"),
names_to="variants", values_to="diff_AC") |> mutate(variants=fct_inorder(variants))
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
my_colors= c("darkgrey",gg_color_hue(7)[c(4,2,3,1,5,6,7)])
boxplot_figure_simu = AC_tib_long |> ggplot(aes(x=(variants), y=diff_AC))+
geom_boxplot(aes(fill=variants)) +
facet_wrap(~categories)+ theme_bw()+
scale_fill_manual(values=my_colors) + theme(legend.position = "none")+
ylab("Alienation coefficient") + xlab("Variants")
print(boxplot_figure_simu,width=8.5,height=6)Illustration: FIFA player data
Data preparation and analysis
Code
# Variables are organized: Categorical first, then numerical
#load("Fifa21NL.RData") # Load the data
data(fifa_nl)
#Xorig<-X<-ddata[,-c(1,2)]
Xorig<-X<-fifa_nl[,-c(1,2,19:24)] # vars 1 and 2 are a bit useless (too many categories)
# vars 19:24 interesting but same scale and little variation
# leaving them out makes the plots a bit smaller.
# leaving them in doesn't seem to change a lot
n<-nrow(X)
p<-ncol(X)
K<-10 # Number of clusters in cluster analysis
nd<-2 # Dimensionality of MDS solution
X=X[,c(8:1,9:16)] # Order
X=X[,-3] # Remove int rep: 380 have something, 21+7 in other cats
X=X[,c(1:12,14,13,15)]
p<-ncol(X)
presets<-c("HL","HLEuclid","catdis", "gower","unbiased_dependent", "euclidean_onehot","catdissim")#,"gower2","cat dissim")
mad_importances<-cc_importances<-cc_ranks<-mad_ranks<-matrix(NA,length(presets),p)
# ASWs<-array(NA,dim=c(length(presets),p+1,K-1))
#FullClusters<-matrix(NA,n,K-1)
# ARIs<-array(NA,dim=c(length(presets),p,K-1))
t0<-Sys.time()
for (pr in 1:length(presets)){
# For each preset, first create full distance matrix:
if (pr==1){ # This is an HL variant on the manipulated (5 original, 5 discretized) data
# Below we specify that we use manhattan, numerical is scaled, categorical uses HL scaling
Dall<-mdist(X,preset="custom",distance_cont = "manhattan",distance_cat = "HL", commensurable=FALSE, scaling="std")
} else if (pr==2){ # HL Euclidean
Dall<-mdist(X,
distance_cont = "euclidean",
commensurable = FALSE,
scaling = "std",
distance_cat = "HLeucl")
} else if (pr==3){ # Catdissim
Dall<-mdist(X,preset="custom",distance_cat = "cat_dis",commensurable = TRUE)
} else if (pr==4){ # Gower
Dall<-mdist(X,preset=presets[pr])*ncol(X)
# Dall<-mdist(X,distance_cat = "matching",
# distance_cont = "manhattan",
# cat_scaling = "none",
# cont_scaling = "range",
# commensurable = FALSE,
# preset = "custom")
} else { # The other presets are the 3 options
Dall<-mdist(X,preset=presets[pr])}
MDS_all <- cmdscale(Dall,eig=TRUE, k=nd) # Full MDS solution
# FullClusters<-matrix(NA,n,K-1) # Full cluster solution
# for (k in 2:K){ # we do this for k in 2:K
# pam.out<-pam(Dall,k)
# ASWs[pr,1,k-1]<-pam.out$silinfo$avg.width # if we want to compare asw
# FullClusters[,k-1]<-pam.out$clustering # For each K we get a cluster solution. We compare solutions leaving one-variable out later
# }
#
# Leave-one-out: (still same pr)
for (j in 1:p){
if (pr==1){
Dj<-mdist(X[,-j],preset="custom",distance_cont = "manhattan",distance_cat = "HL", commensurable=FALSE, scaling="std")
} else if (pr==2){ # HL 2
Dj<-mdist(X[,-j],distance_cont = "euclidean",
commensurable = FALSE,
scaling = "std",
distance_cat = "HLeucl")
} else if (pr==3){ # Unbiased catdissim
Dj<-mdist(X[,-j],preset="custom",
distance_cat = "cat_dis",commensurable = TRUE)
} else if (pr==4){ # Gower
Dj<-mdist(X[,-j],preset=presets[pr])*(ncol(X)-1)
# Dall<-mdist(X[,-j],distance_cat = "matching",
# distance_cont = "manhattan",
# cat_scaling = "none",
# cont_scaling = "range",
# commensurable = FALSE,
# preset = "custom")
} else { # the remaining presets
Dj<-mdist(X[,-j],preset=presets[pr])}
mad_importances[pr,j]<-mean(abs(Dall-Dj)) # Compare MAD
MDS_j <- cmdscale(Dj,eig=TRUE, k=nd) # Needed to compare MDS
cc_j<-CongruenceCoeff(MDS_all$points[,1:nd], MDS_j$points[,1:nd])
cc_importances[pr,j]<-cc_j # The congruence coefficients
# for (k in 2:K){ # Now compare for each k, the obtained clusters
# pam.out<-pam(Dj,k)
# ASWs[pr,j+1,k-1]<-pam.out$silinfo$avg.width
# ARIs[pr,j,k-1]<-ARI(FullClusters[,k-1],pam.out$clustering) # Compare,
# # for each k, all variable clusters with
# # leave-one-out clusters
#
# } # This gives per pr and k, the effect (ARI) of leaving individual variables out on the clustering
}
# rankings: (Not sure needed)
cc_ranks[pr,]<-rank(cc_importances[pr,])
# mad_ranks[pr,]<-rank(mad_importances[pr,])
}
# (elapsed<-Sys.time()-t0)
variants<-c("HLa","HL", "Ustd", "G","Udep","Naive","Uind")
varreorder<-c(6,2,1,4,7,3,5) # To get better sequence
mad_importances<-t(mad_importances)
cc_importances<-sqrt((1-cc_importances^2)) # Change to Alienation
cc_importances<-t(cc_importances)
varnames<-rownames(mad_importances) <-rownames(cc_importances)<-names(X)
#
varnames[7]<-"position"
#varnames[3]<-"int_repu"
varnames[1]<-"pref_foot"
varnames[15]<-"clause_eur"
varnames[6]<-"club"
#
colnames(cc_importances)<-colnames(mad_importances)<-variants
# Reorder appropriate fields:
mad_importances<-mad_importances[,varreorder]
cc_importances<-cc_importances[,varreorder]
# ARIs<-ARIs[varreorder,,]
# ASWs<-ASWs[varreorder,,]
# Create relative importance
madrel<-sweep(mad_importances,2,colSums(mad_importances),`/`)
rownames(mad_importances)<-rownames(cc_importances)<-varnamesCode for Figure 4
Code
max_y= max(mad_importances)
min_y= 0
max_y_rel= max(madrel)
min_y_rel= 0
min_x=0
max_x=p
max_y_cc=max(cc_importances)
rect_data = tibble(
xmin_cont=7.5,xmax_cont=15,ymin_cont=0,ymax_cont=max_y,
xmin_cat=.5,xmax_cat=7.5,ymin_cat=0,ymax_cat=max_y)
rect_data_rel = tibble(
xmin_cont=7.5,xmax_cont=15,ymin_cont=0,ymax_cont=max_y_rel,
xmin_cat=.5,xmax_cat=7.5,ymin_cat=0,ymax_cat=max_y_rel)
mad_imp_plot = mad_importances |> as_tibble() |>
mutate(variable_names=rownames(mad_importances),
var_number=1:nrow(mad_importances)) |>
pivot_longer(cols=Naive:Udep,names_to="variant",values_to="value") |>
ggplot(aes(x=var_number,y=value,color=variant))+
geom_rect(data=rect_data,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line()+
ylab("absolute variable contribution to distance") +
xlab("left-out variable") + theme_bw() + scale_x_continuous(breaks=seq(1,p,1),labels = varnames)+
theme(axis.text.x = element_text(angle = 60, hjust = 1),
legend.position = "none")
madrel_plot = madrel |> as_tibble() |>
mutate(variable_names=rownames(mad_importances),
var_number=1:nrow(mad_importances)) |>
pivot_longer(cols=Naive:Udep,names_to="variant",values_to="value") |>
ggplot(aes(x=var_number,y=value,color=variant))+
geom_rect(data=rect_data_rel,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data_rel,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line()+
ylab("relative variable contribution to distance") +
xlab("left-out variable") + theme_bw() + scale_x_continuous(breaks=seq(1,p,1),labels = varnames)+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
fifa_mad = mad_imp_plot | madrel_plot
print(fifa_mad, width = 10, height = 5)Code for Figure 5
Code
max_y_cc=max(cc_importances)
rect_data_cc = tibble(
xmin_cont=7.5,xmax_cont=15,ymin_cont=0,ymax_cont=max_y_cc,
xmin_cat=.5,xmax_cat=7.5,ymin_cat=0,ymax_cat=max_y_cc)
cc_imp_plot = cc_importances |> as_tibble() |>
mutate(variable_names=rownames(cc_importances),
var_number=1:nrow(cc_importances)) |>
pivot_longer(cols=Naive:Udep,names_to="variant",values_to="value") |>
ggplot(aes(x=var_number,y=value,color=variant))+
geom_rect(data=rect_data_cc,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data_cc,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line()+
ylab("alienation coefficient") +
xlab("left-out variable") + theme_bw() + scale_x_continuous(breaks=seq(1,p,1),labels = varnames)+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
print(cc_imp_plot,width = 8, height = 4)Appendix D Categorical mean distances: skewed distributions
Data generation
Code
# Let's consider some specific unbalanced scenarios
# One probility is d. The others are (1-d)/(q-1)
# We consider skewed options:
dis<-c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)
#qi=5
qs<-c(2,3,5,10)#,18) # Some options for qi
nint<-length(dis) # Number of options for d (nint-1)
rsm<-rsma<-rhl<-sdsm<-rse<-rsof<-rsiof<-sdse<-rsa<-sdsa<-sdsof<-sdsiof<-sddsma<-sdssm<-
sddse<-sddsa<-sddsm<-sdhl<-sddsmca<-sddsof<-sddsiof<-sddshl<- sdshl<-
rsmca<-sdsmca<-esm<-em<-ee<-ea<-ehl<-emca<-eof<-eiof<-matrix(NA,nrow=nint,ncol=length(qs))
#par(mfrow=c(2,2))
for (qis in 1:length(qs)){
#ds<-NULL
qi<-qs[qis]
for (i in (1:nint)){ # variant 2
Deltam<-matrix(1,qi,qi) # Matching
diag(Deltam)<-rep(0,qi) # Matching Delta
#d=i/(2*qi) # i=2 corresponds to equal probs if variant 1 is chosen. (Only works for one qi. Not for qs loop)
d=dis[i] # variant 2
#ds<-rbind(ds,d)
p<-c(d,rep((1-d)/(qi-1),(qi-1)))
sum(p)
Dp<-diag(p)
pp<-p %*% t(p)
spi<-(diag(Dp-pp))^-1
spi5<-(diag(Dp-pp))^-.5
DEsk<-(2/(qi^2))*Deltam # Delta Eskin
DOF<-log(p)%*%t(log(p)) * Deltam # OF
n=160 # For inv OF we need n. Soon gets very large (increases in n) 160 is sufficient
#n<-ceiling(1/min(p))
DIOF<-log(n*p)%*%t(log(n*p)) * Deltam # inverse OF. If some np<1 we can get negatives
n=160 # For HL we need to find a way to get somewhat consistent values. Let's choose n large
marginals=p*n
HLemp<-distancefactor(cat=qi,n=n,catsizes=marginals) #
DHL<-Deltam*HLemp*2#*sqrt(2) # NOTE 13/09: Corrected: Was sqrt(2). But, should be, see paper, 2x
#DE<-sqrt((1/qi)*(spi %*% t(rep(1,qi)) + t(spi %*% t(rep(1,qi)))))
#diag(DE)<-rep(0,qi)
DM<-sqrt(1/qi)*(spi5 %*% t(rep(1,qi)) + t(spi5 %*% t(rep(1,qi))))
diag(DM)<-rep(0,qi)
DA<-(1/qi)*diag(spi5)%*%Deltam%*%diag(spi5)
DMCA<-(1/qi) * diag(p^-.5)%*%Deltam%*%diag(p^-.5)
#we<-ee[i,(qis)]<- t(p)%*%DE%*%p
wsm<-esm[i,(qis)]<- t(p)%*%Deltam%*%p # Simple Matching
we<-ee[i,(qis)]<- t(p)%*%DEsk%*%p # Eskin
wof<-eof[i,(qis)]<-t(p)%*%DOF%*%p # OF
wiof<-eiof[i,(qis)]<-t(p)%*%DIOF%*%p # IOF
whl<-ehl[i,(qis)]<-t(p)%*%DHL%*%p # HL
wm<-em[i,(qis)]<- t(p)%*%DM%*%p # SD scaling
wa<-ea[i,(qis)]<- t(p)%*%DA%*%p # Cat dis scaling
wmca<-emca[i,(qis)]<- t(p)%*%DMCA%*%p # MCA scaling
# We can also estimate the variance among the distances using Deltas: E(X^2)-E(X)^2
# We can do this dividing by expected values so that we look at what happens
# when delta is commensurable. Or not
# if not:
#we<-wm<-wa<-wmca<-wof<-wiof<-whl<-wsm<-1
sddse[i,(qis)]<- sqrt(t(p)%*%DEsk^2%*%p - ee[i,(qis)]^2)/we # Eskin
sddsma[i,(qis)]<- sqrt(t(p)%*%Deltam^2%*%p - esm[i,(qis)]^2)/ wsm # Simple Matching
sddshl[i,(qis)]<- sqrt(t(p)%*%DHL^2%*%p - ehl[i,(qis)]^2)/ whl # HL
sddsm[i,(qis)]<- sqrt(t(p)%*%DM^2%*%p - em[i,(qis)]^2)/ wm # SD
sddsa[i,(qis)]<- sqrt(t(p)%*%DA^2%*%p - ea[i,(qis)]^2)/wa # Cat dissim
sddsmca[i,(qis)]<- sqrt(t(p)%*%DMCA^2%*%p - emca[i,(qis)]^2)/wmca # MCA
sddsof[i,(qis)]<-sqrt(t(p)%*%DOF^2%*%p - eof[i,(qis)]^2)/wof # OF
sddsiof[i,(qis)]<-sqrt(t(p)%*%(DIOF/as.numeric(wiof))^2%*%p - 1 ) #IOF
# Let's consider spread among cat dissimilarities. But: This only makes sense
# when on similar scales. So, use commensurable deltas: (Or not: see above)
sdse[i,qis]<-sd((1/we)*DEsk[upper.tri(DEsk)]) # Eskin
sdsm[i,qis]<-sd((1/wm)*DM[upper.tri(DM)]) # SD
sdshl[i,qis]<-sd((1/whl)*DHL[upper.tri(DHL)]) # HL
sdssm[i,qis]<-sd((1/wsm)*Deltam[upper.tri(Deltam)]) # Simple Matching
sdsa[i,qis]<-sd((1/wa)*DA[upper.tri(DA)]) # Cat dissim,
sdsmca[i,qis]<-sd((1/wmca)*DMCA[upper.tri(DMCA)]) # MCA
sdsof[i,qis]<-sd((1/wof)*DOF[upper.tri(DOF)]) # OF
sdsiof[i,qis]<-sd((1/wiof)*DIOF[upper.tri(DIOF)]) # IOF
}
}
cats<-as.character(qs)
skew<-as.character(dis)#as.character(1:nint)Code for Figure 6
Code
sk_distance_to_plot = function(exp_mean_dist,distance_name){
exp_mean_dist = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |>
rename(`2 cats` = V1,
`3 cats` = V2,
`5 cats` = V3,
`10 cats` = V4) |>
mutate(method=distance_name) |>
pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`),
names_to = "Number of categories", values_to = "Mean distance")
return(exp_mean_dist)
}
all_exp_mean_dist=rbind(sk_distance_to_plot(esm,"Simple Matching"),
sk_distance_to_plot(ee,"Eskin"),
sk_distance_to_plot(eof,"OF"),
sk_distance_to_plot(eiof,"IOF")
)
paper_plot_skew1=all_exp_mean_dist |>
mutate(`Number of categories`=fct_inorder(`Number of categories`),
method=fct_inorder(method),) |>
ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
geom_line() + geom_point() + facet_wrap(~method,scales = "free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")
print(paper_plot_skew1,width=10,height=6)Code for Figure 7
Code
all_exp_mean_dist2=rbind(sk_distance_to_plot(ehl,"Hennig-Liao"),
sk_distance_to_plot(em,"Std. dev."),
sk_distance_to_plot(ea,"Cat. dissim.")
)
paper_plot_skew2=all_exp_mean_dist2 |>
mutate(`Number of categories`=fct_inorder(`Number of categories`),
method=fct_inorder(method),) |>
ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
geom_line() + geom_point() + facet_wrap(~method) + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")
print(paper_plot_skew2)Code for Figure 8
Code
qs<-c(2,3,5,10)#,18) # Some options for qs
nint<-length(dis) # Number of options for d (nint-1)
CVchd<-CVtvd<-CVmsd<-CVlhd<-Echi<-Echd<-Etvi<-Etvd<-Emsi<-Elhi<-Elhd<-Emsd<-array(NA,dim=c(length(qs),length(qs),nint,nint))
for (q1 in 1:length(qs)){ # cats row
qi<-qs[q1]
for (q2 in q1:length(qs)){ # cats cols
qj<-qs[q2]
for (pi in 1:nint){ # row distributions
d1<-dis[pi]
p1<-c(d1,rep((1-d1)/(qj-1),(qj-1))) # nint different marginals: Distribution over colums
for (pj in 1:nint){
d2<-dis[pj]
p2<-c(d2,rep((1-d2)/(qi-1),(qi-1))) # corresponding col marginals: Distribution over rows
Pi<-p2 %*% t(p1) # joint table independent
Ri<-Pi/rowSums(Pi) # conditional table independent
# Ci<-Ri/sqrt(p2)
if (qj>qi) {
Pd<-diag(p1[1:qi])
Addzeros<-matrix(0,(qi-1),(qj-qi))
Addps<-p1[qi+1:(qj-qi)]
Add<-rbind(Addzeros,Addps)
Pd<-cbind(Pd,Add)
} else { #qi=qj
pd<-p2
Pd<-diag(pd) # joint table dependent case (diagonal smallest q)
}
Rd<-Pd/rowSums(Pd)
Cd<-Rd/sqrt(p2) # Chi-square table: cond. prob/sqrt(p)
# category dissimilarities:
Dlhd<-LeHo(Rd) # Le Ho
# Dmsd<-MousaviSehhati(Pd) # Moussavi
Dtvd<-dist(Rd,"manhattan")/2 # TVD
Dchd<-dist(Cd,"euclidean")^2 # Chi2 dist
# Dchd<-Dchd/(2*(qi-1)) # Chi2 dist corrected for number of categories...
Elhd[q1,q2,pi,pj]<-t(pd) %*%Dlhd %*%pd
# Emsd[q1,q2,pi,pj]<-t(pd) %*%Dmsd %*%pd
Etvd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dtvd) %*%pd
Echd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dchd) %*%pd
# Coefficient of variation
CVlhd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dlhd^2 %*%pd - (t(pd) %*%Dlhd %*%pd)^2)/(t(pd) %*%Dlhd %*%pd)
# CVmsd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dmsd^2 %*%pd - (t(pd) %*%Dmsd %*%pd)^2)/(t(pd) %*%Dmsd %*%pd)
CVtvd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dtvd))^2 %*%pd - (t(pd) %*%as.matrix(Dtvd) %*%pd)^2)/(t(pd) %*%as.matrix(Dtvd) %*%pd)
CVchd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dchd))^2 %*%pd - (t(pd) %*%as.matrix(Dchd) %*%pd)^2)/(t(pd) %*%as.matrix(Dchd) %*%pd)
Dlhi<-LeHo(Ri)
# Dmsi<-MousaviSehhati(Pi)
Dtvi<-dist(Ri,"manhattan")/2
# Dchi<-dist(Ci,"euclidean")^2
Elhi[q1,q2,pi,pj]<-t(p2) %*%Dlhi %*%p2
# Emsi[q1,q2,pi,pj]<-t(p2) %*%Dmsi %*%p2
Etvi[q1,q2,pi,pj]<-t(p2) %*%as.matrix(Dtvi) %*%p2
# Echi[q1,q2,pi,pj]<-t(p1) %*%as.matrix(Dchi) %*%p1
}
}
}
}
Elhs<-as.data.frame(rbind(Elhd[1,1,1,],Elhd[2,2,1,],Elhd[3,3,1,],Elhd[4,4,1,]))
Etvs<-as.data.frame(rbind(Etvd[1,1,1,],Etvd[2,2,1,],Etvd[3,3,1,],Etvd[4,4,1,]))
Emss<-as.data.frame(rbind(Emsd[1,1,1,],Emsd[2,2,1,],Emsd[3,3,1,],Emsd[4,4,1,]))
Echs<-as.data.frame(rbind(Echd[1,1,1,],Echd[2,2,1,],Echd[3,3,1,],Echd[4,4,1,]))
CVlhs<-as.data.frame(rbind(CVlhd[1,1,1,],CVlhd[2,2,1,],CVlhd[3,3,1,],CVlhd[4,4,1,]))
CVtvs<-as.data.frame(rbind(CVtvd[1,1,1,],CVtvd[2,2,1,],CVtvd[3,3,1,],CVtvd[4,4,1,]))
CVmss<-as.data.frame(rbind(CVmsd[1,1,1,],CVmsd[2,2,1,],CVmsd[3,3,1,],CVmsd[4,4,1,]))
CVchs<-as.data.frame(rbind(CVchd[1,1,1,],CVchd[2,2,1,],CVchd[3,3,1,],CVchd[4,4,1,]))
cats<-as.character(qs)
skew<-as.character(dis)#as.character(1:nint)
sk_distance_to_plot = function(exp_mean_dist,distance_name){
exp_mean_dist = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |>
rename(`2 cats` = V1,
`3 cats` = V2,
`5 cats` = V3,
`10 cats` = V4) |>
mutate(method=distance_name) |>
pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`),
names_to = "Number of categories", values_to = "Mean distance")
return(exp_mean_dist)
}
all_exp_mean_dist3=rbind(sk_distance_to_plot(t(as.matrix(Elhs)),"Le-Ho"),
sk_distance_to_plot(t(as.matrix(Etvs)),"Total Variance Distance")
)
paper_plot_skew3=all_exp_mean_dist3 |>
mutate(`Number of categories`=fct_inorder(`Number of categories`),
method=fct_inorder(method),) |>
ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
geom_line() + geom_point() + facet_wrap(~method,scales="free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")
print(paper_plot_skew3,width=8,height=4)Appendix E FIFA variable distributions
Code for Figure 9
Code
X_tib_num = X |> as_tibble() |> select(where(is.numeric))
X_tib_cat = X |> as_tibble() |> select(where(is.factor))
levels(X_tib_cat$work_rate)=c("H/H", "H/L", "H/M", "L/H", "L/L", "L/M", "M/H", "M/L", "M/M")
levels(X_tib_cat$club_name) = c("ADO","Ajax","AZ","Emmen","Groningen","Twente","Utrecht",
"Feyenoord","Fortuna","Heracles","PEC","PSV","RKC","Heerenveen",
"Sparta","Vitesse","VVV","Willem II")
X_tib_cat=X_tib_cat |> rename(`club`=club_name,
`position`=team_position)
X_tib_num_nest = tibble(values=X_tib_num |> map((~.x)),features=names(X_tib_num))|>
mutate(
hplot=map2(.x=values,.y=features,
function(x=.x,y=.y){
my_tib=tibble(a=x)
names(my_tib) = y
my_plot= my_tib |> ggplot(aes(x)) + geom_histogram(bins=15,alpha=.75)+theme_bw()+ggtitle(y)+
theme(plot.title = element_text(hjust = 0.5))+xlab("")+ylab("")
}
)
)
X_tib_cat_nest = tibble(values=X_tib_cat |> map((~.x)),features=names(X_tib_cat))|>
mutate(
bplot=map2(.x=values,.y=features,
function(x=.x,y=.y){
my_tib=tibble(a=x)
names(my_tib) = y
my_plot= my_tib |> ggplot(aes(x)) + geom_bar(alpha=.75)+theme_bw()+ggtitle(y)+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x =element_text(angle=90 ))+xlab("")+ylab("")
}
)
)
fifa_descriptive_plots=(X_tib_cat_nest$bplot[[1]]/
X_tib_cat_nest$bplot[[2]]/
X_tib_cat_nest$bplot[[3]]/
X_tib_cat_nest$bplot[[4]]/
X_tib_cat_nest$bplot[[5]]/
X_tib_cat_nest$bplot[[6]]/
X_tib_cat_nest$bplot[[7]])|
(X_tib_num_nest$hplot[[1]]/
X_tib_num_nest$hplot[[2]]/
X_tib_num_nest$hplot[[3]]/
X_tib_num_nest$hplot[[4]]/
X_tib_num_nest$hplot[[5]]/
X_tib_num_nest$hplot[[6]]/
X_tib_num_nest$hplot[[7]])#+ plot_layout(guides = 'collect')
print(fifa_descriptive_plots)