::install_github("alfonsoIodiceDE/manydist_package/manydist") devtools
Unbiased mixed variables distances
supplementary material
library(aricode)
library(cluster)
library(fpc)
library(manydist)
library(patchwork)
library(tidyverse)
library(varhandle)
library(vegan)
Here we provide the code to reproduce the results of the supplementary material of the paper “Unbiased mixed variables distances”.
Code
<-function(L1,L2){
CongruenceCoeff<-dist(L1)
L1<-dist(L2)
L2<-sum(diag(t(L1) %*% L2)) / sqrt(sum(diag(t(L1) %*% L1))*sum(diag(t(L2) %*% L2)))
CCreturn(CC)
}
<-function(X){ # Returns the LeHo Distance (symmetric Kullback-Leibler)
LeHo# X is matrix with conditional probabilities
#if(any(rowSums(X)!=1)){
# print("Input must be conditional probs (row sums should be 1)")
#} else {
require(philentropy)
<-nrow(X)
n<-matrix(0,n,n)
Dfor (i in 1:(n-1)){
for (j in (i+1):n){
<-D[i,j]<-distance(X[c(i,j),],"kullback-leibler",unit="log2")+
D[j,i]distance(X[c(j,i),],"kullback-leibler",unit="log2")}
}
return(D)
}# }
Variable specific effects
Data generation
Code
<-100 # 100 can take 30min
reps#reps<-5 # 100 can take 30min
<-500
n<-2 # Dimension solution
k<-0.03 # Noise
sigma
<-6 # Number of variables corresponding to true config
porig<-0 # Number of numerical noise vars
pnnoise<-0 # Number of categorical noise vars
pcnoise<-2 # Number of numerical variables underying config
pn
<-pn+pnnoise+pcnoise # Total number of numerical vars+noise vars (Needed to know which vars to discretize)
pnum<-porig+pnnoise+pcnoise # Total number of variables
p
<-c(2,3,5,9) # Distribution of the categorical variables corresponding to true config
qoptions<-length(qoptions) # Number of cat variables underlying config
pcat
# We consider some variants:
<-c("custom","HLa","HLe","catdis","gower","unbiased_dependent","euclidean_onehot","catdissim")
presets
<-cc_importances<-array(NA,dim=(c(reps,length(presets),p)))
mad_importances
<-Sys.time()
t0for (rep in 1:reps){
set.seed(1234+rep)
# Generate 2d data:
<-cbind(runif(n,-2,2),runif(n,-2,2)) # Random Uniform
T# print(T[1:5,])
<- svd(T)
SVDT <-SVDT$u # TU is orthonormal. This is the underlying 2d config
Y
<-NULL
Rfor (i in 1:porig){
<-cbind(R,runif(2,-2,2)) # or uniform. Seems to have little impact
R
}
<-Y%*%R # Inflate/expand the data
X
<-as.data.frame(X)
Xorig<-as.data.frame(Xorig)
X
# Add noise (we do this before making categorical; could also be after)
for (i in 1:porig){
<-X[,i]+rnorm(n,0,sigma)
X[,i]
}
### Or: Add noise columns. Completely random numerical variables
<-NULL
Nif (pnnoise>0){
for (i in 1:pnnoise){
<-cbind(N,rnorm(n))
N
}
}if (pcnoise>0){
<-round(runif(1,3,9))
qr[rep]for (i in 1:pcnoise){
<-as.factor(round(runif(n,1.5,qr[rep]+0.5)))
Cjif (is.null(N)){
<-cbind(N,Cj)
N<-as.factor(N)
Nelse {
} <-cbind.data.frame(N,Cj)}
N
}
colnames(N)<-(1:(pnnoise+pcnoise))
}
if ((pnnoise+pcnoise)>0) {
<-cbind.data.frame(N,Xorig) # The random variables are the first pnoise
Xorig<-cbind.data.frame(N,X)
X
}
# END add noise columns
for (i in (pnum+1):p){ # Create cat variables by simply evenly cutting based on range
<-cut(X[,i],qoptions[i-pnum])
X[,i]
}
for (pr in 1:length(presets)){
# For each preset, first create full distance matrix:
print(pr)
if (pr==1){ # This is the real, numerical data: We use manhattan on scaled values:
<-mdist(Xorig,preset=presets[pr],distance_cont = "manhattan",commensurable=FALSE, scaling="std")
Dallelse if (pr==2){ # This is an HL variant on the manipulated (5 original, 5 discretized) data
} # Below we specify that we use manhattan, numerical is scaled, categorical uses HL scaling
<-mdist(X,preset="custom",distance_cont = "manhattan",distance_cat = "HL", commensurable=FALSE, scaling="std")
Dallelse if (pr==3){ # HL Euclidean
} <-mdist(X,
Dalldistance_cont = "euclidean",
distance_cat = "HLeucl",
commensurable = FALSE,
scaling = "std")
else if (pr==4){ # catdis
} <-mdist(X,preset="custom",distance_cat = "cat_dis",commensurable = TRUE)
Dallelse if (pr==5){ # Gower
} <-mdist(X,preset=presets[pr])*ncol(X)
Dall
else { # The other presets are the 3 options
} <-mdist(X,preset=presets[pr])}
Dall
<- cmdscale(Dall,eig=TRUE, k=2)
MDS_all
# Leave-one-out:
for (j in 1:p){
if (pr==1){
<-mdist(Xorig[,-j],preset=presets[pr],distance_cont = "manhattan",commensurable=FALSE, scaling="std")
Djelse if (pr==2){
} <-mdist(X[,-j],preset="custom",distance_cont = "manhattan",distance_cat = "HL", commensurable=FALSE, scaling="std")
Djelse if (pr==3){ # HL 2
} <-mdist(X[,-j],distance_cont = "euclidean",
Djcommensurable = FALSE,
scaling = "std",
distance_cat = "HLeucl")
else if (pr==4){ # Unbiased catdissim
} <-mdist(X[,-j],preset="custom",
Djdistance_cat = "cat_dis",commensurable = TRUE)
else if (pr==5){ # Gower
} <-mdist(X[,-j],preset=presets[pr])*(ncol(X)-1)
Djelse { # the remaining presets
} <-mdist(X[,-j],preset=presets[pr])}
Dj
<-mean(abs(Dall-Dj))
mad_importances[rep,pr,j]<- cmdscale(Dj,eig=TRUE, k=2)
MDS_j <-CongruenceCoeff(MDS_all$points[,1:2], MDS_j$points[,1:2])
cc_j<-cc_j
cc_importances[rep,pr,j]
}
}
# End replications
} ## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
<-Sys.time()-t0)
(elapsed## Time difference of 21.07505 mins
<-sqrt((1-cc_importances^2)) # We calculated congruences
ac_importances
<-c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind")
variants<-c(1,7,3,2,5,8,4,6)
varreorder<-mad_importances[,varreorder,]
mad_importances<-ac_importances[,varreorder,]
ac_importances<-ac<-madranks<-ccranks<-maddis<-madrel<-acrel<-NULL
mad<-madrels<-acrels<-array(NA,dim=c(length(presets),reps,p))
mad_dists
for (pr in 1:length(presets)){
<-rbind(mad,colMeans(mad_importances[,pr,]))
mad<- rbind(ac,colMeans(ac_importances[,pr,]))
ac<-rbind(maddis,colMeans(mad_importances[,pr,]/rowSums(mad_importances[,pr,])))
maddis
<-mad_importances[,pr,]/rowSums(mad_importances[,pr,]) # We need all distributions
mad_dists[pr,,]<-mad_dists[pr,,]/mad_dists[1,,] # If larger: bigger diff than true
madrels[pr,,]<-rbind(madrel,colMeans(madrels[pr,,]))
madrel
<-ac_importances[,pr,]/ac_importances[,1,] #
acrels[pr,,]<-rbind(acrel,colMeans(acrels[pr,,]))
acrel
}
<-list()
Measures1]]<-data.frame(t(mad))
Measures[[2]]<-data.frame(t(maddis))
Measures[[3]]<-data.frame(t(ac))
Measures[[4]]<-data.frame(t(madrel))
Measures[[5]]<-data.frame(t(acrel))
Measures[[
= tibble(meas=Measures,
Measures_tibmeas_name=c("MAD","MAD distribution","AC","MAD rel","AC rel"))
save(Measures_tib,file="Measures_tib.RData")
Code for Figure 1
Code
<-c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind")
variants<-c(1,7,3,2,5,8,4,6)
varreorder= Measures_tib |>
meas_to_plot_mad mutate(meas=map(.x=meas,function(x=.x){colnames(x)=variants[varreorder];return(x)}),
meas=map(.x=meas,~.x |> slice(c(3:6,1,2)))
|>
) unnest(meas) |>mutate(loo_v=paste0("var",rep(1:6,times=5))) |>
filter(meas_name %in% c("MAD"))
= Measures_tib |>
meas_to_plot_mad_rel mutate(meas=map(.x=meas,function(x=.x){colnames(x)=variants[varreorder];return(x)}),
meas=map(.x=meas,~.x |> slice(c(3:6,1,2)))
|>
) unnest(meas) |>mutate(loo_v=paste0("var",rep(1:6,times=5))) |>
filter(meas_name %in% c("MAD distribution"))
###################
= max(meas_to_plot_mad |> select(where(is.numeric)))
mad_max = tibble(
rect_data xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max,
xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max)
= max(meas_to_plot_mad_rel |> select(where(is.numeric)))
mad_max_rel = tibble(
rect_data_rel xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max_rel,
xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max_rel)
= meas_to_plot_mad |> pivot_longer(cols=Num:Udep, names_to="variant_name",values_to="value") |>
loo_mad_plot ggplot(aes(x = parse_number(loo_v), y = value,
color = variant_name)) +
geom_rect(data=rect_data,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line(aes(lty=variant_name)) +
ylab("absolute variable contribution to distance")+
xlab("left-out variable") + theme_bw() + theme(legend.position = "none")+
scale_x_continuous(breaks=seq(1,6,1),labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2"))+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
= meas_to_plot_mad_rel |> pivot_longer(cols=Num:Udep, names_to="variant_name",values_to="value") |>
loo_mad_rel_plot ggplot(aes(x = parse_number(loo_v), y = value,
color = variant_name)) +
geom_rect(data=rect_data_rel,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data_rel,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line(aes(lty=variant_name)) +
ylab("relative variable contribution to distance") +
xlab("left-out variable") + theme_bw() +
scale_x_continuous(breaks=seq(1,6,1),labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2"))+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
= loo_mad_plot | loo_mad_rel_plot
fig_1_plot
print(fig_1_plot, width = 8, height = 4)
Code for Figure 2
Code
= Measures_tib |>
meas_to_plot_ac mutate(meas=map(.x=meas,function(x=.x){colnames(x)=variants[varreorder];return(x)}),
meas=map(.x=meas,~.x |> slice(c(3:6,1,2)))
|>
) unnest(meas) |>mutate(loo_v=paste0("var",rep(1:6,times=5))) |>
filter(meas_name %in% c("AC rel"))
= max(meas_to_plot_ac |> select(where(is.numeric)))
ac_max = tibble(
rect_data_ac xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=ac_max,
xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=ac_max)
= meas_to_plot_ac |> pivot_longer(cols=Num:Udep, names_to="variant_name",values_to="value") |>
ac_plot ggplot(aes(x = parse_number(loo_v), y = value,
color = variant_name)) +
geom_rect(data=rect_data_ac,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data_ac,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line(aes(lty=variant_name)) +
ylab("alienation coefficient")+
xlab("left-out variable") + theme_bw() +
scale_x_continuous(breaks=seq(1,6,1),labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2"))+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
print(ac_plot, width = 8, height = 4)
Retrieval of the true configuration
Data generation
Code
<- 100
reps <-500 # large is needed for the many (9) categories scenario
n<-2
k<-0.03 # standard deviation of Xorig variables is
sigma# is around 0.067
<-6 # Number of variables corresponding to true config
porig<-0 # Number of numerical noise vars
pnnoise<-0 # Number of categorical noise vars
pcnoise<-3 # Number of numerical variables underying config
pn
<-pn+pnnoise+pcnoise # Total number of numerical vars+noise vars (Needed to know which vars to discretize)
pnum<-porig+pnnoise+pcnoise # Total number of variables
p
<-c(2,3,5,9) # Distribution of the categorical variables corresponding to true config
qoptions<-length(qoptions) # Number of cat variables underlying config
pcat
# We consider some variants:
<-c("custom","HL","HLEuclid","catdis", "gower","unbiased_dependent", "euclidean_onehot","catdissim")#,"gower2","cat dissim")
presets<-array(NA,dim=c(reps,length(presets),length(qoptions)))
CC<-rep(NA,reps)
qr
#set.seed(123)
<-Sys.time()
t0for (rep in 1:reps){
set.seed(1234+rep)
# Generate 2d data:
<-cbind(runif(n,-2,2),runif(n,-2,2)) # Random Uniform
T
<- svd(T)
SVDT <-SVDT$u # TU is orthonormal. This is the underlying 2d config
Y
<-NULL
Rfor (i in 1:porig){
<-cbind(R,runif(2,-2,2)) # or uniform. Seems to have little impact
R
}
<-Y%*%R # Inflate/expand the data
X
<-as.data.frame(X)
Xorig<-as.data.frame(Xorig)
X# Add noise (we do this before making categorical; could also be after)
for (i in 1:porig){
<-X[,i]+rnorm(n,0,sigma)
X[,i]
}
### Or: Add noise columns. Completely random numerical variables
<-NULL
Nif (pnnoise>0){
for (i in 1:pnnoise){
<-cbind(N,rnorm(n))
N
}
}if (pcnoise>0){
<-round(runif(1,3,9))
qr[rep]for (i in 1:pcnoise){
<-as.factor(round(runif(n,1.5,qr[rep]+0.5)))
Cjif (is.null(N)){
<-cbind(N,Cj)
N<-as.factor(N)
Nelse {
} <-cbind.data.frame(N,Cj)}
N
}
colnames(N)<-(1:(pnnoise+pcnoise))
}
if ((pnnoise+pcnoise)>0) {
<-cbind.data.frame(N,Xorig) # The random variables are the first pnoise
Xorig<-cbind.data.frame(N,X)
X
}
# END add noise columns
for (q in 1:length(qoptions)){
<-rep(qoptions[q],p-pnum) # Choice of categories.
qs# If number to large, we could get
# empty categories. All cat variables
# have same number of categories
<-length(qs)
pcat
for (i in (pnum+1):p){ # Create cat variables by simply evenly cutting based on range
<-cut(Xorig[,i],qs[i-pnum])
X[,i]
}
for (pr in 1:length(presets)){
# For each preset, first create full distance matrix:
if (pr==1){ # This is the real, numerical data: We use manhattan on scaled values:
<-mdist(Xorig,preset=presets[pr],distance_cont = "manhattan",commensurable=FALSE, scaling="std")
Dallelse if (pr==2){ # Additive HL, numerical is scaled, categorical uses HL scaling
} <-mdist(X,preset="custom",distance_cont = "manhattan",distance_cat = "HL", commensurable=FALSE, scaling="std")
Dall
else if (pr==3){ # HL Euclid
} <-mdist(X,
Dalldistance_cont = "euclidean",
commensurable = FALSE,
scaling = "std",
distance_cat = "HLeucl")
else if (pr==4){ # Cat dissimilarity
} <-mdist(X,preset="custom",distance_cat = "cat_dis",commensurable = TRUE)
Dallelse if (pr==5){ # Gower
} <-mdist(X,preset=presets[pr])*ncol(X)
Dallelse { # The other presets are the remaining 3 options
} if (pr==7){ # Catch error if no contvars
if ((pn+pnnoise)==0) {
print("Naive distances error: No numvars")
<-as.numeric(X[,1])
X1<-cbind.data.frame(X1,X[,-1])
X2<-mdist(X2,preset=presets[pr])
Dallelse { #pn+pnnoise !=0, pr==7
} <-mdist(X,preset=presets[pr])
Dall
}else {# pr is not 7
} #print(pr)
<-mdist(X,preset=presets[pr])}
Dall
}
<- cmdscale(Dall,eig=TRUE, k=2)
MDS_all <-CongruenceCoeff(Y,MDS_all$points[,1:2])
CC[rep,pr,q]# Mix dist variants
}
# Number of categories
}
# End replications
}
save(CC,file="boxplot_CC_data.RData")
Code for figure Figure 3
Code
<- 100
reps <-c(2,3,5,9)
qoptions
<-sqrt((1-CC^2)) # We calculated congruences
AC<-c("MAD","MAD Distr","Alienation Coeff", "MAD ranks", "CC ranks", "Relative MADs", "Relative ACs")
MeasNames<-c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind")
variants<-c(1,7,3,2,5,8,4,6)
varreorderstr(AC)
## num [1:100, 1:8, 1:4] 0.2299 0.0585 0.2598 0.2691 0.1671 ...
<-array(NA,c(reps,7,4))
diff_AC
for(q in 1:length(qoptions)){
= matrix(rep(AC[,1,q],7),nrow=reps)
numeric_AC<-AC[,-1,q]-numeric_AC
diff_AC[,,q]
}
= rbind(as_tibble(data.frame(AC[,,1])) |> mutate(categories="two cats"),
AC_tib as_tibble(data.frame(AC[,,2])) |> mutate(categories="three cats"),
as_tibble(data.frame(AC[,,3])) |> mutate(categories="five cats"),
as_tibble(data.frame(AC[,,4])) |> mutate(categories="nine cats")) |>
mutate(categories=fct_inorder(categories))
names(AC_tib) = c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind","categories")
= c(1,7,3,2,5,8,4,6)
variant_ordering
= AC_tib |> select(variant_ordering,categories)
AC_tib
= AC_tib |> pivot_longer(
AC_tib_long cols=c("Num","Naive","HL", "HLa","G","Uind","Ustd","Udep"),
names_to="variants", values_to="diff_AC") |> mutate(variants=fct_inorder(variants))
<- function(n) {
gg_color_hue = seq(15, 375, length = n + 1)
hues hcl(h = hues, l = 65, c = 100)[1:n]
}
= c("darkgrey",gg_color_hue(7)[c(4,2,3,1,5,6,7)])
my_colors
= AC_tib_long |> ggplot(aes(x=(variants), y=diff_AC))+
boxplot_figure_simu geom_boxplot(aes(fill=variants)) +
facet_wrap(~categories)+ theme_bw()+
scale_fill_manual(values=my_colors) + theme(legend.position = "none")+
ylab("Alienation coefficient") + xlab("Variants")
print(boxplot_figure_simu,width=8.5,height=6)
Illustration: FIFA player data
Data preparation and analysis
Code
# Variables are organized: Categorical first, then numerical
#load("Fifa21NL.RData") # Load the data
data(fifa_nl)
#Xorig<-X<-ddata[,-c(1,2)]
<-X<-fifa_nl[,-c(1,2,19:24)] # vars 1 and 2 are a bit useless (too many categories)
Xorig# vars 19:24 interesting but same scale and little variation
# leaving them out makes the plots a bit smaller.
# leaving them in doesn't seem to change a lot
<-nrow(X)
n<-ncol(X)
p<-10 # Number of clusters in cluster analysis
K<-2 # Dimensionality of MDS solution
nd=X[,c(8:1,9:16)] # Order
X=X[,-3] # Remove int rep: 380 have something, 21+7 in other cats
X=X[,c(1:12,14,13,15)]
X<-ncol(X)
p
<-c("HL","HLEuclid","catdis", "gower","unbiased_dependent", "euclidean_onehot","catdissim")#,"gower2","cat dissim")
presets
<-cc_importances<-cc_ranks<-mad_ranks<-matrix(NA,length(presets),p)
mad_importances# ASWs<-array(NA,dim=c(length(presets),p+1,K-1))
#FullClusters<-matrix(NA,n,K-1)
# ARIs<-array(NA,dim=c(length(presets),p,K-1))
<-Sys.time()
t0for (pr in 1:length(presets)){
# For each preset, first create full distance matrix:
if (pr==1){ # This is an HL variant on the manipulated (5 original, 5 discretized) data
# Below we specify that we use manhattan, numerical is scaled, categorical uses HL scaling
<-mdist(X,preset="custom",distance_cont = "manhattan",distance_cat = "HL", commensurable=FALSE, scaling="std")
Dallelse if (pr==2){ # HL Euclidean
} <-mdist(X,
Dalldistance_cont = "euclidean",
commensurable = FALSE,
scaling = "std",
distance_cat = "HLeucl")
else if (pr==3){ # Catdissim
} <-mdist(X,preset="custom",distance_cat = "cat_dis",commensurable = TRUE)
Dallelse if (pr==4){ # Gower
} <-mdist(X,preset=presets[pr])*ncol(X)
Dall# Dall<-mdist(X,distance_cat = "matching",
# distance_cont = "manhattan",
# cat_scaling = "none",
# cont_scaling = "range",
# commensurable = FALSE,
# preset = "custom")
else { # The other presets are the 3 options
} <-mdist(X,preset=presets[pr])}
Dall
<- cmdscale(Dall,eig=TRUE, k=nd) # Full MDS solution
MDS_all # FullClusters<-matrix(NA,n,K-1) # Full cluster solution
# for (k in 2:K){ # we do this for k in 2:K
# pam.out<-pam(Dall,k)
# ASWs[pr,1,k-1]<-pam.out$silinfo$avg.width # if we want to compare asw
# FullClusters[,k-1]<-pam.out$clustering # For each K we get a cluster solution. We compare solutions leaving one-variable out later
# }
#
# Leave-one-out: (still same pr)
for (j in 1:p){
if (pr==1){
<-mdist(X[,-j],preset="custom",distance_cont = "manhattan",distance_cat = "HL", commensurable=FALSE, scaling="std")
Djelse if (pr==2){ # HL 2
} <-mdist(X[,-j],distance_cont = "euclidean",
Djcommensurable = FALSE,
scaling = "std",
distance_cat = "HLeucl")
else if (pr==3){ # Unbiased catdissim
} <-mdist(X[,-j],preset="custom",
Djdistance_cat = "cat_dis",commensurable = TRUE)
else if (pr==4){ # Gower
} <-mdist(X[,-j],preset=presets[pr])*(ncol(X)-1)
Dj# Dall<-mdist(X[,-j],distance_cat = "matching",
# distance_cont = "manhattan",
# cat_scaling = "none",
# cont_scaling = "range",
# commensurable = FALSE,
# preset = "custom")
else { # the remaining presets
} <-mdist(X[,-j],preset=presets[pr])}
Dj
<-mean(abs(Dall-Dj)) # Compare MAD
mad_importances[pr,j]<- cmdscale(Dj,eig=TRUE, k=nd) # Needed to compare MDS
MDS_j <-CongruenceCoeff(MDS_all$points[,1:nd], MDS_j$points[,1:nd])
cc_j<-cc_j # The congruence coefficients
cc_importances[pr,j]
# for (k in 2:K){ # Now compare for each k, the obtained clusters
# pam.out<-pam(Dj,k)
# ASWs[pr,j+1,k-1]<-pam.out$silinfo$avg.width
# ARIs[pr,j,k-1]<-ARI(FullClusters[,k-1],pam.out$clustering) # Compare,
# # for each k, all variable clusters with
# # leave-one-out clusters
#
# } # This gives per pr and k, the effect (ARI) of leaving individual variables out on the clustering
}
# rankings: (Not sure needed)
<-rank(cc_importances[pr,])
cc_ranks[pr,]# mad_ranks[pr,]<-rank(mad_importances[pr,])
}
# (elapsed<-Sys.time()-t0)
<-c("HLa","HL", "Ustd", "G","Udep","Naive","Uind")
variants<-c(6,2,1,4,7,3,5) # To get better sequence
varreorder<-t(mad_importances)
mad_importances
<-sqrt((1-cc_importances^2)) # Change to Alienation
cc_importances<-t(cc_importances)
cc_importances<-rownames(mad_importances) <-rownames(cc_importances)<-names(X)
varnames
#
7]<-"position"
varnames[#varnames[3]<-"int_repu"
1]<-"pref_foot"
varnames[15]<-"clause_eur"
varnames[6]<-"club"
varnames[#
colnames(cc_importances)<-colnames(mad_importances)<-variants
# Reorder appropriate fields:
<-mad_importances[,varreorder]
mad_importances<-cc_importances[,varreorder]
cc_importances# ARIs<-ARIs[varreorder,,]
# ASWs<-ASWs[varreorder,,]
# Create relative importance
<-sweep(mad_importances,2,colSums(mad_importances),`/`)
madrelrownames(mad_importances)<-rownames(cc_importances)<-varnames
Code for Figure 4
Code
= max(mad_importances)
max_y= 0
min_y= max(madrel)
max_y_rel= 0
min_y_rel=0
min_x=p
max_x=max(cc_importances)
max_y_cc= tibble(
rect_data xmin_cont=7.5,xmax_cont=15,ymin_cont=0,ymax_cont=max_y,
xmin_cat=.5,xmax_cat=7.5,ymin_cat=0,ymax_cat=max_y)
= tibble(
rect_data_rel xmin_cont=7.5,xmax_cont=15,ymin_cont=0,ymax_cont=max_y_rel,
xmin_cat=.5,xmax_cat=7.5,ymin_cat=0,ymax_cat=max_y_rel)
= mad_importances |> as_tibble() |>
mad_imp_plot mutate(variable_names=rownames(mad_importances),
var_number=1:nrow(mad_importances)) |>
pivot_longer(cols=Naive:Udep,names_to="variant",values_to="value") |>
ggplot(aes(x=var_number,y=value,color=variant))+
geom_rect(data=rect_data,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line()+
ylab("absolute variable contribution to distance") +
xlab("left-out variable") + theme_bw() + scale_x_continuous(breaks=seq(1,p,1),labels = varnames)+
theme(axis.text.x = element_text(angle = 60, hjust = 1),
legend.position = "none")
= madrel |> as_tibble() |>
madrel_plot mutate(variable_names=rownames(mad_importances),
var_number=1:nrow(mad_importances)) |>
pivot_longer(cols=Naive:Udep,names_to="variant",values_to="value") |>
ggplot(aes(x=var_number,y=value,color=variant))+
geom_rect(data=rect_data_rel,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data_rel,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line()+
ylab("relative variable contribution to distance") +
xlab("left-out variable") + theme_bw() + scale_x_continuous(breaks=seq(1,p,1),labels = varnames)+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
= mad_imp_plot | madrel_plot
fifa_mad
print(fifa_mad, width = 10, height = 5)
Code for Figure 5
Code
=max(cc_importances)
max_y_cc
= tibble(
rect_data_cc xmin_cont=7.5,xmax_cont=15,ymin_cont=0,ymax_cont=max_y_cc,
xmin_cat=.5,xmax_cat=7.5,ymin_cat=0,ymax_cat=max_y_cc)
= cc_importances |> as_tibble() |>
cc_imp_plot mutate(variable_names=rownames(cc_importances),
var_number=1:nrow(cc_importances)) |>
pivot_longer(cols=Naive:Udep,names_to="variant",values_to="value") |>
ggplot(aes(x=var_number,y=value,color=variant))+
geom_rect(data=rect_data_cc,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data_cc,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line()+
ylab("alienation coefficient") +
xlab("left-out variable") + theme_bw() + scale_x_continuous(breaks=seq(1,p,1),labels = varnames)+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
print(cc_imp_plot,width = 8, height = 4)
Appendix D Categorical mean distances: skewed distributions
Data generation
Code
# Let's consider some specific unbalanced scenarios
# One probility is d. The others are (1-d)/(q-1)
# We consider skewed options:
<-c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)
dis
#qi=5
<-c(2,3,5,10)#,18) # Some options for qi
qs
<-length(dis) # Number of options for d (nint-1)
nint<-rsma<-rhl<-sdsm<-rse<-rsof<-rsiof<-sdse<-rsa<-sdsa<-sdsof<-sdsiof<-sddsma<-sdssm<-
rsm<-sddsa<-sddsm<-sdhl<-sddsmca<-sddsof<-sddsiof<-sddshl<- sdshl<-
sddse<-sdsmca<-esm<-em<-ee<-ea<-ehl<-emca<-eof<-eiof<-matrix(NA,nrow=nint,ncol=length(qs))
rsmca
#par(mfrow=c(2,2))
for (qis in 1:length(qs)){
#ds<-NULL
<-qs[qis]
qifor (i in (1:nint)){ # variant 2
<-matrix(1,qi,qi) # Matching
Deltamdiag(Deltam)<-rep(0,qi) # Matching Delta
#d=i/(2*qi) # i=2 corresponds to equal probs if variant 1 is chosen. (Only works for one qi. Not for qs loop)
=dis[i] # variant 2
d
#ds<-rbind(ds,d)
<-c(d,rep((1-d)/(qi-1),(qi-1)))
psum(p)
<-diag(p)
Dp<-p %*% t(p)
pp<-(diag(Dp-pp))^-1
spi<-(diag(Dp-pp))^-.5
spi5
<-(2/(qi^2))*Deltam # Delta Eskin
DEsk
<-log(p)%*%t(log(p)) * Deltam # OF
DOF=160 # For inv OF we need n. Soon gets very large (increases in n) 160 is sufficient
n#n<-ceiling(1/min(p))
<-log(n*p)%*%t(log(n*p)) * Deltam # inverse OF. If some np<1 we can get negatives
DIOF
=160 # For HL we need to find a way to get somewhat consistent values. Let's choose n large
n=p*n
marginals
<-distancefactor(cat=qi,n=n,catsizes=marginals) #
HLemp<-Deltam*HLemp*2#*sqrt(2) # NOTE 13/09: Corrected: Was sqrt(2). But, should be, see paper, 2x
DHL#DE<-sqrt((1/qi)*(spi %*% t(rep(1,qi)) + t(spi %*% t(rep(1,qi)))))
#diag(DE)<-rep(0,qi)
<-sqrt(1/qi)*(spi5 %*% t(rep(1,qi)) + t(spi5 %*% t(rep(1,qi))))
DMdiag(DM)<-rep(0,qi)
<-(1/qi)*diag(spi5)%*%Deltam%*%diag(spi5)
DA<-(1/qi) * diag(p^-.5)%*%Deltam%*%diag(p^-.5)
DMCA
#we<-ee[i,(qis)]<- t(p)%*%DE%*%p
<-esm[i,(qis)]<- t(p)%*%Deltam%*%p # Simple Matching
wsm<-ee[i,(qis)]<- t(p)%*%DEsk%*%p # Eskin
we<-eof[i,(qis)]<-t(p)%*%DOF%*%p # OF
wof<-eiof[i,(qis)]<-t(p)%*%DIOF%*%p # IOF
wiof<-ehl[i,(qis)]<-t(p)%*%DHL%*%p # HL
whl<-em[i,(qis)]<- t(p)%*%DM%*%p # SD scaling
wm<-ea[i,(qis)]<- t(p)%*%DA%*%p # Cat dis scaling
wa<-emca[i,(qis)]<- t(p)%*%DMCA%*%p # MCA scaling
wmca
# We can also estimate the variance among the distances using Deltas: E(X^2)-E(X)^2
# We can do this dividing by expected values so that we look at what happens
# when delta is commensurable. Or not
# if not:
#we<-wm<-wa<-wmca<-wof<-wiof<-whl<-wsm<-1
<- sqrt(t(p)%*%DEsk^2%*%p - ee[i,(qis)]^2)/we # Eskin
sddse[i,(qis)]<- sqrt(t(p)%*%Deltam^2%*%p - esm[i,(qis)]^2)/ wsm # Simple Matching
sddsma[i,(qis)]<- sqrt(t(p)%*%DHL^2%*%p - ehl[i,(qis)]^2)/ whl # HL
sddshl[i,(qis)]<- sqrt(t(p)%*%DM^2%*%p - em[i,(qis)]^2)/ wm # SD
sddsm[i,(qis)]<- sqrt(t(p)%*%DA^2%*%p - ea[i,(qis)]^2)/wa # Cat dissim
sddsa[i,(qis)]<- sqrt(t(p)%*%DMCA^2%*%p - emca[i,(qis)]^2)/wmca # MCA
sddsmca[i,(qis)]<-sqrt(t(p)%*%DOF^2%*%p - eof[i,(qis)]^2)/wof # OF
sddsof[i,(qis)]<-sqrt(t(p)%*%(DIOF/as.numeric(wiof))^2%*%p - 1 ) #IOF
sddsiof[i,(qis)]
# Let's consider spread among cat dissimilarities. But: This only makes sense
# when on similar scales. So, use commensurable deltas: (Or not: see above)
<-sd((1/we)*DEsk[upper.tri(DEsk)]) # Eskin
sdse[i,qis]<-sd((1/wm)*DM[upper.tri(DM)]) # SD
sdsm[i,qis]<-sd((1/whl)*DHL[upper.tri(DHL)]) # HL
sdshl[i,qis]<-sd((1/wsm)*Deltam[upper.tri(Deltam)]) # Simple Matching
sdssm[i,qis]<-sd((1/wa)*DA[upper.tri(DA)]) # Cat dissim,
sdsa[i,qis]<-sd((1/wmca)*DMCA[upper.tri(DMCA)]) # MCA
sdsmca[i,qis]<-sd((1/wof)*DOF[upper.tri(DOF)]) # OF
sdsof[i,qis]<-sd((1/wiof)*DIOF[upper.tri(DIOF)]) # IOF
sdsiof[i,qis]
}
}
<-as.character(qs)
cats<-as.character(dis)#as.character(1:nint) skew
Code for Figure 6
Code
= function(exp_mean_dist,distance_name){
sk_distance_to_plot = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |>
exp_mean_dist rename(`2 cats` = V1,
`3 cats` = V2,
`5 cats` = V3,
`10 cats` = V4) |>
mutate(method=distance_name) |>
pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`),
names_to = "Number of categories", values_to = "Mean distance")
return(exp_mean_dist)
}
=rbind(sk_distance_to_plot(esm,"Simple Matching"),
all_exp_mean_distsk_distance_to_plot(ee,"Eskin"),
sk_distance_to_plot(eof,"OF"),
sk_distance_to_plot(eiof,"IOF")
)
=all_exp_mean_dist |>
paper_plot_skew1mutate(`Number of categories`=fct_inorder(`Number of categories`),
method=fct_inorder(method),) |>
ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
geom_line() + geom_point() + facet_wrap(~method,scales = "free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")
print(paper_plot_skew1,width=10,height=6)
Code for Figure 7
Code
=rbind(sk_distance_to_plot(ehl,"Hennig-Liao"),
all_exp_mean_dist2sk_distance_to_plot(em,"Std. dev."),
sk_distance_to_plot(ea,"Cat. dissim.")
)
=all_exp_mean_dist2 |>
paper_plot_skew2mutate(`Number of categories`=fct_inorder(`Number of categories`),
method=fct_inorder(method),) |>
ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
geom_line() + geom_point() + facet_wrap(~method) + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")
print(paper_plot_skew2)
Code for Figure 8
Code
<-c(2,3,5,10)#,18) # Some options for qs
qs
<-length(dis) # Number of options for d (nint-1)
nint
<-CVtvd<-CVmsd<-CVlhd<-Echi<-Echd<-Etvi<-Etvd<-Emsi<-Elhi<-Elhd<-Emsd<-array(NA,dim=c(length(qs),length(qs),nint,nint))
CVchd
for (q1 in 1:length(qs)){ # cats row
<-qs[q1]
qifor (q2 in q1:length(qs)){ # cats cols
<-qs[q2]
qjfor (pi in 1:nint){ # row distributions
<-dis[pi]
d1<-c(d1,rep((1-d1)/(qj-1),(qj-1))) # nint different marginals: Distribution over colums
p1for (pj in 1:nint){
<-dis[pj]
d2<-c(d2,rep((1-d2)/(qi-1),(qi-1))) # corresponding col marginals: Distribution over rows
p2<-p2 %*% t(p1) # joint table independent
Pi<-Pi/rowSums(Pi) # conditional table independent
Ri# Ci<-Ri/sqrt(p2)
if (qj>qi) {
<-diag(p1[1:qi])
Pd<-matrix(0,(qi-1),(qj-qi))
Addzeros<-p1[qi+1:(qj-qi)]
Addps<-rbind(Addzeros,Addps)
Add<-cbind(Pd,Add)
Pdelse { #qi=qj
} <-p2
pd<-diag(pd) # joint table dependent case (diagonal smallest q)
Pd
}<-Pd/rowSums(Pd)
Rd<-Rd/sqrt(p2) # Chi-square table: cond. prob/sqrt(p)
Cd
# category dissimilarities:
<-LeHo(Rd) # Le Ho
Dlhd# Dmsd<-MousaviSehhati(Pd) # Moussavi
<-dist(Rd,"manhattan")/2 # TVD
Dtvd<-dist(Cd,"euclidean")^2 # Chi2 dist
Dchd# Dchd<-Dchd/(2*(qi-1)) # Chi2 dist corrected for number of categories...
<-t(pd) %*%Dlhd %*%pd
Elhd[q1,q2,pi,pj]# Emsd[q1,q2,pi,pj]<-t(pd) %*%Dmsd %*%pd
<-t(pd) %*%as.matrix(Dtvd) %*%pd
Etvd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dchd) %*%pd
Echd[q1,q2,pi,pj]
# Coefficient of variation
<-sqrt(t(pd) %*%Dlhd^2 %*%pd - (t(pd) %*%Dlhd %*%pd)^2)/(t(pd) %*%Dlhd %*%pd)
CVlhd[q1,q2,pi,pj]# CVmsd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dmsd^2 %*%pd - (t(pd) %*%Dmsd %*%pd)^2)/(t(pd) %*%Dmsd %*%pd)
<-sqrt(t(pd) %*%(as.matrix(Dtvd))^2 %*%pd - (t(pd) %*%as.matrix(Dtvd) %*%pd)^2)/(t(pd) %*%as.matrix(Dtvd) %*%pd)
CVtvd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dchd))^2 %*%pd - (t(pd) %*%as.matrix(Dchd) %*%pd)^2)/(t(pd) %*%as.matrix(Dchd) %*%pd)
CVchd[q1,q2,pi,pj]
<-LeHo(Ri)
Dlhi# Dmsi<-MousaviSehhati(Pi)
<-dist(Ri,"manhattan")/2
Dtvi# Dchi<-dist(Ci,"euclidean")^2
<-t(p2) %*%Dlhi %*%p2
Elhi[q1,q2,pi,pj]# Emsi[q1,q2,pi,pj]<-t(p2) %*%Dmsi %*%p2
<-t(p2) %*%as.matrix(Dtvi) %*%p2
Etvi[q1,q2,pi,pj]# Echi[q1,q2,pi,pj]<-t(p1) %*%as.matrix(Dchi) %*%p1
}
}
}
}
<-as.data.frame(rbind(Elhd[1,1,1,],Elhd[2,2,1,],Elhd[3,3,1,],Elhd[4,4,1,]))
Elhs<-as.data.frame(rbind(Etvd[1,1,1,],Etvd[2,2,1,],Etvd[3,3,1,],Etvd[4,4,1,]))
Etvs<-as.data.frame(rbind(Emsd[1,1,1,],Emsd[2,2,1,],Emsd[3,3,1,],Emsd[4,4,1,]))
Emss<-as.data.frame(rbind(Echd[1,1,1,],Echd[2,2,1,],Echd[3,3,1,],Echd[4,4,1,]))
Echs
<-as.data.frame(rbind(CVlhd[1,1,1,],CVlhd[2,2,1,],CVlhd[3,3,1,],CVlhd[4,4,1,]))
CVlhs<-as.data.frame(rbind(CVtvd[1,1,1,],CVtvd[2,2,1,],CVtvd[3,3,1,],CVtvd[4,4,1,]))
CVtvs<-as.data.frame(rbind(CVmsd[1,1,1,],CVmsd[2,2,1,],CVmsd[3,3,1,],CVmsd[4,4,1,]))
CVmss<-as.data.frame(rbind(CVchd[1,1,1,],CVchd[2,2,1,],CVchd[3,3,1,],CVchd[4,4,1,]))
CVchs
<-as.character(qs)
cats<-as.character(dis)#as.character(1:nint)
skew
= function(exp_mean_dist,distance_name){
sk_distance_to_plot = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |>
exp_mean_dist rename(`2 cats` = V1,
`3 cats` = V2,
`5 cats` = V3,
`10 cats` = V4) |>
mutate(method=distance_name) |>
pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`),
names_to = "Number of categories", values_to = "Mean distance")
return(exp_mean_dist)
}
=rbind(sk_distance_to_plot(t(as.matrix(Elhs)),"Le-Ho"),
all_exp_mean_dist3sk_distance_to_plot(t(as.matrix(Etvs)),"Total Variance Distance")
)
=all_exp_mean_dist3 |>
paper_plot_skew3mutate(`Number of categories`=fct_inorder(`Number of categories`),
method=fct_inorder(method),) |>
ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
geom_line() + geom_point() + facet_wrap(~method,scales="free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")
print(paper_plot_skew3,width=8,height=4)
Appendix E FIFA variable distributions
Code for Figure 9
Code
= X |> as_tibble() |> select(where(is.numeric))
X_tib_num
= X |> as_tibble() |> select(where(is.factor))
X_tib_cat
levels(X_tib_cat$work_rate)=c("H/H", "H/L", "H/M", "L/H", "L/L", "L/M", "M/H", "M/L", "M/M")
levels(X_tib_cat$club_name) = c("ADO","Ajax","AZ","Emmen","Groningen","Twente","Utrecht",
"Feyenoord","Fortuna","Heracles","PEC","PSV","RKC","Heerenveen",
"Sparta","Vitesse","VVV","Willem II")
=X_tib_cat |> rename(`club`=club_name,
X_tib_cat`position`=team_position)
= tibble(values=X_tib_num |> map((~.x)),features=names(X_tib_num))|>
X_tib_num_nest mutate(
hplot=map2(.x=values,.y=features,
function(x=.x,y=.y){
=tibble(a=x)
my_tibnames(my_tib) = y
= my_tib |> ggplot(aes(x)) + geom_histogram(bins=15,alpha=.75)+theme_bw()+ggtitle(y)+
my_plottheme(plot.title = element_text(hjust = 0.5))+xlab("")+ylab("")
}
)
)
= tibble(values=X_tib_cat |> map((~.x)),features=names(X_tib_cat))|>
X_tib_cat_nest mutate(
bplot=map2(.x=values,.y=features,
function(x=.x,y=.y){
=tibble(a=x)
my_tibnames(my_tib) = y
= my_tib |> ggplot(aes(x)) + geom_bar(alpha=.75)+theme_bw()+ggtitle(y)+
my_plottheme(plot.title = element_text(hjust = 0.5),
axis.text.x =element_text(angle=90 ))+xlab("")+ylab("")
}
)
)
=(X_tib_cat_nest$bplot[[1]]/
fifa_descriptive_plots$bplot[[2]]/
X_tib_cat_nest$bplot[[3]]/
X_tib_cat_nest$bplot[[4]]/
X_tib_cat_nest$bplot[[5]]/
X_tib_cat_nest$bplot[[6]]/
X_tib_cat_nest$bplot[[7]])|
X_tib_cat_nest$hplot[[1]]/
(X_tib_num_nest$hplot[[2]]/
X_tib_num_nest$hplot[[3]]/
X_tib_num_nest$hplot[[4]]/
X_tib_num_nest$hplot[[5]]/
X_tib_num_nest$hplot[[6]]/
X_tib_num_nest$hplot[[7]])#+ plot_layout(guides = 'collect')
X_tib_num_nest
print(fifa_descriptive_plots)