Delta for mixed
Two categories, two continuos variables
To measure the impact of two continuous variables on a pair of categories, we look at how well separated are the groups of observations labeled after the two considered categories.
We consider convex spherical clusters and non-convex clusters, with and without overlap. Below is the collapsed code I used to generate the data, the resulting datasets are represented in the plot.
set.seed(1234)
=1000
n=tibble(category=c("a","b"),props = c(.7,.3))
category_margins= round(n*category_margins %>%
n_a filter(category=="a") %>% pull(props),digits=0)
= round(n*category_margins %>%
n_b filter(category=="b") %>% pull(props),digits=0)
$csize=c(n_a,n_b) category_margins
Code
= mlbench.2dnormals(2*n,cl=2,r=3, sd=.75)
convex_data
= cbind(convex_data$x %>% as_tibble(),tibble(category = convex_data$classes)) %>%
convex_data_a group_by(category) %>%nest() %>% ungroup() %>%
mutate(
category=as.character(category),
csize = category_margins %>% pull(csize),
data = map2(.x= data,.y= csize,~slice_sample(.data=.x, n= .y))
%>% unnest(cols=c(data)) %>%
)mutate(overlap="no_overlap")%>% dplyr::select(V1,V2,category,overlap)
= fct_infreq(convex_data_a$category) %>% levels() %>% .[1]
red_clu = fct_infreq(convex_data_a$category) %>% levels() %>% .[2]
blue_clu
= convex_data_a %>%
convex_data_a mutate(category = as.factor(fct_inorder(fct_recode(category, a=red_clu,b=blue_clu))))
= mlbench.2dnormals(2*n,cl=2,r=.25, sd=.75)
convex_data
= cbind(convex_data$x %>% as_tibble(),tibble(category = convex_data$classes)) %>%
convex_data_b group_by(category) %>%nest() %>% ungroup() %>%
mutate(
category=as.character(category),
csize = category_margins %>% pull(csize),
data = map2(.x= data,.y= csize,~slice_sample(.data=.x, n= .y))
%>% unnest(cols=c(data)) %>%
)mutate(overlap="overlap")%>% dplyr::select(V1,V2,category,overlap)
= fct_infreq(convex_data_b$category) %>% levels() %>% .[1]
red_clu = fct_infreq(convex_data_b$category) %>% levels() %>% .[2]
blue_clu
= convex_data_b %>%
convex_data_b mutate(category = as.factor(fct_inorder(fct_recode(category, a=red_clu,b=blue_clu))))
=rbind(convex_data_a,convex_data_b) %>% ggplot(aes(x=V1,y=V2, color=category)) +
spheres_plotgeom_point(alpha=.5) + facet_wrap(~overlap) + theme(legend.position = "bottom")+
labs(title="spherical")
Code
= rmovMF(n_a, c(0, 0))
non_convex_data_a_cl_1 = rmovMF(n_b, c(0, 0))
non_convex_data_a_cl_2
= rbind(tibble(V1=non_convex_data_a_cl_1[,1],V2=non_convex_data_a_cl_1[,2]) %>%
non_convex_data_a_don mutate(V1=(V1*2)+rnorm(n(),sd=.1),
V2=(V2*2)+rnorm(n(),sd=.1),
category="a"),
tibble(V1=non_convex_data_a_cl_2[,1],V2=non_convex_data_a_cl_2[,2]) %>%
mutate(V1=(V1*.5)+rnorm(n(),sd=.075),
V2=(V2*.5)+rnorm(n(),sd=.075),
category="b")
%>% mutate(overlap="no_overlap")%>% dplyr::select(V1,V2,category,overlap)
)
= rbind(tibble(V1=non_convex_data_a_cl_1[,1],V2=non_convex_data_a_cl_1[,2]) %>%
non_convex_data_b_don mutate(V1=(V1*2)+rnorm(n(),sd=.5),
V2=(V2*2)+rnorm(n(),sd=.5),
category="a"),
tibble(V1=non_convex_data_a_cl_2[,1],V2=non_convex_data_a_cl_2[,2]) %>%
mutate(V1=(V1*1.75)+rnorm(n(),sd=.35),
V2=(V2*1.75)+rnorm(n(),sd=.35),
category="b")
%>% mutate(overlap="overlap")%>% dplyr::select(V1,V2,category,overlap)
)
=rbind(non_convex_data_a_don,non_convex_data_b_don) %>% ggplot(aes(x=V1,y=V2, color=category)) +
donut_plotgeom_point(alpha=.5) + facet_wrap(~overlap) + theme(legend.position = "bottom") + labs(title="donut")
Code
= rmovMF(n_a, c(0, 2))
non_convex_data_a_cl_1_moo = rmovMF(n_b, c(0, -2))
non_convex_data_a_cl_2_moo
= rbind(
non_convex_data_a_moo tibble(V1=non_convex_data_a_cl_1_moo[,1],V2=non_convex_data_a_cl_1_moo[,2]) %>%
mutate(V1=(V1+.5)+rnorm(n(),sd=.1),
V2=(V2)+rnorm(n(),sd=.1),
category="a"),
tibble(V1=non_convex_data_a_cl_2_moo[,1],V2=non_convex_data_a_cl_2_moo[,2]) %>%
mutate(V1=(V1-.5)+rnorm(n(),sd=.1),
V2=(V2)+rnorm(n(),sd=.1),
category="b")
%>% mutate(overlap="no_overlap")%>% dplyr::select(V1,V2,category,overlap)
)
= rbind(tibble(V1=non_convex_data_a_cl_1_moo[,1],V2=non_convex_data_a_cl_1_moo[,2]) %>%
non_convex_data_b_moo mutate(V1=(V1)+rnorm(n(),sd=.25),
V2=(V2-1)+rnorm(n(),sd=.25),
category="a"),
tibble(V1=non_convex_data_a_cl_2_moo[,1],V2=non_convex_data_a_cl_2_moo[,2]) %>%
mutate(V1=(V1)+rnorm(n(),sd=.25),
V2=(V2+.5)+rnorm(n(),sd=.25),
category="b")
%>% mutate(overlap="overlap") %>% dplyr::select(V1,V2,category,overlap)
)
= rbind(non_convex_data_a_moo,non_convex_data_b_moo) %>% ggplot(aes(x=V1,y=V2, color=category)) +
moon_plot geom_point(alpha=.5) + facet_wrap(~overlap) + theme(legend.position = "bottom") + labs(title="half moons")
Computing \({\bf\phi}^{ij}_{ab}\)
Consider a pair of categories a and b from the categorical variable \(i\), and their proportion \(\pi_{a}\) and \(\pi_{b}\), computed wrt \(n_{ab}\), the number of observations that present the category \(a\) OR the cateogory \(b\) (this is important when you have more than two categories, not so in this example, where \(n_{ab}=n=1000\)).
For each observation of the category \(a\) and \(b\), with a proportion of occurrence \({\pi}_{a}\) and \({\pi}_{b}\), respectively, we compute a set of neighbors \({\mathcal N}_{a}\) of size \(k{\pi}_{a}\) and a set of neighbors \({\mathcal N}_{b}\) of size \(k{\pi}_{b}\). We refer to \({\hat \pi}_{a}\) and \({\hat \pi}_{b}\) as the observed proportions within \({\mathcal N}_{a}\) and \({\mathcal N}_{b}\). For each observation of the category \(a\), if \({\hat \pi}_{a}\geq.5\), then the observation is well classified . Same goes for the observations of the category \(b\).
\[ \phi_{ab} = \frac{1}{k{\pi}_{a}}\sum_{i\in\mathcal{N}_{a}}I({\hat \pi}_{a(i)}>.5)-.5\] \[ \phi_{ba} = \frac{1}{k{\pi}_{b}}\sum_{i\in\mathcal{N}_{b}}I({\hat \pi}_{b(i)}>.5)-.5\]
finally, the general measure for the category pair \((a,b)\) is
\[{\bf\phi}^{ij}_{ab} = \frac{1}{2}(\phi_{\mathcal{N}_{a}}+\phi_{\mathcal{N}_{b}})\].
Code
set.seed(1234)
=c(20,50,100,200)
neighbors
= crossing(neighbors,tibble(dataset_type = rep(c("spherical", "donut","half moon"),times=2),
synthetic_data overlap=rep(c("no_overlap", "overlap"),each=3),
data=list(convex_data_a,non_convex_data_a_don,non_convex_data_a_moo,
convex_data_b,non_convex_data_b_don,non_convex_data_b_moo)
)%>% mutate(
) distances = data %>% map(~dist(.x %>% dplyr::select(V1,V2)) %>% as.matrix()),
attribute = data %>% map(~.x$category),
delta_comp = pmap(.l=list(..1=attribute,..2=distances,..3=neighbors),
.f= ~cont_delta_single_attr(x=..1, y=..2,neighbors=..3)),
full_delta=map(.x=delta_comp,~.x$delta),
full_delta_structure=map(.x=delta_comp,~.x$structure),
delta_weight=map_dbl(.x=delta_comp,~.x$single_weight)
)
# synthetic_data %>% filter(overlap=="overlap") %>% dplyr::select(dataset_type,neighbors,delta_weight)
Code
%>% ggplot(aes(x=as.factor(neighbors),y=delta_weight,fill=overlap))+
synthetic_data geom_col(position="dodge") +
facet_grid(dataset_type~.)
As expected/wanted the value of \(\phi_{ab}^{ij}\) is high when the groups defined by \(a\) and \(b\) do not overlap. For the overalpping cases, the considered number of neighbors appears to affect the value of \(\phi_{ab}^{ij}\)