Two categories, two continuos variables

To measure the impact of two continuous variables on a pair of categories, we look at how well separated are the groups of observations labeled after the two considered categories.

We consider convex spherical clusters and non-convex clusters, with and without overlap. Below is the collapsed code I used to generate the data, the resulting datasets are represented in the plot.

set.seed(1234)
n=1000
category_margins=tibble(category=c("a","b"),props = c(.7,.3))
n_a = round(n*category_margins %>% 
              filter(category=="a") %>% pull(props),digits=0)
n_b = round(n*category_margins %>% 
              filter(category=="b") %>% pull(props),digits=0)


category_margins$csize=c(n_a,n_b)
Code
convex_data = mlbench.2dnormals(2*n,cl=2,r=3, sd=.75)


convex_data_a = cbind(convex_data$x %>% as_tibble(),tibble(category = convex_data$classes)) %>% 
  group_by(category) %>%nest() %>%  ungroup() %>% 
  mutate(
    category=as.character(category),
    csize = category_margins %>% pull(csize),
    data = map2(.x= data,.y= csize,~slice_sample(.data=.x, n= .y))
    )%>% unnest(cols=c(data)) %>% 
  mutate(overlap="no_overlap")%>% dplyr::select(V1,V2,category,overlap)

red_clu = fct_infreq(convex_data_a$category) %>% levels() %>% .[1]
blue_clu= fct_infreq(convex_data_a$category) %>% levels() %>% .[2]

convex_data_a = convex_data_a %>%  
  mutate(category = as.factor(fct_inorder(fct_recode(category, a=red_clu,b=blue_clu))))

convex_data = mlbench.2dnormals(2*n,cl=2,r=.25, sd=.75)

convex_data_b = cbind(convex_data$x %>% as_tibble(),tibble(category = convex_data$classes)) %>%
  group_by(category) %>%nest() %>%  ungroup() %>% 
  mutate(
    category=as.character(category),
    csize = category_margins %>% pull(csize),
    data = map2(.x= data,.y= csize,~slice_sample(.data=.x, n= .y))
    )%>% unnest(cols=c(data)) %>% 
  mutate(overlap="overlap")%>% dplyr::select(V1,V2,category,overlap)

red_clu = fct_infreq(convex_data_b$category) %>% levels() %>% .[1]
blue_clu= fct_infreq(convex_data_b$category) %>% levels() %>% .[2]

convex_data_b = convex_data_b %>%  
  mutate(category = as.factor(fct_inorder(fct_recode(category, a=red_clu,b=blue_clu)))) 




spheres_plot=rbind(convex_data_a,convex_data_b) %>% ggplot(aes(x=V1,y=V2, color=category)) +
  geom_point(alpha=.5) + facet_wrap(~overlap) + theme(legend.position = "bottom")+
  labs(title="spherical")
Code
non_convex_data_a_cl_1 = rmovMF(n_a,  c(0, 0))
non_convex_data_a_cl_2 = rmovMF(n_b,  c(0, 0))
  
non_convex_data_a_don = rbind(tibble(V1=non_convex_data_a_cl_1[,1],V2=non_convex_data_a_cl_1[,2]) %>%
    mutate(V1=(V1*2)+rnorm(n(),sd=.1),
           V2=(V2*2)+rnorm(n(),sd=.1),
           category="a"),
    tibble(V1=non_convex_data_a_cl_2[,1],V2=non_convex_data_a_cl_2[,2]) %>%
      mutate(V1=(V1*.5)+rnorm(n(),sd=.075),
             V2=(V2*.5)+rnorm(n(),sd=.075),
             category="b")
  ) %>% mutate(overlap="no_overlap")%>% dplyr::select(V1,V2,category,overlap)


non_convex_data_b_don = rbind(tibble(V1=non_convex_data_a_cl_1[,1],V2=non_convex_data_a_cl_1[,2]) %>%
    mutate(V1=(V1*2)+rnorm(n(),sd=.5),
           V2=(V2*2)+rnorm(n(),sd=.5),
           category="a"),
    tibble(V1=non_convex_data_a_cl_2[,1],V2=non_convex_data_a_cl_2[,2]) %>%
      mutate(V1=(V1*1.75)+rnorm(n(),sd=.35),
             V2=(V2*1.75)+rnorm(n(),sd=.35),
             category="b")
  ) %>% mutate(overlap="overlap")%>% dplyr::select(V1,V2,category,overlap)

donut_plot=rbind(non_convex_data_a_don,non_convex_data_b_don) %>% ggplot(aes(x=V1,y=V2, color=category)) +
  geom_point(alpha=.5) + facet_wrap(~overlap) + theme(legend.position = "bottom") + labs(title="donut")
Code
non_convex_data_a_cl_1_moo = rmovMF(n_a,  c(0, 2))
non_convex_data_a_cl_2_moo = rmovMF(n_b,  c(0, -2))
  
non_convex_data_a_moo = rbind(
  tibble(V1=non_convex_data_a_cl_1_moo[,1],V2=non_convex_data_a_cl_1_moo[,2]) %>%
    mutate(V1=(V1+.5)+rnorm(n(),sd=.1),
           V2=(V2)+rnorm(n(),sd=.1),
           category="a"),
  tibble(V1=non_convex_data_a_cl_2_moo[,1],V2=non_convex_data_a_cl_2_moo[,2]) %>%
      mutate(V1=(V1-.5)+rnorm(n(),sd=.1),
             V2=(V2)+rnorm(n(),sd=.1),
             category="b")
) %>% mutate(overlap="no_overlap")%>% dplyr::select(V1,V2,category,overlap)

non_convex_data_b_moo = rbind(tibble(V1=non_convex_data_a_cl_1_moo[,1],V2=non_convex_data_a_cl_1_moo[,2]) %>%
    mutate(V1=(V1)+rnorm(n(),sd=.25),
           V2=(V2-1)+rnorm(n(),sd=.25),
           category="a"),
    tibble(V1=non_convex_data_a_cl_2_moo[,1],V2=non_convex_data_a_cl_2_moo[,2]) %>%
      mutate(V1=(V1)+rnorm(n(),sd=.25),
             V2=(V2+.5)+rnorm(n(),sd=.25),
             category="b")
) %>% mutate(overlap="overlap") %>% dplyr::select(V1,V2,category,overlap)

moon_plot = rbind(non_convex_data_a_moo,non_convex_data_b_moo) %>% ggplot(aes(x=V1,y=V2, color=category)) +
  geom_point(alpha=.5) + facet_wrap(~overlap) + theme(legend.position = "bottom") + labs(title="half moons")

Computing \({\bf\phi}^{ij}_{ab}\)

Consider a pair of categories a and b from the categorical variable \(i\), and their proportion \(\pi_{a}\) and \(\pi_{b}\), computed wrt \(n_{ab}\), the number of observations that present the category \(a\) OR the cateogory \(b\) (this is important when you have more than two categories, not so in this example, where \(n_{ab}=n=1000\)).

For each observation of the category \(a\) and \(b\), with a proportion of occurrence \({\pi}_{a}\) and \({\pi}_{b}\), respectively, we compute a set of neighbors \({\mathcal N}_{a}\) of size \(k{\pi}_{a}\) and a set of neighbors \({\mathcal N}_{b}\) of size \(k{\pi}_{b}\). We refer to \({\hat \pi}_{a}\) and \({\hat \pi}_{b}\) as the observed proportions within \({\mathcal N}_{a}\) and \({\mathcal N}_{b}\). For each observation of the category \(a\), if \({\hat \pi}_{a}\geq.5\), then the observation is well classified . Same goes for the observations of the category \(b\).

\[ \phi_{ab} = \frac{1}{k{\pi}_{a}}\sum_{i\in\mathcal{N}_{a}}I({\hat \pi}_{a(i)}>.5)-.5\] \[ \phi_{ba} = \frac{1}{k{\pi}_{b}}\sum_{i\in\mathcal{N}_{b}}I({\hat \pi}_{b(i)}>.5)-.5\]

finally, the general measure for the category pair \((a,b)\) is

\[{\bf\phi}^{ij}_{ab} = \frac{1}{2}(\phi_{\mathcal{N}_{a}}+\phi_{\mathcal{N}_{b}})\].

Code
set.seed(1234)
neighbors=c(20,50,100,200)

synthetic_data = crossing(neighbors,tibble(dataset_type = rep(c("spherical", "donut","half moon"),times=2),
                        overlap=rep(c("no_overlap", "overlap"),each=3),
                        data=list(convex_data_a,non_convex_data_a_don,non_convex_data_a_moo,
                             convex_data_b,non_convex_data_b_don,non_convex_data_b_moo)
                        )
                        ) %>% mutate(
                          distances = data %>% map(~dist(.x %>% dplyr::select(V1,V2)) %>% as.matrix()),
                          attribute = data %>% map(~.x$category),
                          delta_comp =  pmap(.l=list(..1=attribute,..2=distances,..3=neighbors),
                             .f= ~cont_delta_single_attr(x=..1, y=..2,neighbors=..3)),
                          full_delta=map(.x=delta_comp,~.x$delta),
                          full_delta_structure=map(.x=delta_comp,~.x$structure),
                          delta_weight=map_dbl(.x=delta_comp,~.x$single_weight)
    )


# synthetic_data %>% filter(overlap=="overlap") %>%  dplyr::select(dataset_type,neighbors,delta_weight)
Code
synthetic_data %>% ggplot(aes(x=as.factor(neighbors),y=delta_weight,fill=overlap))+
  geom_col(position="dodge") +
  facet_grid(dataset_type~.)

As expected/wanted the value of \(\phi_{ab}^{ij}\) is high when the groups defined by \(a\) and \(b\) do not overlap. For the overalpping cases, the considered number of neighbors appears to affect the value of \(\phi_{ab}^{ij}\)