Delta for mixed

Two categories, two continuos variables

To measure the impact of two continuous variables on a pair of categories, we look at how well separated are the groups of observations labeled after the two considered categories.

We consider convex spherical clusters and non-convex clusters, with and without overlap. Below is the collapsed code I used to generate the data, the resulting datasets are represented in the plot.

set.seed(1234)
n=1000
category_margins=tibble(category=c("a","b"),props = c(.7,.3))
n_a = round(n*category_margins %>% 
              filter(category=="a") %>% pull(props),digits=0)
n_b = round(n*category_margins %>% 
              filter(category=="b") %>% pull(props),digits=0)


category_margins$csize=c(n_a,n_b)

Code

convex_data = mlbench.2dnormals(2*n,cl=2,r=3, sd=.75)


convex_data_a = cbind(convex_data$x %>% as_tibble(),tibble(category = convex_data$classes)) %>% 
  group_by(category) %>%nest() %>%  ungroup() %>% 
  mutate(
    category=as.character(category),
    csize = category_margins %>% pull(csize),
    data = map2(.x= data,.y= csize,~slice_sample(.data=.x, n= .y))
    )%>% unnest(cols=c(data)) %>% 
  mutate(overlap="no_overlap")%>% dplyr::select(V1,V2,category,overlap)

red_clu = fct_infreq(convex_data_a$category) %>% levels() %>% .[1]
blue_clu= fct_infreq(convex_data_a$category) %>% levels() %>% .[2]

convex_data_a = convex_data_a %>%  
  mutate(category = as.factor(fct_inorder(fct_recode(category, a=red_clu,b=blue_clu))))

convex_data = mlbench.2dnormals(2*n,cl=2,r=.25, sd=.75)

convex_data_b = cbind(convex_data$x %>% as_tibble(),tibble(category = convex_data$classes)) %>%
  group_by(category) %>%nest() %>%  ungroup() %>% 
  mutate(
    category=as.character(category),
    csize = category_margins %>% pull(csize),
    data = map2(.x= data,.y= csize,~slice_sample(.data=.x, n= .y))
    )%>% unnest(cols=c(data)) %>% 
  mutate(overlap="overlap")%>% dplyr::select(V1,V2,category,overlap)

red_clu = fct_infreq(convex_data_b$category) %>% levels() %>% .[1]
blue_clu= fct_infreq(convex_data_b$category) %>% levels() %>% .[2]

convex_data_b = convex_data_b %>%  
  mutate(category = as.factor(fct_inorder(fct_recode(category, a=red_clu,b=blue_clu)))) 




spheres_plot=rbind(convex_data_a,convex_data_b) %>% ggplot(aes(x=V1,y=V2, color=category)) +
  geom_point(alpha=.5) + facet_wrap(~overlap) + theme(legend.position = "bottom")+
  labs(title="spherical")

Code

non_convex_data_a_cl_1 = rmovMF(n_a,  c(0, 0))
non_convex_data_a_cl_2 = rmovMF(n_b,  c(0, 0))
  
non_convex_data_a_don = rbind(tibble(V1=non_convex_data_a_cl_1[,1],V2=non_convex_data_a_cl_1[,2]) %>%
    mutate(V1=(V1*2)+rnorm(n(),sd=.1),
           V2=(V2*2)+rnorm(n(),sd=.1),
           category="a"),
    tibble(V1=non_convex_data_a_cl_2[,1],V2=non_convex_data_a_cl_2[,2]) %>%
      mutate(V1=(V1*.5)+rnorm(n(),sd=.075),
             V2=(V2*.5)+rnorm(n(),sd=.075),
             category="b")
  ) %>% mutate(overlap="no_overlap")%>% dplyr::select(V1,V2,category,overlap)


non_convex_data_b_don = rbind(tibble(V1=non_convex_data_a_cl_1[,1],V2=non_convex_data_a_cl_1[,2]) %>%
    mutate(V1=(V1*2)+rnorm(n(),sd=.5),
           V2=(V2*2)+rnorm(n(),sd=.5),
           category="a"),
    tibble(V1=non_convex_data_a_cl_2[,1],V2=non_convex_data_a_cl_2[,2]) %>%
      mutate(V1=(V1*1.75)+rnorm(n(),sd=.35),
             V2=(V2*1.75)+rnorm(n(),sd=.35),
             category="b")
  ) %>% mutate(overlap="overlap")%>% dplyr::select(V1,V2,category,overlap)

donut_plot=rbind(non_convex_data_a_don,non_convex_data_b_don) %>% ggplot(aes(x=V1,y=V2, color=category)) +
  geom_point(alpha=.5) + facet_wrap(~overlap) + theme(legend.position = "bottom") + labs(title="donut")

Code

non_convex_data_a_cl_1_moo = rmovMF(n_a,  c(0, 2))
non_convex_data_a_cl_2_moo = rmovMF(n_b,  c(0, -2))
  
non_convex_data_a_moo = rbind(
  tibble(V1=non_convex_data_a_cl_1_moo[,1],V2=non_convex_data_a_cl_1_moo[,2]) %>%
    mutate(V1=(V1+.5)+rnorm(n(),sd=.1),
           V2=(V2)+rnorm(n(),sd=.1),
           category="a"),
  tibble(V1=non_convex_data_a_cl_2_moo[,1],V2=non_convex_data_a_cl_2_moo[,2]) %>%
      mutate(V1=(V1-.5)+rnorm(n(),sd=.1),
             V2=(V2)+rnorm(n(),sd=.1),
             category="b")
) %>% mutate(overlap="no_overlap")%>% dplyr::select(V1,V2,category,overlap)

non_convex_data_b_moo = rbind(tibble(V1=non_convex_data_a_cl_1_moo[,1],V2=non_convex_data_a_cl_1_moo[,2]) %>%
    mutate(V1=(V1)+rnorm(n(),sd=.25),
           V2=(V2-1)+rnorm(n(),sd=.25),
           category="a"),
    tibble(V1=non_convex_data_a_cl_2_moo[,1],V2=non_convex_data_a_cl_2_moo[,2]) %>%
      mutate(V1=(V1)+rnorm(n(),sd=.25),
             V2=(V2+.5)+rnorm(n(),sd=.25),
             category="b")
) %>% mutate(overlap="overlap") %>% dplyr::select(V1,V2,category,overlap)

moon_plot = rbind(non_convex_data_a_moo,non_convex_data_b_moo) %>% ggplot(aes(x=V1,y=V2, color=category)) +
  geom_point(alpha=.5) + facet_wrap(~overlap) + theme(legend.position = "bottom") + labs(title="half moons")

Computing ${\bf\phi}^{ij}_{ab}$

Consider a pair of categories a and b from the categorical variable $i$, and their proportion $\pi_{a}$ and $\pi_{b}$, computed wrt $n_{ab}$, the number of observations that present the category $a$ OR the cateogory $b$ (this is important when you have more than two categories, not so in this example, where $n_{ab}=n=1000$).

For each observation of the category $a$ and $b$, with a proportion of occurrence ${\pi}_{a}$ and ${\pi}_{b}$, respectively, we compute a set of neighbors ${\mathcal N}_{a}$ of size $k{\pi}_{a}$ and a set of neighbors ${\mathcal N}_{b}$ of size $k{\pi}_{b}$. We refer to ${\hat \pi}_{a}$ and ${\hat \pi}_{b}$ as the observed proportions within ${\mathcal N}_{a}$ and ${\mathcal N}_{b}$. For each observation of the category $a$, if ${\hat \pi}_{a}\geq.5$, then the observation is well classified . Same goes for the observations of the category $b$.

\[ \phi_{ab} = \frac{1}{k{\pi}_{a}}\sum_{i\in\mathcal{N}_{a}}I({\hat \pi}_{a(i)}>.5)-.5\] \[ \phi_{ba} = \frac{1}{k{\pi}_{b}}\sum_{i\in\mathcal{N}_{b}}I({\hat \pi}_{b(i)}>.5)-.5\]

finally, the general measure for the category pair $(a,b)$ is

\[{\bf\phi}^{ij}_{ab} = \frac{1}{2}(\phi_{\mathcal{N}_{a}}+\phi_{\mathcal{N}_{b}})\].

Code

set.seed(1234)
neighbors=c(20,50,100,200)

synthetic_data = crossing(neighbors,tibble(dataset_type = rep(c("spherical", "donut","half moon"),times=2),
                        overlap=rep(c("no_overlap", "overlap"),each=3),
                        data=list(convex_data_a,non_convex_data_a_don,non_convex_data_a_moo,
                             convex_data_b,non_convex_data_b_don,non_convex_data_b_moo)
                        )
                        ) %>% mutate(
                          distances = data %>% map(~dist(.x %>% dplyr::select(V1,V2)) %>% as.matrix()),
                          attribute = data %>% map(~.x$category),
                          delta_comp =  pmap(.l=list(..1=attribute,..2=distances,..3=neighbors),
                             .f= ~cont_delta_single_attr(x=..1, y=..2,neighbors=..3)),
                          full_delta=map(.x=delta_comp,~.x$delta),
                          full_delta_structure=map(.x=delta_comp,~.x$structure),
                          delta_weight=map_dbl(.x=delta_comp,~.x$single_weight)
    )


# synthetic_data %>% filter(overlap=="overlap") %>%  dplyr::select(dataset_type,neighbors,delta_weight)

Code

synthetic_data %>% ggplot(aes(x=as.factor(neighbors),y=delta_weight,fill=overlap))+
  geom_col(position="dodge") +
  facet_grid(dataset_type~.)

As expected/wanted the value of $\phi_{ab}^{ij}$ is high when the groups defined by $a$ and $b$ do not overlap. For the overalpping cases, the considered number of neighbors appears to affect the value of $\phi_{ab}^{ij}$

--- title: "Delta for mixed" format: html code-fold: true code-tools: true embed-resources: true --- ```{r} #| echo: FALSE #| message: FALSE #| warning: FALSE #| library(Matrix) library(movMF) library(data.table) library(fastDummies) library(tidyverse) library(FactoMineR) library(mlbench) library(kableExtra) source("R/cat_delta.R") source("R/cont_delta_single_attr.R") source("R/cat_custom_delta.R") source("R/z_preproc.R") ``` ## Two categories, two continuos variables To measure the impact of two continuous variables on a pair of categories, we look at how well separated are the groups of observations labeled after the two considered categories. We consider convex spherical clusters and non-convex clusters, with and without overlap. Below is the collapsed code I used to generate the data, the resulting datasets are represented in the plot. ::: {.panel-tabset} ## Cluster sizes ```{r} #| code-fold: false set.seed(1234) n=1000 category_margins=tibble(category=c("a","b"),props = c(.7,.3)) n_a = round(n*category_margins %>% filter(category=="a") %>% pull(props),digits=0) n_b = round(n*category_margins %>% filter(category=="b") %>% pull(props),digits=0) category_margins$csize=c(n_a,n_b) ``` ## Convex cluster cases ```{r, message=FALSE, warning=FALSE,fig.align='center'} convex_data = mlbench.2dnormals(2*n,cl=2,r=3, sd=.75) convex_data_a = cbind(convex_data$x %>% as_tibble(),tibble(category = convex_data$classes)) %>% group_by(category) %>%nest() %>% ungroup() %>% mutate( category=as.character(category), csize = category_margins %>% pull(csize), data = map2(.x= data,.y= csize,~slice_sample(.data=.x, n= .y)) )%>% unnest(cols=c(data)) %>% mutate(overlap="no_overlap")%>% dplyr::select(V1,V2,category,overlap) red_clu = fct_infreq(convex_data_a$category) %>% levels() %>% .[1] blue_clu= fct_infreq(convex_data_a$category) %>% levels() %>% .[2] convex_data_a = convex_data_a %>% mutate(category = as.factor(fct_inorder(fct_recode(category, a=red_clu,b=blue_clu)))) convex_data = mlbench.2dnormals(2*n,cl=2,r=.25, sd=.75) convex_data_b = cbind(convex_data$x %>% as_tibble(),tibble(category = convex_data$classes)) %>% group_by(category) %>%nest() %>% ungroup() %>% mutate( category=as.character(category), csize = category_margins %>% pull(csize), data = map2(.x= data,.y= csize,~slice_sample(.data=.x, n= .y)) )%>% unnest(cols=c(data)) %>% mutate(overlap="overlap")%>% dplyr::select(V1,V2,category,overlap) red_clu = fct_infreq(convex_data_b$category) %>% levels() %>% .[1] blue_clu= fct_infreq(convex_data_b$category) %>% levels() %>% .[2] convex_data_b = convex_data_b %>% mutate(category = as.factor(fct_inorder(fct_recode(category, a=red_clu,b=blue_clu)))) spheres_plot=rbind(convex_data_a,convex_data_b) %>% ggplot(aes(x=V1,y=V2, color=category)) + geom_point(alpha=.5) + facet_wrap(~overlap) + theme(legend.position = "bottom")+ labs(title="spherical") ``` ## Non-convex cluster cases: donuts ```{r, message=FALSE, warning=FALSE,fig.align='center'} non_convex_data_a_cl_1 = rmovMF(n_a, c(0, 0)) non_convex_data_a_cl_2 = rmovMF(n_b, c(0, 0)) non_convex_data_a_don = rbind(tibble(V1=non_convex_data_a_cl_1[,1],V2=non_convex_data_a_cl_1[,2]) %>% mutate(V1=(V1*2)+rnorm(n(),sd=.1), V2=(V2*2)+rnorm(n(),sd=.1), category="a"), tibble(V1=non_convex_data_a_cl_2[,1],V2=non_convex_data_a_cl_2[,2]) %>% mutate(V1=(V1*.5)+rnorm(n(),sd=.075), V2=(V2*.5)+rnorm(n(),sd=.075), category="b") ) %>% mutate(overlap="no_overlap")%>% dplyr::select(V1,V2,category,overlap) non_convex_data_b_don = rbind(tibble(V1=non_convex_data_a_cl_1[,1],V2=non_convex_data_a_cl_1[,2]) %>% mutate(V1=(V1*2)+rnorm(n(),sd=.5), V2=(V2*2)+rnorm(n(),sd=.5), category="a"), tibble(V1=non_convex_data_a_cl_2[,1],V2=non_convex_data_a_cl_2[,2]) %>% mutate(V1=(V1*1.75)+rnorm(n(),sd=.35), V2=(V2*1.75)+rnorm(n(),sd=.35), category="b") ) %>% mutate(overlap="overlap")%>% dplyr::select(V1,V2,category,overlap) donut_plot=rbind(non_convex_data_a_don,non_convex_data_b_don) %>% ggplot(aes(x=V1,y=V2, color=category)) + geom_point(alpha=.5) + facet_wrap(~overlap) + theme(legend.position = "bottom") + labs(title="donut") ``` ## Non-convex cluster cases: moons ```{r, message=FALSE, warning=FALSE,fig.align='center', fig.asp=1} non_convex_data_a_cl_1_moo = rmovMF(n_a, c(0, 2)) non_convex_data_a_cl_2_moo = rmovMF(n_b, c(0, -2)) non_convex_data_a_moo = rbind( tibble(V1=non_convex_data_a_cl_1_moo[,1],V2=non_convex_data_a_cl_1_moo[,2]) %>% mutate(V1=(V1+.5)+rnorm(n(),sd=.1), V2=(V2)+rnorm(n(),sd=.1), category="a"), tibble(V1=non_convex_data_a_cl_2_moo[,1],V2=non_convex_data_a_cl_2_moo[,2]) %>% mutate(V1=(V1-.5)+rnorm(n(),sd=.1), V2=(V2)+rnorm(n(),sd=.1), category="b") ) %>% mutate(overlap="no_overlap")%>% dplyr::select(V1,V2,category,overlap) non_convex_data_b_moo = rbind(tibble(V1=non_convex_data_a_cl_1_moo[,1],V2=non_convex_data_a_cl_1_moo[,2]) %>% mutate(V1=(V1)+rnorm(n(),sd=.25), V2=(V2-1)+rnorm(n(),sd=.25), category="a"), tibble(V1=non_convex_data_a_cl_2_moo[,1],V2=non_convex_data_a_cl_2_moo[,2]) %>% mutate(V1=(V1)+rnorm(n(),sd=.25), V2=(V2+.5)+rnorm(n(),sd=.25), category="b") ) %>% mutate(overlap="overlap") %>% dplyr::select(V1,V2,category,overlap) moon_plot = rbind(non_convex_data_a_moo,non_convex_data_b_moo) %>% ggplot(aes(x=V1,y=V2, color=category)) + geom_point(alpha=.5) + facet_wrap(~overlap) + theme(legend.position = "bottom") + labs(title="half moons") ``` ::: ```{r,echo=FALSE, fig.align='center'} spheres_plot donut_plot moon_plot ``` ## Computing ${\bf\phi}^{ij}_{ab}$ Consider a pair of categories a and b from the categorical variable $i$, and their proportion $\pi_{a}$ and $\pi_{b}$, computed wrt $n_{ab}$, the number of observations that present the category $a$ OR the cateogory $b$ (this is important when you have more than two categories, not so in this example, where $n_{ab}=n=1000$). For each observation of the category $a$ and $b$, with a proportion of occurrence ${\pi}_{a}$ and ${\pi}_{b}$, respectively, we compute a set of neighbors ${\mathcal N}_{a}$ of size $k{\pi}_{a}$ and a set of neighbors ${\mathcal N}_{b}$ of size $k{\pi}_{b}$. We refer to ${\hat \pi}_{a}$ and ${\hat \pi}_{b}$ as the observed proportions within ${\mathcal N}_{a}$ and ${\mathcal N}_{b}$. For each observation of the category $a$, **if ${\hat \pi}_{a}\geq.5$, then the observation is well classified **. Same goes for the observations of the category $b$. $$ \phi_{ab} = \frac{1}{k{\pi}_{a}}\sum_{i\in\mathcal{N}_{a}}I({\hat \pi}_{a(i)}>.5)-.5$$ $$ \phi_{ba} = \frac{1}{k{\pi}_{b}}\sum_{i\in\mathcal{N}_{b}}I({\hat \pi}_{b(i)}>.5)-.5$$ finally, the general measure for the category pair $(a,b)$ is $${\bf\phi}^{ij}_{ab} = \frac{1}{2}(\phi_{\mathcal{N}_{a}}+\phi_{\mathcal{N}_{b}})$$.        ```{r, fig.align='center'} set.seed(1234) neighbors=c(20,50,100,200) synthetic_data = crossing(neighbors,tibble(dataset_type = rep(c("spherical", "donut","half moon"),times=2), overlap=rep(c("no_overlap", "overlap"),each=3), data=list(convex_data_a,non_convex_data_a_don,non_convex_data_a_moo, convex_data_b,non_convex_data_b_don,non_convex_data_b_moo) ) ) %>% mutate( distances = data %>% map(~dist(.x %>% dplyr::select(V1,V2)) %>% as.matrix()), attribute = data %>% map(~.x$category), delta_comp = pmap(.l=list(..1=attribute,..2=distances,..3=neighbors), .f= ~cont_delta_single_attr(x=..1, y=..2,neighbors=..3)), full_delta=map(.x=delta_comp,~.x$delta), full_delta_structure=map(.x=delta_comp,~.x$structure), delta_weight=map_dbl(.x=delta_comp,~.x$single_weight) ) # synthetic_data %>% filter(overlap=="overlap") %>% dplyr::select(dataset_type,neighbors,delta_weight) ``` ```{r} synthetic_data %>% ggplot(aes(x=as.factor(neighbors),y=delta_weight,fill=overlap))+ geom_col(position="dodge") + facet_grid(dataset_type~.) ``` As expected/wanted the value of $\phi_{ab}^{ij}$ is high when the groups defined by $a$ and $b$ do not overlap. For the overalpping cases, the considered number of neighbors appears to affect the value of $\phi_{ab}^{ij}$

Two categories, two continuos variables

Computing \({\bf\phi}^{ij}_{ab}\)