Computational time analysis of divisive algorithms

Initialization

Load the required python packages.

# !pip install HiPart
# !pip install scikit-learn-extra
import numpy as np
import pandas as pd

from HiPart.clustering import BisectingKmeans
from HiPart.clustering import DePDDP
from HiPart.clustering import IPDDP
from HiPart.clustering import KMPDDP
from HiPart.clustering import PDDP
from HiPart.clustering import MDH
from sklearn.datasets import make_blobs
from sklearn import metrics
from sklearn.cluster import AgglomerativeClustering as hclust
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids as PAM
from scipy.spatial import distance_matrix

import matplotlib.pyplot as plt
import time
import HiPart.visualizations as viz
from matplotlib.pylab import tick_params

1. Simulation study

This study compares the computational time of the algorithms for different sample sizes, number of features in a Monte Carlo simulation. We will compare the following algorithms: agglomerative hierachical clustering (HCl), PDDP, dePDDP, iPDDP, kMeans-PDDP, Bisecting kMeans and partition around medoids (PAM).

Parameter setting

Code

# number of cluster k to be searched in the data by the algorithms
clusters = 2
centers = [-3, 3]
stdk = 1.5
M = 100
P = [50, 100, 200, 500]
verbose = False

nobs = [100, 200, 500, 1000, 2000, 5000, 10000]
OUTPUT = pd.DataFrame({
      "M" : [],
      "P" : [],
      "Size" : [],
      "Method" : [],
      "CluNr" : [],
      "Secs" : []
      })

A number of 2 clusters are generated in the data. Data are generated from a p-variate Gaussian distribution whose parameters are different in each cluster. Clustered data are centered around the p-variate points (-3, …, -3) and (3, …, 3) respectively, while the covariance matrices of of the clusters are diagonal with homoskedastic standard deviations $\sigma_1$ = 3.0 and $\sigma_2$ = 1.5. The sample sizes are [100, 200, 500, 1000, 2000, 5000, 10000] and the feature sizes are [50, 100, 200, 500]. The algorithms are executed in a loop for each sample size, number of features and Monte Carlo simulation. The number of Monte Carlo simulations for each scenario is 100.

Simulation step

for mm in range(M):
  if verbose: print("MC loop %d of %d" % (mm+1,M))
  for NN in nobs:
    if verbose: print("   sample size = %d" % NN, end="  ")
    for p in P:
    
      # Parameters for the multivariate Gaussian distributions
      N1 = round(0.6*NN) 
      N2 = round(0.4*NN) 
      
      # Define the mean and covariance for the clusters
      mean1 = centers[0]*np.zeros(p)
      cov1 = np.eye(p) * stdk * 2
      
      mean2 = +centers[1]*np.ones(p)
      cov2 = np.eye(p) * stdk
      
      # Generate the clusters
      cluster1 = np.random.multivariate_normal(mean1, cov1, N1)
      cluster2 = np.random.multivariate_normal(mean2, cov2, N2)
      
      # Combine the clusters
      X = np.vstack((cluster1, cluster2))
      y = np.array([0] * N1 + [1] * N2) 
  
      # %% dePDDP algorithm execution --------------------------------------------
      tic = time.perf_counter()
      depddp = DePDDP(
          decomposition_method="pca",
          max_clusters_number=clusters,
          bandwidth_scale=0.5,
          percentile=0.1
      ).fit(X)
      toc = time.perf_counter()
      depddp_elapsed_time = toc - tic
      if verbose: print("1", end="")
      #  ---------------------------------------------------------------------------
  
      # %% iPDDP algorithm execution -----------------------------------------------
      tic = time.perf_counter()
      ipddp = IPDDP(
          decomposition_method="pca",
          max_clusters_number=clusters,
          percentile=0.1
      ).fit(X)
      toc = time.perf_counter()
      ipddp_elapsed_time = toc - tic
      if verbose: print("2", end="")
      #  ---------------------------------------------------------------------------
  
      # %% kMeans-PDDP algorithm execution -----------------------------------------
      tic = time.perf_counter()
      kmpddp = KMPDDP(
          decomposition_method="pca",
          max_clusters_number=clusters
      ).fit(X)
      toc = time.perf_counter()
      kmpddp_elapsed_time = toc - tic
      if verbose: print("3", end="")
      #  ---------------------------------------------------------------------------
  
      # %% PDDP algorithm execution
      tic = time.perf_counter()
      pddp = PDDP(
          decomposition_method="pca",
          max_clusters_number=clusters
      ).fit(X)
      toc = time.perf_counter()
      pddp_elapsed_time = toc - tic
      if verbose: print("4", end="")
      #  ---------------------------------------------------------------------------
  
      # %% Bisecting kMeans algorithm execution ------------------------------------
      tic = time.perf_counter()
      bikmeans = BisectingKmeans(
          max_clusters_number=clusters
      ).fit(X)
      toc = time.perf_counter()
      bikmeans_elapsed_time = toc - tic
      if verbose: print("5", end="")
      #  ---------------------------------------------------------------------------
  
      # %% HIERARCHICAL CLUSTERING algorithm execution ---------------------------
      tic = time.perf_counter()
      hcl_labels_ = hclust(
          n_clusters=clusters,
          metric='euclidean',
          linkage='ward'
      ).fit_predict(X) #  affinity='euclidean'
      toc = time.perf_counter()
      hcl_elapsed_time = toc - tic
      if verbose: print("6", end="")
      #  ---------------------------------------------------------------------------
  
      # %% KMedoids algorithm execution ------------------------------------
      tic = time.perf_counter()
      pam = PAM(n_clusters=clusters).fit(X)
      toc = time.perf_counter()
      pam_elapsed_time = toc - tic
      if verbose: print("7")
      #  ---------------------------------------------------------------------------
  
      results = pd.DataFrame({
          "M" : [mm, mm, mm, mm, mm, mm, mm],
          "P" : [p, p, p, p, p, p, p],
          "Size" : [NN, NN, NN, NN, NN, NN, NN],
          "Method" : ["dePDDP", "iPDDP", "kMeans-PDDP", "PDDP", "Bisecting kMeans", "HCl", "PAM"],
          "CluNr" : [len(np.unique(depddp.labels_)), len(np.unique(ipddp.labels_)), len(np.unique(kmpddp.labels_)), len(np.unique(pddp.labels_)), len(np.unique(bikmeans.labels_)), len(np.unique(hcl_labels_)), len(np.unique(pam.labels_))],
          "Secs" : [depddp_elapsed_time, ipddp_elapsed_time, kmpddp_elapsed_time, pddp_elapsed_time, bikmeans_elapsed_time, hcl_elapsed_time, pam_elapsed_time]
      })
      OUTPUT = pd.concat([OUTPUT, results])
      
      # if verbose: print(results["Secs"])

2. Plotting the results

Code

if verbose: print(OUTPUT)

# Plotting the computational time by method and number of features
for p in P:
  grouped_df = OUTPUT[OUTPUT["P"] == p].groupby(["Method", "Size"]).agg(
    Avg_Secs=("Secs", "mean")
    ).reset_index()
    
  plt.figure(figsize=(7.5, 6.5))
  plt.plot(grouped_df[grouped_df["Method"] == "PDDP"]["Size"], grouped_df[grouped_df["Method"] == "PDDP"]["Avg_Secs"], marker="o", label="PDDP")
  plt.plot(grouped_df[grouped_df["Method"] == "iPDDP"]["Size"], grouped_df[grouped_df["Method"] == "iPDDP"]["Avg_Secs"], marker="o", label="iPDDP")
  plt.plot(grouped_df[grouped_df["Method"] == "dePDDP"]["Size"], grouped_df[grouped_df["Method"] == "dePDDP"]["Avg_Secs"], marker="o", label="dePDDP")
  plt.plot(grouped_df[grouped_df["Method"] == "kMeans-PDDP"]["Size"], grouped_df[grouped_df["Method"] == "kMeans-PDDP"]["Avg_Secs"], marker="o", label="kMeans-PDDP")
  plt.plot(grouped_df[grouped_df["Method"] == "Bisecting kMeans"]["Size"], grouped_df[grouped_df["Method"] == "Bisecting kMeans"]["Avg_Secs"], marker="o", label="Bisecting kMeans")
  plt.plot(grouped_df[grouped_df["Method"] == "PAM"]["Size"], grouped_df[grouped_df["Method"] == "PAM"]["Avg_Secs"], marker="o", label="PAM")
  plt.plot(grouped_df[grouped_df["Method"] == "HCl"]["Size"], grouped_df[grouped_df["Method"] == "HCl"]["Avg_Secs"], marker="o", label="HCl")
  
  
  # Show the plot
  plt.legend(title="Method", fontsize=10)
  plt.title("Computational Time by Method, %s features" % p, fontsize=14)
  plt.ylabel("Time (log-secs)", fontsize=12)
  plt.yscale('log')
  plt.grid(axis='y', linestyle='--', alpha=0.7)
  plt.tight_layout()
  plt.show()

--- title: "Computational time analysis of divisive algorithms" format: html code-fold: true code-tools: true embed-resources: true self-contained: true page-layout: full toc: true toc-title: Contents toc-location: left toc-expand: 2 theme: minty --- ## Initialization Load the required python packages. ```{python} #| code-fold: false # !pip install HiPart # !pip install scikit-learn-extra import numpy as np import pandas as pd from HiPart.clustering import BisectingKmeans from HiPart.clustering import DePDDP from HiPart.clustering import IPDDP from HiPart.clustering import KMPDDP from HiPart.clustering import PDDP from HiPart.clustering import MDH from sklearn.datasets import make_blobs from sklearn import metrics from sklearn.cluster import AgglomerativeClustering as hclust from sklearn.cluster import KMeans from sklearn_extra.cluster import KMedoids as PAM from scipy.spatial import distance_matrix import matplotlib.pyplot as plt import time import HiPart.visualizations as viz from matplotlib.pylab import tick_params ``` ```{python} #| echo: false #| eval: false # Sys.setenv(RETICULATE_PYTHON = "/usr/local/bin/python3.12") ``` ## 1. Simulation study This study compares the computational time of the algorithms for different sample sizes, number of features in a Monte Carlo simulation. We will compare the following algorithms: agglomerative hierachical clustering (HCl), PDDP, dePDDP, iPDDP, kMeans-PDDP, Bisecting kMeans and partition around medoids (PAM). ### Parameter setting ```{python} # number of cluster k to be searched in the data by the algorithms clusters = 2 centers = [-3, 3] stdk = 1.5 M = 100 P = [50, 100, 200, 500] verbose = False nobs = [100, 200, 500, 1000, 2000, 5000, 10000] OUTPUT = pd.DataFrame({ "M" : [], "P" : [], "Size" : [], "Method" : [], "CluNr" : [], "Secs" : [] }) ``` A number of `{python} clusters` clusters are generated in the data. Data are generated from a p-variate Gaussian distribution whose parameters are different in each cluster. Clustered data are centered around the p-variate points (`{python} centers[0]`, ..., `{python} centers[0]`) and (`{python} centers[1]`, ..., `{python} centers[1]`) respectively, while the covariance matrices of of the clusters are diagonal with homoskedastic standard deviations $\sigma_1$ = `{python} stdk*2` and $\sigma_2$ = `{python} stdk*1`. The sample sizes are `{python} nobs` and the feature sizes are `{python} P`. The algorithms are executed in a loop for each sample size, number of features and Monte Carlo simulation. The number of Monte Carlo simulations for each scenario is `{python} M`. ### Simulation step ```{python, results='hide'} #| code-fold: false for mm in range(M): if verbose: print("MC loop %d of %d" % (mm+1,M)) for NN in nobs: if verbose: print(" sample size = %d" % NN, end=" ") for p in P: # Parameters for the multivariate Gaussian distributions N1 = round(0.6*NN) N2 = round(0.4*NN) # Define the mean and covariance for the clusters mean1 = centers[0]*np.zeros(p) cov1 = np.eye(p) * stdk * 2 mean2 = +centers[1]*np.ones(p) cov2 = np.eye(p) * stdk # Generate the clusters cluster1 = np.random.multivariate_normal(mean1, cov1, N1) cluster2 = np.random.multivariate_normal(mean2, cov2, N2) # Combine the clusters X = np.vstack((cluster1, cluster2)) y = np.array([0] * N1 + [1] * N2) # %% dePDDP algorithm execution -------------------------------------------- tic = time.perf_counter() depddp = DePDDP( decomposition_method="pca", max_clusters_number=clusters, bandwidth_scale=0.5, percentile=0.1 ).fit(X) toc = time.perf_counter() depddp_elapsed_time = toc - tic if verbose: print("1", end="") # --------------------------------------------------------------------------- # %% iPDDP algorithm execution ----------------------------------------------- tic = time.perf_counter() ipddp = IPDDP( decomposition_method="pca", max_clusters_number=clusters, percentile=0.1 ).fit(X) toc = time.perf_counter() ipddp_elapsed_time = toc - tic if verbose: print("2", end="") # --------------------------------------------------------------------------- # %% kMeans-PDDP algorithm execution ----------------------------------------- tic = time.perf_counter() kmpddp = KMPDDP( decomposition_method="pca", max_clusters_number=clusters ).fit(X) toc = time.perf_counter() kmpddp_elapsed_time = toc - tic if verbose: print("3", end="") # --------------------------------------------------------------------------- # %% PDDP algorithm execution tic = time.perf_counter() pddp = PDDP( decomposition_method="pca", max_clusters_number=clusters ).fit(X) toc = time.perf_counter() pddp_elapsed_time = toc - tic if verbose: print("4", end="") # --------------------------------------------------------------------------- # %% Bisecting kMeans algorithm execution ------------------------------------ tic = time.perf_counter() bikmeans = BisectingKmeans( max_clusters_number=clusters ).fit(X) toc = time.perf_counter() bikmeans_elapsed_time = toc - tic if verbose: print("5", end="") # --------------------------------------------------------------------------- # %% HIERARCHICAL CLUSTERING algorithm execution --------------------------- tic = time.perf_counter() hcl_labels_ = hclust( n_clusters=clusters, metric='euclidean', linkage='ward' ).fit_predict(X) # affinity='euclidean' toc = time.perf_counter() hcl_elapsed_time = toc - tic if verbose: print("6", end="") # --------------------------------------------------------------------------- # %% KMedoids algorithm execution ------------------------------------ tic = time.perf_counter() pam = PAM(n_clusters=clusters).fit(X) toc = time.perf_counter() pam_elapsed_time = toc - tic if verbose: print("7") # --------------------------------------------------------------------------- results = pd.DataFrame({ "M" : [mm, mm, mm, mm, mm, mm, mm], "P" : [p, p, p, p, p, p, p], "Size" : [NN, NN, NN, NN, NN, NN, NN], "Method" : ["dePDDP", "iPDDP", "kMeans-PDDP", "PDDP", "Bisecting kMeans", "HCl", "PAM"], "CluNr" : [len(np.unique(depddp.labels_)), len(np.unique(ipddp.labels_)), len(np.unique(kmpddp.labels_)), len(np.unique(pddp.labels_)), len(np.unique(bikmeans.labels_)), len(np.unique(hcl_labels_)), len(np.unique(pam.labels_))], "Secs" : [depddp_elapsed_time, ipddp_elapsed_time, kmpddp_elapsed_time, pddp_elapsed_time, bikmeans_elapsed_time, hcl_elapsed_time, pam_elapsed_time] }) OUTPUT = pd.concat([OUTPUT, results]) # if verbose: print(results["Secs"]) ``` ## 2. Plotting the results ```{python} if verbose: print(OUTPUT) # Plotting the computational time by method and number of features for p in P: grouped_df = OUTPUT[OUTPUT["P"] == p].groupby(["Method", "Size"]).agg( Avg_Secs=("Secs", "mean") ).reset_index() plt.figure(figsize=(7.5, 6.5)) plt.plot(grouped_df[grouped_df["Method"] == "PDDP"]["Size"], grouped_df[grouped_df["Method"] == "PDDP"]["Avg_Secs"], marker="o", label="PDDP") plt.plot(grouped_df[grouped_df["Method"] == "iPDDP"]["Size"], grouped_df[grouped_df["Method"] == "iPDDP"]["Avg_Secs"], marker="o", label="iPDDP") plt.plot(grouped_df[grouped_df["Method"] == "dePDDP"]["Size"], grouped_df[grouped_df["Method"] == "dePDDP"]["Avg_Secs"], marker="o", label="dePDDP") plt.plot(grouped_df[grouped_df["Method"] == "kMeans-PDDP"]["Size"], grouped_df[grouped_df["Method"] == "kMeans-PDDP"]["Avg_Secs"], marker="o", label="kMeans-PDDP") plt.plot(grouped_df[grouped_df["Method"] == "Bisecting kMeans"]["Size"], grouped_df[grouped_df["Method"] == "Bisecting kMeans"]["Avg_Secs"], marker="o", label="Bisecting kMeans") plt.plot(grouped_df[grouped_df["Method"] == "PAM"]["Size"], grouped_df[grouped_df["Method"] == "PAM"]["Avg_Secs"], marker="o", label="PAM") plt.plot(grouped_df[grouped_df["Method"] == "HCl"]["Size"], grouped_df[grouped_df["Method"] == "HCl"]["Avg_Secs"], marker="o", label="HCl") # Show the plot plt.legend(title="Method", fontsize=10) plt.title("Computational Time by Method, %s features" % p, fontsize=14) plt.ylabel("Time (log-secs)", fontsize=12) plt.yscale('log') plt.grid(axis='y', linestyle='--', alpha=0.7) plt.tight_layout() plt.show() ```