Computational time analysis of divisive algorithms

Initialization

Load the required python packages.

# !pip install HiPart
# !pip install scikit-learn-extra
import numpy as np
import pandas as pd

from HiPart.clustering import BisectingKmeans
from HiPart.clustering import DePDDP
from HiPart.clustering import IPDDP
from HiPart.clustering import KMPDDP
from HiPart.clustering import PDDP
from HiPart.clustering import MDH
from sklearn.datasets import make_blobs
from sklearn import metrics
from sklearn.cluster import AgglomerativeClustering as hclust
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids as PAM
from scipy.spatial import distance_matrix

import matplotlib.pyplot as plt
import time
import HiPart.visualizations as viz
from matplotlib.pylab import tick_params

1. Simulation study

This study compares the computational time of the algorithms for different sample sizes, number of features in a Monte Carlo simulation. We will compare the following algorithms: agglomerative hierachical clustering (HCl), PDDP, dePDDP, iPDDP, kMeans-PDDP, Bisecting kMeans and partition around medoids (PAM).

Parameter setting

Code
# number of cluster k to be searched in the data by the algorithms
clusters = 2
centers = [-3, 3]
stdk = 1.5
M = 100
P = [50, 100, 200, 500]
verbose = False

nobs = [100, 200, 500, 1000, 2000, 5000, 10000]
OUTPUT = pd.DataFrame({
      "M" : [],
      "P" : [],
      "Size" : [],
      "Method" : [],
      "CluNr" : [],
      "Secs" : []
      })

A number of 2 clusters are generated in the data. Data are generated from a p-variate Gaussian distribution whose parameters are different in each cluster. Clustered data are centered around the p-variate points (-3, …, -3) and (3, …, 3) respectively, while the covariance matrices of of the clusters are diagonal with homoskedastic standard deviations \(\sigma_1\) = 3.0 and \(\sigma_2\) = 1.5. The sample sizes are [100, 200, 500, 1000, 2000, 5000, 10000] and the feature sizes are [50, 100, 200, 500]. The algorithms are executed in a loop for each sample size, number of features and Monte Carlo simulation. The number of Monte Carlo simulations for each scenario is 100.

Simulation step

for mm in range(M):
  if verbose: print("MC loop %d of %d" % (mm+1,M))
  for NN in nobs:
    if verbose: print("   sample size = %d" % NN, end="  ")
    for p in P:
    
      # Parameters for the multivariate Gaussian distributions
      N1 = round(0.6*NN) 
      N2 = round(0.4*NN) 
      
      # Define the mean and covariance for the clusters
      mean1 = centers[0]*np.zeros(p)
      cov1 = np.eye(p) * stdk * 2
      
      mean2 = +centers[1]*np.ones(p)
      cov2 = np.eye(p) * stdk
      
      # Generate the clusters
      cluster1 = np.random.multivariate_normal(mean1, cov1, N1)
      cluster2 = np.random.multivariate_normal(mean2, cov2, N2)
      
      # Combine the clusters
      X = np.vstack((cluster1, cluster2))
      y = np.array([0] * N1 + [1] * N2) 
  
      # %% dePDDP algorithm execution --------------------------------------------
      tic = time.perf_counter()
      depddp = DePDDP(
          decomposition_method="pca",
          max_clusters_number=clusters,
          bandwidth_scale=0.5,
          percentile=0.1
      ).fit(X)
      toc = time.perf_counter()
      depddp_elapsed_time = toc - tic
      if verbose: print("1", end="")
      #  ---------------------------------------------------------------------------
  
      # %% iPDDP algorithm execution -----------------------------------------------
      tic = time.perf_counter()
      ipddp = IPDDP(
          decomposition_method="pca",
          max_clusters_number=clusters,
          percentile=0.1
      ).fit(X)
      toc = time.perf_counter()
      ipddp_elapsed_time = toc - tic
      if verbose: print("2", end="")
      #  ---------------------------------------------------------------------------
  
      # %% kMeans-PDDP algorithm execution -----------------------------------------
      tic = time.perf_counter()
      kmpddp = KMPDDP(
          decomposition_method="pca",
          max_clusters_number=clusters
      ).fit(X)
      toc = time.perf_counter()
      kmpddp_elapsed_time = toc - tic
      if verbose: print("3", end="")
      #  ---------------------------------------------------------------------------
  
      # %% PDDP algorithm execution
      tic = time.perf_counter()
      pddp = PDDP(
          decomposition_method="pca",
          max_clusters_number=clusters
      ).fit(X)
      toc = time.perf_counter()
      pddp_elapsed_time = toc - tic
      if verbose: print("4", end="")
      #  ---------------------------------------------------------------------------
  
      # %% Bisecting kMeans algorithm execution ------------------------------------
      tic = time.perf_counter()
      bikmeans = BisectingKmeans(
          max_clusters_number=clusters
      ).fit(X)
      toc = time.perf_counter()
      bikmeans_elapsed_time = toc - tic
      if verbose: print("5", end="")
      #  ---------------------------------------------------------------------------
  
      # %% HIERARCHICAL CLUSTERING algorithm execution ---------------------------
      tic = time.perf_counter()
      hcl_labels_ = hclust(
          n_clusters=clusters,
          metric='euclidean',
          linkage='ward'
      ).fit_predict(X) #  affinity='euclidean'
      toc = time.perf_counter()
      hcl_elapsed_time = toc - tic
      if verbose: print("6", end="")
      #  ---------------------------------------------------------------------------
  
      # %% KMedoids algorithm execution ------------------------------------
      tic = time.perf_counter()
      pam = PAM(n_clusters=clusters).fit(X)
      toc = time.perf_counter()
      pam_elapsed_time = toc - tic
      if verbose: print("7")
      #  ---------------------------------------------------------------------------
  
      results = pd.DataFrame({
          "M" : [mm, mm, mm, mm, mm, mm, mm],
          "P" : [p, p, p, p, p, p, p],
          "Size" : [NN, NN, NN, NN, NN, NN, NN],
          "Method" : ["dePDDP", "iPDDP", "kMeans-PDDP", "PDDP", "Bisecting kMeans", "HCl", "PAM"],
          "CluNr" : [len(np.unique(depddp.labels_)), len(np.unique(ipddp.labels_)), len(np.unique(kmpddp.labels_)), len(np.unique(pddp.labels_)), len(np.unique(bikmeans.labels_)), len(np.unique(hcl_labels_)), len(np.unique(pam.labels_))],
          "Secs" : [depddp_elapsed_time, ipddp_elapsed_time, kmpddp_elapsed_time, pddp_elapsed_time, bikmeans_elapsed_time, hcl_elapsed_time, pam_elapsed_time]
      })
      OUTPUT = pd.concat([OUTPUT, results])
      
      # if verbose: print(results["Secs"])

2. Plotting the results

Code
if verbose: print(OUTPUT)

# Plotting the computational time by method and number of features
for p in P:
  grouped_df = OUTPUT[OUTPUT["P"] == p].groupby(["Method", "Size"]).agg(
    Avg_Secs=("Secs", "mean")
    ).reset_index()
    
  plt.figure(figsize=(7.5, 6.5))
  plt.plot(grouped_df[grouped_df["Method"] == "PDDP"]["Size"], grouped_df[grouped_df["Method"] == "PDDP"]["Avg_Secs"], marker="o", label="PDDP")
  plt.plot(grouped_df[grouped_df["Method"] == "iPDDP"]["Size"], grouped_df[grouped_df["Method"] == "iPDDP"]["Avg_Secs"], marker="o", label="iPDDP")
  plt.plot(grouped_df[grouped_df["Method"] == "dePDDP"]["Size"], grouped_df[grouped_df["Method"] == "dePDDP"]["Avg_Secs"], marker="o", label="dePDDP")
  plt.plot(grouped_df[grouped_df["Method"] == "kMeans-PDDP"]["Size"], grouped_df[grouped_df["Method"] == "kMeans-PDDP"]["Avg_Secs"], marker="o", label="kMeans-PDDP")
  plt.plot(grouped_df[grouped_df["Method"] == "Bisecting kMeans"]["Size"], grouped_df[grouped_df["Method"] == "Bisecting kMeans"]["Avg_Secs"], marker="o", label="Bisecting kMeans")
  plt.plot(grouped_df[grouped_df["Method"] == "PAM"]["Size"], grouped_df[grouped_df["Method"] == "PAM"]["Avg_Secs"], marker="o", label="PAM")
  plt.plot(grouped_df[grouped_df["Method"] == "HCl"]["Size"], grouped_df[grouped_df["Method"] == "HCl"]["Avg_Secs"], marker="o", label="HCl")
  
  
  # Show the plot
  plt.legend(title="Method", fontsize=10)
  plt.title("Computational Time by Method, %s features" % p, fontsize=14)
  plt.ylabel("Time (log-secs)", fontsize=12)
  plt.yscale('log')
  plt.grid(axis='y', linestyle='--', alpha=0.7)
  plt.tight_layout()
  plt.show()