# !pip install HiPart
# !pip install scikit-learn-extra
import numpy as np
import pandas as pd
from HiPart.clustering import BisectingKmeans
from HiPart.clustering import DePDDP
from HiPart.clustering import IPDDP
from HiPart.clustering import KMPDDP
from HiPart.clustering import PDDP
from HiPart.clustering import MDH
from sklearn.datasets import make_blobs
from sklearn import metrics
from sklearn.cluster import AgglomerativeClustering as hclust
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids as PAM
from scipy.spatial import distance_matrix
import matplotlib.pyplot as plt
import time
import HiPart.visualizations as viz
from matplotlib.pylab import tick_params
Computational time analysis of divisive algorithms
Initialization
Load the required python packages.
1. Simulation study
This study compares the computational time of the algorithms for different sample sizes, number of features in a Monte Carlo simulation. We will compare the following algorithms: agglomerative hierachical clustering (HCl), PDDP, dePDDP, iPDDP, kMeans-PDDP, Bisecting kMeans and partition around medoids (PAM).
Parameter setting
Code
# number of cluster k to be searched in the data by the algorithms
= 2
clusters = [-3, 3]
centers = 1.5
stdk = 100
M = [50, 100, 200, 500]
P = False
verbose
= [100, 200, 500, 1000, 2000, 5000, 10000]
nobs = pd.DataFrame({
OUTPUT "M" : [],
"P" : [],
"Size" : [],
"Method" : [],
"CluNr" : [],
"Secs" : []
})
A number of 2 clusters are generated in the data. Data are generated from a p-variate Gaussian distribution whose parameters are different in each cluster. Clustered data are centered around the p-variate points (-3, …, -3) and (3, …, 3) respectively, while the covariance matrices of of the clusters are diagonal with homoskedastic standard deviations \(\sigma_1\) = 3.0 and \(\sigma_2\) = 1.5. The sample sizes are [100, 200, 500, 1000, 2000, 5000, 10000] and the feature sizes are [50, 100, 200, 500]. The algorithms are executed in a loop for each sample size, number of features and Monte Carlo simulation. The number of Monte Carlo simulations for each scenario is 100.
Simulation step
for mm in range(M):
if verbose: print("MC loop %d of %d" % (mm+1,M))
for NN in nobs:
if verbose: print(" sample size = %d" % NN, end=" ")
for p in P:
# Parameters for the multivariate Gaussian distributions
= round(0.6*NN)
N1 = round(0.4*NN)
N2
# Define the mean and covariance for the clusters
= centers[0]*np.zeros(p)
mean1 = np.eye(p) * stdk * 2
cov1
= +centers[1]*np.ones(p)
mean2 = np.eye(p) * stdk
cov2
# Generate the clusters
= np.random.multivariate_normal(mean1, cov1, N1)
cluster1 = np.random.multivariate_normal(mean2, cov2, N2)
cluster2
# Combine the clusters
= np.vstack((cluster1, cluster2))
X = np.array([0] * N1 + [1] * N2)
y
# %% dePDDP algorithm execution --------------------------------------------
= time.perf_counter()
tic = DePDDP(
depddp ="pca",
decomposition_method=clusters,
max_clusters_number=0.5,
bandwidth_scale=0.1
percentile
).fit(X)= time.perf_counter()
toc = toc - tic
depddp_elapsed_time if verbose: print("1", end="")
# ---------------------------------------------------------------------------
# %% iPDDP algorithm execution -----------------------------------------------
= time.perf_counter()
tic = IPDDP(
ipddp ="pca",
decomposition_method=clusters,
max_clusters_number=0.1
percentile
).fit(X)= time.perf_counter()
toc = toc - tic
ipddp_elapsed_time if verbose: print("2", end="")
# ---------------------------------------------------------------------------
# %% kMeans-PDDP algorithm execution -----------------------------------------
= time.perf_counter()
tic = KMPDDP(
kmpddp ="pca",
decomposition_method=clusters
max_clusters_number
).fit(X)= time.perf_counter()
toc = toc - tic
kmpddp_elapsed_time if verbose: print("3", end="")
# ---------------------------------------------------------------------------
# %% PDDP algorithm execution
= time.perf_counter()
tic = PDDP(
pddp ="pca",
decomposition_method=clusters
max_clusters_number
).fit(X)= time.perf_counter()
toc = toc - tic
pddp_elapsed_time if verbose: print("4", end="")
# ---------------------------------------------------------------------------
# %% Bisecting kMeans algorithm execution ------------------------------------
= time.perf_counter()
tic = BisectingKmeans(
bikmeans =clusters
max_clusters_number
).fit(X)= time.perf_counter()
toc = toc - tic
bikmeans_elapsed_time if verbose: print("5", end="")
# ---------------------------------------------------------------------------
# %% HIERARCHICAL CLUSTERING algorithm execution ---------------------------
= time.perf_counter()
tic = hclust(
hcl_labels_ =clusters,
n_clusters='euclidean',
metric='ward'
linkage# affinity='euclidean'
).fit_predict(X) = time.perf_counter()
toc = toc - tic
hcl_elapsed_time if verbose: print("6", end="")
# ---------------------------------------------------------------------------
# %% KMedoids algorithm execution ------------------------------------
= time.perf_counter()
tic = PAM(n_clusters=clusters).fit(X)
pam = time.perf_counter()
toc = toc - tic
pam_elapsed_time if verbose: print("7")
# ---------------------------------------------------------------------------
= pd.DataFrame({
results "M" : [mm, mm, mm, mm, mm, mm, mm],
"P" : [p, p, p, p, p, p, p],
"Size" : [NN, NN, NN, NN, NN, NN, NN],
"Method" : ["dePDDP", "iPDDP", "kMeans-PDDP", "PDDP", "Bisecting kMeans", "HCl", "PAM"],
"CluNr" : [len(np.unique(depddp.labels_)), len(np.unique(ipddp.labels_)), len(np.unique(kmpddp.labels_)), len(np.unique(pddp.labels_)), len(np.unique(bikmeans.labels_)), len(np.unique(hcl_labels_)), len(np.unique(pam.labels_))],
"Secs" : [depddp_elapsed_time, ipddp_elapsed_time, kmpddp_elapsed_time, pddp_elapsed_time, bikmeans_elapsed_time, hcl_elapsed_time, pam_elapsed_time]
})= pd.concat([OUTPUT, results])
OUTPUT
# if verbose: print(results["Secs"])
2. Plotting the results
Code
if verbose: print(OUTPUT)
# Plotting the computational time by method and number of features
for p in P:
= OUTPUT[OUTPUT["P"] == p].groupby(["Method", "Size"]).agg(
grouped_df =("Secs", "mean")
Avg_Secs
).reset_index()
=(7.5, 6.5))
plt.figure(figsize"Method"] == "PDDP"]["Size"], grouped_df[grouped_df["Method"] == "PDDP"]["Avg_Secs"], marker="o", label="PDDP")
plt.plot(grouped_df[grouped_df["Method"] == "iPDDP"]["Size"], grouped_df[grouped_df["Method"] == "iPDDP"]["Avg_Secs"], marker="o", label="iPDDP")
plt.plot(grouped_df[grouped_df["Method"] == "dePDDP"]["Size"], grouped_df[grouped_df["Method"] == "dePDDP"]["Avg_Secs"], marker="o", label="dePDDP")
plt.plot(grouped_df[grouped_df["Method"] == "kMeans-PDDP"]["Size"], grouped_df[grouped_df["Method"] == "kMeans-PDDP"]["Avg_Secs"], marker="o", label="kMeans-PDDP")
plt.plot(grouped_df[grouped_df["Method"] == "Bisecting kMeans"]["Size"], grouped_df[grouped_df["Method"] == "Bisecting kMeans"]["Avg_Secs"], marker="o", label="Bisecting kMeans")
plt.plot(grouped_df[grouped_df["Method"] == "PAM"]["Size"], grouped_df[grouped_df["Method"] == "PAM"]["Avg_Secs"], marker="o", label="PAM")
plt.plot(grouped_df[grouped_df["Method"] == "HCl"]["Size"], grouped_df[grouped_df["Method"] == "HCl"]["Avg_Secs"], marker="o", label="HCl")
plt.plot(grouped_df[grouped_df[
# Show the plot
="Method", fontsize=10)
plt.legend(title"Computational Time by Method, %s features" % p, fontsize=14)
plt.title("Time (log-secs)", fontsize=12)
plt.ylabel('log')
plt.yscale(='y', linestyle='--', alpha=0.7)
plt.grid(axis
plt.tight_layout() plt.show()