Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

The execution time is very slow #13

Open
nabilEM opened this issue Dec 3, 2019 · 4 comments
Open

The execution time is very slow #13

nabilEM opened this issue Dec 3, 2019 · 4 comments

Comments

@nabilEM
Copy link

nabilEM commented Dec 3, 2019

Your solution is interesting. Unfortunately, it is not scalable. I made it turn for 200 points of two dimensions, it takes almost 6 seconds. For thousands of points I can't keep it running anymore.

@yuvalshachaf
Copy link

"""
Implimentation of Density-Based Clustering Validation "DBCV"

Citation:
Moulavi, Davoud, et al. "Density-based clustering validation."
Proceedings of the 2014 SIAM International Conference on Data Mining.
Society for Industrial and Applied Mathematics, 2014.
"""

from multiprocessing import Pool
from functools import partial
import numpy as np
from scipy.spatial.distance import euclidean, cdist
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse import csgraph
from time import time

def DBCV(X, labels, dist_function=euclidean):
"""
Density Based clustering validation

Args:
    X (np.ndarray): ndarray with dimensions [n_samples, n_features]
        data to check validity of clustering
    labels (np.array): clustering assignments for data X
    dist_dunction (func): function to determine distance between objects
        func args must be [np.array, np.array] where each array is a point

Returns: cluster_validity (float)
    score in range[-1, 1] indicating validity of clustering assignments
"""
graph = _mutual_reach_dist_graph_multiproc(X, labels, dist_function)
mst = _mutual_reach_dist_MST(graph)
cluster_validity = _clustering_validity_index_multiproc(mst, labels)
return cluster_validity

def _core_dist(point, neighbors, dist_function):
"""
Computes the core distance of a point.
Core distance is the inverse density of an object.

Args:
    point (np.array): array of dimensions (n_features,)
        point to compute core distance of
    neighbors (np.ndarray): array of dimensions (n_neighbors, n_features):
        array of all other points in object class
    dist_dunction (func): function to determine distance between objects
        func args must be [np.array, np.array] where each array is a point

Returns: core_dist (float)
    inverse density of point
"""
n_features = np.shape(point)[0]
n_neighbors = np.shape(neighbors)[0]

distance_vector = cdist(point.reshape(1, -1), neighbors)
distance_vector = distance_vector[distance_vector != 0]
if len(distance_vector) != 0:
    numerator = ((1/distance_vector)**n_features).sum()
    core_dist = (numerator / (n_neighbors - 1)) ** (-1/n_features)
else:
    core_dist = 0.0
return core_dist

def _mutual_reachability_dist(point_i, point_j, neighbors_i,
neighbors_j, dist_function):
""".
Computes the mutual reachability distance between points

Args:
    point_i (np.array): array of dimensions (n_features,)
        point i to compare to point j
    point_j (np.array): array of dimensions (n_features,)
        point i to compare to point i
    neighbors_i (np.ndarray): array of dims (n_neighbors, n_features):
        array of all other points in object class of point i
    neighbors_j (np.ndarray): array of dims (n_neighbors, n_features):
        array of all other points in object class of point j
    dist_dunction (func): function to determine distance between objects
        func args must be [np.array, np.array] where each array is a point

Returns: mutual_reachability (float)
    mutual reachability between points i and j

"""
core_dist_i = _core_dist(point_i, neighbors_i, dist_function)
core_dist_j = _core_dist(point_j, neighbors_j, dist_function)
dist = dist_function(point_i, point_j)
mutual_reachability = np.max([core_dist_i, core_dist_j, dist])
return mutual_reachability

def _mutual_reach_dist_graph(X, labels, dist_function):
"""
Computes the mutual reach distance complete graph.
Graph of all pair-wise mutual reachability distances between points

Args:
    X (np.ndarray): ndarray with dimensions [n_samples, n_features]
        data to check validity of clustering
    labels (np.array): clustering assignments for data X
    dist_dunction (func): function to determine distance between objects
        func args must be [np.array, np.array] where each array is a point

Returns: graph (np.ndarray)
    array of dimensions (n_samples, n_samples)
    Graph of all pair-wise mutual reachability distances between points.

"""
n_samples = np.shape(X)[0]
graph = []
for row in range(n_samples):
    graph_row = []
    for col in range(n_samples):
        point_i = X[row]
        point_j = X[col]
        class_i = labels[row]
        class_j = labels[col]
        members_i = _get_label_members(X, labels, class_i)
        members_j = _get_label_members(X, labels, class_j)
        dist = _mutual_reachability_dist(point_i, point_j,
                                         members_i, members_j,
                                         dist_function)
        graph_row.append(dist)
    graph.append(graph_row)
graph = np.array(graph)
return graph

def _mutual_reach_dist_graph_worker(X, labels, dist_function, point_members_row):
graph_row = []
for i in point_members_row:
point_i = i[0]; point_j = i[1]; class_i = i[2]; class_j = i[3]
members_i = _get_label_members(X, labels, class_i)
members_j = _get_label_members(X, labels, class_j)
graph_row.append(_mutual_reachability_dist(point_i, point_j,
members_i, members_j,
dist_function))
return graph_row

def _mutual_reach_dist_graph_multiproc(X, labels, dist_function):
processes = 2
n_samples = np.shape(X)[0]
point_members = [[((X[row], X[col], labels[row], labels[col])) for col in range(n_samples)] for row in range(n_samples)]
p = Pool(processes)
func = partial(_mutual_reach_dist_graph_worker, X, labels, dist_function)
graph = p.map(func, point_members)
p.close()
p.join()

graph = np.array(graph)
return graph

def _mutual_reach_dist_MST(dist_tree):
"""
Computes minimum spanning tree of the mutual reach distance complete graph

Args:
    dist_tree (np.ndarray): array of dimensions (n_samples, n_samples)
        Graph of all pair-wise mutual reachability distances
        between points.

Returns: minimum_spanning_tree (np.ndarray)
    array of dimensions (n_samples, n_samples)
    minimum spanning tree of all pair-wise mutual reachability
        distances between points.
"""
mst = minimum_spanning_tree(dist_tree).toarray()
return mst + np.transpose(mst)

def _cluster_density_sparseness(MST, labels, cluster):
"""
Computes the cluster density sparseness, the minimum density
within a cluster

Args:
    MST (np.ndarray): minimum spanning tree of all pair-wise
        mutual reachability distances between points.
    labels (np.array): clustering assignments for data X
    cluster (int): cluster of interest

Returns: cluster_density_sparseness (float)
    value corresponding to the minimum density within a cluster
"""
indices = np.where(labels == cluster)[0]
cluster_MST = MST[indices][:, indices]
cluster_density_sparseness = np.max(cluster_MST)
return cluster_density_sparseness

def _cluster_density_separation(MST, labels, cluster_i, cluster_j):
"""
Computes the density separation between two clusters, the maximum
density between clusters.

Args:
    MST (np.ndarray): minimum spanning tree of all pair-wise
        mutual reachability distances between points.
    labels (np.array): clustering assignments for data X
    cluster_i (int): cluster i of interest
    cluster_j (int): cluster j of interest

Returns: density_separation (float):
    value corresponding to the maximum density between clusters
"""
indices_i = np.where(labels == cluster_i)[0]
indices_j = np.where(labels == cluster_j)[0]
shortest_paths = csgraph.dijkstra(MST, indices=indices_i)
relevant_paths = shortest_paths[:, indices_j]
density_separation = np.min(relevant_paths)
return density_separation

def _cluster_validity_index(MST, labels, cluster):
"""
Computes the validity of a cluster (validity of assignmnets)

Args:
    MST (np.ndarray): minimum spanning tree of all pair-wise
        mutual reachability distances between points.
    labels (np.array): clustering assignments for data X
    cluster (int): cluster of interest

Returns: cluster_validity (float)
    value corresponding to the validity of cluster assignments
"""
min_density_separation = np.inf
for cluster_j in np.unique(labels):
    if cluster_j != cluster:
        cluster_density_separation = _cluster_density_separation(MST,
                                                                 labels,
                                                                 cluster,
                                                                 cluster_j)
        if cluster_density_separation < min_density_separation:
            min_density_separation = cluster_density_separation
cluster_density_sparseness = _cluster_density_sparseness(MST,
                                                         labels,
                                                         cluster)
numerator = min_density_separation - cluster_density_sparseness
denominator = np.max([min_density_separation, cluster_density_sparseness])
cluster_validity = numerator / denominator
return cluster_validity

def _cluster_validity_index_worker(MST, labels, n_samples, label):
fraction = np.sum(labels == label) / float(n_samples)
cluster_validity = _cluster_validity_index(MST, labels, label)
return fraction * cluster_validity

def _clustering_validity_index_multiproc(MST, labels):
processes = 2
n_samples = len(labels)
p = Pool(processes)
func = partial(_cluster_validity_index_worker, MST, labels, n_samples)
validity_index_list = p.map(func, np.unique(labels))
p.close()
p.join()
validity_index = np.sum(np.array(validity_index_list))
return validity_index

def _clustering_validity_index(MST, labels):
"""
Computes the validity of all clustering assignments for a
clustering algorithm

Args:
    MST (np.ndarray): minimum spanning tree of all pair-wise
        mutual reachability distances between points.
    labels (np.array): clustering assignments for data X

Returns: validity_index (float):
    score in range[-1, 1] indicating validity of clustering assignments
"""

n_samples = len(labels)
validity_index = 0
for label in np.unique(labels):
    fraction = np.sum(labels == label) / float(n_samples)
    cluster_validity = _cluster_validity_index(MST, labels, label)
    validity_index += fraction * cluster_validity
return validity_index

def _get_label_members(X, labels, cluster):
"""
Helper function to get samples of a specified cluster.

Args:
    X (np.ndarray): ndarray with dimensions [n_samples, n_features]
        data to check validity of clustering
    labels (np.array): clustering assignments for data X
    cluster (int): cluster of interest

Returns: members (np.ndarray)
    array of dimensions (n_samples, n_features) of samples of the
    specified cluster.
"""
indices = np.where(labels == cluster)[0]
members = X[indices]
return members

@yuvalshachaf
Copy link

yuvalshachaf commented Jul 29, 2020

i have included a version that runs in multiprocess and so runs much faster
Look for p lines include p = Pool(processes)
remember you have to run the DBCV with if name == 'main':
This version includes the bug fix by alashkov83

@nabilEM
Copy link
Author

nabilEM commented Jul 30, 2020

i have included a version that runs in multiprocess and so runs much faster
Look for p lines include p = Pool(processes)
remember you have to run the DBCV with if name == 'main':
This version includes the bug fix by alashkov83

Thank you very much ^^

@barblin
Copy link

barblin commented Feb 24, 2021

First, thanks @christopherjenness for this implementation!

You dont need to introduce parallelism to make it faster. There are some steps you can take to instantly improve the runtime. Im currently having a look at it and these modifications:

  def _mutual_reach_dist_graph(X, labels, dist_function):
  
      neighs_lookup = _create_neighbours_lookup(X, labels)
  
      n_samples = np.shape(X)[0]
      graph = np.zeros(shape=(n_samples, n_samples))
      offset = 0
      for row in range(n_samples):
          offset += 1
          for col in range(offset, n_samples):
              point_i = X[row]
              point_j = X[col]
              class_i = labels[row]
              class_j = labels[col]
              members_i = neighs_lookup[class_i]
              members_j = neighs_lookup[class_j]
              dist = _mutual_reachability_dist(point_i, point_j,
                                               members_i, members_j,
                                               dist_function)
              graph[row][col] = dist
              graph[col][row] = dist
      return graph
  
  def _create_neighbours_lookup(X, labels):
      lookup = {}
      for i in labels:
          if i not in lookup.keys():
              indices = np.where(labels == i)[0]
              lookup[i] = X[indices]
  
      return lookup

Already lead to a considerable improvement. The first thing is, to use the already calculated dist to store it for both directions and skip that calculation step with an incresing offset. The other is, to store the neighbors for a specific label and therefore avoid extracting the neighbours in the for loops.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants