import numpy as np
import numba
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils import check_array, check_random_state
from sklearn.utils.validation import check_is_fitted
from .numba_kdtree import build_kdtree
from .boruvka import parallel_boruvka
from .cluster_trees import (
mst_to_linkage_tree,
condense_tree,
mask_condensed_tree,
extract_leaves,
get_cluster_label_vector,
get_point_membership_strength_vector,
)
from .clustering_utilities import (
find_peaks,
_binary_search_for_n_clusters,
binary_search_for_n_clusters,
min_cluster_size_barcode,
compute_total_persistence,
extract_clusters_by_id,
select_diverse_peaks,
build_cluster_tree,
find_duplicates,
)
from .knn_graph import knn_graph
from .label_propagation import label_propagation_init
from .node_embedding import node_embedding
from .graph_construction import neighbor_graph_matrix
[docs]
def build_cluster_layers(
data,
*,
min_samples=5,
base_min_cluster_size=10,
base_n_clusters=None,
reproducible_flag=False,
min_similarity_threshold=0.2,
max_layers=10,
):
"""Build hierarchical cluster layers from embedding data.
Parameters
----------
data : array-like of shape (n_samples, n_features)
The embedding data to cluster. Typically the output of a node embedding
algorithm.
min_samples : int, default=5
The minimum number of samples to use in the density estimation when
performing density based clustering.
base_min_cluster_size : int, default=10
The minimum number of points in a cluster at the base layer of the clustering.
This gives the finest granularity clustering that will be returned.
base_n_clusters : int or None, default=None
If not None, the algorithm will attempt to find the granularity of
clustering that will give exactly this many clusters for the bottom-most layer
of clustering. This affects the base layer computation and allows multiple
layers to be built on top of this base.
reproducible_flag : bool, default=False
Whether to ensure reproducible results by using deterministic algorithms
where possible.
min_similarity_threshold : float, default=0.2
The minimum similarity threshold for cluster layer selection. Peaks that result
in clusterings with Jaccard similarity above this threshold will be filtered out
to ensure diverse cluster layers.
max_layers : int, default=10
The maximum number of cluster layers to return. The algorithm will select up to
this many diverse peaks based on persistence and similarity criteria.
Returns
-------
cluster_layers : list of array-like of shape (n_samples,)
The clustering of the data at each layer of the clustering. Each layer
is a clustering of the data into a different number of clusters.
membership_strength_layers : list of array-like of shape (n_samples,)
The membership strengths of each point in the clustering at each layer.
This gives a measure of how strongly each point belongs to each cluster.
persistence_scores : list of float
The persistence scores for each cluster layer, indicating the quality or
stability of the clustering at that layer.
"""
n_samples = data.shape[0]
min_cluster_size = base_min_cluster_size
cluster_layers = []
membership_strength_layers = []
persistence_scores = []
n_threads = numba.get_num_threads()
numba_tree = build_kdtree(data.astype(np.float32))
edges = parallel_boruvka(
numba_tree,
n_threads,
min_samples=min_cluster_size if min_samples is None else min_samples,
reproducible=reproducible_flag,
)
sorted_mst = edges[np.argsort(edges.T[2])]
uncondensed_tree = mst_to_linkage_tree(sorted_mst)
if base_n_clusters is not None:
leaves, clusters, strengths = _binary_search_for_n_clusters(
uncondensed_tree, base_n_clusters, n_samples=n_samples
)
cluster_sizes = np.bincount(clusters[clusters >= 0])
if len(cluster_sizes) > 0:
min_cluster_size = max(1, np.min(cluster_sizes))
else:
min_cluster_size = base_min_cluster_size
# Still need condensed tree for later processing
condensed_tree = condense_tree(uncondensed_tree, min_cluster_size)
else:
condensed_tree = condense_tree(uncondensed_tree, base_min_cluster_size)
leaves = extract_leaves(condensed_tree)
clusters = get_cluster_label_vector(condensed_tree, leaves, 0.0, n_samples)
strengths = get_point_membership_strength_vector(
condensed_tree, leaves, clusters
)
mask = condensed_tree.child >= n_samples
cluster_tree = mask_condensed_tree(condensed_tree, mask)
# points_tree = mask_condensed_tree(condensed_tree, ~mask)
# Check if cluster_tree is valid before processing
if len(cluster_tree.child) > 0 and cluster_tree.child[-1] >= n_samples:
births, deaths, parents, lambda_deaths = min_cluster_size_barcode(
cluster_tree, n_samples, min_cluster_size
)
sizes, total_persistence = compute_total_persistence(
births, deaths, lambda_deaths
)
peaks = find_peaks(total_persistence)
else:
# Handle empty or invalid cluster tree
births = np.array([])
deaths = np.array([])
parents = np.array([])
lambda_deaths = np.array([])
sizes = np.array([])
total_persistence = np.array([])
peaks = np.array([], dtype=np.int64)
# Always include the base layer (from initial condensed tree)
cluster_layers.append(clusters)
membership_strength_layers.append(strengths)
persistence_scores.append(0.0) # Base layer gets 0 persistence score
# Select diverse peaks using hierarchical selection
selected_peaks = select_diverse_peaks(
peaks,
total_persistence,
sizes,
births,
deaths,
min_similarity_threshold=min_similarity_threshold,
max_layers=max_layers - 1, # Reserve one slot for base layer
)
for peak in selected_peaks:
best_birth = sizes[peak]
persistence = total_persistence[peak]
selected_clusters = (
np.where((births <= best_birth) & (deaths > best_birth))[0] + n_samples
)
labels, strengths = extract_clusters_by_id(condensed_tree, selected_clusters)
cluster_layers.append(labels)
membership_strength_layers.append(strengths)
persistence_scores.append(persistence)
# Sort cluster layers by number of clusters (most clusters first)
n_clusters_per_layer = [layer.max() + 1 for layer in cluster_layers]
sorted_indices = np.argsort(n_clusters_per_layer)[::-1] # Descending order
cluster_layers = [cluster_layers[i] for i in sorted_indices]
membership_strength_layers = [membership_strength_layers[i] for i in sorted_indices]
persistence_scores = [persistence_scores[i] for i in sorted_indices]
return cluster_layers, membership_strength_layers, persistence_scores
[docs]
def evoc_clusters(
data,
noise_level=0.5,
base_min_cluster_size=5,
base_n_clusters=None,
approx_n_clusters=None,
n_neighbors=15,
min_samples=5,
n_epochs=50,
node_embedding_init="label_prop",
symmetrize_graph=True,
return_duplicates=False,
node_embedding_dim=None,
neighbor_scale=1.0,
random_state=None,
reproducible_flag=True,
min_similarity_threshold=0.2,
max_layers=10,
n_label_prop_iter=20,
):
"""Cluster data using the EVoC algorithm.
Parameters
----------
data : array-like of shape (n_samples, n_features)
The data to cluster. If the data is float valued then it is assumed to use
cosine distance as a matric. If the data is int8 valued then it is assumed
that a quantized embedding is being used and a quantized version of cosine
distance is used. If the data is uint8 valued then it is assumed that a
binary embedding is being used, and a bitwise Jaccard distance is used.
noise_level : float, default=0.5
The noise level expected in the data. A value of 0.0 will try to cluster
more data, at the expense of getting less accurate clustering. A value of
1.0 will try for accurate clusters, discarding more data as noise to do so.
base_min_cluster_size : int, default=5
The minimum number of points in a cluster at the base layer of the clustering.
This gives the finest granularity clustering that will be returned, with less
graularity at higher layers.
base_n_clusters : int, default=None
If not None, the algorithm will attempt to find the granularity of
clustering that will give exactly this many clusters for the bottom-most layer
of clustering. This affects the base layer computation and allows multiple
layers to be built on top of this base. Since the actual number of clusters
cannot be guaranteed this is only approximate, but usually the algorithm can
manage to get this exact number, assuming a reasonable clustering into
``base_n_clusters`` exists.
approx_n_clusters : int, default=None
If not None, the algorithm will attempt to find the granularity of
clustering that will give exactly this many clusters as the final output.
Unlike ``base_n_clusters``, when this parameter is set, only a single
clustering layer will be returned -- no hierarchical layers will be produced.
This is useful when you know the exact number of clusters you want and don't
need the multi-layer analysis. Since the actual number of clusters cannot be
guaranteed this is only approximate, but usually the algorithm can manage to
get this exact number, assuming a reasonable clustering into ``approx_n_clusters``
exists.
n_neighbors : int, default=15
The number of neighbors to use in the nearest neighbor graph construction.
min_samples : int, default=5
The minimum number of samples to use in the density estimation when
performing density based clustering on the node embedding.
n_epochs : int, default=50
The number of epochs to use when training the node embedding.
node_embedding_init : str or None, default='label_prop'
The method to use to initialize the node embedding. If None, no initialization
will be used. If 'label_prop', the label propagation method will be used.
symmetrize_graph : bool, default=True
Whether to symmetrize the nearest neighbor graph before using it to
construct the node embedding.
return_duplicates : bool, default=False
Whether to return a set of duplicate pairs of points in the data.
node_embedding_dim : int or None, default=None
The number of dimensions to use in the node embedding. If None, a default
value of min(max(n_neighbors // 4, 4), 15) will be used.
neighbor_scale : float, default=1.0
The scale factor to use when constructing the nearest neighbor graph. This
multiplies the effective number of neighbors used in graph construction
(neighbor_scale * n_neighbors). Values > 1.0 create denser graphs with more
connectivity, potentially capturing more global structure but at increased
computational cost. Values < 1.0 create sparser graphs focused on local
structure.
random_state : np.random.RandomState or None, default=None
The random state to use for the random number generator. If None, the random
number generator will not be seeded and will use the system time as the seed.
reproducible_flag : bool, default=True
Whether to ensure reproducible results by using deterministic algorithms
where possible. When True, the clustering results should be consistent
across runs with the same random_state.
min_similarity_threshold : float, default=0.2
The minimum similarity threshold for cluster layer selection. Peaks that result
in clusterings with Jaccard similarity above this threshold will be filtered out
to ensure diverse cluster layers.
max_layers : int, default=10
The maximum number of cluster layers to return. The algorithm will select up to
this many diverse peaks based on persistence and similarity criteria.
n_label_prop_iter : int, default=20
The number of iterations to use in the label propagation algorithm when
initializing the node embedding.
Returns
-------
cluster_layers : list of array-like of shape (n_samples,)
The clustering of the data at each layer of the clustering. Each layer
is a clustering of the data into a different number of clusters.
membership_strengths : list of array-like of shape (n_samples,)
The membership strengths of each point in the clustering at each layer.
This gives a measure of how strongly each point belongs to each cluster.
nn_inds : array-like of shape (n_samples, n_neighbors)
Indices of nearest neighbors for each sample.
nn_dists : array-like of shape (n_samples, n_neighbors)
Distance from each sample to each nearest neighbor indexed by nn_inds
duplicates : set of tuple of int
Only returned in ``return_duplicates`` is True. A set of pairs of indices of
potential duplicate points in the data.
"""
if random_state is None:
random_state = np.random.RandomState()
nn_inds, nn_dists = knn_graph(
data, n_neighbors=n_neighbors, random_state=random_state
)
graph = neighbor_graph_matrix(
neighbor_scale * n_neighbors, nn_inds, nn_dists, symmetrize_graph
)
n_embedding_components = node_embedding_dim or min(max(n_neighbors // 4, 4), 15)
if node_embedding_init == "label_prop":
init_embedding = label_propagation_init(
graph,
n_components=n_embedding_components,
approx_n_parts=np.clip(int(8 * np.sqrt(data.shape[0])), 256, 16384),
random_scale=0.1,
scaling=0.5,
noise_level=noise_level,
random_state=random_state,
data=data,
n_label_prop_iter=n_label_prop_iter,
)
elif node_embedding_init is None:
init_embedding = None
embedding = node_embedding(
graph,
n_components=n_embedding_components,
n_epochs=n_epochs,
initial_embedding=init_embedding,
negative_sample_rate=1.0,
noise_level=noise_level,
random_state=random_state,
verbose=False,
reproducible_flag=reproducible_flag,
initial_alpha=0.1,
)
if return_duplicates:
duplicates = find_duplicates(nn_inds, nn_dists)
n_threads = numba.get_num_threads()
if approx_n_clusters is not None:
cluster_vector, strengths = binary_search_for_n_clusters(
embedding,
approx_n_clusters,
n_threads,
min_samples=min_samples,
)
if return_duplicates:
return [cluster_vector], [strengths], [0.0], nn_inds, nn_dists, duplicates
else:
return [cluster_vector], [strengths], [0.0], nn_inds, nn_dists
else:
cluster_layers, membership_strengths, persistence_scores = build_cluster_layers(
embedding,
min_samples=min_samples,
base_min_cluster_size=base_min_cluster_size,
base_n_clusters=base_n_clusters,
reproducible_flag=reproducible_flag,
min_similarity_threshold=min_similarity_threshold,
max_layers=max_layers,
)
if return_duplicates:
return (
cluster_layers,
membership_strengths,
persistence_scores,
nn_inds,
nn_dists,
duplicates,
)
else:
return (
cluster_layers,
membership_strengths,
persistence_scores,
nn_inds,
nn_dists,
)
[docs]
class EVoC(BaseEstimator, ClusterMixin):
"""
Embedding Vector Oriented Clustering for efficient clustering of high-dimensional
embedding vectors such as CLIP-vectors, sentence-transformers output, etc. The
clustering uses a combination of a node embedding of a nearest neighbour graph,
related to UMAP, and a density based clustering approach related to HDBSCAN,
improving upon those approaches in efficiency and quality for the specific case
of high-dimensional embedding vectors.
Parameters
----------
noise_level : float, default=0.5
The noise level expected in the data. A value of 0.0 will try to cluster
more data, at the expense of getting less accurate clustering. A value of
1.0 will try for accurate clusters, discarding more data as noise to do so.
base_min_cluster_size : int, default=5
The minimum number of points in a cluster at the base layer of the clustering.
This gives the finest granularity clustering that will be returned, with less
graularity at higher layers.
base_n_clusters : int or None, default=None
If not None, the algorithm will attempt to find the granularity of
clustering that will give exactly this many clusters for the bottom-most layer
of clustering. This affects the base layer computation and allows multiple
layers to be built on top of this base. Since the actual number of clusters
cannot be guaranteed this is only approximate, but usually the algorithm can
manage to get this exact number, assuming a reasonable clustering into
``base_n_clusters`` exists.
approx_n_clusters : int, default=None
If not None, the algorithm will attempt to find the granularity of
clustering that will give exactly this many clusters as the final output.
Unlike ``base_n_clusters``, when this parameter is set, only a single
clustering layer will be returned -- no hierarchical layers will be produced.
This is useful when you know the exact number of clusters you want and don't
need the multi-layer analysis. Since the actual number of clusters cannot be
guaranteed this is only approximate, but usually the algorithm can manage to
get this exact number, assuming a reasonable clustering into ``approx_n_clusters``
exists.
n_neighbors : int, default=15
The number of neighbors to use in the nearest neighbor graph construction.
min_samples : int, default=5
The minimum number of samples to use in the density estimation when
performing density based clustering on the node embedding.
n_epochs : int, default=50
The number of epochs to use when training the node embedding.
node_embedding_init : str or None, default='label_prop'
The method to use to initialize the node embedding. If None, no initialization
will be used. If 'label_prop', the label propagation method will be used.
symmetrize_graph : bool, default=True
Whether to symmetrize the nearest neighbor graph before using it to
construct the node embedding.
node_embedding_dim : int or None, default=None
The number of dimensions to use in the node embedding. If None, a default
value of min(max(n_neighbors // 4, 4), 15) will be used.
neighbor_scale : float, default=1.0
The scale factor to use when constructing the nearest neighbor graph. This
multiplies the effective number of neighbors used in graph construction
(neighbor_scale * n_neighbors). Values > 1.0 create denser graphs with more
connectivity, potentially capturing more global structure but at increased
computational cost. Values < 1.0 create sparser graphs focused on local
structure.
random_state : int or None, default=None
The random seed to use for the random number generator. If None, the random
number generator will not be seeded and will use the system time as the seed.
min_similarity_threshold : float, default=0.2
The minimum similarity threshold for cluster layer selection. Peaks that result
in clusterings with Jaccard similarity above this threshold will be filtered out
to ensure diverse cluster layers.
max_layers : int, default=10
The maximum number of cluster layers to return. The algorithm will select up to
this many diverse peaks based on persistence and similarity criteria.
n_label_prop_iter : int, default=20
The number of iterations to use in the label propagation algorithm when
initializing the node embedding. This parameter controls how many steps
the label propagation process takes to converge when node_embedding_init
is set to 'label_prop'.
Attributes
----------
labels_ : array-like of shape (n_samples,)
An array of labels for the data samples; this is a integer array as per other scikit-learn
clustering algorithms. A value of -1 indicates that a point is a noise point and
not in any cluster.
membership_strengths_ : array-like of shape (n_samples,)
An array of membership strengths for the data samples; this gives a measure of how
strongly each point belongs to each cluster. This is a floating point array with
values between 0 and 1.
cluster_layers_ : list of array-like of shape (n_samples,)
The clustering of the data at each layer of the clustering. Each layer
is a clustering of the data into a different number of clusters; the earlier the
cluster vector is in this list the finer the granularity of clustering.
membership_strength_layers_ : list of array-like of shape (n_samples,)
The membership strengths of each point in the clustering at each layer.
cluster_tree_ : dict
A dictionary representing the hierarchical clustering of the data. The keys are
tuples of (layer, cluster) and the values are lists of tuples of (layer, cluster)
representing the children of the key cluster.
nn_inds_ : array-like of shape (n_samples, n_neighbors)
Indices of nearest neighbors for each sample.
nn_dists_ : array-like of shape (n_samples, n_neighbors)
Distance from each sample to each nearest neighbor (indexed by nn_inds).
duplicates_ : set of tuple of int
A set of pairs of indices of potential duplicate points in the data.
"""
[docs]
def __init__(
self,
noise_level: float = 0.5,
base_min_cluster_size: int = 5,
base_n_clusters: int | None = None,
approx_n_clusters: int | None = None,
n_neighbors: int = 15,
min_samples: int = 5,
n_epochs: int = 50,
node_embedding_init: str | None = "label_prop",
symmetrize_graph: bool = True,
node_embedding_dim: int | None = None,
neighbor_scale: float = 1.0,
random_state: int | None = None,
min_similarity_threshold: float = 0.2,
max_layers: int = 10,
n_label_prop_iter=20,
) -> None:
self.n_neighbors = n_neighbors
self.noise_level = noise_level
self.base_min_cluster_size = base_min_cluster_size
self.base_n_clusters = base_n_clusters
self.approx_n_clusters = approx_n_clusters
self.min_samples = min_samples
self.n_epochs = n_epochs
self.node_embedding_init = node_embedding_init
self.symmetrize_graph = symmetrize_graph
self.node_embedding_dim = node_embedding_dim
self.neighbor_scale = neighbor_scale
self.random_state = random_state
self.min_similarity_threshold = min_similarity_threshold
self.max_layers = max_layers
self.n_label_prop_iter = n_label_prop_iter
[docs]
def fit_predict(self, X, y=None, **fit_params):
"""Fit the model to the data and return the clustering labels.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data to cluster. If the data is float valued then it is assumed to use
cosine distance as a matric. If the data is int8 valued then it is assumed
that a quantized embedding is being used and a quantized version of cosine
distance is used. If the data is uint8 valued then it is assumed that a
binary embedding is being used, and a bitwise Jaccard distance is used.
y : array-like of shape (n_samples,), default=None
Ignored. This parameter exists only for compatibility with
scikit-learn's fit_predict method.
**fit_params : dict
Additional fit parameters. Currently unused, included for compatibility
with scikit-learn's fit_predict interface.
Returns
-------
labels_ : array-like of shape (n_samples,)
An array of labels for the data samples; this is a integer array as per other scikit-learn
clustering algorithms. A value of -1 indicates that a point is a noise point and
not in any cluster.
"""
X = check_array(X)
current_random_state = check_random_state(self.random_state)
(
self.cluster_layers_,
self.membership_strength_layers_,
self.persistence_scores_,
self.nn_inds_,
self.nn_dists_,
self.duplicates_,
) = evoc_clusters(
X,
n_neighbors=self.n_neighbors,
noise_level=self.noise_level,
base_min_cluster_size=self.base_min_cluster_size,
base_n_clusters=self.base_n_clusters,
approx_n_clusters=self.approx_n_clusters,
min_samples=self.min_samples,
n_epochs=self.n_epochs,
node_embedding_init=self.node_embedding_init,
symmetrize_graph=self.symmetrize_graph,
return_duplicates=True,
node_embedding_dim=self.node_embedding_dim,
neighbor_scale=self.neighbor_scale,
random_state=current_random_state,
reproducible_flag=self.random_state is not None,
min_similarity_threshold=self.min_similarity_threshold,
max_layers=self.max_layers,
n_label_prop_iter=self.n_label_prop_iter,
)
if len(self.cluster_layers_) == 1:
self.labels_ = self.cluster_layers_[0]
self.membership_strengths_ = self.membership_strength_layers_[0]
else:
best_layer = np.argmax(self.persistence_scores_)
self.labels_ = self.cluster_layers_[best_layer]
self.membership_strengths_ = self.membership_strength_layers_[best_layer]
return self.labels_
[docs]
def fit(self, X, y=None, **fit_params):
"""Fit the model to the data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data to cluster. If the data is float valued then it is assumed to use
cosine distance as a matric. If the data is int8 valued then it is assumed
that a quantized embedding is being used and a quantized version of cosine
distance is used. If the data is uint8 valued then it is assumed that a
binary embedding is being used, and a bitwise Jaccard distance is used.
y : array-like of shape (n_samples,), default=None
Ignored. This parameter exists only for compatibility with
scikit-learn's fit method.
**fit_params : dict
Additional fit parameters. Currently unused, included for compatibility
with scikit-learn's fit interface.
Returns
-------
self : sklearn Estimator
Returns the instance itself.
"""
self.fit_predict(X, y, **fit_params)
return self
@property
def cluster_tree_(self):
"""dict
A dictionary representing the hierarchical clustering of the data.
The keys are tuples of (layer, cluster) and the values are lists of
tuples of (layer, cluster) representing the children of the key cluster.
This provides a tree structure showing how clusters at different layers
relate to each other hierarchically.
Only available after fitting the model.
Returns
-------
dict
Hierarchical tree structure with (layer, cluster) tuples as keys
and lists of child (layer, cluster) tuples as values.
Raises
------
NotFittedError
If the model has not been fitted yet.
"""
check_is_fitted(
self,
"cluster_layers_",
msg="This %(name)s instance is not fitted yet, and 'cluster_tree_' is not available. "
"Please call 'fit' with appropriate arguments before accessing this attribute.",
)
if not hasattr(self, "_cluster_tree"):
self._cluster_tree = build_cluster_tree(self.cluster_layers_)
return self._cluster_tree