import numpy as np
import numba
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.manifold import SpectralEmbedding, MDS
from .node_embedding import node_embedding
from .common_nndescent import tau_rand, tau_rand_int
INT32_MIN = np.iinfo(np.int32).min + 1
INT32_MAX = np.iinfo(np.int32).max - 1
@numba.njit(fastmath=True, parallel=True, cache=True)
def label_prop_iteration(
indptr,
indices,
data,
labels,
rng_state,
):
n_rows = indptr.shape[0] - 1
result = labels.copy()
for i in numba.prange(n_rows):
current_l = labels[i]
if current_l >= 0:
continue
local_rng_state = rng_state + i
votes = {}
for k in range(indptr[i], indptr[i + 1]):
j = indices[k]
l = labels[j]
if l in votes:
votes[l] += data[k]
else:
votes[l] = data[k]
max_vote = 1
tie_count = 1
for l in votes:
if l == -1:
continue
elif votes[l] > max_vote:
max_vote = votes[l]
result[i] = l
tie_count = 1
elif votes[l] == max_vote:
tie_count += 1
if current_l == -1:
result[i] = l
elif tau_rand(local_rng_state) < 1.0 / tie_count:
result[i] = l
else:
continue
return result
@numba.njit(fastmath=True, parallel=True, cache=True)
def original_label_prop_iteration(
indptr,
indices,
data,
labels,
rng_state,
):
n_rows = indptr.shape[0] - 1
result = labels.copy()
for i in numba.prange(n_rows):
current_l = labels[i]
local_rng_state = rng_state + i
votes = {}
for k in range(indptr[i], indptr[i + 1]):
j = indices[k]
l = labels[j]
if l in votes:
votes[l] += data[k]
else:
votes[l] = data[k]
max_vote = 1
tie_count = 1
for l in votes:
if l == -1:
continue
elif votes[l] > max_vote:
max_vote = votes[l]
result[i] = l
tie_count = 1
elif votes[l] == max_vote:
tie_count += 1
if current_l == -1:
result[i] = l
elif tau_rand(local_rng_state) < 1.0 / tie_count:
result[i] = l
else:
continue
return result
@numba.njit(cache=True)
def label_outliers(indptr, indices, labels, rng_state):
n_rows = indptr.shape[0] - 1
max_label = labels.max()
for i in numba.prange(n_rows):
local_rng_state = rng_state + i
if labels[i] < 0:
node_queue = [i]
unlabelled = True
n_iter = 0
while unlabelled and n_iter < 64 and len(node_queue) > 0:
n_iter += 1
current_node = node_queue.pop()
for k in range(indptr[current_node], indptr[current_node + 1]):
j = indices[k]
if labels[j] >= 0:
labels[i] = labels[j]
unlabelled = False
break
else:
node_queue.append(j)
if n_iter >= 64 or unlabelled:
labels[i] = tau_rand_int(local_rng_state) % (max_label + 1)
return labels
@numba.njit(cache=True)
def remap_labels(labels):
mapping = {}
unique_labels = np.unique(labels)
if unique_labels[0] == -1:
unique_labels = unique_labels[1:]
for i, l in enumerate(unique_labels):
mapping[l] = i
next_label = i + 1
for i in range(labels.shape[0]):
if labels[i] < 0:
labels[i] = next_label
next_label += 1
else:
labels[i] = mapping[labels[i]]
return labels
def label_prop_loop(
indptr, indices, data, labels, random_state, n_iter=20, approx_n_parts=2048
):
rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)
for i in range(approx_n_parts): # range(int(1.25 * approx_n_parts)):
labels[random_state.randint(labels.shape[0])] = i
for i in range(n_iter):
new_labels = label_prop_iteration(indptr, indices, data, labels, rng_state)
labels = new_labels
labels = label_outliers(indptr, indices, labels, rng_state)
return remap_labels(labels)
def original_label_prop_loop(
indptr, indices, data, labels, random_state, n_iter=20, approx_n_parts=2048
):
rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)
for i in range(int(1.25 * approx_n_parts)):
labels[random_state.randint(labels.shape[0])] = i
for i in range(n_iter):
new_labels = original_label_prop_iteration(
indptr, indices, data, labels, rng_state
)
labels = new_labels
labels = label_outliers(indptr, indices, labels, rng_state)
return remap_labels(labels)
[docs]
def label_propagation_init(
graph,
n_label_prop_iter=20,
n_embedding_epochs=50,
approx_n_parts=512,
n_components=2,
scaling=0.1,
random_scale=1.0,
noise_level=0.5,
random_state=None,
data=None,
recursive_init=True,
base_init="pca",
base_init_threshold=64,
upscaling="partition_expander",
):
"""Initialize a node embedding using label propagation on a sparse graph.
This function provides a high-quality initialization for node embeddings by
combining graph-based label propagation with hierarchical partitioning. For
large graphs, it recursively partitions the data and upscales the results.
For small graphs, it uses direct methods (PCA, spectral embedding, or random).
Parameters
----------
graph : scipy.sparse matrix
A sparse adjacency or weighted graph matrix representing connectivity.
n_label_prop_iter : int, default=20
Number of label propagation iterations to perform on the graph.
n_embedding_epochs : int, default=50
Number of epochs when using node embedding for upscaling.
approx_n_parts : int, default=512
Approximate number of partitions to create for recursive partitioning
of large graphs. Useful for controlling memory and computation.
n_components : int, default=2
The number of dimensions in the output embedding.
scaling : float, default=0.1
Scaling factor applied to label propagation distances.
random_scale : float, default=1.0
Scaling factor for random noise in the initialization.
noise_level : float, default=0.5
The noise level parameter passed to node embedding algorithms.
random_state : RandomState instance or None, default=None
Controls the randomness of the algorithm. If None, uses system randomness.
data : array-like of shape (n_samples, n_features) or None, default=None
The original data array. Required if base_init='pca'. Used for direct
initialization methods on small graphs.
recursive_init : bool, default=True
If True, uses recursive partitioning for large graphs. If False, applies
the base initialization method directly.
base_init : {'pca', 'random', 'spectral', 'mds'}, default='pca'
The initialization method to use for small graphs (when graph size is below
base_init_threshold). 'pca' requires the data parameter.
base_init_threshold : int, default=64
The size threshold below which the base_init method is used directly.
Graphs larger than this use recursive partitioning.
upscaling : {'partition_expander', 'node_embedding'}, default='partition_expander'
The method to use when upscaling partitions back to the full graph.
'partition_expander' uses a fast expansion method, 'node_embedding' uses
full node embedding (slower but potentially better quality).
Returns
-------
embedding : array-like of shape (n_vertices, n_components)
The initialized node embedding based on label propagation and graph structure.
"""
if random_state is None:
random_state = np.random.RandomState()
if graph.shape[0] < base_init_threshold:
if base_init == "random":
result = random_state.normal(
loc=0.0, scale=1.0, size=(graph.shape[0], n_components)
)
norms = np.linalg.norm(result, axis=1, keepdims=True)
result = result / norms
return result.astype(np.float32)
elif base_init == "pca":
result = (
PCA(n_components=n_components, random_state=random_state)
.fit_transform(data)
.astype(np.float32, order="C")
)
result -= result.mean()
result /= (result.max() - result.min()) / 2.0
return result
elif base_init == "spectral":
result = (
SpectralEmbedding(n_components=n_components, random_state=random_state)
.fit_transform(data)
.astype(np.float32, order="C")
)
result -= result.mean()
result /= (result.max() - result.min()) / 2.0
return result
elif base_init == "mds":
result = (
MDS(
n_components=n_components,
random_state=random_state,
n_init=1,
max_iter=300,
)
.fit_transform(data)
.astype(np.float32, order="C")
)
result -= result.mean()
result /= (result.max() - result.min()) / 2.0
return result
else:
raise ValueError(
"Unknown base initialization method. Should be one of ['random', 'pca', 'spectral', 'mds']"
)
labels = np.full(graph.shape[0], -1, dtype=np.int64)
partition = label_prop_loop(
graph.indptr,
graph.indices,
graph.data,
labels,
random_state,
n_label_prop_iter,
approx_n_parts,
)
base_reduction_map = csr_matrix(
(np.ones(partition.shape[0]), partition, np.arange(partition.shape[0] + 1)),
shape=(partition.shape[0], partition.max() + 1),
)
normalized_reduction_map = normalize(base_reduction_map, axis=0, norm="l2")
data_reducer = normalize(normalized_reduction_map.T, norm="l1")
if data is not None:
reduced_data = data_reducer @ data
else:
reduced_data = None
reduced_graph = normalized_reduction_map.T * graph * base_reduction_map
reduced_graph.data = np.clip(reduced_graph.data, 0.0, 1.0)
if recursive_init:
reduced_init = label_propagation_init(
reduced_graph,
n_label_prop_iter=n_label_prop_iter,
n_embedding_epochs=min(255, n_embedding_epochs),
approx_n_parts=approx_n_parts // 4,
n_components=n_components,
scaling=scaling,
random_scale=random_scale,
noise_level=noise_level,
random_state=random_state,
data=reduced_data,
recursive_init=True,
upscaling=upscaling,
base_init=base_init,
base_init_threshold=base_init_threshold,
)
else:
reduced_init = None
reduced_layout = node_embedding(
reduced_graph,
n_components,
n_embedding_epochs,
verbose=False,
noise_level=noise_level,
random_state=random_state,
initial_embedding=reduced_init,
initial_alpha=0.001 * n_embedding_epochs,
)
if upscaling == "partition_expander":
data_expander = normalize(
(graph.multiply(graph.T)) @ normalized_reduction_map, norm="l1"
)
result = (
data_expander @ reduced_layout
+ normalize(normalized_reduction_map, norm="l1") @ reduced_layout
) / 2.0
elif upscaling == "jitter_expander":
data_expander = normalize(
(graph.multiply(graph.T)) @ normalized_reduction_map, norm="l1"
)
expanded = (
data_expander @ reduced_layout
+ normalize(normalized_reduction_map, norm="l1") @ reduced_layout
) / 2.0
jittered = reduced_layout[partition]
jittered += random_state.normal(
scale=random_scale / 4.0, size=(partition.shape[0], reduced_layout.shape[1])
)
result = (expanded + jittered) / 2.0
else:
result = reduced_layout[partition]
result += random_state.normal(
scale=random_scale, size=(partition.shape[0], reduced_layout.shape[1])
)
result = (scaling * (result - result.mean(axis=0))).astype(np.float32)
return result