Source code for evoc.label_propagation

import numpy as np
import numba

from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.manifold import SpectralEmbedding, MDS

from .node_embedding import node_embedding
from .common_nndescent import tau_rand, tau_rand_int

INT32_MIN = np.iinfo(np.int32).min + 1
INT32_MAX = np.iinfo(np.int32).max - 1


@numba.njit(fastmath=True, parallel=True, cache=True)
def label_prop_iteration(
    indptr,
    indices,
    data,
    labels,
    rng_state,
):
    n_rows = indptr.shape[0] - 1
    result = labels.copy()

    for i in numba.prange(n_rows):
        current_l = labels[i]
        if current_l >= 0:
            continue
        local_rng_state = rng_state + i
        votes = {}
        for k in range(indptr[i], indptr[i + 1]):
            j = indices[k]
            l = labels[j]
            if l in votes:
                votes[l] += data[k]
            else:
                votes[l] = data[k]

        max_vote = 1
        tie_count = 1
        for l in votes:
            if l == -1:
                continue
            elif votes[l] > max_vote:
                max_vote = votes[l]
                result[i] = l
                tie_count = 1
            elif votes[l] == max_vote:
                tie_count += 1
                if current_l == -1:
                    result[i] = l
                elif tau_rand(local_rng_state) < 1.0 / tie_count:
                    result[i] = l
            else:
                continue

    return result


@numba.njit(fastmath=True, parallel=True, cache=True)
def original_label_prop_iteration(
    indptr,
    indices,
    data,
    labels,
    rng_state,
):
    n_rows = indptr.shape[0] - 1
    result = labels.copy()

    for i in numba.prange(n_rows):
        current_l = labels[i]
        local_rng_state = rng_state + i
        votes = {}
        for k in range(indptr[i], indptr[i + 1]):
            j = indices[k]
            l = labels[j]
            if l in votes:
                votes[l] += data[k]
            else:
                votes[l] = data[k]

        max_vote = 1
        tie_count = 1
        for l in votes:
            if l == -1:
                continue
            elif votes[l] > max_vote:
                max_vote = votes[l]
                result[i] = l
                tie_count = 1
            elif votes[l] == max_vote:
                tie_count += 1
                if current_l == -1:
                    result[i] = l
                elif tau_rand(local_rng_state) < 1.0 / tie_count:
                    result[i] = l
            else:
                continue

    return result


@numba.njit(cache=True)
def label_outliers(indptr, indices, labels, rng_state):
    n_rows = indptr.shape[0] - 1
    max_label = labels.max()

    for i in numba.prange(n_rows):
        local_rng_state = rng_state + i
        if labels[i] < 0:

            node_queue = [i]
            unlabelled = True
            n_iter = 0

            while unlabelled and n_iter < 64 and len(node_queue) > 0:

                n_iter += 1
                current_node = node_queue.pop()
                for k in range(indptr[current_node], indptr[current_node + 1]):
                    j = indices[k]
                    if labels[j] >= 0:
                        labels[i] = labels[j]
                        unlabelled = False
                        break
                    else:
                        node_queue.append(j)

            if n_iter >= 64 or unlabelled:
                labels[i] = tau_rand_int(local_rng_state) % (max_label + 1)

    return labels


@numba.njit(cache=True)
def remap_labels(labels):
    mapping = {}
    unique_labels = np.unique(labels)
    if unique_labels[0] == -1:
        unique_labels = unique_labels[1:]
    for i, l in enumerate(unique_labels):
        mapping[l] = i
    next_label = i + 1
    for i in range(labels.shape[0]):
        if labels[i] < 0:
            labels[i] = next_label
            next_label += 1
        else:
            labels[i] = mapping[labels[i]]

    return labels


def label_prop_loop(
    indptr, indices, data, labels, random_state, n_iter=20, approx_n_parts=2048
):
    rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)
    for i in range(approx_n_parts):  # range(int(1.25 * approx_n_parts)):
        labels[random_state.randint(labels.shape[0])] = i

    for i in range(n_iter):
        new_labels = label_prop_iteration(indptr, indices, data, labels, rng_state)
        labels = new_labels

    labels = label_outliers(indptr, indices, labels, rng_state)
    return remap_labels(labels)


def original_label_prop_loop(
    indptr, indices, data, labels, random_state, n_iter=20, approx_n_parts=2048
):
    rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)
    for i in range(int(1.25 * approx_n_parts)):
        labels[random_state.randint(labels.shape[0])] = i

    for i in range(n_iter):
        new_labels = original_label_prop_iteration(
            indptr, indices, data, labels, rng_state
        )
        labels = new_labels

    labels = label_outliers(indptr, indices, labels, rng_state)
    return remap_labels(labels)



[docs]
def label_propagation_init(
    graph,
    n_label_prop_iter=20,
    n_embedding_epochs=50,
    approx_n_parts=512,
    n_components=2,
    scaling=0.1,
    random_scale=1.0,
    noise_level=0.5,
    random_state=None,
    data=None,
    recursive_init=True,
    base_init="pca",
    base_init_threshold=64,
    upscaling="partition_expander",
):
    """Initialize a node embedding using label propagation on a sparse graph.

    This function provides a high-quality initialization for node embeddings by
    combining graph-based label propagation with hierarchical partitioning. For
    large graphs, it recursively partitions the data and upscales the results.
    For small graphs, it uses direct methods (PCA, spectral embedding, or random).

    Parameters
    ----------
    graph : scipy.sparse matrix
        A sparse adjacency or weighted graph matrix representing connectivity.

    n_label_prop_iter : int, default=20
        Number of label propagation iterations to perform on the graph.

    n_embedding_epochs : int, default=50
        Number of epochs when using node embedding for upscaling.

    approx_n_parts : int, default=512
        Approximate number of partitions to create for recursive partitioning
        of large graphs. Useful for controlling memory and computation.

    n_components : int, default=2
        The number of dimensions in the output embedding.

    scaling : float, default=0.1
        Scaling factor applied to label propagation distances.

    random_scale : float, default=1.0
        Scaling factor for random noise in the initialization.

    noise_level : float, default=0.5
        The noise level parameter passed to node embedding algorithms.

    random_state : RandomState instance or None, default=None
        Controls the randomness of the algorithm. If None, uses system randomness.

    data : array-like of shape (n_samples, n_features) or None, default=None
        The original data array. Required if base_init='pca'. Used for direct
        initialization methods on small graphs.

    recursive_init : bool, default=True
        If True, uses recursive partitioning for large graphs. If False, applies
        the base initialization method directly.

    base_init : {'pca', 'random', 'spectral', 'mds'}, default='pca'
        The initialization method to use for small graphs (when graph size is below
        base_init_threshold). 'pca' requires the data parameter.

    base_init_threshold : int, default=64
        The size threshold below which the base_init method is used directly.
        Graphs larger than this use recursive partitioning.

    upscaling : {'partition_expander', 'node_embedding'}, default='partition_expander'
        The method to use when upscaling partitions back to the full graph.
        'partition_expander' uses a fast expansion method, 'node_embedding' uses
        full node embedding (slower but potentially better quality).

    Returns
    -------
    embedding : array-like of shape (n_vertices, n_components)
        The initialized node embedding based on label propagation and graph structure.
    """
    if random_state is None:
        random_state = np.random.RandomState()

    if graph.shape[0] < base_init_threshold:
        if base_init == "random":
            result = random_state.normal(
                loc=0.0, scale=1.0, size=(graph.shape[0], n_components)
            )
            norms = np.linalg.norm(result, axis=1, keepdims=True)
            result = result / norms
            return result.astype(np.float32)
        elif base_init == "pca":
            result = (
                PCA(n_components=n_components, random_state=random_state)
                .fit_transform(data)
                .astype(np.float32, order="C")
            )
            result -= result.mean()
            result /= (result.max() - result.min()) / 2.0
            return result
        elif base_init == "spectral":
            result = (
                SpectralEmbedding(n_components=n_components, random_state=random_state)
                .fit_transform(data)
                .astype(np.float32, order="C")
            )
            result -= result.mean()
            result /= (result.max() - result.min()) / 2.0
            return result
        elif base_init == "mds":
            result = (
                MDS(
                    n_components=n_components,
                    random_state=random_state,
                    n_init=1,
                    max_iter=300,
                )
                .fit_transform(data)
                .astype(np.float32, order="C")
            )
            result -= result.mean()
            result /= (result.max() - result.min()) / 2.0
            return result
        else:
            raise ValueError(
                "Unknown base initialization method. Should be one of ['random', 'pca', 'spectral', 'mds']"
            )

    labels = np.full(graph.shape[0], -1, dtype=np.int64)
    partition = label_prop_loop(
        graph.indptr,
        graph.indices,
        graph.data,
        labels,
        random_state,
        n_label_prop_iter,
        approx_n_parts,
    )
    base_reduction_map = csr_matrix(
        (np.ones(partition.shape[0]), partition, np.arange(partition.shape[0] + 1)),
        shape=(partition.shape[0], partition.max() + 1),
    )
    normalized_reduction_map = normalize(base_reduction_map, axis=0, norm="l2")
    data_reducer = normalize(normalized_reduction_map.T, norm="l1")
    if data is not None:
        reduced_data = data_reducer @ data
    else:
        reduced_data = None

    reduced_graph = normalized_reduction_map.T * graph * base_reduction_map
    reduced_graph.data = np.clip(reduced_graph.data, 0.0, 1.0)

    if recursive_init:
        reduced_init = label_propagation_init(
            reduced_graph,
            n_label_prop_iter=n_label_prop_iter,
            n_embedding_epochs=min(255, n_embedding_epochs),
            approx_n_parts=approx_n_parts // 4,
            n_components=n_components,
            scaling=scaling,
            random_scale=random_scale,
            noise_level=noise_level,
            random_state=random_state,
            data=reduced_data,
            recursive_init=True,
            upscaling=upscaling,
            base_init=base_init,
            base_init_threshold=base_init_threshold,
        )
    else:
        reduced_init = None

    reduced_layout = node_embedding(
        reduced_graph,
        n_components,
        n_embedding_epochs,
        verbose=False,
        noise_level=noise_level,
        random_state=random_state,
        initial_embedding=reduced_init,
        initial_alpha=0.001 * n_embedding_epochs,
    )

    if upscaling == "partition_expander":
        data_expander = normalize(
            (graph.multiply(graph.T)) @ normalized_reduction_map, norm="l1"
        )
        result = (
            data_expander @ reduced_layout
            + normalize(normalized_reduction_map, norm="l1") @ reduced_layout
        ) / 2.0
    elif upscaling == "jitter_expander":
        data_expander = normalize(
            (graph.multiply(graph.T)) @ normalized_reduction_map, norm="l1"
        )
        expanded = (
            data_expander @ reduced_layout
            + normalize(normalized_reduction_map, norm="l1") @ reduced_layout
        ) / 2.0
        jittered = reduced_layout[partition]
        jittered += random_state.normal(
            scale=random_scale / 4.0, size=(partition.shape[0], reduced_layout.shape[1])
        )
        result = (expanded + jittered) / 2.0
    else:
        result = reduced_layout[partition]
        result += random_state.normal(
            scale=random_scale, size=(partition.shape[0], reduced_layout.shape[1])
        )

    result = (scaling * (result - result.mean(axis=0))).astype(np.float32)
    return result