Source code for evoc.knn_graph

import numpy as np
import numba
import time

from sklearn.utils import check_array, check_random_state

from warnings import warn
from .float_nndescent import (
    make_float_forest,
    nn_descent_float,
    nn_descent_float_sorted,
)
from .uint8_nndescent import (
    make_uint8_forest,
    nn_descent_uint8,
    nn_descent_uint8_sorted,
)
from .int8_nndescent import make_int8_forest, nn_descent_int8, nn_descent_int8_sorted

INT32_MIN = np.iinfo(np.int32).min + 1
INT32_MAX = np.iinfo(np.int32).max - 1


# Generates a timestamp for use in logging messages when verbose=True
def ts():
    return time.ctime(time.time())


def make_forest(
    data,
    n_neighbors,
    n_trees,
    leaf_size,
    random_state,
    input_dtype,
    max_depth=200,
):
    """Build a random projection forest with ``n_trees``.

    Parameters
    ----------
    data
    n_neighbors
    n_trees
    leaf_size
    rng_state
    angular

    Returns
    -------
    forest: list
        A list of random projection trees.
    """

    if leaf_size is None:
        leaf_size = max(10, np.int32(n_neighbors))

    rng_states = random_state.randint(INT32_MIN, INT32_MAX, size=(n_trees, 3)).astype(
        np.int64
    )
    try:
        if input_dtype == np.uint8:
            result = make_uint8_forest(data, rng_states, leaf_size, max_depth)
        elif input_dtype == np.int8:
            result = make_int8_forest(data, rng_states, leaf_size, max_depth)
        else:
            result = make_float_forest(data, rng_states, leaf_size, max_depth)
    except (RuntimeError, RecursionError, SystemError):
        warn(
            "Random Projection forest initialisation failed due to recursion"
            "limit being reached. Something is a little strange with your "
            "graph_data, and this may take longer than normal to compute."
        )
        return np.empty((0, 0), dtype=np.int32)

    # different trees can end up with different max leaf_sizes if the tree depth is insufficient
    max_leaf_size = np.max([leaf_array.shape[1] for leaf_array in result])

    # pad each leaf_array from each tree out to the max_leaf_size from any tree
    # so that vstack can succeed. Check np.pad docs for the specific semantics
    return np.vstack(
        [
            np.pad(
                leaf_array,
                ((0, 0), (0, max_leaf_size - leaf_array.shape[1])),
                constant_values=-1,
            )
            for leaf_array in result
        ]
    )


def nn_descent(
    data,
    n_neighbors,
    rng_state,
    effective_max_candidates,
    n_iters,
    delta,
    input_dtype,
    leaf_array=None,
    verbose=False,
    use_sorted_updates=True,
    delta_improv=None,
):
    if input_dtype == np.uint8:
        if use_sorted_updates:
            neighbor_graph = nn_descent_uint8_sorted(
                data,
                n_neighbors,
                rng_state,
                effective_max_candidates,
                n_iters,
                delta,
                delta_improv=delta_improv,
                leaf_array=leaf_array,
                verbose=verbose,
            )
        else:
            neighbor_graph = nn_descent_uint8(
                data,
                n_neighbors,
                rng_state,
                effective_max_candidates,
                n_iters,
                delta,
                delta_improv=delta_improv,
                leaf_array=leaf_array,
                verbose=verbose,
            )
        neighbor_graph[1][:] = -np.log2(-neighbor_graph[1])
    elif input_dtype == np.int8:
        if use_sorted_updates:
            neighbor_graph = nn_descent_int8_sorted(
                data,
                n_neighbors,
                rng_state,
                effective_max_candidates,
                n_iters,
                delta,
                delta_improv=delta_improv,
                leaf_array=leaf_array,
                verbose=verbose,
            )
        else:
            neighbor_graph = nn_descent_int8(
                data,
                n_neighbors,
                rng_state,
                effective_max_candidates,
                n_iters,
                delta,
                delta_improv=delta_improv,
                leaf_array=leaf_array,
                verbose=verbose,
            )
        neighbor_graph[1][:] = 1.0 / (-neighbor_graph[1])
    else:
        if use_sorted_updates:
            neighbor_graph = nn_descent_float_sorted(
                data,
                n_neighbors,
                rng_state,
                effective_max_candidates,
                n_iters,
                delta,
                delta_improv=delta_improv,
                leaf_array=leaf_array,
                verbose=verbose,
            )
        else:
            neighbor_graph = nn_descent_float(
                data,
                n_neighbors,
                rng_state,
                effective_max_candidates,
                n_iters,
                delta,
                delta_improv=delta_improv,
                leaf_array=leaf_array,
                verbose=verbose,
            )
        neighbor_graph[1][:] = np.maximum(-np.log2(-neighbor_graph[1]), 0.0)

    return neighbor_graph



[docs]
def knn_graph(
    data,
    n_neighbors=30,
    n_trees=None,
    leaf_size=None,
    random_state=None,
    max_candidates=None,
    max_rptree_depth=200,
    n_iters=None,
    delta=0.001,
    delta_improv=0.001,
    n_jobs=None,
    verbose=False,
    use_sorted_updates=True,
):
    """Construct a k-nearest neighbor graph using the NN-Descent algorithm.

    This function builds a k-nearest neighbor graph using random projection trees
    for initialization followed by the NN-Descent algorithm for refinement. It
    supports multiple data types (float32 for normalized embeddings, int8 for
    quantized embeddings, uint8 for binary embeddings) with appropriate distance
    metrics for each.

    Parameters
    ----------
    data : array-like of shape (n_samples, n_features)
        The data for which to compute nearest neighbors. If float32, cosine distance
        is used. If int8, quantized cosine distance is used. If uint8, Jaccard
        distance (based on Hamming distance for binary embeddings) is used.

    n_neighbors : int, default=30
        The number of nearest neighbors to compute for each sample.

    n_trees : int or None, default=None
        The number of random projection trees to build. If None, defaults to
        between 4 and 8 depending on the number of available threads.

    leaf_size : int or None, default=None
        The maximum number of points per leaf in the random projection trees.
        If None, defaults to max(10, n_neighbors).

    random_state : int, RandomState instance or None, default=None
        Controls the randomness of the algorithm. Pass an int for reproducible
        output across multiple function calls.

    max_candidates : int or None, default=None
        The maximum number of candidate neighbors to evaluate during NN-Descent.
        If None, defaults to min(60, int(n_neighbors * 1.5)).

    max_rptree_depth : int, default=200
        Maximum depth of the random projection trees.

    n_iters : int or None, default=None
        Number of iterations for the NN-Descent algorithm. If None, defaults to
        max(5, int(round(log2(n_samples)))).

    delta : float, default=0.001
        Convergence threshold for the NN-Descent algorithm.

    delta_improv : float, default=0.001
        Improvement threshold for early stopping in NN-Descent.

    n_jobs : int or None, default=None
        The number of threads to use. If -1, uses all available threads.
        If None, preserves the current numba thread setting.

    verbose : bool, default=False
        If True, print progress messages during computation.

    use_sorted_updates : bool, default=True
        If True, uses a more efficient sorted update strategy in NN-Descent.

    Returns
    -------
    neighbor_graph : tuple of (array, array)
        A tuple containing:
        - indices : array-like of shape (n_samples, n_neighbors)
            The indices of the k-nearest neighbors for each sample.
        - distances : array-like of shape (n_samples, n_neighbors)
            The distances from each sample to its k-nearest neighbors.
            Distances are transformed to a uniform scale based on the input dtype.
    """
    if data.dtype == np.uint8:
        data = check_array(data, dtype=np.uint8, order="C")
        _input_dtype = np.uint8
        _bit_trees = True
    elif data.dtype == np.int8:
        data = check_array(data, dtype=np.int8, order="C")
        _input_dtype = np.int8
        _bit_trees = False
    else:
        norms = np.einsum("ij,ij->i", data, data)
        np.sqrt(norms, norms)
        norms[norms == 0.0] = 1.0
        if np.allclose(norms, 1.0):
            # Data is already normalized, just ensure C-contiguity and float32
            data = np.ascontiguousarray(data, dtype=np.float32)
        else:
            # Efficiently create a modifiable float32 C-contiguous copy
            data = np.array(data, dtype=np.float32, order="C", copy=True)
            data /= norms[:, np.newaxis]
        _input_dtype = np.float32
        _bit_trees = False

    current_random_state = check_random_state(random_state)
    rng_state = current_random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)

    # Set threading constraints
    _original_num_threads = numba.get_num_threads()
    if n_jobs != -1 and n_jobs is not None:
        numba.set_num_threads(n_jobs)

    if n_trees is None:
        n_trees = numba.get_num_threads()
        n_trees = max(4, min(8, n_trees))  # Only so many trees are useful
    if n_iters is None:
        n_iters = max(5, int(round(np.log2(data.shape[0]))))

    if verbose:
        print(ts(), "Building RP forest with", str(n_trees), "trees")

    leaf_array = make_forest(
        data,
        n_neighbors,
        n_trees,
        leaf_size,
        current_random_state,
        _input_dtype,
        max_depth=max_rptree_depth,
    )

    if max_candidates is None:
        effective_max_candidates = min(60, int(n_neighbors * 1.5))
    else:
        effective_max_candidates = max_candidates

    if verbose:
        print(ts(), "NN descent for", str(n_iters), "iterations")

    neighbor_graph = nn_descent(
        data,
        n_neighbors,
        rng_state,
        effective_max_candidates,
        n_iters,
        delta,
        _input_dtype,
        leaf_array=leaf_array,
        verbose=verbose,
        use_sorted_updates=use_sorted_updates,
        delta_improv=delta_improv,
    )

    if np.any(neighbor_graph[0] < 0):
        warn(
            "Failed to correctly find n_neighbors for some samples."
            " Results may be less than ideal. Try re-running with"
            " different parameters."
        )

    if n_jobs != -1 and n_jobs is not None:
        numba.set_num_threads(_original_num_threads)
    return neighbor_graph