Source code for evoc.knn_graph

import numpy as np
import numba
import time

from sklearn.utils import check_array, check_random_state

from warnings import warn
from .float_nndescent import (
    make_float_forest,
    nn_descent_float,
    nn_descent_float_sorted,
)
from .uint8_nndescent import (
    make_uint8_forest,
    nn_descent_uint8,
    nn_descent_uint8_sorted,
)
from .int8_nndescent import make_int8_forest, nn_descent_int8, nn_descent_int8_sorted

INT32_MIN = np.iinfo(np.int32).min + 1
INT32_MAX = np.iinfo(np.int32).max - 1


# Generates a timestamp for use in logging messages when verbose=True
def ts():
    return time.ctime(time.time())


def make_forest(
    data,
    n_neighbors,
    n_trees,
    leaf_size,
    random_state,
    input_dtype,
    max_depth=200,
):
    """Build a random projection forest with ``n_trees``.

    Parameters
    ----------
    data
    n_neighbors
    n_trees
    leaf_size
    rng_state
    angular

    Returns
    -------
    forest: list
        A list of random projection trees.
    """

    if leaf_size is None:
        leaf_size = max(10, np.int32(n_neighbors))

    rng_states = random_state.randint(INT32_MIN, INT32_MAX, size=(n_trees, 3)).astype(
        np.int64
    )
    try:
        if input_dtype == np.uint8:
            result = make_uint8_forest(data, rng_states, leaf_size, max_depth)
        elif input_dtype == np.int8:
            result = make_int8_forest(data, rng_states, leaf_size, max_depth)
        else:
            result = make_float_forest(data, rng_states, leaf_size, max_depth)
    except (RuntimeError, RecursionError, SystemError):
        warn(
            "Random Projection forest initialisation failed due to recursion"
            "limit being reached. Something is a little strange with your "
            "graph_data, and this may take longer than normal to compute."
        )
        return np.empty((0, 0), dtype=np.int32)

    # different trees can end up with different max leaf_sizes if the tree depth is insufficient
    max_leaf_size = np.max([leaf_array.shape[1] for leaf_array in result])

    # pad each leaf_array from each tree out to the max_leaf_size from any tree
    # so that vstack can succeed. Check np.pad docs for the specific semantics
    return np.vstack(
        [
            np.pad(
                leaf_array,
                ((0, 0), (0, max_leaf_size - leaf_array.shape[1])),
                constant_values=-1,
            )
            for leaf_array in result
        ]
    )


def nn_descent(
    data,
    n_neighbors,
    rng_state,
    effective_max_candidates,
    n_iters,
    delta,
    input_dtype,
    leaf_array=None,
    verbose=False,
    use_sorted_updates=True,
    delta_improv=None,
):
    if input_dtype == np.uint8:
        if use_sorted_updates:
            neighbor_graph = nn_descent_uint8_sorted(
                data,
                n_neighbors,
                rng_state,
                effective_max_candidates,
                n_iters,
                delta,
                delta_improv=delta_improv,
                leaf_array=leaf_array,
                verbose=verbose,
            )
        else:
            neighbor_graph = nn_descent_uint8(
                data,
                n_neighbors,
                rng_state,
                effective_max_candidates,
                n_iters,
                delta,
                delta_improv=delta_improv,
                leaf_array=leaf_array,
                verbose=verbose,
            )
        neighbor_graph[1][:] = -np.log2(-neighbor_graph[1])
    elif input_dtype == np.int8:
        if use_sorted_updates:
            neighbor_graph = nn_descent_int8_sorted(
                data,
                n_neighbors,
                rng_state,
                effective_max_candidates,
                n_iters,
                delta,
                delta_improv=delta_improv,
                leaf_array=leaf_array,
                verbose=verbose,
            )
        else:
            neighbor_graph = nn_descent_int8(
                data,
                n_neighbors,
                rng_state,
                effective_max_candidates,
                n_iters,
                delta,
                delta_improv=delta_improv,
                leaf_array=leaf_array,
                verbose=verbose,
            )
        neighbor_graph[1][:] = 1.0 / (-neighbor_graph[1])
    else:
        if use_sorted_updates:
            neighbor_graph = nn_descent_float_sorted(
                data,
                n_neighbors,
                rng_state,
                effective_max_candidates,
                n_iters,
                delta,
                delta_improv=delta_improv,
                leaf_array=leaf_array,
                verbose=verbose,
            )
        else:
            neighbor_graph = nn_descent_float(
                data,
                n_neighbors,
                rng_state,
                effective_max_candidates,
                n_iters,
                delta,
                delta_improv=delta_improv,
                leaf_array=leaf_array,
                verbose=verbose,
            )
        neighbor_graph[1][:] = np.maximum(-np.log2(-neighbor_graph[1]), 0.0)

    return neighbor_graph


[docs] def knn_graph( data, n_neighbors=30, n_trees=None, leaf_size=None, random_state=None, max_candidates=None, max_rptree_depth=200, n_iters=None, delta=0.001, delta_improv=0.001, n_jobs=None, verbose=False, use_sorted_updates=True, ): """Construct a k-nearest neighbor graph using the NN-Descent algorithm. This function builds a k-nearest neighbor graph using random projection trees for initialization followed by the NN-Descent algorithm for refinement. It supports multiple data types (float32 for normalized embeddings, int8 for quantized embeddings, uint8 for binary embeddings) with appropriate distance metrics for each. Parameters ---------- data : array-like of shape (n_samples, n_features) The data for which to compute nearest neighbors. If float32, cosine distance is used. If int8, quantized cosine distance is used. If uint8, Jaccard distance (based on Hamming distance for binary embeddings) is used. n_neighbors : int, default=30 The number of nearest neighbors to compute for each sample. n_trees : int or None, default=None The number of random projection trees to build. If None, defaults to between 4 and 8 depending on the number of available threads. leaf_size : int or None, default=None The maximum number of points per leaf in the random projection trees. If None, defaults to max(10, n_neighbors). random_state : int, RandomState instance or None, default=None Controls the randomness of the algorithm. Pass an int for reproducible output across multiple function calls. max_candidates : int or None, default=None The maximum number of candidate neighbors to evaluate during NN-Descent. If None, defaults to min(60, int(n_neighbors * 1.5)). max_rptree_depth : int, default=200 Maximum depth of the random projection trees. n_iters : int or None, default=None Number of iterations for the NN-Descent algorithm. If None, defaults to max(5, int(round(log2(n_samples)))). delta : float, default=0.001 Convergence threshold for the NN-Descent algorithm. delta_improv : float, default=0.001 Improvement threshold for early stopping in NN-Descent. n_jobs : int or None, default=None The number of threads to use. If -1, uses all available threads. If None, preserves the current numba thread setting. verbose : bool, default=False If True, print progress messages during computation. use_sorted_updates : bool, default=True If True, uses a more efficient sorted update strategy in NN-Descent. Returns ------- neighbor_graph : tuple of (array, array) A tuple containing: - indices : array-like of shape (n_samples, n_neighbors) The indices of the k-nearest neighbors for each sample. - distances : array-like of shape (n_samples, n_neighbors) The distances from each sample to its k-nearest neighbors. Distances are transformed to a uniform scale based on the input dtype. """ if data.dtype == np.uint8: data = check_array(data, dtype=np.uint8, order="C") _input_dtype = np.uint8 _bit_trees = True elif data.dtype == np.int8: data = check_array(data, dtype=np.int8, order="C") _input_dtype = np.int8 _bit_trees = False else: norms = np.einsum("ij,ij->i", data, data) np.sqrt(norms, norms) norms[norms == 0.0] = 1.0 if np.allclose(norms, 1.0): # Data is already normalized, just ensure C-contiguity and float32 data = np.ascontiguousarray(data, dtype=np.float32) else: # Efficiently create a modifiable float32 C-contiguous copy data = np.array(data, dtype=np.float32, order="C", copy=True) data /= norms[:, np.newaxis] _input_dtype = np.float32 _bit_trees = False current_random_state = check_random_state(random_state) rng_state = current_random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) # Set threading constraints _original_num_threads = numba.get_num_threads() if n_jobs != -1 and n_jobs is not None: numba.set_num_threads(n_jobs) if n_trees is None: n_trees = numba.get_num_threads() n_trees = max(4, min(8, n_trees)) # Only so many trees are useful if n_iters is None: n_iters = max(5, int(round(np.log2(data.shape[0])))) if verbose: print(ts(), "Building RP forest with", str(n_trees), "trees") leaf_array = make_forest( data, n_neighbors, n_trees, leaf_size, current_random_state, _input_dtype, max_depth=max_rptree_depth, ) if max_candidates is None: effective_max_candidates = min(60, int(n_neighbors * 1.5)) else: effective_max_candidates = max_candidates if verbose: print(ts(), "NN descent for", str(n_iters), "iterations") neighbor_graph = nn_descent( data, n_neighbors, rng_state, effective_max_candidates, n_iters, delta, _input_dtype, leaf_array=leaf_array, verbose=verbose, use_sorted_updates=use_sorted_updates, delta_improv=delta_improv, ) if np.any(neighbor_graph[0] < 0): warn( "Failed to correctly find n_neighbors for some samples." " Results may be less than ideal. Try re-running with" " different parameters." ) if n_jobs != -1 and n_jobs is not None: numba.set_num_threads(_original_num_threads) return neighbor_graph