import numpy as np
import numba
from tqdm import tqdm
INT32_MIN = np.iinfo(np.int32).min + 1
INT32_MAX = np.iinfo(np.int32).max - 1
def make_epochs_per_sample(weights, n_epochs):
result = np.full(weights.shape[0], n_epochs, dtype=np.float32)
n_samples = np.maximum(n_epochs * (weights / weights.max()), 1.0)
result = float(n_epochs) / np.float32(n_samples)
return result
@numba.njit(
"f4(f4[::1],f4[::1])",
fastmath=True,
cache=True,
locals={
"result": numba.types.float32,
"diff": numba.types.float32,
"dim": numba.types.intp,
"i": numba.types.intp,
},
)
def rdist(x, y):
result = 0.0
dim = x.shape[0]
for i in range(dim):
diff = x[i] - y[i]
result += diff * diff
return result
@numba.njit(inline="always")
def clip(val, lo, hi):
if val > hi:
return hi
elif val < lo:
return lo
else:
return val
@numba.njit(
"void(f4[:,::1],u4[::1],u4[::1],u4,f4[::1],u4,u1,f4,f4[::1],f4[::1],f4[::1],u1,f4)",
fastmath=True,
parallel=True,
cache=True,
locals={
"i": numba.uint32,
"j": numba.uint32,
"k": numba.uint32,
"di": numba.uint8,
"p": numba.uint8,
"n_neg_samples": numba.uint8,
"dist_squared": numba.float32,
"grad_coeff": numba.float32,
"current": numba.float32[::1],
"other": numba.float32[::1],
},
)
def node_embedding_epoch(
embedding,
head,
tail,
n_vertices,
epochs_per_sample,
rng_state,
dim,
alpha,
epochs_per_negative_sample,
epoch_of_next_negative_sample,
epoch_of_next_sample,
n,
noise_level,
):
for i in numba.prange(epochs_per_sample.shape[0]):
if epoch_of_next_sample[i] <= n:
j = head[i]
k = tail[i]
current = embedding[j]
other = embedding[k]
dist_squared = rdist(current, other)
if dist_squared > 0.0:
dist = np.sqrt(dist_squared)
grad_coeff = (-2.0 * noise_level * dist - 2.0) / (
2.0 * dist_squared - 0.5 * dist + 1.0
)
for di in range(dim):
grad_d = grad_coeff * (current[di] - other[di])
current[di] += grad_d * alpha
other[di] += -grad_d * alpha
epoch_of_next_sample[i] += epochs_per_sample[i]
n_neg_samples = int(
(n - epoch_of_next_negative_sample[i]) / epochs_per_negative_sample[i]
)
for p in range(n_neg_samples):
k = ((n + p) * i * rng_state) % n_vertices
other = embedding[k]
dist_squared = rdist(current, other)
if dist_squared > 1e-2:
grad_coeff = 4.0 / ((1.0 + 0.25 * dist_squared) * dist_squared)
for di in range(dim):
grad_d = clip(grad_coeff * (current[di] - other[di]), -4, 4)
current[di] += grad_d * alpha
epoch_of_next_negative_sample[i] += (
n_neg_samples * epochs_per_negative_sample[i]
)
@numba.njit(
"void(f4[:, ::1], u4[::1], u4[::1], u4, f4[::1], u4, u1, f4, f4[::1], f4[::1], f4[::1], u1, f4, f4, f4[:, ::1], u4[::1], u4)",
fastmath=True,
parallel=True,
cache=True,
locals={
"updates": numba.types.float32[:, ::1],
"from_node": numba.types.intp,
"to_node": numba.types.intp,
"raw_index": numba.types.intp,
"dist_squared": numba.types.float32,
"dist": numba.types.float32,
"grad_coeff": numba.types.float32,
"grad_d": numba.types.float32,
"current": numba.types.float32[::1],
"other": numba.types.float32[::1],
"block_start": numba.types.intp,
"block_end": numba.types.intp,
"node_idx": numba.types.intp,
"d": numba.types.uint8,
"n": numba.types.uint8,
"p": numba.types.uint8,
"n_neg_samples": numba.types.uint8,
},
)
def node_embedding_epoch_repr(
embedding,
csr_indptr,
csr_indices,
n_vertices,
epochs_per_sample,
rng_state,
dim,
alpha,
epochs_per_negative_sample,
epoch_of_next_negative_sample,
epoch_of_next_sample,
n,
noise_level,
gamma,
updates,
node_order,
block_size=4096,
):
for block_start in range(0, n_vertices, block_size):
block_end = min(block_start + block_size, n_vertices)
for node_idx in numba.prange(block_start, block_end):
from_node = node_order[node_idx]
current = embedding[from_node]
for raw_index in range(csr_indptr[from_node], csr_indptr[from_node + 1]):
if epoch_of_next_sample[raw_index] <= n:
to_node = csr_indices[raw_index]
other = embedding[to_node]
dist_squared = rdist(current, other)
if dist_squared > 0.0:
dist = np.sqrt(dist_squared)
grad_coeff = (-2.0 * noise_level * dist - 2.0) / (
2.0 * dist_squared - 0.5 * dist + 1.0
)
for d in range(dim):
grad_d = grad_coeff * (current[d] - other[d])
updates[from_node, d] += grad_d * alpha
epoch_of_next_sample[raw_index] += epochs_per_sample[raw_index]
n_neg_samples = int(
(n - epoch_of_next_negative_sample[raw_index])
/ epochs_per_negative_sample[raw_index]
)
for p in range(n_neg_samples):
to_node = node_order[
(raw_index * (n + p + 1) * rng_state) % n_vertices
]
other = embedding[to_node]
dist_squared = rdist(current, other)
if dist_squared > 1e-2:
grad_coeff = (
gamma
* 4.0
/ ((1.0 + 0.25 * dist_squared) * dist_squared)
)
# grad_coeff /= n_neg_samples
if grad_coeff > 0.0:
for d in range(dim):
grad_d = clip(
grad_coeff * (current[d] - other[d]), -4, 4
)
updates[from_node, d] += grad_d * alpha
epoch_of_next_negative_sample[raw_index] += (
n_neg_samples * epochs_per_negative_sample[raw_index]
)
for node_idx in numba.prange(block_start, block_end):
from_node = node_order[node_idx]
for d in range(dim):
embedding[from_node, d] += updates[from_node, d]
[docs]
def node_embedding(
graph,
n_components,
n_epochs,
initial_embedding=None,
initial_alpha=0.5,
negative_sample_rate=1.0,
noise_level=0.5,
random_state=None,
reproducible_flag=True,
verbose=False,
tqdm_kwds={},
):
"""Learn a low-dimensional embedding of a graph using a UMAP-like algorithm.
This function performs stochastic gradient descent optimization to learn a
low-dimensional embedding of graph structure. It uses both positive (connected
edges) and negative (random) samples to guide the optimization.
Parameters
----------
graph : scipy.sparse matrix, typically csr_matrix or csc_matrix
A sparse adjacency matrix representing the graph. The weights in the matrix
represent connection strengths between nodes.
n_components : int
The number of dimensions in the output embedding.
n_epochs : int
The number of epochs to train the embedding.
initial_embedding : array-like of shape (n_vertices, n_components) or None, default=None
An initial embedding to use as a starting point. If None, a random
embedding is generated from a normal distribution with scale 0.25.
initial_alpha : float, default=0.5
The initial learning rate. The learning rate decays linearly over epochs.
negative_sample_rate : float, default=1.0
The rate at which negative samples are drawn relative to positive samples.
Controls the ratio of negative to positive updates per epoch.
noise_level : float, default=0.5
Controls the strength of noise in the gradient computation. Higher values
increase the tolerance for larger distances before penalizing in the
embedding space.
random_state : RandomState instance or None, default=None
Random state for reproducibility. If None, uses system randomness.
reproducible_flag : bool, default=True
If True, uses a deterministic (but slower) update strategy that processes
nodes in blocks for reproducibility. If False, uses a faster stochastic
approach.
verbose : bool, default=False
If True, display a progress bar during training.
tqdm_kwds : dict, default={}
Additional keyword arguments to pass to tqdm for progress bar customization.
Returns
-------
embedding : array-like of shape (n_vertices, n_components)
The learned low-dimensional embedding of the graph vertices.
"""
if random_state is None:
random_state = np.random.RandomState()
if initial_embedding is None:
embedding = random_state.normal(
scale=0.25, size=(graph.shape[0], n_components)
).astype(np.float32, order="C")
else:
embedding = initial_embedding
epochs_per_sample = make_epochs_per_sample(graph.data, n_epochs).astype(
np.float32, order="C"
)
epochs_per_negative_sample = epochs_per_sample / negative_sample_rate
if reproducible_flag:
epochs_per_negative_sample *= 1.5
epoch_of_next_negative_sample = epochs_per_negative_sample.copy()
epoch_of_next_sample = epochs_per_sample.copy()
if tqdm_kwds is None:
tqdm_kwds = {}
if "disable" not in tqdm_kwds:
tqdm_kwds["disable"] = not verbose
rng_val = random_state.randint(INT32_MAX, size=n_epochs)
coo_graph = graph.tocoo()
head_u4 = coo_graph.row.astype(np.uint32)
tail_u4 = coo_graph.col.astype(np.uint32)
# New
csr_indptr = graph.indptr.astype(np.uint32)
csr_indices = graph.indices.astype(np.uint32)
updates = np.zeros_like(embedding)
node_order = np.arange(graph.shape[0], dtype=np.uint32)
gamma_schedule = np.linspace(0.5, 1.5, n_epochs)
# End new
n_vertices = np.uint32(graph.shape[0])
block_size = max(1024, n_vertices // 8)
dim = np.uint8(embedding.shape[1])
alpha = np.float32(initial_alpha)
for n in tqdm(range(n_epochs), **tqdm_kwds):
if not reproducible_flag:
node_embedding_epoch(
embedding,
head_u4,
tail_u4,
n_vertices,
epochs_per_sample,
rng_val[n],
dim,
alpha,
epochs_per_negative_sample,
epoch_of_next_negative_sample,
epoch_of_next_sample,
n,
noise_level,
)
else:
node_embedding_epoch_repr(
embedding,
csr_indptr,
csr_indices,
n_vertices,
epochs_per_sample,
np.uint32(rng_val[n]),
dim,
alpha,
epochs_per_negative_sample,
epoch_of_next_negative_sample,
epoch_of_next_sample,
np.uint8(n),
np.float32(noise_level),
gamma_schedule[n],
updates,
node_order,
np.uint32(block_size),
)
updates *= (1.0 - alpha) ** 2 * 0.5
random_state.shuffle(node_order)
alpha = np.float32(initial_alpha * (1.0 - (float(n) / float(n_epochs))))
return embedding