Simplified test comparing sparse to dense to be more approchable with proper backend adaptation

nathanneike · nathanneike · commit a298a86d302d · 2025-11-27T14:16:48.000+01:00
diff --git a/ot/utils.py b/ot/utils.py
@@ -13,7 +13,6 @@
 import numpy as np
 from scipy.spatial.distance import cdist
 from scipy.sparse import coo_array
-from sklearn.neighbors import NearestNeighbors
 import sys
 import warnings
 from inspect import signature
@@ -436,75 +435,37 @@ def dist(
             return cdist(x1, x2, metric=metric)
 
 
-def dist_knn(
-    x1,
-    x2=None,
-    k=10,
-    metric="euclidean",
-    p=2,
-):
-    r"""Compute sparse k-nearest neighbors distance matrix in COO format
-
-    This function efficiently computes a sparse distance matrix containing only
-    the k-nearest neighbors for each sample, which is useful for large-scale
-    optimal transport problems where the full dense distance matrix would be
-    prohibitively large.
-
-    Parameters
-    ----------
-    x1 : array-like, shape (n1, d)
-        Matrix with `n1` samples of size `d`
-    x2 : array-like, shape (n2, d), optional
-        Matrix with `n2` samples of size `d` (if None then :math:`\mathbf{x_2} = \mathbf{x_1}`)
-    k : int, optional (default=10)
-        Number of nearest neighbors to keep for each sample
-    metric : str, optional (default='euclidean')
-        Distance metric to use. Supported metrics include: 'euclidean', 'manhattan',
-        'chebyshev', 'minkowski', 'cityblock', 'cosine', 'l1', 'l2', 'sqeuclidean',
-        and others supported by sklearn.neighbors.NearestNeighbors
-    p : float, optional (default=2)
-        Parameter for the Minkowski metric
-
-    Returns
-    -------
-    M_sparse : scipy.sparse.coo_array, shape (n1, n2)
-        Sparse distance matrix in COO format containing only k-nearest neighbors
-
-    """
-    nx = get_backend(x1, x2)
-
-    # Convert to numpy for k-NN computation
-    x1_np = nx.to_numpy(x1)
-    x2_np = nx.to_numpy(x2) if x2 is not None else x1_np
-
-    n1 = x1_np.shape[0]
-    n2 = x2_np.shape[0]
-    k_actual = min(k, n2)  # Handle case where k > n2
-
-    # Use sklearn's efficient k-NN implementation
-    metric_params = {}
-    if metric == "minkowski":
-        metric_params["p"] = p
-
-    nbrs = NearestNeighbors(
-        n_neighbors=k_actual,
-        algorithm="auto",
-        metric=metric,
-        metric_params=metric_params if metric_params else None,
-    )
-    nbrs.fit(x2_np)
-
-    # Find k-nearest neighbors and their distances
-    distances, indices = nbrs.kneighbors(x1_np)
-
-    # Build sparse matrix in COO format
-    rows = np.repeat(np.arange(n1), k_actual)
-    cols = indices.ravel()
-    data = distances.ravel()
+def get_sparse_test_matrices(n1, n2, k=2, seed=42, nx=None):
+    if nx is None:
+        nx = NumpyBackend()
+
+    rng = np.random.RandomState(seed)
+    M_orig = rng.rand(n1, n2)
+
+    mask = np.zeros((n1, n2))
+    for i in range(n1):
+        j_list = rng.choice(n2, min(k, n2), replace=False)
+        for j in j_list:
+            mask[i, j] = 1
+    for j in range(n2):
+        i_list = rng.choice(n1, min(k, n1), replace=False)
+        for i in i_list:
+            mask[i, j] = 1
+
+    M_sparse_np = coo_array(M_orig * mask)
+    rows, cols, data = M_sparse_np.row, M_sparse_np.col, M_sparse_np.data
+
+    if nx.__name__ == "numpy":
+        M_sparse = M_sparse_np
+    else:
+        rows_b = nx.from_numpy(rows.astype(np.int64))
+        cols_b = nx.from_numpy(cols.astype(np.int64))
+        data_b = nx.from_numpy(data)
+        M_sparse = nx.coo_matrix(data_b, rows_b, cols_b, shape=(n1, n2))
 
-    M_sparse = coo_array((data, (rows, cols)), shape=(n1, n2))
+    M_dense = nx.from_numpy(M_orig + 1e8 * (1 - mask))
 
-    return M_sparse
+    return M_sparse, M_dense
 
 
 def dist0(n, method="lin_square"):
diff --git a/test/test_ot.py b/test/test_ot.py
@@ -12,7 +12,6 @@
 import ot
 from ot.datasets import make_1D_gauss as gauss
 from ot.backend import torch, tf, get_backend
-from scipy.sparse import coo_array
 
 
 def test_emd_dimension_and_mass_mismatch():
@@ -918,156 +917,70 @@ def test_dual_variables():
 def test_emd_sparse_vs_dense(nx):
     """Test that sparse and dense EMD solvers produce identical results.
 
-    Uses augmented k-NN graph approach: first solves with dense solver to
-    identify needed edges, then compares both solvers on the same graph.
+    Uses random sparse graphs with k=2 edges per row/column, which guarantees
+    feasibility with uniform marginals.
     """
     # Skip for backends that don't support sparse matrices
     backend_name = nx.__class__.__name__.lower()
     if "jax" in backend_name or "tensorflow" in backend_name:
         pytest.skip("Backend does not support sparse matrices")
 
-    n_source = 100
-    n_target = 100
-    k = 10
+    n1 = 100
+    n2 = 100
+    k = 2
 
-    rng = np.random.RandomState(42)
+    M_sparse, M_dense = ot.utils.get_sparse_test_matrices(n1, n2, k=k, seed=42, nx=nx)
 
-    x_source = rng.randn(n_source, 2)
-    x_target = rng.randn(n_target, 2) + 0.5
+    a = ot.utils.unif(n1, type_as=M_dense)
+    b = ot.utils.unif(n2, type_as=M_dense)
 
-    a = ot.utils.unif(n_source)
-    b = ot.utils.unif(n_target)
-
-    C = ot.dist(x_source, x_target)
-
-    # Compute k-NN sparse cost matrix
-    C_knn = ot.utils.dist_knn(x_source, x_target, k=k, metric="sqeuclidean")
-
-    # First pass: solve with k-NN to identify active edges
-    large_cost = 1e8
-    C_dense_infty = np.full((n_source, n_target), large_cost)
-    C_knn_array = C_knn.toarray()
-    C_dense_infty[C_knn_array > 0] = C_knn_array[C_knn_array > 0]
-
-    G_dense_initial = ot.emd(a, b, C_dense_infty)
-    eps = 1e-9
-    active_mask = G_dense_initial > eps
-    knn_mask = C_knn_array > 0
-    extra_edges_mask = active_mask & ~knn_mask
-
-    rows_aug = []
-    cols_aug = []
-    data_aug = []
-
-    knn_rows, knn_cols = np.where(knn_mask)
-    for i, j in zip(knn_rows, knn_cols):
-        rows_aug.append(i)
-        cols_aug.append(j)
-        data_aug.append(C[i, j])
-
-    extra_rows, extra_cols = np.where(extra_edges_mask)
-    for i, j in zip(extra_rows, extra_cols):
-        rows_aug.append(i)
-        cols_aug.append(j)
-        data_aug.append(C[i, j])
-
-    C_augmented = coo_array(
-        (data_aug, (rows_aug, cols_aug)), shape=(n_source, n_target)
-    )
-
-    C_augmented_dense = np.full((n_source, n_target), large_cost)
-    C_augmented_dense[rows_aug, cols_aug] = data_aug
-
-    G_dense, log_dense = ot.emd(a, b, C_augmented_dense, log=True)
-    G_sparse, log_sparse = ot.emd(a, b, C_augmented, log=True)
+    # Solve with both dense and sparse solvers
+    G_dense, log_dense = ot.emd(a, b, M_dense, log=True)
+    G_sparse, log_sparse = ot.emd(a, b, M_sparse, log=True)
 
     cost_dense = log_dense["cost"]
     cost_sparse = log_sparse["cost"]
-
     np.testing.assert_allclose(cost_dense, cost_sparse, rtol=1e-5, atol=1e-7)
 
-    # For dense, G_dense is returned; for sparse, reconstruct from flow edges
     np.testing.assert_allclose(a, G_dense.sum(1), rtol=1e-5, atol=1e-7)
     np.testing.assert_allclose(b, G_dense.sum(0), rtol=1e-5, atol=1e-7)
 
-    # G_sparse is now returned as a sparse matrix
-    from scipy.sparse import issparse
+    assert nx.issparse(G_sparse), "Sparse solver should return a sparse matrix"
 
-    assert issparse(G_sparse), "Sparse solver should return a sparse matrix"
-
-    # Convert to dense for marginal checks
-    G_sparse_dense = G_sparse.toarray()
-    np.testing.assert_allclose(a, G_sparse_dense.sum(1), rtol=1e-5, atol=1e-7)
-    np.testing.assert_allclose(b, G_sparse_dense.sum(0), rtol=1e-5, atol=1e-7)
+    G_sparse_dense = nx.todense(G_sparse)
+    np.testing.assert_allclose(
+        a, nx.to_numpy(nx.sum(G_sparse_dense, 1)), rtol=1e-5, atol=1e-7
+    )
+    np.testing.assert_allclose(
+        b, nx.to_numpy(nx.sum(G_sparse_dense, 0)), rtol=1e-5, atol=1e-7
+    )
 
 
 def test_emd2_sparse_vs_dense(nx):
-    """Test that sparse and dense emd2 solvers produce identical results.
+    """Test that sparse and dense emd2 solvers produce identical costs.
 
-    Uses augmented k-NN graph approach: first solves with dense solver to
-    identify needed edges, then compares both solvers on the same graph.
+    Uses random sparse graphs with k=2 edges per row/column, which guarantees
+    feasibility with uniform marginals.
     """
     # Skip for backends that don't support sparse matrices
     backend_name = nx.__class__.__name__.lower()
     if "jax" in backend_name or "tensorflow" in backend_name:
         pytest.skip("Backend does not support sparse matrices")
 
-    n_source = 100
-    n_target = 100
-    k = 10
-
-    rng = np.random.RandomState(42)
-
-    x_source = rng.randn(n_source, 2)
-    x_target = rng.randn(n_target, 2) + 0.5
-
-    a = ot.utils.unif(n_source)
-    b = ot.utils.unif(n_target)
-
-    C = ot.dist(x_source, x_target)
-
-    # Compute k-NN sparse cost matrix
-    C_knn = ot.utils.dist_knn(x_source, x_target, k=k, metric="sqeuclidean")
-
-    # First pass: solve with k-NN to identify active edges
-    large_cost = 1e8
-    C_dense_infty = np.full((n_source, n_target), large_cost)
-    C_knn_array = C_knn.toarray()
-    C_dense_infty[C_knn_array > 0] = C_knn_array[C_knn_array > 0]
+    n1 = 100
+    n2 = 150
+    k = 2
 
-    G_dense_initial = ot.emd(a, b, C_dense_infty)
+    M_sparse, M_dense = ot.utils.get_sparse_test_matrices(n1, n2, k=k, seed=43, nx=nx)
 
-    eps = 1e-9
-    active_mask = G_dense_initial > eps
-    knn_mask = C_knn_array > 0
-    extra_edges_mask = active_mask & ~knn_mask
+    a = ot.utils.unif(n1, type_as=M_dense)
+    b = ot.utils.unif(n2, type_as=M_dense)
 
-    rows_aug = []
-    cols_aug = []
-    data_aug = []
-
-    knn_rows, knn_cols = np.where(knn_mask)
-    for i, j in zip(knn_rows, knn_cols):
-        rows_aug.append(i)
-        cols_aug.append(j)
-        data_aug.append(C[i, j])
-
-    extra_rows, extra_cols = np.where(extra_edges_mask)
-    for i, j in zip(extra_rows, extra_cols):
-        rows_aug.append(i)
-        cols_aug.append(j)
-        data_aug.append(C[i, j])
-
-    C_augmented = coo_array(
-        (data_aug, (rows_aug, cols_aug)), shape=(n_source, n_target)
-    )
-
-    C_augmented_dense = np.full((n_source, n_target), large_cost)
-    C_augmented_dense[rows_aug, cols_aug] = data_aug
-
-    cost_dense = ot.emd2(a, b, C_augmented_dense)
-    cost_sparse = ot.emd2(a, b, C_augmented)
+    # Solve with both dense and sparse solvers
+    cost_dense = ot.emd2(a, b, M_dense)
+    cost_sparse = ot.emd2(a, b, M_sparse)
 
+    # Check costs match
     np.testing.assert_allclose(cost_dense, cost_sparse, rtol=1e-5, atol=1e-7)