From fa0c2bc34687176973fd29cc1a32b828a244b4ee Mon Sep 17 00:00:00 2001 From: tdruez Date: Wed, 3 Dec 2025 11:36:20 +0400 Subject: [PATCH] Remove the dependency on scipy #1754 Signed-off-by: tdruez --- pyproject.toml | 1 - scanpipe/pipes/symbolmap.py | 18 ++++++++++++++++-- scanpipe/tests/pipes/test_symbolmap.py | 22 ++++++++++++++++++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b3fba73d78..83c18ac6a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,7 +100,6 @@ dependencies = [ "aboutcode.hashid==0.2.0", # AboutCode pipeline "aboutcode.pipeline==0.2.1", - "scipy==1.15.3", # 1.16.x requires Python >=3.11 # ScoreCode "scorecode==0.0.4", # Workaround issue https://github.com/aboutcode-org/scancode.io/issues/1885 diff --git a/scanpipe/pipes/symbolmap.py b/scanpipe/pipes/symbolmap.py index c724f70fd5..cf39381623 100644 --- a/scanpipe/pipes/symbolmap.py +++ b/scanpipe/pipes/symbolmap.py @@ -20,12 +20,11 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +import math from collections import Counter from dataclasses import dataclass from dataclasses import field -from scipy.spatial.distance import jensenshannon - from aboutcode.pipeline import LoopProgress from scanpipe.models import CodebaseRelation from scanpipe.pipes import flag @@ -292,6 +291,21 @@ def get_symbols_probability_distribution(symbols, unique_symbols): return probability_dist +def jensenshannon(p, q): + """Compute the Jensen-Shannon distance between two probability distributions.""" + m = [(pi + qi) / 2.0 for pi, qi in zip(p, q)] + + left = 0.0 + right = 0.0 + for pi, qi, mi in zip(p, q, m): + if pi > 0: + left += pi * math.log(pi / mi) + if qi > 0: + right += qi * math.log(qi / mi) + + return math.sqrt((left + right) / 2.0) + + def get_similarity_between_source_and_deployed_symbols( source_symbols, deployed_symbols, diff --git a/scanpipe/tests/pipes/test_symbolmap.py b/scanpipe/tests/pipes/test_symbolmap.py index ef7772c560..d3b078c432 100644 --- a/scanpipe/tests/pipes/test_symbolmap.py +++ b/scanpipe/tests/pipes/test_symbolmap.py @@ -277,6 +277,28 @@ def test_scanpipe_pipes_symbolmap_get_symbols_probability_distribution(self): # print(result_prob_dist) self.assertListEqual(result_prob_dist, expected_prob_dist) + def test_jensenshannon(self): + # Identical distributions -> distance is 0 + self.assertEqual(symbolmap.jensenshannon([1.0, 0.0, 0.0], [1.0, 0.0, 0.0]), 0.0) + + # Completely different distributions -> maximum distance + self.assertAlmostEqual( + symbolmap.jensenshannon([1.0, 0.0], [0.0, 1.0]), 0.8325546, places=5 + ) + + # Partial overlap + self.assertAlmostEqual( + symbolmap.jensenshannon([1.0, 0.0], [0.5, 0.5]), + 0.46450140402245893, + places=5, + ) + + # Uniform distributions -> distance is 0 + self.assertEqual( + symbolmap.jensenshannon([0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25]), + 0.0, + ) + def test_get_similarity_between_source_and_deployed_symbols( self, ):