Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ dependencies = [
"aboutcode.hashid==0.2.0",
# AboutCode pipeline
"aboutcode.pipeline==0.2.1",
"scipy==1.15.3", # 1.16.x requires Python >=3.11
# ScoreCode
"scorecode==0.0.4",
# Workaround issue https://github.com/aboutcode-org/scancode.io/issues/1885
Expand Down
18 changes: 16 additions & 2 deletions scanpipe/pipes/symbolmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,11 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

import math
from collections import Counter
from dataclasses import dataclass
from dataclasses import field

from scipy.spatial.distance import jensenshannon

from aboutcode.pipeline import LoopProgress
from scanpipe.models import CodebaseRelation
from scanpipe.pipes import flag
Expand Down Expand Up @@ -292,6 +291,21 @@ def get_symbols_probability_distribution(symbols, unique_symbols):
return probability_dist


def jensenshannon(p, q):
"""Compute the Jensen-Shannon distance between two probability distributions."""
m = [(pi + qi) / 2.0 for pi, qi in zip(p, q)]

left = 0.0
right = 0.0
for pi, qi, mi in zip(p, q, m):
if pi > 0:
left += pi * math.log(pi / mi)
if qi > 0:
right += qi * math.log(qi / mi)

return math.sqrt((left + right) / 2.0)


def get_similarity_between_source_and_deployed_symbols(
source_symbols,
deployed_symbols,
Expand Down
22 changes: 22 additions & 0 deletions scanpipe/tests/pipes/test_symbolmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,28 @@ def test_scanpipe_pipes_symbolmap_get_symbols_probability_distribution(self):
# print(result_prob_dist)
self.assertListEqual(result_prob_dist, expected_prob_dist)

def test_jensenshannon(self):
# Identical distributions -> distance is 0
self.assertEqual(symbolmap.jensenshannon([1.0, 0.0, 0.0], [1.0, 0.0, 0.0]), 0.0)

# Completely different distributions -> maximum distance
self.assertAlmostEqual(
symbolmap.jensenshannon([1.0, 0.0], [0.0, 1.0]), 0.8325546, places=5
)

# Partial overlap
self.assertAlmostEqual(
symbolmap.jensenshannon([1.0, 0.0], [0.5, 0.5]),
0.46450140402245893,
places=5,
)

# Uniform distributions -> distance is 0
self.assertEqual(
symbolmap.jensenshannon([0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25]),
0.0,
)

def test_get_similarity_between_source_and_deployed_symbols(
self,
):
Expand Down