Merge pull request #36 from lucasimi/feature/remove-grid-cover

lucasimi · web-flow · commit 165e0528aef9 · 2024-01-26T08:21:45.000+01:00
Feature/remove grid cover
diff --git a/README.md b/README.md
@@ -39,56 +39,13 @@ pip install git+https://github.com/lucasimi/tda-mapper-python.git@develop
 
 ## A worked out example
 
-In order to show how to use this package, we perform some analysis on the the well known dataset of hand written digits (more info [here](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html)), consisting of less than 2000 8x8 pictures represented as arrays of 64 elements.
-
-```python
-import numpy as np
-
-from sklearn.datasets import load_digits
-from sklearn.cluster import AgglomerativeClustering
-from sklearn.decomposition import PCA
-
-from tdamapper.core import *
-from tdamapper.cover import *
-from tdamapper.clustering import *
-from tdamapper.plot import *
-
-import matplotlib
-
-digits = load_digits()
-X, y = [np.array(x) for x in digits.data], digits.target
-lens = PCA(2).fit_transform(X)
-
-mapper_algo = MapperAlgorithm(
-    cover=GridCover(n_intervals=10, overlap_frac=0.65),
-    clustering=AgglomerativeClustering(10),
-    verbose=True,
-    n_jobs=8)
-mapper_graph = mapper_algo.fit_transform(X, lens)
-
-mapper_plot = MapperPlot(X, mapper_graph,
-    colors=y, 
-    cmap='jet', 
-    agg=np.nanmean,
-    dim=2,
-    iterations=400)
-fig_mean = mapper_plot.plot(title='digit (mean)', width=600, height=600)
-fig_mean.show(config={'scrollZoom': True})
-```
+![In this file](tests/example.py) you can find a worked out example that shows how to use this package.
+We perform some analysis on the the well known dataset of ![hand written digits](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html), consisting of less than 2000 8x8 pictures represented as arrays of 64 elements.
 
 ![The mapper graph of the digits dataset, colored according to mean value](resources/digits_mean.png)
 
 It's also possible to obtain a new plot colored according to different values, while keeping the same computed geometry. For example, if we want to visualize how much dispersion we have on each cluster, we could plot colors according to the standard deviation
 
-```python
-fig_std = mapper_plot.with_colors(
-    colors=y, 
-    cmap='viridis', 
-    agg=np.nanstd,
-).plot(title='digit (std)', width=600, height=600)
-fig_std.show(config={'scrollZoom': True})
-```
-
 ![The mapper graph of the digits dataset, colored according to std](resources/digits_std.png)
 
 The mapper graph of the digits dataset shows a few interesting patterns. For example, we can make the following observations:
@@ -107,7 +64,7 @@ The mapper graph of the digits dataset shows a few interesting patterns. For exa
     - [x] custom metrics
 
 - [x] Cover algorithms:
-    - [x] `GridCover`
+    - [x] `CubicalCover`
     - [x] `BallCover`
     - [x] `KnnCover`
 
diff --git a/src/tdamapper/clustering.py b/src/tdamapper/clustering.py
@@ -2,7 +2,7 @@
 import numpy as np
 from tdamapper.core import build_labels_par, build_connected_components, MapperAlgorithm
 from tdamapper.utils.unionfind import UnionFind
-from tdamapper.cover import TrivialCover, GridCover, BallCover, KNNCover
+from tdamapper.cover import TrivialCover, CubicalCover, BallCover, KNNCover
 
 
 _logger = logging.getLogger(__name__)
@@ -82,7 +82,7 @@ def fit(self, X, y=None):
 class MapperGraphClustering:
 
     def __init__(self,
-            cover='grid',
+            cover='cubical',
             n_intervals=10,
             overlap_frac=0.25,
             radius=0.5,
@@ -104,8 +104,8 @@ def fit(self, X, y=None):
     def __get_cover(self):
         if self.cover == 'trivial':
             return TrivialCover()
-        elif self.cover == 'grid':
-            return GridCover(n_intervals=self.n_intervals, overlap_frac=self.overlap_frac)
+        elif self.cover == 'cubical':
+            return CubicalCover(n_intervals=self.n_intervals, overlap_frac=self.overlap_frac)
         elif self.cover == 'ball':
             return BallCover(radius=self.radius, metric=self.metric)
         elif self.cover == 'knn':
diff --git a/src/tdamapper/cover.py b/src/tdamapper/cover.py
@@ -64,69 +64,6 @@ def search(self, x):
         return [x for (x, _) in neighs]
 
 
-class GridCover:
-
-    def __init__(self, n_intervals, overlap_frac):
-        self.n_intervals = n_intervals
-        self.overlap_frac = overlap_frac
-
-    def proximity(self):
-        return GridProximity(self.n_intervals, self.overlap_frac)
-
-
-class GridProximity:
-
-    def __init__(self, n_intervals, overlap_frac):
-        self.__n_intervals = n_intervals
-        self.__overlap_frac = overlap_frac
-        self.__radius = (1.0 + self.__overlap_frac) / 2.0
-        self.__minimum = None
-        self.__maximum = None
-        self.__delta = None
-        metric = self._pullback(self._gamma_n, self._l_infty)
-        self.__ball_proximity = BallCover(self.__radius, metric).proximity()
-
-    def _l_infty(self, x, y):
-        return np.max(np.abs(x - y)) # in alternative: np.linalg.norm(x - y, ord=np.inf)
-
-    def _gamma_n(self, x):
-        return self.__n_intervals * (x - self.__minimum) / self.__delta
-
-    def _gamma_n_inv(self, x):
-        return self.__minimum + self.__delta * x / self.__n_intervals
-
-    def _rho(self, x):
-        return x.round()
-
-    def _phi(self, x):
-        return self._gamma_n_inv(self._rho(self._gamma_n(x)))
-
-    def _pullback(self, fun, dist):
-        return lambda x, y: dist(fun(x), fun(y))
-
-    def _set_bounds(self, data):
-        if (data is None) or len(data) == 0:
-            return
-        minimum, maximum = data[0], data[0]
-        eps = np.finfo(np.float64).eps
-        for w in data:
-            minimum = np.minimum(minimum, np.array(w))
-            maximum = np.maximum(maximum, np.array(w))
-        self.__minimum = np.nan_to_num(minimum, nan=-eps)
-        self.__maximum = np.nan_to_num(maximum, nan=eps)
-        delta = self.__maximum - self.__minimum
-        eps = np.finfo(np.float64).eps
-        self.__delta = np.maximum(eps, delta)
-
-    def fit(self, X):
-        self._set_bounds(X)
-        self.__ball_proximity.fit(X)
-        return
-
-    def search(self, x):
-        return self.__ball_proximity.search(self._phi(x))
-
-
 class CubicalCover:
 
     def __init__(self, n_intervals, overlap_frac):
@@ -159,7 +96,7 @@ def _gamma_n_inv(self, x):
         return self.__minimum + self.__delta * x / self.__n_intervals
 
     def _rho(self, x):
-        return x.round() + 0.5
+        return np.floor(x) + 0.5
 
     def _phi(self, x):
         return self._gamma_n_inv(self._rho(self._gamma_n(x)))
diff --git a/tests/example.py b/tests/example.py
@@ -0,0 +1,39 @@
+import numpy as np
+
+from sklearn.datasets import load_digits
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.decomposition import PCA
+
+from tdamapper.core import MapperAlgorithm
+from tdamapper.cover import CubicalCover
+from tdamapper.clustering import PermissiveClustering
+from tdamapper.plot import MapperPlot
+
+X, y = load_digits(return_X_y=True)             # We load a labelled dataset
+lens = PCA(2).fit_transform(X)                  # We compute the lens values
+
+mapper_algo = MapperAlgorithm(
+    cover=CubicalCover(
+        n_intervals=10,
+        overlap_frac=0.65),
+    clustering=PermissiveClustering(            # We prevent clustering failures
+        clustering=AgglomerativeClustering(10),
+        verbose=False),
+    n_jobs=1)
+mapper_graph = mapper_algo.fit_transform(X, lens)
+
+mapper_plot = MapperPlot(X, mapper_graph,
+    colors=y,                                   # We color according to digit values
+    cmap='jet',                                 # Jet colormap, used for classes
+    agg=np.nanmean,                             # We aggregate on graph nodes according to mean
+    dim=2,
+    iterations=400)
+fig_mean = mapper_plot.plot(title='digit (mean)', width=600, height=600)
+#fig_mean.show(config={'scrollZoom': True})     # Uncomment to show the plot
+
+fig_std = mapper_plot.with_colors(              # We reuse the graph plot with the same positions
+    colors=y,
+    cmap='viridis',                             # Virtidis colormap, used for ranges
+    agg=np.nanstd,                              # We aggregate on graph nodes according to std
+).plot(title='digit (std)', width=600, height=600)
+#fig_std.show(config={'scrollZoom': True})      # Uncomment to show the plot
diff --git a/tests/test_cover.py b/tests/test_cover.py
@@ -1,7 +1,7 @@
 import unittest
 
 import numpy as np
-from tdamapper.cover import TrivialCover, BallCover, KNNCover, GridCover
+from tdamapper.cover import TrivialCover, BallCover, KNNCover, CubicalCover
 from tdamapper.core import ProximityNet
 
 
@@ -37,10 +37,10 @@ def testKnnCover(self):
         charts = list(ProximityNet(cover).proximity_net(data))
         self.assertEqual(2, len(charts))
 
-    def testGridCover(self):
+    def testCubicalCover(self):
         data = [
             np.array([0.0, 1.0]), np.array([1.1, 0.0]),
             np.array([0.0, 0.0]), np.array([1.1, 1.0])]
-        cover = GridCover(2, 0.5)
+        cover = CubicalCover(2, 0.5)
         charts = list(ProximityNet(cover).proximity_net(data))
         self.assertEqual(4, len(charts))
diff --git a/tests/test_readme.py b/tests/test_readme.py
@@ -0,0 +1,7 @@
+import unittest
+
+
+class TestReadme(unittest.TestCase):
+
+    def testRun(self):
+        import tests.example
diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py
@@ -7,7 +7,7 @@
 from sklearn.cluster import KMeans
 
 from tdamapper.clustering import TrivialClustering, CoverClustering, PermissiveClustering, MapperGraphClustering
-from tdamapper.cover import TrivialCover, BallCover, KNNCover, GridCover
+from tdamapper.cover import TrivialCover, BallCover, KNNCover, CubicalCover
 
 
 def euclidean(x, y):
@@ -77,14 +77,14 @@ def get_cover(self):
         return BallCover(radius=self.radius, metric=self.metric)
 
 
-class GridCoverEstimator(CoverClusteringEstimator):
+class CubicalCoverEstimator(CoverClusteringEstimator):
 
     def __init__(self, n_intervals=10, overlap_frac=0.25):
         self.n_intervals = n_intervals
         self.overlap_frac = overlap_frac
 
     def get_cover(self):
-        return GridCover(n_intervals=self.n_intervals, overlap_frac=self.overlap_frac)
+        return CubicalCover(n_intervals=self.n_intervals, overlap_frac=self.overlap_frac)
 
 
 class KNNCoverEstimator(CoverClusteringEstimator):
@@ -114,8 +114,8 @@ def testClustering(self):
     def testBall(self):
         check_estimator(BallCoverEstimator())
 
-    def testGrid(self):
-        check_estimator(GridCoverEstimator())
+    def testCubical(self):
+        check_estimator(CubicalCoverEstimator())
 
     def testKNN(self):
         check_estimator(KNNCoverEstimator())