Make PyramidMatchingGraphKernel faster (#11)

simonschoelly · web-flow · commit f1d33aff0246 · 2021-04-23T03:57:56.000+02:00
diff --git a/src/graph-kernels/pyramid-match-graph-kernel.jl b/src/graph-kernels/pyramid-match-graph-kernel.jl
@@ -27,54 +27,86 @@ function preprocessed_form(kernel::PyramidMatchGraphKernel, g::AbstractGraph)
     # nv(g) x embedding_dim matrix
     embedding = _embedding(g, d)
 
-    # hist[l, d][i] contains number of points in the i-th bucket
-    # for layer l and dimension d
-    hists = Matrix{SparseVector{Int, Int}}(undef, L + 1, d)
-    for l in 0:L
-        for dd in 1:d
-            counts = zeros(Int, 2^l)
-            for i in 1:2^l
-                lo = -1.0 + (i - 1) * 2 / 2^l
-                hi = -1.0 + i * 2 / 2^l
-
-                for x in embedding[:, dd]
-                    if lo <= x < hi # this currently misses 1.0
-                        counts[i] += 1
-                    end
+    # TODO consider inplace sort!
+
+    # the j-th column of this matrix contains the j-th coordinates
+    # of the embedding vectors sorted in ascending order
+    ordered_points = sort(embedding; dims=1)
+
+    return ordered_points
+end
+
+function apply_preprocessed(kernel::PyramidMatchGraphKernel, points1, points2)
+
+    # TODO the embedding dim might actually be smaller for some graph
+    # we should maybe consider some kind of scaling in such a case
+    d = min(size(points1, 2), size(points2, 2))
+    L = kernel.histogram_levels
+    n1 = size(points1, 1)
+    n2 = size(points2, 1)
+
+    hist_intersect = zeros(Int64, L + 1)
+
+    # TODO ensure no int overflow, same result on 32 bit platform
+
+    # TODO we don't need to store hist_intersect as a vector
+    # TODO add @inbounds
+    hist_intersect[1] = d * min(n1, n2)
+    for l in 1:L
+        cell_boundaries = range(0.0, 1.0, length=2^l + 1)
+        for j in 1:d
+            i1 = 1
+            i2 = 1
+            while i1 <= n1
+                # TODO is is possible that searchsortedlast is not implemented
+                # efficiently on a StepRangeLen
+                cell_num = searchsortedlast(cell_boundaries, points1[i1, j])
+                # TODO maybe we should verify here, that cell_num is a valid index
+                cell_lower = cell_boundaries[cell_num]
+                cell_upper = cell_boundaries[cell_num + 1]
+                # TODO maybe we need some correction for rounding errors here
+
+                i1 += 1
+                num1 = 1
+                # the first loop is just for safety we probably don't need it
+                while i1 <= n1 && points1[i1, j] < cell_lower
+                    i1 += 1
+                end
+                # count number of vertices from g1 that fall into the bucket
+                # specified [cell_lower, cell_upper) along the j-th dimension
+                # of the hypercube
+                while i1 <= n1 && points1[i1, j] < cell_upper
+                    num1 += 1
+                    i1 += 1
+                end
+
+                # count number of vertices from g2 that fall into that bucket
+                # We could also use binary search here
+                while i2 <= n2 && points2[i2, j] < cell_lower
+                    i2 += 1
                 end
+                num2 = 0
+                while i2 <= n2 && points2[i2, j] < cell_upper
+                    num2 += 1
+                    i2 += 1
+                end
+                hist_intersect[l + 1] += min(num1, num2)
             end
-            hists[l + 1, dd] = sparsevec(counts)
         end
     end
-    return hists
-end
 
-function apply_preprocessed(kernel::PyramidMatchGraphKernel, hists1, hists2)
-
-    d = kernel.embedding_dim
-    L = kernel.histogram_levels
-    return _I(hists1, hists2, d, L + 1) +
-        sum(l -> 1 / 2^(L - l) * (_I(hists1, hists2, d, l + 1) -
-                                  _I(hists1, hists2, d, l + 2)), 0:L-1)
+    return hist_intersect[L + 1] + sum(l -> (hist_intersect[l+1] - hist_intersect[l+2]) / 2^(L - l), 0:L-1)
 end
 
 
 function _embedding(g::AbstractGraph, embedding_dim::Int)
 
-    n = nv(g)
-    A = zeros(max(embedding_dim, n), max(embedding_dim, n))
-    A[1:n, 1:n] = adjacency_matrix(g)
-    # TODO should we scale?
-    embedding = eigvecs(A; sortby=x -> -abs(x))[:, 1:embedding_dim]
+    evs = eigvecs(adjacency_matrix(g); sortby=x -> -abs(x))
+    embedding = abs.(@view evs[:, 1:min(size(evs, 2), embedding_dim)])
     # theoretically clamping is not necessary, this is just a precaution
-    # against rounding errors
-    clamp!(embedding, -1.0, 1.0)
+    # against rounding errors. Clamping to prevfloat(-1.0) should make some calculations easier
+    clamp!(embedding, 0.0, prevfloat(1.0))
     return embedding
 end
 
-function _I(hists1, hists2, embedding_dim, level)
-
-    return sum(dd -> sum(min.(hists1[level, dd], hists2[level, dd])), 1:embedding_dim)
-end
-