add feature: parallel computing of NLE-GP.

Xiangyan93 · Xiangyan93 · commit 6cbf209fc889 · 2022-01-12T21:43:48.000+08:00
diff --git a/chemml/evaluator.py b/chemml/evaluator.py
@@ -206,10 +206,12 @@ def set_model(self, args: TrainArgs):
                 batch_size=args.batch_size
             )
         elif args.model_type == 'gpr_nle':
+            n_jobs = 1 if self.args.graph_kernel_type == 'graph' else self.args.n_jobs
             self.model = NLEGPR(
                 kernel=self.kernel,
                 alpha=args.alpha_,
                 n_local=args.n_local,
+                n_jobs=n_jobs
             )
         elif args.model_type == 'gpc':
             self.model = GPC(
diff --git a/chemml/models/regression/ScalableGPR/NLE.py b/chemml/models/regression/ScalableGPR/NLE.py
@@ -2,18 +2,23 @@
 # -*- coding: utf-8 -*-
 import numpy as np
 from tqdm import tqdm
+import threading
+from joblib import Parallel, delayed
+from sklearn.utils.fixes import _joblib_parallel_args
 from graphdot.linalg.cholesky import CholSolver
 
 
 class NaiveLocalExpertGP:
     """Transductive Naive Local Experts of Gaussian process regression.
 
     """
-    def __init__(self, kernel, alpha=1e-8, n_local=500, normalize_y=False, kernel_options={}):
+    def __init__(self, kernel, alpha=1e-8, n_local=500, normalize_y=False,
+                 n_jobs=1, kernel_options={}):
         self.kernel = kernel
         self.alpha = alpha
         self.n_local = n_local
         self.normalize_y = normalize_y
+        self.n_jobs = n_jobs
         self.kernel_options = kernel_options
 
     @property
@@ -79,27 +84,48 @@ def fit(self, X, y):
         self.X = X
         self.y = y
 
+    def predict_(self, Z, return_std=False):
+        Ks = self._gramian(Z, self.X)
+        local_idx = np.argsort(-Ks)[:, :min(self.n_local, Ks.shape[1])][0]
+        Ks_local = Ks[:, local_idx]
+        X_local = self.X[local_idx]
+        y_local = self.y[local_idx]
+        K_local = self._gramian(X_local)
+        Kinv_local, _ = self._invert(K_local)
+        Ky_local = Kinv_local @ y_local
+        y_mean = (Ks_local @ Ky_local) * self.y_std + self.y_mean
+        if return_std:
+            Kss = self._gramian(Z, diag=True)
+            y_std = np.sqrt(
+                np.maximum(0, Kss - (Ks_local @ (Kinv_local @ Ks_local.T)).diagonal())
+            )
+            return y_mean, y_std
+        else:
+            return y_mean
+
+    def _accumulate_prediction(self, Z, y_hat, u_hat, lock, return_std=False):
+        if return_std:
+            prediction, uncertainty = self.predict_(Z, return_std=True)
+            with lock:
+                y_hat.append(prediction)
+                u_hat.append(uncertainty)
+        else:
+            prediction = self.predict_(Z, return_std=False)
+            with lock:
+                y_hat.append(prediction)
+
     def predict(self, Z, return_std=False):
-        y_mean = []
-        y_std = []
-        for z in tqdm(Z, total=len(Z)):
-            Z_ = z.reshape(1, -1)
-            Ks = self._gramian(Z_, self.X)
-            local_idx = np.argsort(-Ks)[:, :min(self.n_local, Ks.shape[1])][0]
-            Ks_local = Ks[:, local_idx]
-            X_local = self.X[local_idx]
-            y_local = self.y[local_idx]
-            K_local = self._gramian(X_local)
-            Kinv_local, _ = self._invert(K_local)
-            Ky_local = Kinv_local @ y_local
-            y_mean.append((Ks_local @ Ky_local) * self.y_std + self.y_mean)
-            if return_std is True:
-                Kss = self._gramian(Z_, diag=True)
-                std = np.sqrt(
-                    np.maximum(0, Kss - (Ks_local @ (Kinv_local @ Ks_local.T)).diagonal())
-                )
-                y_std.append(std)
+        results = Parallel(
+            n_jobs=self.n_jobs, verbose=True,
+            **_joblib_parallel_args(prefer='processes'))(
+            delayed(self.predict_)(
+                z.reshape(1, -1),
+                return_std
+            )
+            for z in Z)
+        y_mean = np.asarray([result[0][0] for result in results])
         if return_std:
-            return np.concatenate(y_mean), np.concatenate(y_std)
+            y_std = np.asarray([result[1][0] for result in results])
+            return y_mean, y_std
         else:
-            return np.concatenate(y_mean)
+            return y_mean

Original file line number	Diff line number	Diff line change
`@@ -206,10 +206,12 @@ def set_model(self, args: TrainArgs):`
`206`	`206`	`batch_size=args.batch_size`
`207`	`207`	`)`
`208`	`208`	`elif args.model_type == 'gpr_nle':`
	`209`	`+ n_jobs = 1 if self.args.graph_kernel_type == 'graph' else self.args.n_jobs`
`209`	`210`	`self.model = NLEGPR(`
`210`	`211`	`kernel=self.kernel,`
`211`	`212`	`alpha=args.alpha_,`
`212`	`213`	`n_local=args.n_local,`
	`214`	`+ n_jobs=n_jobs`
`213`	`215`	`)`
`214`	`216`	`elif args.model_type == 'gpc':`
`215`	`217`	`self.model = GPC(`