theislab · picciama · May 27, 2022 · May 27, 2022 · May 27, 2022 · May 27, 2022
diff --git a/batchglm/train/numpy/base_glm/estimator.py b/batchglm/train/numpy/base_glm/estimator.py
@@ -17,9 +17,17 @@
 from .external import pkg_constants
 from .model_container import BaseModelContainer
 from .training_strategies import TrainingStrategies
+from .vst import bw_kde, geometric_mean, is_outlier, log_geometric_mean
 
 logger = logging.getLogger("batchglm")
 
+try:
+    from skfda.preprocessing.smoothing.kernel_smoothers import NadarayaWatsonSmoother as NWSmoother
+    from skfda.representation.grid import FDataGrid
+except ImportError:
+    FDataGrid = None
+    NWSmoother = None
+
 
 class EstimatorGlm(metaclass=abc.ABCMeta):
     """
@@ -86,8 +94,95 @@ def train_sequence(self, training_strategy, **kwargs):
                         "overrding %s from training strategy with value %s with new value %s\n"
                         % (x, str(d[x]), str(kwargs[x]))
                     )
+            if "dispersion_smoothing" in d:
+                if d["dispersion_smoothing"] == "sctransform":
+                    if FDataGrid is None or NWSmoother is None:
+                        logger.error("Missing optional dependency scikit-fda.")
             self.train(**d, **kwargs)
             logger.debug("Training sequence #%d complete", idx + 1)
+            if "dispersion_smoothing" in d:
+                if d["dispersion_smoothing"] == "sctransform":
+                    self.perform_vst()
+                elif d["dispersion_smoothing"] == "edger":
+                    raise NotImplementedError()
+                elif d["dispersion_smoothing"] == "deseq2":
+                    raise NotImplementedError()
+
+    def perform_vst(
+        self,
+        theta_regularization: str = "od_factor",
+        use_geometric_mean: bool = True,
+        gmean_eps: float = 1.0,
+        bw_adjust: float = 1.5,
+    ):
+        logger.info("Performing Dispersion smoothing with flavour sctransform...")
+        # compute geometric log mean counts
+        if use_geometric_mean:
+            genes_log_gmean = np.log10(geometric_mean(self.model_container.x, axis=0, eps=gmean_eps))
+        else:
+            genes_log_gmean = np.log10(np.mean(self.model_container.x, axis=0))
+        if isinstance(genes_log_gmean, dask.array.core.Array):
+            genes_log_gmean = genes_log_gmean.compute()
+
+        # specify which kind of regularization is performed
+        scale_param = np.exp(self.model_container.theta_scale[0])  # TODO check if this is always correct
+        if theta_regularization == "log_theta":
+            dispersion_par = np.log10(scale_param)
+        elif theta_regularization == "od_factor":
+            dispersion_par = np.log10(1 + np.power(10, genes_log_gmean) / scale_param)
+        else:
+            raise ValueError(f"Unrecognized regularization method {theta_regularization}")
+        if isinstance(dispersion_par, dask.array.core.Array):
+            dispersion_par = dispersion_par.compute()
+
+        # downsample because KDE and bandwidth selection would take too long if performed on the entire dataset.
+        # It is sufficient to get a general idea of the data distribution
+        if len(genes_log_gmean) > 2000:
+            logger.info("Sampling 2000 random features...")
+            idx = np.random.choice(np.arange(len(genes_log_gmean)), 2000)
+            genes_log_gmean_filtered = genes_log_gmean[idx]
+            dispersion_par_filtered = dispersion_par[idx]
+        else:
+            dispersion_par_filtered = dispersion_par
+            genes_log_gmean_filtered = genes_log_gmean
+
+        # check for outliers in the function f(genes_log_gmean_filtered) = dispersion_par_filtered and remove them
+        logger.info("Searching for outliers...")
+        outliers = is_outlier(model_param=dispersion_par_filtered, means=genes_log_gmean_filtered)
+        outliers = outliers["outlier"].values
+        if np.any(outliers):
+            # toss out the outliers
+            logger.info(f"Excluded {np.sum(outliers)} outliers.")
+            genes_log_gmean_filtered = genes_log_gmean_filtered[~outliers]
+            dispersion_par_filtered = dispersion_par_filtered[~outliers]
+
+        # define a data grid with the downsampled and filtered values
+        domain_range = genes_log_gmean_filtered.min(), genes_log_gmean_filtered.max()
+        fd = FDataGrid(
+            data_matrix=dispersion_par_filtered, grid_points=genes_log_gmean_filtered, domain_range=domain_range
+        )
+        # select bandwidth to be used for smoothing
+        bw = bw_kde(genes_log_gmean_filtered) * bw_adjust
+        # bandwidth = FFTKDE(kernel='gaussian', bw='ISJ').fit(fd).bw * 0.37 * 3
+
+        # define points for evaluation. Ensure x_points is within the range of genes_log_gmean_filtered
+        x_points = np.clip(genes_log_gmean, *domain_range)
+        # smooth the dispersion_par
+        logger.info("Performing smoothing...")
+        smoother = NWSmoother(smoothing_parameter=bw, output_points=x_points)
+        dispersion_par_smoothed = smoother.fit_transform(fd).data_matrix[0, :, 0]
+
+        # transform back to scale param
+        if theta_regularization == "log_theta":
+            smoothed_scale = np.power(10, dispersion_par_smoothed)
+        elif theta_regularization == "od_factor":
+            smoothed_scale = np.power(10, genes_log_gmean) / (np.power(10, dispersion_par_smoothed) - 1)
+        else:
+            raise ValueError(f"Unrecognized regularization method {theta_regularization}")
+
+        # store smoothed dispersion_par
+        self.model_container.theta_scale_smoothed = np.log(smoothed_scale)
+        logger.info("Done with dispersion smoothing.")
 
     def initialize(self):
         pass

diff --git a/batchglm/train/numpy/base_glm/model_container.py b/batchglm/train/numpy/base_glm/model_container.py
@@ -162,6 +162,16 @@ def theta_scale_j_setter(self, value, j):
         else:
             self.params[self.npar_location :, j] = value
 
+    # dispersion_smoothing
+
+    @property
+    def theta_scale_smoothed(self) -> np.ndarray:
+        return self._theta_scale_smoothed
+
+    @theta_scale_smoothed.setter
+    def theta_scale_smoothed(self, value):
+        self._theta_scale_smoothed = value
+
     # jacobians
 
     @abc.abstractmethod

diff --git a/batchglm/train/numpy/base_glm/training_strategies.py b/batchglm/train/numpy/base_glm/training_strategies.py
@@ -13,6 +13,25 @@ class TrainingStrategies(Enum):
             "max_iter_scale": 1000,
         },
     ]
-    GD = [
-        {"max_steps": 1000, "method_scale": "gd", "update_scale_freq": 5, "ftol_scale": 1e-6, "max_iter_scale": 100},
+    GD = (
+        [
+            {
+                "max_steps": 1000,
+                "method_scale": "gd",
+                "update_scale_freq": 5,
+                "ftol_scale": 1e-6,
+                "max_iter_scale": 100,
+            },
+        ],
+    )
+
+    SCTRANSFORM = [
+        {
+            "max_steps": 1000,
+            "method_scale": "brent",
+            "update_scale_freq": 5,
+            "ftol_scale": 1e-6,
+            "max_iter_scale": 1000,
+            "dispersion_smoothing": "sctransform",
+        },
     ]
diff --git a/batchglm/train/numpy/base_glm/vst.py b/batchglm/train/numpy/base_glm/vst.py
@@ -0,0 +1,100 @@
+from typing import Optional, Union
+
+import numpy as np
+import pandas as pd
+from scipy.stats import gaussian_kde
+
+
+def log_geometric_mean(a: np.ndarray, axis: Optional[int] = None, eps: Union[int, float] = 1.0):
+    """
+    Returns the log of the geometric mean defined as mean(log(a + eps)).
+    :param a: np.ndarray containing the data
+    :param axis: the axis over which the log geometric mean is calculated. If None, computes the log geometric mean
+        over the entire data.
+    :param eps: small value added to each value in order to avoid computing log(0). Default is 1, i.e. log(x) is
+        equivalent to np.log1p(x).
+    :return np.ndarray: An array with the same length as the axis not used for computation containing the log geometric
+         means.
+    """
+    log_a = np.log(a + eps)
+    return np.mean(log_a, axis=axis)
+
+
+def geometric_mean(a: np.ndarray, axis: Optional[int] = None, eps: Union[int, float] = 1.0):
+    r"""
+    Return the geometric mean defined as (\prod_{i=1}^{n}(x_{i} + eps))^{1/n} - eps computed in log space as
+    exp(1/n(sum_{i=1}^{n}(ln(x_{i} + eps)))) - eps for numerical stability.
+    :param a: np.ndarray containing the data
+    :param axis: the axis over which the geometric mean is calculated. If None, computes the geometric mean over the
+        entire data.
+    :param eps: small value added to each value in order to avoid computing log(0). Default is 1, i.e. log(x) is
+        equivalent to np.log1p(x).
+    :return np.ndarray: An array with the same length as the axis not used for computation containing the geometric
+        means.
+    """
+    log_geo_mean = log_geometric_mean(a, axis, eps)
+    return np.exp(log_geo_mean) - eps
+
+
+def bw_kde(x: np.ndarray, method: str = "silverman"):
+    """
+    Performs gaussian kernel density estimation using a specified method and returns the estimated bandwidth.
+    :param x: np.ndarray of values for which the KDE is performed.
+    :param method: The method used for estimating an optimal bandwidth, see scipy gaussian_kde documentation for
+        available methods.
+    :return float: The estimated bandwidth
+    """
+    return gaussian_kde(x, bw_method=method).factor * 0.37
+    # return FFTKDE(kernel=kernel, bw=method).fit(x).bw
+
+
+def robust_scale(x: pd.Series, c: float = 1.4826, eps: Optional[Union[int, float]] = None):
+    r"""
+    Compute a scale param using the formula scale_{i} = (x_{i} - median(x)) / mad(x) where
+    mad = c * median(abs(x_{i} - median(x))) + eps is the median absolute deviation.
+    This function is derived from sctransform's implementation of robust scale:
+        https://github.com/satijalab/sctransform/blob/7e9c9557222d1e34416e8854ed22da580e533e78/R/utils.R#L162-L163
+    :param x: pd.Series containing the values used to compute the scale.
+    :param c: Scaling constant used in the computation of mad. The default value is equivalent to the R implementation
+        of mad: https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/mad
+    :param eps: Small value added to the mad. If None, it defaults to `np.finfo(float).eps`. This should be equivalent
+        to sctransform's `.Machine$double.eps`.
+
+    :return pd.Series containing the computed scales.
+    """
+    if eps is None:
+        eps = float(np.finfo(float).eps)
+
+    deviation = x - x.median()
+    mad = c * deviation.abs().median() + eps
+    scale = deviation / mad
+    return scale
+
+
+def is_outlier(model_param: np.ndarray, means: np.ndarray, threshold: Union[int, float] = 10):
+    """
+    Compute outlier genes based on deviation of model_param from mean counts in individual bins.
+    This function is derived from sctransform's implementation of is_outlier:
+    https://github.com/satijalab/sctransform/blob/7e9c9557222d1e34416e8854ed22da580e533e78/R/utils.R#L120-L129
+    :param model_param: np.ndarray of a specific model_param. This can be the intercept, any batch/condition or
+        loc/scale param. This is the param based on which it is determined if a specific gene is an outlier.
+    :param means: np.ndarray of genewise mean counts. The means are used to determine bins within outlier detection of
+        the model_param is performed.
+    :param threshold: The minimal score required for a model_param to be considered an outlier.
+
+    :return np.ndarray of booleans indicating if a particular gene is an outlier (True) or not (False).
+    """
+    bin_width = (means.max() - means.min()) * bw_kde(means) / 2
+    eps = np.finfo(float).eps * 10
+
+    breaks1 = np.arange(means.min() - eps, means.max() + bin_width, bin_width)
+    breaks2 = np.arange(means.min() - eps - bin_width / 2, means.max() + bin_width, bin_width)
+    bins1 = pd.cut(means, bins=breaks1)
+    bins2 = pd.cut(means, bins=breaks2)
+
+    df_tmp = pd.DataFrame({"param": model_param, "bins1": bins1, "bins2": bins2})
+    df_tmp["score1"] = df_tmp.groupby("bins1")["param"].transform(robust_scale)
+    df_tmp["score2"] = df_tmp.groupby("bins2")["param"].transform(robust_scale)
+    df_tmp["outlier"] = df_tmp[["score1", "score2"]].abs().min(axis=1) > threshold
+
+    return df_tmp
diff --git a/docs/api/index.rst b/docs/api/index.rst
@@ -23,7 +23,7 @@ where `xxxxxx` is the backend desired, like `tf2`, `numpy` or `statsmodel`.
 For example, here is a short snippet to give a sense of how the API might work::
 
    from batchglm.models.glm_nb import Model as NBModel
-   from batchglm.train.numpy.glm_nb import Estimator as NBEstimator  
+   from batchglm.train.numpy.glm_nb import Estimator as NBEstimator
    from batchglm.utils.input import InputDataGLM
 
    input_data = InputDataGLM(data=data_matrix, design_loc=_design_loc, design_scale=_design_scale, as_dask=as_dask)

diff --git a/tests/numpy/test_accuracy.py b/tests/numpy/test_accuracy.py
@@ -64,7 +64,7 @@ def _test_accuracy(self, estimator: EstimatorGlm) -> bool:
 
 
 class TestAccuracyNB(TestAccuracy):
-    def test_accuracy_rand_theta(self) -> bool:
+    def test_accuracy_rand_theta(self):
         """
         This tests randTheta simulated data with 2 conditions and 4 batches sparse and dense.
         """
@@ -94,7 +94,7 @@ def test_accuracy_rand_theta(self) -> bool:
         )
         assert self._test_accuracy(sparse_estimator)
 
-    def test_accuracy_const_theta(self) -> bool:
+    def test_accuracy_const_theta(self):
         """
         This tests constTheta simulated data with 2 conditions and 0 batches sparse and dense.
         """
@@ -113,7 +113,7 @@ def test_accuracy_const_theta(self) -> bool:
         sparse_estimator = get_estimator(
             noise_model="nb", model=sparse_model, init_location="standard", init_scale="standard"
         )
-        return self._test_accuracy(sparse_estimator)
+        assert self._test_accuracy(sparse_estimator)
 
         dense_estimator = get_estimator(
             noise_model="nb", model=dense_model, init_location="standard", init_scale="standard", quick_scale=True
@@ -123,7 +123,7 @@ def test_accuracy_const_theta(self) -> bool:
         sparse_estimator = get_estimator(
             noise_model="nb", model=sparse_model, init_location="standard", init_scale="standard", quick_scale=True
         )
-        return self._test_accuracy(sparse_estimator)
+        assert self._test_accuracy(sparse_estimator)
 
 
 if __name__ == "__main__":