Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
c001fef
added sctransform vst wrapper
picciama May 27, 2022
9f768db
fixed trailing whitespace
picciama May 27, 2022
3fe5524
removed return statements
picciama May 27, 2022
74155f0
cast np.finfo to float explicitly
picciama May 27, 2022
47c8341
resolved commments by Ilan Gold in #145
picciama Jun 19, 2022
6e347f7
bugfix: also check for "no_smoothing"
picciama Jun 19, 2022
235e8a5
added nbdeviance from edgeR
picciama Jul 15, 2022
66b3ed5
added aveLogCPM from edgeR
picciama Jul 15, 2022
17360a0
added calcNormFactors from edgeR
picciama Jul 15, 2022
6e1e530
added residDF from edgeR
picciama Jul 15, 2022
21f6570
added wleb wrapper from edgeR
picciama Jul 15, 2022
63444d1
added maximizeInterpolant from edgeR
picciama Jul 15, 2022
9756c36
added external.py for external imports
picciama Jul 15, 2022
9dd2d38
added squeezeVar from limma
picciama Jul 15, 2022
b09b853
added glm_one_group from edgeR
picciama Jul 15, 2022
0b0dcef
bugfixes in tmmwsp fixed
picciama Jul 19, 2022
1541d7d
added deps for newly included edgeR procedures
picciama Jul 23, 2022
c6dcf99
compute() dask arrays before calculating nb_dev
picciama Jul 23, 2022
7701c78
added levenberg-marquardt estimator as in edgeR
picciama Jul 23, 2022
bb78dab
added qr_decomposition for param init as in edgeR
picciama Jul 23, 2022
ae5a8bd
integrated batchglm model / estimator environment
picciama Aug 6, 2022
5280a08
added documentation, bugfixing and use c_utils
picciama Aug 6, 2022
737e099
added documentation and minor fixes
picciama Aug 6, 2022
ce34637
documentation and cleanup
picciama Aug 6, 2022
d307bd4
cleanup
picciama Aug 6, 2022
4547fdd
documentation / cleanup
picciama Aug 6, 2022
5d54b1f
cleanup / bugfixing /integration in estimator
picciama Aug 6, 2022
8c4e96c
added imports
picciama Aug 6, 2022
56d9220
bugfixing and cleanup
picciama Aug 6, 2022
281336a
added effects function
picciama Aug 6, 2022
291f2f2
added module init
picciama Aug 6, 2022
2829929
added fit_f_dist function
picciama Aug 6, 2022
9b4bb0b
added adjusted profile likelihood
picciama Aug 6, 2022
8d4ac3b
cleanup and integration of one_group_glm
picciama Aug 6, 2022
0e78480
added prior degrees of freedom
picciama Aug 6, 2022
dc4ad4c
added basic estimateDisp function
picciama Aug 6, 2022
f2e0831
added c_utility functions
picciama Aug 6, 2022
26861d7
cleanup
picciama Aug 7, 2022
dd7c053
added support for summed nb_deviance
picciama Aug 7, 2022
2520732
bugfix: leverages not correctly returned
picciama Aug 7, 2022
6abe5c9
removed print statement
picciama Aug 7, 2022
3d2e1f3
ignore irrelevant log division errors
picciama Aug 7, 2022
9595dd9
delayed levenberg theta_loc init + dask support
picciama Aug 7, 2022
ca7e50d
added sizefactor and dask/nodask support
picciama Aug 7, 2022
e1228ee
added newest mypy and pytest versions
picciama Sep 13, 2022
0a21e4a
updated pre-commit yaml
picciama Sep 13, 2022
0594375
bumped version of black to support newer click
picciama Sep 13, 2022
09475f0
typo fixed
picciama Sep 13, 2022
0e53b11
added dask support to typehint
picciama Sep 13, 2022
5cd4676
make constraints a mandatory argument
picciama Sep 13, 2022
8730ba7
fix mypy related problems + black reformatting
picciama Sep 13, 2022
a0636ca
index using np.ndarray for mypy to succeed
picciama Sep 20, 2022
5856631
return all, not just prior_df
picciama Sep 20, 2022
d961166
minor refactoring and bugfixing
picciama Sep 20, 2022
4a230b5
fixed pre-commit hooks and mypy
picciama Sep 22, 2022
fbc5e91
increased the param clipping limits
picciama Sep 22, 2022
16915b7
Merge branch 'development' into mp/dispersion_smoothing
picciama Oct 7, 2022
13a7800
fix mypy / batchglm API after merging develoment
picciama Oct 7, 2022
dd63af4
added missing import and init file
picciama Oct 8, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions batchglm/train/numpy/base_glm/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,17 @@
from .external import pkg_constants
from .model_container import BaseModelContainer
from .training_strategies import TrainingStrategies
from .vst import bw_kde, geometric_mean, is_outlier, log_geometric_mean

logger = logging.getLogger("batchglm")

try:
from skfda.preprocessing.smoothing.kernel_smoothers import NadarayaWatsonSmoother as NWSmoother
from skfda.representation.grid import FDataGrid
except ImportError:
FDataGrid = None
NWSmoother = None


class EstimatorGlm(metaclass=abc.ABCMeta):
"""
Expand Down Expand Up @@ -86,8 +94,95 @@ def train_sequence(self, training_strategy, **kwargs):
"overrding %s from training strategy with value %s with new value %s\n"
% (x, str(d[x]), str(kwargs[x]))
)
if "dispersion_smoothing" in d:
if d["dispersion_smoothing"] == "sctransform":
if FDataGrid is None or NWSmoother is None:
logger.error("Missing optional dependency scikit-fda.")
self.train(**d, **kwargs)
logger.debug("Training sequence #%d complete", idx + 1)
if "dispersion_smoothing" in d:
if d["dispersion_smoothing"] == "sctransform":
self.perform_vst()
elif d["dispersion_smoothing"] == "edger":
raise NotImplementedError()
elif d["dispersion_smoothing"] == "deseq2":
raise NotImplementedError()

def perform_vst(
self,
theta_regularization: str = "od_factor",
use_geometric_mean: bool = True,
gmean_eps: float = 1.0,
bw_adjust: float = 1.5,
):
logger.info("Performing Dispersion smoothing with flavour sctransform...")
# compute geometric log mean counts
if use_geometric_mean:
genes_log_gmean = np.log10(geometric_mean(self.model_container.x, axis=0, eps=gmean_eps))
else:
genes_log_gmean = np.log10(np.mean(self.model_container.x, axis=0))
if isinstance(genes_log_gmean, dask.array.core.Array):
genes_log_gmean = genes_log_gmean.compute()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because I had trouble with this downstream. It was easier to transform to a dense numpy array before. I will check again to see if I can delay this further.


# specify which kind of regularization is performed
scale_param = np.exp(self.model_container.theta_scale[0]) # TODO check if this is always correct
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why an exponentiation here? is this meant to be link-function specific?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure how to resolve this (thus the TODO). Indeed, I think this depends on the noise model. I will have to look at that again.

if theta_regularization == "log_theta":
dispersion_par = np.log10(scale_param)
elif theta_regularization == "od_factor":
dispersion_par = np.log10(1 + np.power(10, genes_log_gmean) / scale_param)
else:
raise ValueError(f"Unrecognized regularization method {theta_regularization}")
if isinstance(dispersion_par, dask.array.core.Array):
dispersion_par = dispersion_par.compute()

# downsample because KDE and bandwidth selection would take too long if performed on the entire dataset.
# It is sufficient to get a general idea of the data distribution
if len(genes_log_gmean) > 2000:
logger.info("Sampling 2000 random features...")
idx = np.random.choice(np.arange(len(genes_log_gmean)), 2000)
genes_log_gmean_filtered = genes_log_gmean[idx]
dispersion_par_filtered = dispersion_par[idx]
else:
dispersion_par_filtered = dispersion_par
genes_log_gmean_filtered = genes_log_gmean

# check for outliers in the function f(genes_log_gmean_filtered) = dispersion_par_filtered and remove them
logger.info("Searching for outliers...")
outliers = is_outlier(model_param=dispersion_par_filtered, means=genes_log_gmean_filtered)
outliers = outliers["outlier"].values
if np.any(outliers):
# toss out the outliers
logger.info(f"Excluded {np.sum(outliers)} outliers.")
genes_log_gmean_filtered = genes_log_gmean_filtered[~outliers]
dispersion_par_filtered = dispersion_par_filtered[~outliers]

# define a data grid with the downsampled and filtered values
domain_range = genes_log_gmean_filtered.min(), genes_log_gmean_filtered.max()
fd = FDataGrid(
data_matrix=dispersion_par_filtered, grid_points=genes_log_gmean_filtered, domain_range=domain_range
)
# select bandwidth to be used for smoothing
bw = bw_kde(genes_log_gmean_filtered) * bw_adjust
# bandwidth = FFTKDE(kernel='gaussian', bw='ISJ').fit(fd).bw * 0.37 * 3

# define points for evaluation. Ensure x_points is within the range of genes_log_gmean_filtered
x_points = np.clip(genes_log_gmean, *domain_range)
# smooth the dispersion_par
logger.info("Performing smoothing...")
smoother = NWSmoother(smoothing_parameter=bw, output_points=x_points)
dispersion_par_smoothed = smoother.fit_transform(fd).data_matrix[0, :, 0]

# transform back to scale param
if theta_regularization == "log_theta":
smoothed_scale = np.power(10, dispersion_par_smoothed)
elif theta_regularization == "od_factor":
smoothed_scale = np.power(10, genes_log_gmean) / (np.power(10, dispersion_par_smoothed) - 1)
else:
raise ValueError(f"Unrecognized regularization method {theta_regularization}")

# store smoothed dispersion_par
self.model_container.theta_scale_smoothed = np.log(smoothed_scale)
logger.info("Done with dispersion smoothing.")

def initialize(self):
pass
Expand Down
10 changes: 10 additions & 0 deletions batchglm/train/numpy/base_glm/model_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,16 @@ def theta_scale_j_setter(self, value, j):
else:
self.params[self.npar_location :, j] = value

# dispersion_smoothing

@property
def theta_scale_smoothed(self) -> np.ndarray:
return self._theta_scale_smoothed

@theta_scale_smoothed.setter
def theta_scale_smoothed(self, value):
self._theta_scale_smoothed = value

# jacobians

@abc.abstractmethod
Expand Down
23 changes: 21 additions & 2 deletions batchglm/train/numpy/base_glm/training_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,25 @@ class TrainingStrategies(Enum):
"max_iter_scale": 1000,
},
]
GD = [
{"max_steps": 1000, "method_scale": "gd", "update_scale_freq": 5, "ftol_scale": 1e-6, "max_iter_scale": 100},
GD = (
[
{
"max_steps": 1000,
"method_scale": "gd",
"update_scale_freq": 5,
"ftol_scale": 1e-6,
"max_iter_scale": 100,
},
],
)

SCTRANSFORM = [
{
"max_steps": 1000,
"method_scale": "brent",
"update_scale_freq": 5,
"ftol_scale": 1e-6,
"max_iter_scale": 1000,
"dispersion_smoothing": "sctransform",
},
]
100 changes: 100 additions & 0 deletions batchglm/train/numpy/base_glm/vst.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from typing import Optional, Union

import numpy as np
import pandas as pd
from scipy.stats import gaussian_kde


def log_geometric_mean(a: np.ndarray, axis: Optional[int] = None, eps: Union[int, float] = 1.0):
"""
Returns the log of the geometric mean defined as mean(log(a + eps)).
:param a: np.ndarray containing the data
:param axis: the axis over which the log geometric mean is calculated. If None, computes the log geometric mean
over the entire data.
:param eps: small value added to each value in order to avoid computing log(0). Default is 1, i.e. log(x) is
equivalent to np.log1p(x).
:return np.ndarray: An array with the same length as the axis not used for computation containing the log geometric
means.
"""
log_a = np.log(a + eps)
return np.mean(log_a, axis=axis)


def geometric_mean(a: np.ndarray, axis: Optional[int] = None, eps: Union[int, float] = 1.0):
r"""
Return the geometric mean defined as (\prod_{i=1}^{n}(x_{i} + eps))^{1/n} - eps computed in log space as
exp(1/n(sum_{i=1}^{n}(ln(x_{i} + eps)))) - eps for numerical stability.
:param a: np.ndarray containing the data
:param axis: the axis over which the geometric mean is calculated. If None, computes the geometric mean over the
entire data.
:param eps: small value added to each value in order to avoid computing log(0). Default is 1, i.e. log(x) is
equivalent to np.log1p(x).
:return np.ndarray: An array with the same length as the axis not used for computation containing the geometric
means.
"""
log_geo_mean = log_geometric_mean(a, axis, eps)
return np.exp(log_geo_mean) - eps


def bw_kde(x: np.ndarray, method: str = "silverman"):
"""
Performs gaussian kernel density estimation using a specified method and returns the estimated bandwidth.
:param x: np.ndarray of values for which the KDE is performed.
:param method: The method used for estimating an optimal bandwidth, see scipy gaussian_kde documentation for
available methods.
:return float: The estimated bandwidth
"""
return gaussian_kde(x, bw_method=method).factor * 0.37
# return FFTKDE(kernel=kernel, bw=method).fit(x).bw


def robust_scale(x: pd.Series, c: float = 1.4826, eps: Optional[Union[int, float]] = None):
r"""
Compute a scale param using the formula scale_{i} = (x_{i} - median(x)) / mad(x) where
mad = c * median(abs(x_{i} - median(x))) + eps is the median absolute deviation.
This function is derived from sctransform's implementation of robust scale:
https://github.com/satijalab/sctransform/blob/7e9c9557222d1e34416e8854ed22da580e533e78/R/utils.R#L162-L163
:param x: pd.Series containing the values used to compute the scale.
:param c: Scaling constant used in the computation of mad. The default value is equivalent to the R implementation
of mad: https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/mad
:param eps: Small value added to the mad. If None, it defaults to `np.finfo(float).eps`. This should be equivalent
to sctransform's `.Machine$double.eps`.
:return pd.Series containing the computed scales.
"""
if eps is None:
eps = float(np.finfo(float).eps)

deviation = x - x.median()
mad = c * deviation.abs().median() + eps
scale = deviation / mad
return scale


def is_outlier(model_param: np.ndarray, means: np.ndarray, threshold: Union[int, float] = 10):
"""
Compute outlier genes based on deviation of model_param from mean counts in individual bins.
This function is derived from sctransform's implementation of is_outlier:
https://github.com/satijalab/sctransform/blob/7e9c9557222d1e34416e8854ed22da580e533e78/R/utils.R#L120-L129
:param model_param: np.ndarray of a specific model_param. This can be the intercept, any batch/condition or
loc/scale param. This is the param based on which it is determined if a specific gene is an outlier.
:param means: np.ndarray of genewise mean counts. The means are used to determine bins within outlier detection of
the model_param is performed.
:param threshold: The minimal score required for a model_param to be considered an outlier.
:return np.ndarray of booleans indicating if a particular gene is an outlier (True) or not (False).
"""
bin_width = (means.max() - means.min()) * bw_kde(means) / 2
eps = np.finfo(float).eps * 10

breaks1 = np.arange(means.min() - eps, means.max() + bin_width, bin_width)
breaks2 = np.arange(means.min() - eps - bin_width / 2, means.max() + bin_width, bin_width)
bins1 = pd.cut(means, bins=breaks1)
bins2 = pd.cut(means, bins=breaks2)

df_tmp = pd.DataFrame({"param": model_param, "bins1": bins1, "bins2": bins2})
df_tmp["score1"] = df_tmp.groupby("bins1")["param"].transform(robust_scale)
df_tmp["score2"] = df_tmp.groupby("bins2")["param"].transform(robust_scale)
df_tmp["outlier"] = df_tmp[["score1", "score2"]].abs().min(axis=1) > threshold

return df_tmp
2 changes: 1 addition & 1 deletion docs/api/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ where `xxxxxx` is the backend desired, like `tf2`, `numpy` or `statsmodel`.
For example, here is a short snippet to give a sense of how the API might work::

from batchglm.models.glm_nb import Model as NBModel
from batchglm.train.numpy.glm_nb import Estimator as NBEstimator
from batchglm.train.numpy.glm_nb import Estimator as NBEstimator
from batchglm.utils.input import InputDataGLM

input_data = InputDataGLM(data=data_matrix, design_loc=_design_loc, design_scale=_design_scale, as_dask=as_dask)
Expand Down
8 changes: 4 additions & 4 deletions tests/numpy/test_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def _test_accuracy(self, estimator: EstimatorGlm) -> bool:


class TestAccuracyNB(TestAccuracy):
def test_accuracy_rand_theta(self) -> bool:
def test_accuracy_rand_theta(self):
"""
This tests randTheta simulated data with 2 conditions and 4 batches sparse and dense.
"""
Expand Down Expand Up @@ -94,7 +94,7 @@ def test_accuracy_rand_theta(self) -> bool:
)
assert self._test_accuracy(sparse_estimator)

def test_accuracy_const_theta(self) -> bool:
def test_accuracy_const_theta(self):
"""
This tests constTheta simulated data with 2 conditions and 0 batches sparse and dense.
"""
Expand All @@ -113,7 +113,7 @@ def test_accuracy_const_theta(self) -> bool:
sparse_estimator = get_estimator(
noise_model="nb", model=sparse_model, init_location="standard", init_scale="standard"
)
return self._test_accuracy(sparse_estimator)
assert self._test_accuracy(sparse_estimator)

dense_estimator = get_estimator(
noise_model="nb", model=dense_model, init_location="standard", init_scale="standard", quick_scale=True
Expand All @@ -123,7 +123,7 @@ def test_accuracy_const_theta(self) -> bool:
sparse_estimator = get_estimator(
noise_model="nb", model=sparse_model, init_location="standard", init_scale="standard", quick_scale=True
)
return self._test_accuracy(sparse_estimator)
assert self._test_accuracy(sparse_estimator)


if __name__ == "__main__":
Expand Down