-
Notifications
You must be signed in to change notification settings - Fork 9
[WIP] Mp/dispersion smoothing #145
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: development
Are you sure you want to change the base?
Changes from 4 commits
c001fef
9f768db
3fe5524
74155f0
47c8341
6e347f7
235e8a5
66b3ed5
17360a0
6e1e530
21f6570
63444d1
9756c36
9dd2d38
b09b853
0b0dcef
1541d7d
c6dcf99
7701c78
bb78dab
ae5a8bd
5280a08
737e099
ce34637
d307bd4
4547fdd
5d54b1f
8c4e96c
56d9220
281336a
291f2f2
2829929
9b4bb0b
8d4ac3b
0e78480
dc4ad4c
f2e0831
26861d7
dd7c053
2520732
6abe5c9
3d2e1f3
9595dd9
ca7e50d
e1228ee
0a21e4a
0594375
09475f0
0e53b11
5cd4676
8730ba7
a0636ca
5856631
d961166
4a230b5
fbc5e91
16915b7
13a7800
dd63af4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,9 +17,17 @@ | |
| from .external import pkg_constants | ||
| from .model_container import BaseModelContainer | ||
| from .training_strategies import TrainingStrategies | ||
| from .vst import bw_kde, geometric_mean, is_outlier, log_geometric_mean | ||
|
|
||
| logger = logging.getLogger("batchglm") | ||
|
|
||
| try: | ||
| from skfda.preprocessing.smoothing.kernel_smoothers import NadarayaWatsonSmoother as NWSmoother | ||
| from skfda.representation.grid import FDataGrid | ||
| except ImportError: | ||
| FDataGrid = None | ||
| NWSmoother = None | ||
|
|
||
|
|
||
| class EstimatorGlm(metaclass=abc.ABCMeta): | ||
| """ | ||
|
|
@@ -86,8 +94,95 @@ def train_sequence(self, training_strategy, **kwargs): | |
| "overrding %s from training strategy with value %s with new value %s\n" | ||
| % (x, str(d[x]), str(kwargs[x])) | ||
| ) | ||
| if "dispersion_smoothing" in d: | ||
| if d["dispersion_smoothing"] == "sctransform": | ||
| if FDataGrid is None or NWSmoother is None: | ||
| logger.error("Missing optional dependency scikit-fda.") | ||
| self.train(**d, **kwargs) | ||
| logger.debug("Training sequence #%d complete", idx + 1) | ||
| if "dispersion_smoothing" in d: | ||
| if d["dispersion_smoothing"] == "sctransform": | ||
picciama marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| self.perform_vst() | ||
| elif d["dispersion_smoothing"] == "edger": | ||
| raise NotImplementedError() | ||
| elif d["dispersion_smoothing"] == "deseq2": | ||
| raise NotImplementedError() | ||
|
|
||
| def perform_vst( | ||
| self, | ||
| theta_regularization: str = "od_factor", | ||
| use_geometric_mean: bool = True, | ||
| gmean_eps: float = 1.0, | ||
| bw_adjust: float = 1.5, | ||
| ): | ||
| logger.info("Performing Dispersion smoothing with flavour sctransform...") | ||
| # compute geometric log mean counts | ||
| if use_geometric_mean: | ||
| genes_log_gmean = np.log10(geometric_mean(self.model_container.x, axis=0, eps=gmean_eps)) | ||
picciama marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| else: | ||
| genes_log_gmean = np.log10(np.mean(self.model_container.x, axis=0)) | ||
| if isinstance(genes_log_gmean, dask.array.core.Array): | ||
| genes_log_gmean = genes_log_gmean.compute() | ||
|
||
|
|
||
| # specify which kind of regularization is performed | ||
| scale_param = np.exp(self.model_container.theta_scale[0]) # TODO check if this is always correct | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why an exponentiation here? is this meant to be link-function specific?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure how to resolve this (thus the TODO). Indeed, I think this depends on the noise model. I will have to look at that again. |
||
| if theta_regularization == "log_theta": | ||
| dispersion_par = np.log10(scale_param) | ||
| elif theta_regularization == "od_factor": | ||
| dispersion_par = np.log10(1 + np.power(10, genes_log_gmean) / scale_param) | ||
| else: | ||
| raise ValueError(f"Unrecognized regularization method {theta_regularization}") | ||
| if isinstance(dispersion_par, dask.array.core.Array): | ||
| dispersion_par = dispersion_par.compute() | ||
|
|
||
| # downsample because KDE and bandwidth selection would take too long if performed on the entire dataset. | ||
| # It is sufficient to get a general idea of the data distribution | ||
picciama marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if len(genes_log_gmean) > 2000: | ||
| logger.info("Sampling 2000 random features...") | ||
| idx = np.random.choice(np.arange(len(genes_log_gmean)), 2000) | ||
| genes_log_gmean_filtered = genes_log_gmean[idx] | ||
| dispersion_par_filtered = dispersion_par[idx] | ||
| else: | ||
| dispersion_par_filtered = dispersion_par | ||
| genes_log_gmean_filtered = genes_log_gmean | ||
|
|
||
| # check for outliers in the function f(genes_log_gmean_filtered) = dispersion_par_filtered and remove them | ||
| logger.info("Searching for outliers...") | ||
| outliers = is_outlier(model_param=dispersion_par_filtered, means=genes_log_gmean_filtered) | ||
| outliers = outliers["outlier"].values | ||
| if np.any(outliers): | ||
| # toss out the outliers | ||
| logger.info(f"Excluded {np.sum(outliers)} outliers.") | ||
| genes_log_gmean_filtered = genes_log_gmean_filtered[~outliers] | ||
| dispersion_par_filtered = dispersion_par_filtered[~outliers] | ||
|
|
||
| # define a data grid with the downsampled and filtered values | ||
| domain_range = genes_log_gmean_filtered.min(), genes_log_gmean_filtered.max() | ||
| fd = FDataGrid( | ||
| data_matrix=dispersion_par_filtered, grid_points=genes_log_gmean_filtered, domain_range=domain_range | ||
| ) | ||
| # select bandwidth to be used for smoothing | ||
| bw = bw_kde(genes_log_gmean_filtered) * bw_adjust | ||
| # bandwidth = FFTKDE(kernel='gaussian', bw='ISJ').fit(fd).bw * 0.37 * 3 | ||
picciama marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # define points for evaluation. Ensure x_points is within the range of genes_log_gmean_filtered | ||
| x_points = np.clip(genes_log_gmean, *domain_range) | ||
| # smooth the dispersion_par | ||
| logger.info("Performing smoothing...") | ||
| smoother = NWSmoother(smoothing_parameter=bw, output_points=x_points) | ||
| dispersion_par_smoothed = smoother.fit_transform(fd).data_matrix[0, :, 0] | ||
|
|
||
| # transform back to scale param | ||
| if theta_regularization == "log_theta": | ||
| smoothed_scale = np.power(10, dispersion_par_smoothed) | ||
| elif theta_regularization == "od_factor": | ||
| smoothed_scale = np.power(10, genes_log_gmean) / (np.power(10, dispersion_par_smoothed) - 1) | ||
| else: | ||
| raise ValueError(f"Unrecognized regularization method {theta_regularization}") | ||
|
|
||
| # store smoothed dispersion_par | ||
| self.model_container.theta_scale_smoothed = np.log(smoothed_scale) | ||
| logger.info("Done with dispersion smoothing.") | ||
|
|
||
| def initialize(self): | ||
| pass | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,100 @@ | ||
| from typing import Optional, Union | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
| from scipy.stats import gaussian_kde | ||
|
|
||
|
|
||
| def log_geometric_mean(a: np.ndarray, axis: Optional[int] = None, eps: Union[int, float] = 1.0): | ||
| """ | ||
| Returns the log of the geometric mean defined as mean(log(a + eps)). | ||
| :param a: np.ndarray containing the data | ||
| :param axis: the axis over which the log geometric mean is calculated. If None, computes the log geometric mean | ||
| over the entire data. | ||
| :param eps: small value added to each value in order to avoid computing log(0). Default is 1, i.e. log(x) is | ||
| equivalent to np.log1p(x). | ||
| :return np.ndarray: An array with the same length as the axis not used for computation containing the log geometric | ||
| means. | ||
| """ | ||
| log_a = np.log(a + eps) | ||
| return np.mean(log_a, axis=axis) | ||
|
|
||
|
|
||
| def geometric_mean(a: np.ndarray, axis: Optional[int] = None, eps: Union[int, float] = 1.0): | ||
| r""" | ||
| Return the geometric mean defined as (\prod_{i=1}^{n}(x_{i} + eps))^{1/n} - eps computed in log space as | ||
| exp(1/n(sum_{i=1}^{n}(ln(x_{i} + eps)))) - eps for numerical stability. | ||
| :param a: np.ndarray containing the data | ||
| :param axis: the axis over which the geometric mean is calculated. If None, computes the geometric mean over the | ||
| entire data. | ||
| :param eps: small value added to each value in order to avoid computing log(0). Default is 1, i.e. log(x) is | ||
| equivalent to np.log1p(x). | ||
| :return np.ndarray: An array with the same length as the axis not used for computation containing the geometric | ||
| means. | ||
| """ | ||
| log_geo_mean = log_geometric_mean(a, axis, eps) | ||
| return np.exp(log_geo_mean) - eps | ||
|
|
||
|
|
||
| def bw_kde(x: np.ndarray, method: str = "silverman"): | ||
| """ | ||
| Performs gaussian kernel density estimation using a specified method and returns the estimated bandwidth. | ||
| :param x: np.ndarray of values for which the KDE is performed. | ||
| :param method: The method used for estimating an optimal bandwidth, see scipy gaussian_kde documentation for | ||
| available methods. | ||
| :return float: The estimated bandwidth | ||
| """ | ||
| return gaussian_kde(x, bw_method=method).factor * 0.37 | ||
| # return FFTKDE(kernel=kernel, bw=method).fit(x).bw | ||
picciama marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
|
|
||
| def robust_scale(x: pd.Series, c: float = 1.4826, eps: Optional[Union[int, float]] = None): | ||
| r""" | ||
| Compute a scale param using the formula scale_{i} = (x_{i} - median(x)) / mad(x) where | ||
| mad = c * median(abs(x_{i} - median(x))) + eps is the median absolute deviation. | ||
| This function is derived from sctransform's implementation of robust scale: | ||
| https://github.com/satijalab/sctransform/blob/7e9c9557222d1e34416e8854ed22da580e533e78/R/utils.R#L162-L163 | ||
| :param x: pd.Series containing the values used to compute the scale. | ||
| :param c: Scaling constant used in the computation of mad. The default value is equivalent to the R implementation | ||
| of mad: https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/mad | ||
| :param eps: Small value added to the mad. If None, it defaults to `np.finfo(float).eps`. This should be equivalent | ||
| to sctransform's `.Machine$double.eps`. | ||
| :return pd.Series containing the computed scales. | ||
| """ | ||
| if eps is None: | ||
| eps = float(np.finfo(float).eps) | ||
|
|
||
| deviation = x - x.median() | ||
| mad = c * deviation.abs().median() + eps | ||
| scale = deviation / mad | ||
| return scale | ||
|
|
||
|
|
||
| def is_outlier(model_param: np.ndarray, means: np.ndarray, threshold: Union[int, float] = 10): | ||
| """ | ||
| Compute outlier genes based on deviation of model_param from mean counts in individual bins. | ||
| This function is derived from sctransform's implementation of is_outlier: | ||
| https://github.com/satijalab/sctransform/blob/7e9c9557222d1e34416e8854ed22da580e533e78/R/utils.R#L120-L129 | ||
| :param model_param: np.ndarray of a specific model_param. This can be the intercept, any batch/condition or | ||
| loc/scale param. This is the param based on which it is determined if a specific gene is an outlier. | ||
| :param means: np.ndarray of genewise mean counts. The means are used to determine bins within outlier detection of | ||
| the model_param is performed. | ||
| :param threshold: The minimal score required for a model_param to be considered an outlier. | ||
| :return np.ndarray of booleans indicating if a particular gene is an outlier (True) or not (False). | ||
| """ | ||
| bin_width = (means.max() - means.min()) * bw_kde(means) / 2 | ||
| eps = np.finfo(float).eps * 10 | ||
|
|
||
| breaks1 = np.arange(means.min() - eps, means.max() + bin_width, bin_width) | ||
| breaks2 = np.arange(means.min() - eps - bin_width / 2, means.max() + bin_width, bin_width) | ||
| bins1 = pd.cut(means, bins=breaks1) | ||
| bins2 = pd.cut(means, bins=breaks2) | ||
|
|
||
| df_tmp = pd.DataFrame({"param": model_param, "bins1": bins1, "bins2": bins2}) | ||
| df_tmp["score1"] = df_tmp.groupby("bins1")["param"].transform(robust_scale) | ||
| df_tmp["score2"] = df_tmp.groupby("bins2")["param"].transform(robust_scale) | ||
picciama marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| df_tmp["outlier"] = df_tmp[["score1", "score2"]].abs().min(axis=1) > threshold | ||
|
|
||
| return df_tmp | ||
Uh oh!
There was an error while loading. Please reload this page.