From e5cdd9a7b0a7d5265f727255c883fd9f048dc98d Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sun, 16 Nov 2025 06:47:58 -0500 Subject: [PATCH 01/80] spelling: , and Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/benchmark/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py index 3f826ae7..2e13defa 100644 --- a/qolmat/benchmark/metrics.py +++ b/qolmat/benchmark/metrics.py @@ -842,7 +842,7 @@ def frechet_distance_base( Frechet_distance = || mu_1 - mu_2 ||_2^2 + Tr(Sigma_1 + Sigma_2 - 2(Sigma_1 . Sigma_2)^(1/2)) It is normalized, df1 and df2 are first scaled by a factor - (std(df1) + std(df2)) / 2 and then centered around + (std(df1) + std(df2)) / 2, and then centered around (mean(df1) + mean(df2)) / 2 Based on: Dowson, D. C., and BV666017 Landau. "The Fréchet distance between multivariate normal distributions." From 531a12863e06493a7fc6e803a1f4404b76d2a83a Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:02:07 -0500 Subject: [PATCH 02/80] spelling: ; otherwise, Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/imputers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py index 3ef82bda..a4f95b07 100644 --- a/qolmat/imputations/imputers.py +++ b/qolmat/imputations/imputers.py @@ -92,7 +92,7 @@ def get_hyperparams(self, col: Optional[str] = None): return hyperparams def _check_dataframe(self, X: NDArray): - """Check that the input X is a dataframe, otherwise raises an error. + """Check that the input X is a dataframe; otherwise, raises an error. Parameters ---------- From 6790cad37808fd6aafc207b9611573d202e73014 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 19:59:56 -0500 Subject: [PATCH 03/80] spelling: a Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- docs/imputers.rst | 2 +- qolmat/utils/algebra.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/imputers.rst b/docs/imputers.rst index 633024e6..0ba31f28 100644 --- a/docs/imputers.rst +++ b/docs/imputers.rst @@ -42,7 +42,7 @@ See the :class:`~qolmat.imputations.imputers.ImputerRpcaPcp` class for implement **Noisy RPCA** [2, 3, 4] -The class :class:`RpcaNoisy` implements an recommanded improved version, which relies on a decomposition :math:`\mathbf{D} = \mathbf{M} + \mathbf{A} + \mathbf{E}`. The additionnal term encodes a Gaussian noise and makes the numerical convergence more reliable. This class also implements a time-consistency penalization for time series, parametrized by the :math:`\eta_k`and :math:`H_k`. By defining :math:`\Vert \mathbf{MH_k} \Vert_p` is either :math:`\Vert \mathbf{MH_k} \Vert_1` or :math:`\Vert \mathbf{MH_k} \Vert_F^2`, the optimisation problem is the following +The class :class:`RpcaNoisy` implements a recommanded improved version, which relies on a decomposition :math:`\mathbf{D} = \mathbf{M} + \mathbf{A} + \mathbf{E}`. The additionnal term encodes a Gaussian noise and makes the numerical convergence more reliable. This class also implements a time-consistency penalization for time series, parametrized by the :math:`\eta_k`and :math:`H_k`. By defining :math:`\Vert \mathbf{MH_k} \Vert_p` is either :math:`\Vert \mathbf{MH_k} \Vert_1` or :math:`\Vert \mathbf{MH_k} \Vert_F^2`, the optimisation problem is the following .. math:: \text{min}_{\mathbf{M, A} \in \mathbb{R}^{m \times n}} \quad \frac 1 2 \Vert P_{\Omega} (\mathbf{D}-\mathbf{M}-\mathbf{A}) \Vert_F^2 + \tau \Vert \mathbf{M} \Vert_* + \lambda \Vert \mathbf{A} \Vert_1 + \sum_{k=1}^K \eta_k \Vert \mathbf{M H_k} \Vert_p diff --git a/qolmat/utils/algebra.py b/qolmat/utils/algebra.py index e78b6bdf..42df0481 100644 --- a/qolmat/utils/algebra.py +++ b/qolmat/utils/algebra.py @@ -19,7 +19,7 @@ def frechet_distance_exact( by a factor (std(df1) + std(df2)) / 2 and then centered around (mean(df1) + mean(df2)) / 2 The result is divided by the number of samples to get - an homogeneous result. + a homogeneous result. Based on: Dowson, D. C., and BV666017 Landau. "The Fréchet distance between multivariate normal distributions." Journal of multivariate analysis 12.3 (1982): 450-455. From 5843aa31f5f86af0dc45c108b1359a1e03a0117e Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:05:19 -0500 Subject: [PATCH 04/80] spelling: access Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_benchmark_TS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tutorials/plot_tuto_benchmark_TS.py b/examples/tutorials/plot_tuto_benchmark_TS.py index be9e77ff..27b4ec18 100644 --- a/examples/tutorials/plot_tuto_benchmark_TS.py +++ b/examples/tutorials/plot_tuto_benchmark_TS.py @@ -41,7 +41,7 @@ # For the purpose of this notebook, # we corrupt the data, with the ``qolmat.utils.data.add_holes`` function # on three variables: "TEMP", "PRES" and "WSPM" -# and the imputation methods will have acces to two additional features: +# and the imputation methods will have access to two additional features: # "DEWP" and "RAIN". df_data = data.get_data("Beijing") From 3620bc9c9b14b1cf34d3e095c8fdb935e840176c Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:05:30 -0500 Subject: [PATCH 05/80] spelling: across Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- docs/analysis.rst | 2 +- qolmat/analysis/holes_characterization.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/analysis.rst b/docs/analysis.rst index 3a8af731..a01b6fbf 100644 --- a/docs/analysis.rst +++ b/docs/analysis.rst @@ -45,7 +45,7 @@ The MCAR missing-data mechanism means that there is independence between the pre a. Little's Test ^^^^^^^^^^^^^^^^ -The best-known MCAR test is the :ref:`Little [1]` test, and it has been implemented in :class:`LittleTest`. Keep in mind that the Little's test is designed to test the homogeneity of means across the missing patterns and won't be efficient to detect the heterogeneity of covariance accross missing patterns. +The best-known MCAR test is the :ref:`Little [1]` test, and it has been implemented in :class:`LittleTest`. Keep in mind that the Little's test is designed to test the homogeneity of means across the missing patterns and won't be efficient to detect the heterogeneity of covariance across missing patterns. b. PKLM Test ^^^^^^^^^^^^ diff --git a/qolmat/analysis/holes_characterization.py b/qolmat/analysis/holes_characterization.py index 221beea1..daa37b71 100644 --- a/qolmat/analysis/holes_characterization.py +++ b/qolmat/analysis/holes_characterization.py @@ -70,7 +70,7 @@ class LittleTest(McarTest): """Little Test class. This class implements the Little's test, which is designed to detect the - heterogeneity accross the missing patterns. The null hypothesis is + heterogeneity across the missing patterns. The null hypothesis is "The missing data mechanism is MCAR". The shortcoming of this test is that it won't detect the heterogeneity of covariance. From 3906516180f6a95c3aa6e718ea47355988abde73 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:05:42 -0500 Subject: [PATCH 06/80] spelling: additional Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- docs/imputers.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/imputers.rst b/docs/imputers.rst index 0ba31f28..410902a8 100644 --- a/docs/imputers.rst +++ b/docs/imputers.rst @@ -42,7 +42,7 @@ See the :class:`~qolmat.imputations.imputers.ImputerRpcaPcp` class for implement **Noisy RPCA** [2, 3, 4] -The class :class:`RpcaNoisy` implements a recommanded improved version, which relies on a decomposition :math:`\mathbf{D} = \mathbf{M} + \mathbf{A} + \mathbf{E}`. The additionnal term encodes a Gaussian noise and makes the numerical convergence more reliable. This class also implements a time-consistency penalization for time series, parametrized by the :math:`\eta_k`and :math:`H_k`. By defining :math:`\Vert \mathbf{MH_k} \Vert_p` is either :math:`\Vert \mathbf{MH_k} \Vert_1` or :math:`\Vert \mathbf{MH_k} \Vert_F^2`, the optimisation problem is the following +The class :class:`RpcaNoisy` implements a recommanded improved version, which relies on a decomposition :math:`\mathbf{D} = \mathbf{M} + \mathbf{A} + \mathbf{E}`. The additional term encodes a Gaussian noise and makes the numerical convergence more reliable. This class also implements a time-consistency penalization for time series, parametrized by the :math:`\eta_k`and :math:`H_k`. By defining :math:`\Vert \mathbf{MH_k} \Vert_p` is either :math:`\Vert \mathbf{MH_k} \Vert_1` or :math:`\Vert \mathbf{MH_k} \Vert_F^2`, the optimisation problem is the following .. math:: \text{min}_{\mathbf{M, A} \in \mathbb{R}^{m \times n}} \quad \frac 1 2 \Vert P_{\Omega} (\mathbf{D}-\mathbf{M}-\mathbf{A}) \Vert_F^2 + \tau \Vert \mathbf{M} \Vert_* + \lambda \Vert \mathbf{A} \Vert_1 + \sum_{k=1}^K \eta_k \Vert \mathbf{M H_k} \Vert_p From 184d0fc8f715a96f27347b5c6f9e32bb9a09a0b2 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:05:54 -0500 Subject: [PATCH 07/80] spelling: address Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tutorials/plot_tuto_categorical.py b/examples/tutorials/plot_tuto_categorical.py index dfabd812..a49fbe0b 100644 --- a/examples/tutorials/plot_tuto_categorical.py +++ b/examples/tutorials/plot_tuto_categorical.py @@ -57,7 +57,7 @@ # %% # The third approach uses ImputerRegressor which imputes iteratively each column using the other # ones. The function make_robust_MixteHGB provides an underlying model able to: -# - adress both numerical targets (regression) and categorical targets (classification) +# - address both numerical targets (regression) and categorical targets (classification) # - manage categorical features though one hot encoding # - manage missing features (native to the HistGradientBoosting) From 18421871f190381027ac711054e2f569ffd71983 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:08:30 -0500 Subject: [PATCH 08/80] spelling: alternative Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- HISTORY.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.rst b/HISTORY.rst index dd2691bd..77f55382 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -69,7 +69,7 @@ History * VAR(p) EM sampler implemented, founding on a VAR(p) modelization such as the one described in `Lütkepohl (2005) New Introduction to Multiple Time Series Analysis` * EM and RPCA matrices transposed in the low-level impelmentation, however the API remains unchanged * Sparse matrices introduced in the RPCA implementation so as to speed up the execution -* Implementation of SoftImpute, which provides a fast but less robust alterantive to RPCA +* Implementation of SoftImpute, which provides a fast but less robust alternative to RPCA * Implementation of TabDDPM and TsDDPM, which are diffusion-based models for tabular data and time-series data, based on Denoising Diffusion Probabilistic Models. Their implementations follow the work of Tashiro et al., (2021) and Kotelnikov et al., (2023). * ImputerDiffusion is an imputer-wrapper of these two models TabDDPM and TsDDPM. * Docstrings and tests improved for the EM sampler From 084a36524b529006e0961dc2403dc60b8329a92a Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 19:58:49 -0500 Subject: [PATCH 09/80] spelling: an Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- docs/explanation.rst | 2 +- qolmat/imputations/imputers.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/explanation.rst b/docs/explanation.rst index fbf3e6f6..b80bffcd 100644 --- a/docs/explanation.rst +++ b/docs/explanation.rst @@ -117,7 +117,7 @@ The observations are said to be Missing at Random (MAR) if the probability of an Finally, the observations are said to be Missing Not at Random (MNAR) in all other cases, i.e. if :math:`P(M | X_{obs}, X_{mis}, \psi)` does not simplify. -Qolmat allows to generate new missing values on a an existing dataset, but only in the MCAR case. +Qolmat allows to generate new missing values on an existing dataset, but only in the MCAR case. Here are the different classes to generate missing data. We recommend the last 3 for time series. diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py index a4f95b07..fa2bc2d0 100644 --- a/qolmat/imputations/imputers.py +++ b/qolmat/imputations/imputers.py @@ -232,7 +232,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: def fit_transform( self, X: pd.DataFrame, y: pd.DataFrame = None ) -> pd.DataFrame: - """Return a imputed dataframe. + """Return an imputed dataframe. The retruned df has same shape as `X`, with unchanged values, but all nans are replaced by non-nan values. @@ -1266,7 +1266,7 @@ class ImputerMICE(_Imputer): """MICE imputer. Wrapper of the class sklearn.impute.IterativeImputer in our framework. - This imputer relies on a estimator which is iterative. + This imputer relies on an estimator which is iterative. Parameters ---------- From e955c4d4567a674c71517f46d24f820afcd24744 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:13:08 -0500 Subject: [PATCH 10/80] spelling: approaches Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tutorials/plot_tuto_categorical.py b/examples/tutorials/plot_tuto_categorical.py index a49fbe0b..62d46e3b 100644 --- a/examples/tutorials/plot_tuto_categorical.py +++ b/examples/tutorials/plot_tuto_categorical.py @@ -68,7 +68,7 @@ # %% # 3. Mixed type model selection # --------------------------------------------------------------- -# Let us now compare these three aproaches by measuring their ability to impute uniformly +# Let us now compare these three approaches by measuring their ability to impute uniformly # distributed holes. dict_imputers = { From de1f2256b4370519d1ccb95bde8d0091442af347 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 19:59:47 -0500 Subject: [PATCH 11/80] spelling: are Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/imputers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py index fa2bc2d0..9156908c 100644 --- a/qolmat/imputations/imputers.py +++ b/qolmat/imputations/imputers.py @@ -2068,7 +2068,7 @@ class ImputerEM(_Imputer): method : {'multinormal', 'VAR'}, default='multinormal' Method defining the hypothesis made on the data distribution. Possible values: - - 'multinormal' : the data points a independent and uniformly + - 'multinormal' : the data points are independent and uniformly distributed following a multinormal distribution - 'VAR' : the data is a time series modeled by a VAR(p) process columnwise : bool From 2842132086b7fff7deed51d8511c970621c5a925 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:54:51 -0500 Subject: [PATCH 12/80] spelling: array-like Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/benchmark/missing_patterns.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qolmat/benchmark/missing_patterns.py b/qolmat/benchmark/missing_patterns.py index 097ac9c2..737a57c7 100644 --- a/qolmat/benchmark/missing_patterns.py +++ b/qolmat/benchmark/missing_patterns.py @@ -338,7 +338,7 @@ def generate_hole_sizes( return list_sizes def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame: - """Create missing data in an arraylike object based on a markov chain. + """Create missing data in an array-like object based on a markov chain. States of the MC are the different masks of missing values: there are at most pow(2,X.shape[1]) possible states. @@ -729,7 +729,7 @@ def generate_multi_realisation( return realisations def generate_mask(self, X: pd.DataFrame) -> List[pd.DataFrame]: - """Create missing data in an arraylike object based on a markov chain. + """Create missing data in an array-like object based on a markov chain. States of the MC are the different masks of missing values: there are at most pow(2,X.shape[1]) possible states. From 45909a33487349c6fa6a636f172ed803b8d8c35e Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 19:57:05 -0500 Subject: [PATCH 13/80] spelling: at Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_benchmark_TS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tutorials/plot_tuto_benchmark_TS.py b/examples/tutorials/plot_tuto_benchmark_TS.py index 27b4ec18..299373d5 100644 --- a/examples/tutorials/plot_tuto_benchmark_TS.py +++ b/examples/tutorials/plot_tuto_benchmark_TS.py @@ -51,7 +51,7 @@ df = data.add_holes(df_data, ratio_masked=0.15, mean_size=50) df[["DEWP", "RAIN"]] = df_data[["DEWP", "RAIN"]] # %% -# Let's take a look a one station, for instance "Aotizhongxin" +# Let's take a look at one station, for instance "Aotizhongxin" station = "Aotizhongxin" fig, ax = plt.subplots(len(cols_to_impute), 1, figsize=(13, 8)) From 293f0f7846ebe1f45aac23ca19be0c18923a1e05 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:13:37 -0500 Subject: [PATCH 14/80] spelling: augmented Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/rpca/rpca_pcp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/rpca/rpca_pcp.py b/qolmat/imputations/rpca/rpca_pcp.py index e018dbf9..cdfaf2bc 100644 --- a/qolmat/imputations/rpca/rpca_pcp.py +++ b/qolmat/imputations/rpca/rpca_pcp.py @@ -93,7 +93,7 @@ def decompose(self, D: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: """Estimate the relevant parameters. It computes the PCP RPCA decomposition, using the - Augumented Largrangian Multiplier (ALM) + Augmented Largrangian Multiplier (ALM) Parameters ---------- From 998fe9e24edcf15476d97c23d29c5baedd0850b9 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:31:18 -0500 Subject: [PATCH 15/80] spelling: between Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/benchmark/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py index 2e13defa..ea15636d 100644 --- a/qolmat/benchmark/metrics.py +++ b/qolmat/benchmark/metrics.py @@ -245,7 +245,7 @@ def weighted_mean_absolute_percentage_error( def accuracy( df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame ) -> pd.Series: - """Compute the matching ratio beetween the two datasets. + """Compute the matching ratio between the two datasets. Parameters ---------- @@ -271,7 +271,7 @@ def accuracy( def accuracy_1D(values1: pd.Series, values2: pd.Series) -> float: - """Compute the matching ratio beetween the set of values. + """Compute the matching ratio between the set of values. Parameters ---------- From 75e111c3b31ba8712f496455434f53ad58bb6ce6 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:31:44 -0500 Subject: [PATCH 16/80] spelling: bias Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- HISTORY.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.rst b/HISTORY.rst index 77f55382..7ee4e16d 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -41,7 +41,7 @@ History * RPCA algorithms now start with a normalizing scaler * The EM algorithms now include a gradient projection step to be more robust to colinearity * The EM algorithm based on the Gaussian model is now initialized using a robust estimation of the covariance matrix -* A bug in the EM algorithm has been patched: the normalizing matrix gamma was creating a sampling biais +* A bug in the EM algorithm has been patched: the normalizing matrix gamma was creating a sampling bias * Speed up of the EM algorithm likelihood maximization, using the conjugate gradient method * The ImputeRegressor class now handles the nans by `row` by default * The metric `frechet` was not correctly called and has been patched From eb8f22ba1dca0d7e6492748771e12e86e355cf14 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:32:09 -0500 Subject: [PATCH 17/80] spelling: building Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/diffusions/ddpms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/diffusions/ddpms.py b/qolmat/imputations/diffusions/ddpms.py index b1c18c01..b19f084d 100644 --- a/qolmat/imputations/diffusions/ddpms.py +++ b/qolmat/imputations/diffusions/ddpms.py @@ -115,7 +115,7 @@ def __init__( self.sqrt_alpha = torch.sqrt(self.alpha) self.std_beta = torch.sqrt(self.beta) - # Hyper-parameters for bulding and training the model + # Hyper-parameters for building and training the model self.loss_func = torch.nn.MSELoss(reduction="none") self.lr = lr From 4d4d47ea1621f234a2ecb81d20641cc301e41c72 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:34:07 -0500 Subject: [PATCH 18/80] spelling: class Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/rpca/rpca_noisy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/rpca/rpca_noisy.py b/qolmat/imputations/rpca/rpca_noisy.py index 7809c87a..9239223f 100644 --- a/qolmat/imputations/rpca/rpca_noisy.py +++ b/qolmat/imputations/rpca/rpca_noisy.py @@ -20,7 +20,7 @@ class RpcaNoisy(RPCA): - """Clas for a noisy version of the so-called 'improved RPCA'. + """Class for a noisy version of the so-called 'improved RPCA'. References ---------- From 23db9517b636c7fe32d285f9f19abf7692d041fd Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:34:29 -0500 Subject: [PATCH 19/80] spelling: columns Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/benchmark/comparator.py | 2 +- qolmat/imputations/imputers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/qolmat/benchmark/comparator.py b/qolmat/benchmark/comparator.py index 46bfdb7a..7dfa748f 100644 --- a/qolmat/benchmark/comparator.py +++ b/qolmat/benchmark/comparator.py @@ -224,7 +224,7 @@ def compare( ------- pd.DataFrame DataFrame (2-level index) with results. - Columsn are imputers. + Columns are imputers. 0-level index are the metrics. 1-level index are the column names. diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py index 9156908c..ae5e67ab 100644 --- a/qolmat/imputations/imputers.py +++ b/qolmat/imputations/imputers.py @@ -1383,7 +1383,7 @@ class ImputerRegressor(_Imputer): This class implements a regression imputer in the multivariate case. It imputes each column using a single fit-predict for a given estimator, - based on the colunms which have no missing values. + based on the columns which have no missing values. Parameters ---------- From c129699769ebe11dca187f45f54897682ae567ee Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:35:15 -0500 Subject: [PATCH 20/80] spelling: compute Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/em_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py index d6845079..432d9a33 100644 --- a/qolmat/imputations/em_sampler.py +++ b/qolmat/imputations/em_sampler.py @@ -1076,7 +1076,7 @@ def gradient_X_loglik(self, X: NDArray) -> NDArray: return grad_1 + grad_2 def get_gamma(self, n_cols: int) -> NDArray: - """Compue gamma. + """Compute gamma. If the noise matrix is not full-rank, defines the projection matrix keeping the sampling process in the relevant subspace. From 82a15eae3d494b288f1873666e938e9aa9c56bc2 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:36:33 -0500 Subject: [PATCH 21/80] spelling: conditional Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- docs/imputers.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/imputers.rst b/docs/imputers.rst index 410902a8..f011fc43 100644 --- a/docs/imputers.rst +++ b/docs/imputers.rst @@ -79,7 +79,7 @@ This process is characterized by a time step :math:`h`. Given an initial station where :math:`Z_n` is a vector of independant standard normal random variables and :math:`L` is the log-likelihood. The sampled distribution tends to the target one in the limit :math:`h \rightarrow 0` and the number of iterations :math:`n \rightarrow \infty`. -Sampling from the conditionnal distribution :math:`p(\mathbf{X}_{mis} \vert \mathbf{X}_{obs} ; \theta^{(n)})` (see MCEM [6]) is achieved by projecting the samples at each step. +Sampling from the conditional distribution :math:`p(\mathbf{X}_{mis} \vert \mathbf{X}_{obs} ; \theta^{(n)})` (see MCEM [6]) is achieved by projecting the samples at each step. .. math:: X_n = Proj_{obs} \left( \widetilde X_n \right), From 544dcb89fbf0cc30bd61658f68b00af28d182ab5 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:36:24 -0500 Subject: [PATCH 22/80] spelling: conditionally Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- docs/imputers.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/imputers.rst b/docs/imputers.rst index f011fc43..0d58ba03 100644 --- a/docs/imputers.rst +++ b/docs/imputers.rst @@ -71,7 +71,7 @@ Suppose the data :math:`\mathbf{X}` has a density :math:`p_\theta` parametrized **Expectation** -Draw samples of :math:`\mathbf{X}` assuming a fixed :math:`\theta`, conditionnaly on the values of :math:`\mathbf{X}_\mathrm{obs}`. This is done by MCMC using a projected Langevin algorithm. +Draw samples of :math:`\mathbf{X}` assuming a fixed :math:`\theta`, conditionally on the values of :math:`\mathbf{X}_\mathrm{obs}`. This is done by MCMC using a projected Langevin algorithm. This process is characterized by a time step :math:`h`. Given an initial station :math:`X_0`, one can update the state at iteration *t* as .. math:: From a45cb6d9852e5f22e110a8ff98247f6f078eb55f Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:36:53 -0500 Subject: [PATCH 23/80] spelling: conditioning Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/em_sampler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py index 432d9a33..2afdf2c4 100644 --- a/qolmat/imputations/em_sampler.py +++ b/qolmat/imputations/em_sampler.py @@ -424,7 +424,7 @@ def fit_X(self, X: NDArray) -> None: # first imputation X_imp = self.init_imputation(X) - self._check_conditionning(X_imp) + self._check_conditioning(X_imp) self.fit_parameters_with_missingness(X) @@ -563,7 +563,7 @@ def pretreatment(self, X, mask_na) -> Tuple[NDArray, NDArray]: """ return X, mask_na - def _check_conditionning(self, X: NDArray): + def _check_conditioning(self, X: NDArray): """Check that the data matrix X is not ill-conditioned. Running the EM algorithm on data with colinear columns leads to From d613f6a80ee979c5e3d83a302366c9311820c0b9 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:37:11 -0500 Subject: [PATCH 24/80] spelling: conjugate Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- tests/imputations/test_em_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/imputations/test_em_sampler.py b/tests/imputations/test_em_sampler.py index 37e7520e..03e4a0c6 100644 --- a/tests/imputations/test_em_sampler.py +++ b/tests/imputations/test_em_sampler.py @@ -93,7 +93,7 @@ def generate_varp_process(d=3, n=10000, p=1): "A, mask", [(A, mask)], ) -def test_gradient_conjugue( +def test_gradient_conjugate( A: NDArray, mask: NDArray, ) -> None: From 83672b9edc0a83063f23a9abfd3c98fd885398a1 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:38:01 -0500 Subject: [PATCH 25/80] spelling: consistent Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- HISTORY.rst | 2 +- tests/utils/test_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 7ee4e16d..e313698e 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -5,7 +5,7 @@ History 0.1.10 (2024-??-??) ------------------ * Long EM and RPCA operations wrapped with tqdm progress bars -* Readme code sample updated, and results table made consistant +* Readme code sample updated, and results table made consistent 0.1.9 (2024-08-29) ------------------ diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 4f048d10..489b607d 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -143,7 +143,7 @@ def test_utils_prepare_data_2D_uneven(X: NDArray): @pytest.mark.parametrize("X", [X_incomplete]) -def test_utils_prepare_data_consistant(X: NDArray): +def test_utils_prepare_data_consistent(X: NDArray): result1 = utils.prepare_data(X, 1) result2 = utils.prepare_data(result1, 2) result3 = utils.prepare_data(X, 2) From fe72ca27da00c6de537a42310b1c9c6aafc616c7 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:38:52 -0500 Subject: [PATCH 26/80] spelling: criteria Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/em_sampler.py | 4 ++-- qolmat/imputations/rpca/rpca_noisy.py | 2 +- qolmat/imputations/rpca/rpca_pcp.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py index 2afdf2c4..45acf212 100644 --- a/qolmat/imputations/em_sampler.py +++ b/qolmat/imputations/em_sampler.py @@ -745,7 +745,7 @@ def get_gamma(self, n_cols: int) -> NDArray: return gamma def update_criteria_stop(self, X: NDArray): - """Update the variables to compute the stopping critera. + """Update the variables to compute the stopping criteria. Parameters ---------- @@ -1103,7 +1103,7 @@ def get_gamma(self, n_cols: int) -> NDArray: return gamma def update_criteria_stop(self, X: NDArray): - """Update the variable to compute the stopping critera. + """Update the variable to compute the stopping criteria. Parameters ---------- diff --git a/qolmat/imputations/rpca/rpca_noisy.py b/qolmat/imputations/rpca/rpca_noisy.py index 9239223f..c193d98d 100644 --- a/qolmat/imputations/rpca/rpca_noisy.py +++ b/qolmat/imputations/rpca/rpca_noisy.py @@ -54,7 +54,7 @@ class RpcaNoisy(RPCA): stopping criteria, maximum number of iterations. By default, the value is set to 10_000 tolerance: Optional[float] - stoppign critera, minimum difference between 2 consecutive iterations. + stoppign criteria, minimum difference between 2 consecutive iterations. By default, the value is set to 1e-6 norm: Optional[str] error norm, can be "L1" or "L2". By default, the value is set to "L2" diff --git a/qolmat/imputations/rpca/rpca_pcp.py b/qolmat/imputations/rpca/rpca_pcp.py index cdfaf2bc..0e0f84da 100644 --- a/qolmat/imputations/rpca/rpca_pcp.py +++ b/qolmat/imputations/rpca/rpca_pcp.py @@ -43,7 +43,7 @@ class RpcaPcp(RPCA): stopping criteria, maximum number of iterations. By default, the value is set to 10_000 tolerance: Optional[float] - stoppign critera, minimum difference between 2 consecutive iterations. + stoppign criteria, minimum difference between 2 consecutive iterations. By default, the value is set to 1e-6 verbose: Optional[bool] verbosity level, if False the warnings are silenced From 6d2c67c4cf6b98472bbf7ae5e3209d1e52bb15bd Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:39:41 -0500 Subject: [PATCH 27/80] spelling: dataframes Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_benchmark_TS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tutorials/plot_tuto_benchmark_TS.py b/examples/tutorials/plot_tuto_benchmark_TS.py index 299373d5..75c90666 100644 --- a/examples/tutorials/plot_tuto_benchmark_TS.py +++ b/examples/tutorials/plot_tuto_benchmark_TS.py @@ -183,7 +183,7 @@ # We can also check the covariance. We simply plot one variable versus one another. # One observes the methods provide similar visual resuls: it's difficult to compare # them based on this criterion, except the median imputation that greatly differs. -# Black points and ellipses are original datafames +# Black points and ellipses are original dataframes # whiel colored ones are imputed dataframes. n_columns = len(dfs_imputed_station) From fbb20483e4fdd84b229c994c15e95d6443364f77 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:41:14 -0500 Subject: [PATCH 28/80] spelling: dataset Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/utils/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/utils/data.py b/qolmat/utils/data.py index 5c72482f..fe62075c 100644 --- a/qolmat/utils/data.py +++ b/qolmat/utils/data.py @@ -324,7 +324,7 @@ def get_data( def preprocess_data_beijing(df: pd.DataFrame) -> pd.DataFrame: - """Preprocess data from the "Beijing" datset. + """Preprocess data from the "Beijing" dataset. Parameters ---------- From 1a4522b5ef78726906ee37aba710e67c0d1bfad0 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:40:50 -0500 Subject: [PATCH 29/80] spelling: datetime Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/diffusions/ddpms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/diffusions/ddpms.py b/qolmat/imputations/diffusions/ddpms.py index b19f084d..c540851d 100644 --- a/qolmat/imputations/diffusions/ddpms.py +++ b/qolmat/imputations/diffusions/ddpms.py @@ -1022,7 +1022,7 @@ def fit( if index_datetime == "": raise ValueError( "Please set the params index_datetime " - "(the name of datatime-like index column). " + "(the name of datetime-like index column). " f" Suggestions: {x.index.names}" ) self.index_datetime = index_datetime From 419f79266dd144385272202faafe520be9979ee3 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:41:38 -0500 Subject: [PATCH 30/80] spelling: default Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_diffusion_models.py | 6 +++--- examples/tutorials/plot_tuto_mcar.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/tutorials/plot_tuto_diffusion_models.py b/examples/tutorials/plot_tuto_diffusion_models.py index 398712e6..e6b2ec3b 100644 --- a/examples/tutorials/plot_tuto_diffusion_models.py +++ b/examples/tutorials/plot_tuto_diffusion_models.py @@ -54,12 +54,12 @@ # # * ``cols_imputed``: list of columns that need to be imputed. Recall that we train the model on # incomplete data by using the self-supervised learning method. We can set which columns to be -# masked during training. Its defaut value is ``None``. +# masked during training. Its default value is ``None``. # -# * ``epochs`` : a number of iterations, its defaut value ``epochs=10``. In practice, we should +# * ``epochs`` : a number of iterations, its default value ``epochs=10``. In practice, we should # set a larger number of epochs e.g., ``epochs=100``. # -# * ``batch_size`` : a size of batch, its defaut value ``batch_size=100``. +# * ``batch_size`` : a size of batch, its default value ``batch_size=100``. # # The following hyperparams are for validation: # diff --git a/examples/tutorials/plot_tuto_mcar.py b/examples/tutorials/plot_tuto_mcar.py index 8e122007..a9857ce8 100644 --- a/examples/tutorials/plot_tuto_mcar.py +++ b/examples/tutorials/plot_tuto_mcar.py @@ -266,7 +266,7 @@ # To use the PKLM test properly, it may be necessary to understand the use of hyper-parameters. # # * ``nb_projections``: Number of projections on which the test statistic is calculated. This -# parameter has the greatest influence on test calculation time. Its defaut value +# parameter has the greatest influence on test calculation time. Its default value # ``nb_projections=100``. # Est-ce qu'on donne des ordres de grandeurs utiles ? J'avais un peu fait ce travail. # From e907399e8791cf60afc5d663174f4e5a5a6d0fc7 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:44:33 -0500 Subject: [PATCH 31/80] spelling: dictionary Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/utils/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/utils/plot.py b/qolmat/utils/plot.py index e9809425..0e4459f4 100644 --- a/qolmat/utils/plot.py +++ b/qolmat/utils/plot.py @@ -381,7 +381,7 @@ def plot_imputations( df : pd.DataFrame original dataframe dict_df_imputed : Dict[str, pd.DataFrame] - dictionnary of imputed dataframe for each imputers + dictionary of imputed dataframe for each imputers """ n_columns = len(df.columns) From b58d27b006f63cc3bb90500a5f78ed2ea3feab42 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:44:44 -0500 Subject: [PATCH 32/80] spelling: different Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- docs/analysis.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/analysis.rst b/docs/analysis.rst index a01b6fbf..5d1e8aad 100644 --- a/docs/analysis.rst +++ b/docs/analysis.rst @@ -16,7 +16,7 @@ Then Qolmat proposes two tests to determine whether the missing data mechanism i 2. How to use the results ------------------------- -At the end of the MCAR test, it can then be assumed whether the missing data mechanism is MCAR or not. This serves three differents purposes: +At the end of the MCAR test, it can then be assumed whether the missing data mechanism is MCAR or not. This serves three different purposes: a. Diagnosis ^^^^^^^^^^^^ From 9b669c892649044d65e736dd4e2aee1c57b07195 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:44:51 -0500 Subject: [PATCH 33/80] spelling: distribution Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/benchmark/metrics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py index ea15636d..07fb1152 100644 --- a/qolmat/benchmark/metrics.py +++ b/qolmat/benchmark/metrics.py @@ -902,9 +902,9 @@ def frechet_distance( Parameters ---------- df1 : pd.DataFrame - First empirical ditribution + First empirical distribution df2 : pd.DataFrame - Second empirical ditribution + Second empirical distribution df_mask : pd.DataFrame Mask indicating on which values the distance has to computed on method: str @@ -1093,9 +1093,9 @@ def distance_anticorr_pattern( Parameters ---------- df1 : pd.DataFrame - First empirical ditribution + First empirical distribution df2 : pd.DataFrame - Second empirical ditribution + Second empirical distribution df_mask : pd.DataFrame Mask indicating on which values the distance has to computed on min_n_rows: int From ea575627858888736301d909a8493d48c7515f69 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:59:18 -0500 Subject: [PATCH 34/80] spelling: element-wise Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/imputers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py index ae5e67ab..519f41cc 100644 --- a/qolmat/imputations/imputers.py +++ b/qolmat/imputations/imputers.py @@ -33,7 +33,7 @@ class _Imputer(_BaseImputer): If True, the imputer will be computed for each column, else it will be computed on the whole dataframe, by default False shrink : bool, optional - Indicates if the elementwise imputation method returns a single value, + Indicates if the element-wise imputation method returns a single value, by default False random_state : RandomSetting, optional Controls the randomness of the fit_transform, by default None From 5894c636fbf47422698e0b9a95ffb28b777d72b3 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:46:03 -0500 Subject: [PATCH 35/80] spelling: estimated Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/analysis/holes_characterization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/analysis/holes_characterization.py b/qolmat/analysis/holes_characterization.py index daa37b71..331c0866 100644 --- a/qolmat/analysis/holes_characterization.py +++ b/qolmat/analysis/holes_characterization.py @@ -596,7 +596,7 @@ def _parallel_process_permutation( Returns ------- float - esimtated statistic U_hat + estimated statistic U_hat """ y = self._build_label(X, M_perm, features_idx, target_idx) From 79009affff5ae85ed034e60f7b6eda41676e7c3e Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:46:37 -0500 Subject: [PATCH 36/80] spelling: explanation Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/diffusions/ddpms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/diffusions/ddpms.py b/qolmat/imputations/diffusions/ddpms.py index c540851d..b92b7fed 100644 --- a/qolmat/imputations/diffusions/ddpms.py +++ b/qolmat/imputations/diffusions/ddpms.py @@ -94,7 +94,7 @@ def __init__( # Section 2, equation 1, num_noise_steps is T. self.num_noise_steps = num_noise_steps - # Section 2, equation 4 and near explation for alpha, alpha hat, beta. + # Section 2, equation 4 and near explanation for alpha, alpha hat, beta. self.beta_start = beta_start self.beta_end = beta_end self.beta = torch.linspace( From 0cb129c79602f39c1b8ed08a3436fbc82e9656a8 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:48:13 -0500 Subject: [PATCH 37/80] spelling: function Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/diffusions/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/diffusions/base.py b/qolmat/imputations/diffusions/base.py index 1b6a9abd..3df13823 100644 --- a/qolmat/imputations/diffusions/base.py +++ b/qolmat/imputations/diffusions/base.py @@ -18,7 +18,7 @@ class ResidualBlock(torch.nn.Module): def __init__( self, dim_input: int, dim_embedding: int = 128, p_dropout: float = 0.0 ): - """Init funciton. + """Init function. Parameters ---------- From da9ee528fbdaa48eab6b1db4034b81ccafb0346f Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:48:41 -0500 Subject: [PATCH 38/80] spelling: globally Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tutorials/plot_tuto_categorical.py b/examples/tutorials/plot_tuto_categorical.py index 62d46e3b..a911c75d 100644 --- a/examples/tutorials/plot_tuto_categorical.py +++ b/examples/tutorials/plot_tuto_categorical.py @@ -101,5 +101,5 @@ results.loc["rmse"].style.highlight_min(color="lightgreen", axis=1) # %% -# The HGB imputation methods globaly reaches a better accuracy on the categorical data. +# The HGB imputation methods globally reaches a better accuracy on the categorical data. results.loc["accuracy"].style.highlight_max(color="lightgreen", axis=1) From 79950008caeea2e2038d74d36b15bd6b90d3bc77 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 22:00:19 -0500 Subject: [PATCH 39/80] spelling: hyperparameters Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_mcar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tutorials/plot_tuto_mcar.py b/examples/tutorials/plot_tuto_mcar.py index a9857ce8..e0a9a0c8 100644 --- a/examples/tutorials/plot_tuto_mcar.py +++ b/examples/tutorials/plot_tuto_mcar.py @@ -260,7 +260,7 @@ """ # %% -# 2.1 Parameters and Hyperparmaters +# 2.1 Parameters and Hyperparameters # ================================================ # # To use the PKLM test properly, it may be necessary to understand the use of hyper-parameters. From 09c5b5c45fdb059eff3b29414974ac8d54a7c078 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:02:23 -0500 Subject: [PATCH 40/80] spelling: id Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/imputers.py | 42 +++++++++++++------------- qolmat/imputations/imputers_pytorch.py | 8 ++--- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py index 519f41cc..49a999f5 100644 --- a/qolmat/imputations/imputers.py +++ b/qolmat/imputations/imputers.py @@ -408,7 +408,7 @@ def _fit_element( col : str, optional Column on which the imputer is fitted, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -440,7 +440,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -586,7 +586,7 @@ def _fit_element( col : str, optional Column on which the imputer is fitted, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -620,7 +620,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -696,7 +696,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -779,7 +779,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -855,7 +855,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -952,7 +952,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -1085,7 +1085,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -1201,7 +1201,7 @@ def _fit_element( col : str, optional Column on which the imputer is fitted, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -1239,7 +1239,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -1316,7 +1316,7 @@ def _fit_element( col : str, optional Column on which the imputer is fitted, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -1355,7 +1355,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -1503,7 +1503,7 @@ def _fit_element( col : str, optional Column on which the imputer is fitted, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -1557,7 +1557,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -1690,7 +1690,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -1839,7 +1839,7 @@ def _fit_element( col : str, optional Column on which the imputer is fitted, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -1887,7 +1887,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -2019,7 +2019,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -2176,7 +2176,7 @@ def _fit_element( col : str, optional Column on which the imputer is fitted, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -2210,7 +2210,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- diff --git a/qolmat/imputations/imputers_pytorch.py b/qolmat/imputations/imputers_pytorch.py index 35e4644f..24c7c8e6 100644 --- a/qolmat/imputations/imputers_pytorch.py +++ b/qolmat/imputations/imputers_pytorch.py @@ -348,7 +348,7 @@ def _fit_element( col : str, optional Column on which the imputer is fitted, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -387,7 +387,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -793,7 +793,7 @@ def _fit_element( col : str, optional Column on which the imputer is fitted, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- @@ -828,7 +828,7 @@ def _transform_element( col : str, optional Column transformed by the imputer, by default "__all__" ngroup : int, optional - Id of the group on which the method is applied + ID of the group on which the method is applied Returns ------- From 33efcac81f8875a3b391e6ee7e32a56659c081bb Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:50:41 -0500 Subject: [PATCH 41/80] spelling: implementation Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- HISTORY.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.rst b/HISTORY.rst index e313698e..30f03569 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -67,7 +67,7 @@ History ------------------- * VAR(p) EM sampler implemented, founding on a VAR(p) modelization such as the one described in `Lütkepohl (2005) New Introduction to Multiple Time Series Analysis` -* EM and RPCA matrices transposed in the low-level impelmentation, however the API remains unchanged +* EM and RPCA matrices transposed in the low-level implementation, however the API remains unchanged * Sparse matrices introduced in the RPCA implementation so as to speed up the execution * Implementation of SoftImpute, which provides a fast but less robust alternative to RPCA * Implementation of TabDDPM and TsDDPM, which are diffusion-based models for tabular data and time-series data, based on Denoising Diffusion Probabilistic Models. Their implementations follow the work of Tashiro et al., (2021) and Kotelnikov et al., (2023). From e03c1a98b11998284515832ee314253fe16afc43 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:50:57 -0500 Subject: [PATCH 42/80] spelling: imputation Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/em_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py index 45acf212..18d2c084 100644 --- a/qolmat/imputations/em_sampler.py +++ b/qolmat/imputations/em_sampler.py @@ -119,7 +119,7 @@ def max_diff_Linf( class EM(BaseEstimator, TransformerMixin): - """Abstract class for EM imputatoin. + """Abstract class for EM imputation. It uses imputation through EM optimization and a projected MCMC sampling process. From 78471277f64196c292cb598eb3a791de4ce8dfa5 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:51:17 -0500 Subject: [PATCH 43/80] spelling: independent Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- docs/imputers.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/imputers.rst b/docs/imputers.rst index 0d58ba03..1cb24a01 100644 --- a/docs/imputers.rst +++ b/docs/imputers.rst @@ -77,7 +77,7 @@ This process is characterized by a time step :math:`h`. Given an initial station .. math:: \widetilde X_n = X_{n-1} + \Gamma \nabla L_X(X_{n-1}, \theta_n) (X_{n-1} - \mu) h + (2 h \Gamma)^{1/2} Z_n, -where :math:`Z_n` is a vector of independant standard normal random variables and :math:`L` is the log-likelihood. +where :math:`Z_n` is a vector of independent standard normal random variables and :math:`L` is the log-likelihood. The sampled distribution tends to the target one in the limit :math:`h \rightarrow 0` and the number of iterations :math:`n \rightarrow \infty`. Sampling from the conditional distribution :math:`p(\mathbf{X}_{mis} \vert \mathbf{X}_{obs} ; \theta^{(n)})` (see MCEM [6]) is achieved by projecting the samples at each step. From dac680527223a432c6acd5858d00c92c78c05212 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 22:00:45 -0500 Subject: [PATCH 44/80] spelling: kullback Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/utils/algebra.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/utils/algebra.py b/qolmat/utils/algebra.py index 42df0481..1a998ae1 100644 --- a/qolmat/utils/algebra.py +++ b/qolmat/utils/algebra.py @@ -83,7 +83,7 @@ def kl_divergence_gaussian_exact( Returns ------- float - Kulback-Leibler divergence + Kullback-Leibler divergence """ n_variables = len(means1) From b209d8f326ff31eb4d2c2c264990af518a30e3d0 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:46:05 -0500 Subject: [PATCH 45/80] spelling: libraries Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/benchmark.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/benchmark.md b/examples/benchmark.md index 08eb1c64..09492d62 100644 --- a/examples/benchmark.md +++ b/examples/benchmark.md @@ -17,7 +17,7 @@ jupyter: In Qolmat, a few data imputation methods are implemented as well as a way to evaluate their performance.** -First, import some useful librairies +First, import some useful libraries ```python tags=[] import warnings From fd85f13e31026cef52123671f38ed9f41c2c7eba Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:53:59 -0500 Subject: [PATCH 46/80] spelling: matrix Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/utils/algebra.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qolmat/utils/algebra.py b/qolmat/utils/algebra.py index 1a998ae1..18efd61e 100644 --- a/qolmat/utils/algebra.py +++ b/qolmat/utils/algebra.py @@ -74,11 +74,11 @@ def kl_divergence_gaussian_exact( means1: NDArray Mean of the first distribution cov1: NDArray - Covariance matrx of the first distribution + Covariance matrix of the first distribution means2: NDArray Mean of the second distribution cov2: NDArray - Covariance matrx of the second distribution + Covariance matrix of the second distribution Returns ------- From b7192d5340bd6fa72904cc1c9460d6f770a1c3cd Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:54:35 -0500 Subject: [PATCH 47/80] spelling: method Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_hole_generator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/tutorials/plot_tuto_hole_generator.py b/examples/tutorials/plot_tuto_hole_generator.py index 7d52ab56..341c0beb 100644 --- a/examples/tutorials/plot_tuto_hole_generator.py +++ b/examples/tutorials/plot_tuto_hole_generator.py @@ -224,7 +224,7 @@ def plot_cdf( # The holes are generated following a Markov 1D process. # Holes are created column by column. The transition matrix of the # one-dimensional Markov process is learned from the data. -# This metohd is implemented in the +# This method is implemented in the # :class:`~qolmat.benchmark.missing_patterns.UniformHoleGenerator` class. geometric_generator = missing_patterns.GeometricHoleGenerator( @@ -248,7 +248,7 @@ def plot_cdf( # The distribution of holes is learned from the data. # The distributions of holes are learned column by column; so you need to fit # the generator to the data. -# This metohd is implemented in the +# This method is implemented in the # :class:`~qolmat.benchmark.missing_patterns.EmpiricalHoleGenerator` class. # We specify ``groups=("station",)`` which means a distribution # is learned on each group: here on each station. @@ -275,7 +275,7 @@ def plot_cdf( # Each line of the dataframe mask (np.nan) represents a state of the Markov chain. # Note it is also more difficult to achieve exactly the required # missing data ratio. -# This metohd is implemented in the +# This method is implemented in the # :class:`~qolmat.benchmark.missing_patterns.MultiMarkovHoleGenerator` class. multi_markov_generator = missing_patterns.MultiMarkovHoleGenerator( @@ -298,7 +298,7 @@ def plot_cdf( # e. Grouped Hole Generator # *************************************************************** # The holes are generated according to the groups defined by the user. -# This metohd is implemented in the +# This method is implemented in the # :class:`~qolmat.benchmark.missing_patterns.GroupedHoleGenerator` class. grouped_generator = missing_patterns.GroupedHoleGenerator( From 3f0626251b916fa9e143a58cf87506f0270f062d Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:56:55 -0500 Subject: [PATCH 48/80] spelling: multi Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_hole_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tutorials/plot_tuto_hole_generator.py b/examples/tutorials/plot_tuto_hole_generator.py index 341c0beb..7f076bf5 100644 --- a/examples/tutorials/plot_tuto_hole_generator.py +++ b/examples/tutorials/plot_tuto_hole_generator.py @@ -330,6 +330,6 @@ def plot_cdf( multi_markov_mask, grouped_mask, ], - ["uniform", "geometric", "empirical", "mutli markov", "grouped"], + ["uniform", "geometric", "empirical", "multi markov", "grouped"], ["tab:orange", "tab:blue", "tab:green", "tab:pink", "tab:olive"], ) From 181697c00daf15aa6982376970805d85020203dc Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:01:03 -0500 Subject: [PATCH 49/80] spelling: original Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/benchmark/comparator.py | 2 +- qolmat/utils/plot.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/qolmat/benchmark/comparator.py b/qolmat/benchmark/comparator.py index 7dfa748f..77cf5fb3 100644 --- a/qolmat/benchmark/comparator.py +++ b/qolmat/benchmark/comparator.py @@ -69,7 +69,7 @@ def get_errors( Parameters ---------- df_origin : pd.DataFrame - reference/orginal signal + reference/original signal df_imputed : pd.DataFrame imputed signal df_mask : pd.DataFrame diff --git a/qolmat/utils/plot.py b/qolmat/utils/plot.py index 0e4459f4..cead6d7a 100644 --- a/qolmat/utils/plot.py +++ b/qolmat/utils/plot.py @@ -112,7 +112,7 @@ def plot_images( Parameters ---------- M : np.ndarray - orginal array + original array A : np.ndarray background array E : np.ndarray From 469733d5688c18a0172a5d85bdba7948dfd827cb Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:05:17 -0500 Subject: [PATCH 50/80] spelling: percentage Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_hole_generator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/tutorials/plot_tuto_hole_generator.py b/examples/tutorials/plot_tuto_hole_generator.py index 7f076bf5..5d279fa8 100644 --- a/examples/tutorials/plot_tuto_hole_generator.py +++ b/examples/tutorials/plot_tuto_hole_generator.py @@ -206,7 +206,7 @@ def plot_cdf( ) uniform_mask = uniform_generator.split(df)[0] -print("Pourcentage of additional missing values:") +print("Percentage of additional missing values:") print(round((uniform_mask.sum() / len(uniform_mask)) * 100, 2)) visualise_missing_values(df, uniform_mask) @@ -232,7 +232,7 @@ def plot_cdf( ) geometric_mask = geometric_generator.split(df)[0] -print("Pourcentage of additional missing values:") +print("Percentage of additional missing values:") print(round((geometric_mask.sum() / len(geometric_mask)) * 100, 2)) visualise_missing_values(df, geometric_mask) @@ -258,7 +258,7 @@ def plot_cdf( ) empirical_mask = empirical_generator.split(df)[0] -print("Pourcentage of additional missing values:") +print("Percentage of additional missing values:") print(round((empirical_mask.sum() / len(empirical_mask)) * 100, 2)) visualise_missing_values(df, empirical_mask) @@ -283,7 +283,7 @@ def plot_cdf( ) multi_markov_mask = multi_markov_generator.split(df)[0] -print("Pourcentage of additional missing values:") +print("Percentage of additional missing values:") print(round((multi_markov_mask.sum() / len(multi_markov_mask)) * 100, 2)) visualise_missing_values(df, multi_markov_mask) @@ -306,7 +306,7 @@ def plot_cdf( ) grouped_mask = grouped_generator.split(df)[0] -print("Pourcentage of additional missing values:") +print("Percentage of additional missing values:") print(round((grouped_mask.sum() / len(grouped_mask)) * 100, 2)) visualise_missing_values(df, grouped_mask) From 7aa1049b97370c3afe09c18c7b5cb670d689dd6f Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:02:41 -0500 Subject: [PATCH 51/80] spelling: performance Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/benchmark/hyperparameters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qolmat/benchmark/hyperparameters.py b/qolmat/benchmark/hyperparameters.py index 7aa4a24b..640e4e42 100644 --- a/qolmat/benchmark/hyperparameters.py +++ b/qolmat/benchmark/hyperparameters.py @@ -38,7 +38,7 @@ def get_objective( Generator creating the masked values in the nested cross validation allowing to measure the imputer performance metric: str - Metric used as perfomance indicator, common values are `mse` and `mae` + Metric used as performance indicator, common values are `mse` and `mae` names_hyperparams: List[str] List of the names of the hyperparameters which are being optimized @@ -98,7 +98,7 @@ def optimize( Generator creating the masked values in the nested cross validation allowing to measure the imputer performance metric: str - Metric used as perfomance indicator, common values are `mse` and `mae` + Metric used as performance indicator, common values are `mse` and `mae` dict_config: Dict[str, HyperValue] Search space for the tested hyperparameters max_evals: int From 6b0e5bad94bc9e5ee68cf0b12d2cb2028262ddd8 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:02:54 -0500 Subject: [PATCH 52/80] spelling: perturbation Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- docs/imputers.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/imputers.rst b/docs/imputers.rst index 1cb24a01..af34d2ea 100644 --- a/docs/imputers.rst +++ b/docs/imputers.rst @@ -113,7 +113,7 @@ Two parametric distributions are implemented: :class:`~qolmat.imputations.diffusions.ddpms.TabDDPM` is a deep learning imputer based on Denoising Diffusion Probabilistic Models (DDPMs) [8] for handling multivariate tabular data. Our implementation mainly follows the works of [8, 9]. Diffusion models focus on modeling the process of data transitions from noisy and incomplete observations to the underlying true data. They include two main processes: -* Forward process perturbs observed data to noise until all the original data structures are lost. The pertubation is done over a series of steps. Let :math:`X_{obs}` be observed data, :math:`T` be the number of steps that noises :math:`\epsilon \sim N(0,I)` are added into the observed data. Therefore, :math:`X_{obs}^t = \bar{\alpha}_t \times X_{obs} + \sqrt{1-\bar{\alpha}_t} \times \epsilon` where :math:`\bar{\alpha}_t` controls the right amount of noise. +* Forward process perturbs observed data to noise until all the original data structures are lost. The perturbation is done over a series of steps. Let :math:`X_{obs}` be observed data, :math:`T` be the number of steps that noises :math:`\epsilon \sim N(0,I)` are added into the observed data. Therefore, :math:`X_{obs}^t = \bar{\alpha}_t \times X_{obs} + \sqrt{1-\bar{\alpha}_t} \times \epsilon` where :math:`\bar{\alpha}_t` controls the right amount of noise. * Reverse process removes noise and reconstructs the observed data. At each step :math:`t`, we train an autoencoder :math:`\epsilon_\theta` based on ResNet [10] to predict the added noise :math:`\epsilon_t` based on the rest of the observed data. The objective function is the error between the noise added in the forward process and the noise predicted by :math:`\epsilon_\theta`. In training phase, we use the self-supervised learning method of [9] to train incomplete data. In detail, our model randomly masks a part of observed data and computes loss from these masked data. Moving on to the inference phase, (1) missing data are replaced by Gaussian noises :math:`\epsilon \sim N(0,I)`, (2) at each noise step from :math:`T` to 0, our model denoises these missing data based on :math:`\epsilon_\theta`. From 1c6d12ac0d568aa8b54b2a6c5b3346d5aad96433 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:03:11 -0500 Subject: [PATCH 53/80] spelling: perturbed Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- tests/imputations/test_em_sampler.py | 14 +++++++------- tests/imputations/test_preprocessing.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/imputations/test_em_sampler.py b/tests/imputations/test_em_sampler.py index 03e4a0c6..16b5c674 100644 --- a/tests/imputations/test_em_sampler.py +++ b/tests/imputations/test_em_sampler.py @@ -367,13 +367,13 @@ def test_multinormal_em_minimize_llik(): llikelihood_imputed = imputer.get_loglikelihood(X_imputed) for _ in range(10): Delta = imputer.rng.uniform(0, 1, size=X.shape) - X_perturbated = X_imputed + Delta - llikelihood_perturbated = imputer.get_loglikelihood(X_perturbated) - assert llikelihood_perturbated < llikelihood_imputed - X_perturbated = X - X_perturbated[np.isnan(X)] = 0 - llikelihood_perturbated = imputer.get_loglikelihood(X_perturbated) - assert llikelihood_perturbated < llikelihood_imputed + X_perturbed = X_imputed + Delta + llikelihood_perturbed = imputer.get_loglikelihood(X_perturbed) + assert llikelihood_perturbed < llikelihood_imputed + X_perturbed = X + X_perturbed[np.isnan(X)] = 0 + llikelihood_perturbed = imputer.get_loglikelihood(X_perturbed) + assert llikelihood_perturbed < llikelihood_imputed @pytest.mark.parametrize("method", ["sample", "mle"]) diff --git a/tests/imputations/test_preprocessing.py b/tests/imputations/test_preprocessing.py index a05fffdb..c4500deb 100644 --- a/tests/imputations/test_preprocessing.py +++ b/tests/imputations/test_preprocessing.py @@ -127,10 +127,10 @@ def test_inverse_transform_OneHotEncoderProjector(encoder): df_back = encoder.inverse_transform(df_dum) pd.testing.assert_frame_equal(df, df_back) - df_dum_perturbated = df_dum + np.random.uniform( + df_dum_perturbed = df_dum + np.random.uniform( -0.5, 0.5, size=df_dum.shape ) - df_back = encoder.inverse_transform(df_dum_perturbated) + df_back = encoder.inverse_transform(df_dum_perturbed) pd.testing.assert_frame_equal(df, df_back) From afb87da2c7c0ad5c0763475ea15268acdddeb87a Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:07:31 -0500 Subject: [PATCH 54/80] spelling: practice Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/benchmark.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/benchmark.md b/examples/benchmark.md index 09492d62..f3aa0297 100644 --- a/examples/benchmark.md +++ b/examples/benchmark.md @@ -102,7 +102,7 @@ All presented methods are group-wise: here each station is imputed independently **Hyperparameters' search**: Some methods require hyperparameters. The user can directly specify them, or rather determine them through an optimization step using the `search_params` dictionary. The keys are the imputation method's name and the values are a dictionary specifying the minimum, maximum or list of categories and type of values (Integer, Real, Category or a dictionary indexed by the variable names) to search. -In pratice, we rely on a cross validation to find the best hyperparams values minimizing an error reconstruction. +In practice, we rely on a cross validation to find the best hyperparams values minimizing an error reconstruction. ```python tags=[] ratio_masked = 0.1 From 4d13296be83d4f9c3a425d35577e191e3ffce693 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:07:43 -0500 Subject: [PATCH 55/80] spelling: pressure Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- tests/utils/test_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/utils/test_data.py b/tests/utils/test_data.py index b30e2933..7bffbb9f 100644 --- a/tests/utils/test_data.py +++ b/tests/utils/test_data.py @@ -75,7 +75,7 @@ df_monach_weather = pd.DataFrame( { "series_name": ["T1", "T2", "T3", "T4", "T5"], - "series_type": ["rain", "preasure", "temperature", "humidity", "sun"], + "series_type": ["rain", "pressure", "temperature", "humidity", "sun"], "series_value": [ [1.0, 2.0, 3.0], [4.0, 5.0, 6.0], @@ -94,7 +94,7 @@ ], columns=[ "T1 rain", - "T2 preasure", + "T2 pressure", "T3 temperature", "T4 humidity", "T5 sun", From e418a1ee7e9a45b5b4d5d73294d28d5327a18627 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:08:11 -0500 Subject: [PATCH 56/80] spelling: pretreated Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/em_sampler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py index 18d2c084..b4a84b83 100644 --- a/qolmat/imputations/em_sampler.py +++ b/qolmat/imputations/em_sampler.py @@ -557,7 +557,7 @@ def pretreatment(self, X, mask_na) -> Tuple[NDArray, NDArray]: ------- Tuple[NDArray, NDArray] A tuple containing: - - X the pretreatd data matrix + - X the pretreated data matrix - mask_na the updated mask """ @@ -1225,7 +1225,7 @@ def pretreatment(self, X, mask_na) -> Tuple[NDArray, NDArray]: ------- Tuple[NDArray, NDArray] A tuple containing: - - X the pretreatd data matrix + - X the pretreated data matrix - mask_na the updated mask """ From e608b94f14982430f32966542a5a8084ada8d1f5 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:08:28 -0500 Subject: [PATCH 57/80] spelling: probability Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/em_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py index b4a84b83..46692cf5 100644 --- a/qolmat/imputations/em_sampler.py +++ b/qolmat/imputations/em_sampler.py @@ -349,7 +349,7 @@ def fun_jac(x): grad_x = grad_x[mask_na] return grad_x - # the method BFGS is much slower, probabily not adapted + # the method BFGS is much slower, probability not adapted # to the high-dimension setting res = spo.minimize(fun_obj, X[mask_na], jac=fun_jac, method="CG") x = res.x From 7ffc8e6c005b277cb530d1f31b61f46aef5e36f5 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:09:00 -0500 Subject: [PATCH 58/80] spelling: recommended Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- docs/imputers.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/imputers.rst b/docs/imputers.rst index af34d2ea..fd20b7a4 100644 --- a/docs/imputers.rst +++ b/docs/imputers.rst @@ -42,7 +42,7 @@ See the :class:`~qolmat.imputations.imputers.ImputerRpcaPcp` class for implement **Noisy RPCA** [2, 3, 4] -The class :class:`RpcaNoisy` implements a recommanded improved version, which relies on a decomposition :math:`\mathbf{D} = \mathbf{M} + \mathbf{A} + \mathbf{E}`. The additional term encodes a Gaussian noise and makes the numerical convergence more reliable. This class also implements a time-consistency penalization for time series, parametrized by the :math:`\eta_k`and :math:`H_k`. By defining :math:`\Vert \mathbf{MH_k} \Vert_p` is either :math:`\Vert \mathbf{MH_k} \Vert_1` or :math:`\Vert \mathbf{MH_k} \Vert_F^2`, the optimisation problem is the following +The class :class:`RpcaNoisy` implements a recommended improved version, which relies on a decomposition :math:`\mathbf{D} = \mathbf{M} + \mathbf{A} + \mathbf{E}`. The additional term encodes a Gaussian noise and makes the numerical convergence more reliable. This class also implements a time-consistency penalization for time series, parametrized by the :math:`\eta_k`and :math:`H_k`. By defining :math:`\Vert \mathbf{MH_k} \Vert_p` is either :math:`\Vert \mathbf{MH_k} \Vert_1` or :math:`\Vert \mathbf{MH_k} \Vert_F^2`, the optimisation problem is the following .. math:: \text{min}_{\mathbf{M, A} \in \mathbb{R}^{m \times n}} \quad \frac 1 2 \Vert P_{\Omega} (\mathbf{D}-\mathbf{M}-\mathbf{A}) \Vert_F^2 + \tau \Vert \mathbf{M} \Vert_* + \lambda \Vert \mathbf{A} \Vert_1 + \sum_{k=1}^K \eta_k \Vert \mathbf{M H_k} \Vert_p From e7e20835f269140b232fbc470b9a8342235c98a2 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:09:07 -0500 Subject: [PATCH 59/80] spelling: refactor Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- HISTORY.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.rst b/HISTORY.rst index 30f03569..2e1dc20d 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -100,7 +100,7 @@ been changed into tuple attributes so that all are not immutable 0.0.13 (2023-06-07) ------------------- -* Refacto cross validation +* Refactor cross validation * Fix Readme * Add test utils.plot From 9cdd5af2590c4186a73615d4e5ab301f572cd099 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:09:54 -0500 Subject: [PATCH 60/80] spelling: reproducibility Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/em_sampler.py | 6 +++--- qolmat/imputations/rpca/rpca_noisy.py | 2 +- qolmat/imputations/rpca/rpca_pcp.py | 2 +- qolmat/imputations/softimpute.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/qolmat/imputations/em_sampler.py b/qolmat/imputations/em_sampler.py index 46692cf5..f4a8b24e 100644 --- a/qolmat/imputations/em_sampler.py +++ b/qolmat/imputations/em_sampler.py @@ -141,7 +141,7 @@ class EM(BaseEstimator, TransformerMixin): or to maximise likelihood (0), by default 1. random_state : int, optional The seed of the pseudo random number generator to use, - for reproductibility. + for reproducibility. dt : float, optional Process integration time step, a large value increases the sample bias and can make the algorithm unstable, but compensates for a @@ -622,7 +622,7 @@ class MultiNormalEM(EM): or to maximise likelihood (0), by default 1. random_state : int, optional The seed of the pseudo random number generator to use, - for reproductibility. + for reproducibility. dt : float Process integration time step, a large value increases the sample bias and can make the algorithm unstable, but compensates for a @@ -951,7 +951,7 @@ class VARpEM(EM): or to maximise likelihood (0), by default 1. random_state : int, optional The seed of the pseudo random number generator to use, - for reproductibility. + for reproducibility. dt : float Process integration time step, a large value increases the sample bias and can make the algorithm unstable, but compensates for diff --git a/qolmat/imputations/rpca/rpca_noisy.py b/qolmat/imputations/rpca/rpca_noisy.py index c193d98d..cbd0772f 100644 --- a/qolmat/imputations/rpca/rpca_noisy.py +++ b/qolmat/imputations/rpca/rpca_noisy.py @@ -36,7 +36,7 @@ class RpcaNoisy(RPCA): ---------- random_state : int, optional The seed of the pseudo random number generator to use, - for reproductibility. + for reproducibility. rank: Optional[int] Upper bound of the rank to be estimated mu: Optional[float] diff --git a/qolmat/imputations/rpca/rpca_pcp.py b/qolmat/imputations/rpca/rpca_pcp.py index 0e0f84da..e4854725 100644 --- a/qolmat/imputations/rpca/rpca_pcp.py +++ b/qolmat/imputations/rpca/rpca_pcp.py @@ -30,7 +30,7 @@ class RpcaPcp(RPCA): ---------- random_state : int, optional The seed of the pseudo random number generator to use, - for reproductibility. + for reproducibility. period: Optional[int] number of rows of the reshaped matrix if the signal is a 1D-array rank: Optional[int] diff --git a/qolmat/imputations/softimpute.py b/qolmat/imputations/softimpute.py index 63688812..507faf17 100644 --- a/qolmat/imputations/softimpute.py +++ b/qolmat/imputations/softimpute.py @@ -49,7 +49,7 @@ class SoftImpute(BaseEstimator, TransformerMixin): Maximum number of iterations random_state : int, optional The seed of the pseudo random number generator to use, - for reproductibility + for reproducibility verbose : bool flag for verbosity From 89b33000e3440a6376734a71b7672e5b55bc8b76 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:10:18 -0500 Subject: [PATCH 61/80] spelling: results Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/benchmark.md | 2 +- examples/tutorials/plot_tuto_benchmark_TS.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/benchmark.md b/examples/benchmark.md index f3aa0297..98f0971e 100644 --- a/examples/benchmark.md +++ b/examples/benchmark.md @@ -476,7 +476,7 @@ plt.show() We first check the covariance. We simply plot one variable versus one another. -One observes the methods provide similar visual resuls: it's difficult to compare them based on this criterion. +One observes the methods provide similar visual results: it's difficult to compare them based on this criterion. ```python fig = plt.figure(figsize=(6 * n_imputers, 6 * n_columns)) diff --git a/examples/tutorials/plot_tuto_benchmark_TS.py b/examples/tutorials/plot_tuto_benchmark_TS.py index 75c90666..332026fe 100644 --- a/examples/tutorials/plot_tuto_benchmark_TS.py +++ b/examples/tutorials/plot_tuto_benchmark_TS.py @@ -181,7 +181,7 @@ # %% # We can also check the covariance. We simply plot one variable versus one another. -# One observes the methods provide similar visual resuls: it's difficult to compare +# One observes the methods provide similar visual results: it's difficult to compare # them based on this criterion, except the median imputation that greatly differs. # Black points and ellipses are original dataframes # whiel colored ones are imputed dataframes. From 4a5e402aff1d685c02c0e6e5c3475556e962a01c Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:10:25 -0500 Subject: [PATCH 62/80] spelling: returned Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/imputers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py index 49a999f5..0e13da3e 100644 --- a/qolmat/imputations/imputers.py +++ b/qolmat/imputations/imputers.py @@ -234,7 +234,7 @@ def fit_transform( ) -> pd.DataFrame: """Return an imputed dataframe. - The retruned df has same shape as `X`, with unchanged values, + The returned df has same shape as `X`, with unchanged values, but all nans are replaced by non-nan values. Depending on the imputer parameters, the dataframe can be imputed with columnwise and/or groupwise methods. From a8bb53b6bfea0a4fad737e411d580a84ea2f8b5a Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:10:41 -0500 Subject: [PATCH 63/80] spelling: returns Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/imputers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py index 0e13da3e..c6650392 100644 --- a/qolmat/imputations/imputers.py +++ b/qolmat/imputations/imputers.py @@ -173,7 +173,7 @@ def fit(self, X: pd.DataFrame, y: pd.DataFrame = None) -> "_Imputer": def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Transform/impute a dataframe. - It retruns a dataframe with same shape as `X`, + It returns a dataframe with same shape as `X`, unchanged values, where all nans are replaced by non-nan values. Depending on the imputer parameters, the dataframe can be imputed with columnwise and/or groupwise methods. From 68062e50babb029c88f6d5999f9e8ada195de802 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:26:15 -0500 Subject: [PATCH 64/80] spelling: seasonal Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/benchmark.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/benchmark.md b/examples/benchmark.md index 98f0971e..50421d93 100644 --- a/examples/benchmark.md +++ b/examples/benchmark.md @@ -494,7 +494,7 @@ plt.show() ## Auto-correlation -We are now interested in the auto-correlation function (ACF). As seen before, time series display seaonal patterns. +We are now interested in the auto-correlation function (ACF). As seen before, time series display seasonal patterns. [Autocorrelation](https://en.wikipedia.org/wiki/Autocorrelation) is the correlation of a signal with a delayed copy of itself as a function of delay. It measures the similarity between observations of a random variable as a function of the time lag between them. The objective is to have an ACF to be similar between the original dataset and the imputed one. ```python From 54654d4a6ab0da68a7ea42d6d84e54d14e3dfed7 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:11:33 -0500 Subject: [PATCH 65/80] spelling: series Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/benchmark/missing_patterns.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/benchmark/missing_patterns.py b/qolmat/benchmark/missing_patterns.py index 737a57c7..0657340f 100644 --- a/qolmat/benchmark/missing_patterns.py +++ b/qolmat/benchmark/missing_patterns.py @@ -51,7 +51,7 @@ def compute_transition_matrix( Parameters ---------- states : pd.Series - serie of possible states (masks) + series of possible states (masks) ngroups : Optional[List], optional groups, by default None From 67d6ac458a62f49b9cc296cfc203b0286a78356c Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:11:54 -0500 Subject: [PATCH 66/80] spelling: shrunk Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/rpca/rpca_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qolmat/imputations/rpca/rpca_utils.py b/qolmat/imputations/rpca/rpca_utils.py index 0d3b6d5f..55b05dec 100644 --- a/qolmat/imputations/rpca/rpca_utils.py +++ b/qolmat/imputations/rpca/rpca_utils.py @@ -44,7 +44,7 @@ def soft_thresholding( Parameters ---------- X : NDArray - Matrix which elements should be shrinked + Matrix which elements should be shrunk threshold : float Shrinking factor @@ -63,7 +63,7 @@ def svd_thresholding(X: NDArray, threshold: float) -> NDArray: Parameters ---------- X : NDArray - Matrix which singular values should be shrinked + Matrix which singular values should be shrunk threshold : float Shrinking factor From dc8164885fd3a48995905b12ebd3110d25c76736 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:32:47 -0500 Subject: [PATCH 67/80] spelling: split Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_diffusion_models.py | 4 ++-- qolmat/imputations/diffusions/ddpms.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/tutorials/plot_tuto_diffusion_models.py b/examples/tutorials/plot_tuto_diffusion_models.py index e6b2ec3b..ced56775 100644 --- a/examples/tutorials/plot_tuto_diffusion_models.py +++ b/examples/tutorials/plot_tuto_diffusion_models.py @@ -198,11 +198,11 @@ # # For TsDDPM, we have two options for splitting data: # -# * ``is_rolling=False`` (default value): the data is splited by using +# * ``is_rolling=False`` (default value): the data is split by using # pandas.DataFrame.resample(rule=freq_str). There is no duplication of row between chunks, # leading a smaller number of chunks than the number of rows in the original data. # -# * ``is_rolling=True``: the data is splited by using pandas.DataFrame.rolling(window=freq_str). +# * ``is_rolling=True``: the data is split by using pandas.DataFrame.rolling(window=freq_str). # The number of chunks is also the number of rows in the original data. # Note that setting ``is_rolling=True`` always produces better quality of imputations # but requires a longer training/inference time. diff --git a/qolmat/imputations/diffusions/ddpms.py b/qolmat/imputations/diffusions/ddpms.py index b92b7fed..7c6ea31f 100644 --- a/qolmat/imputations/diffusions/ddpms.py +++ b/qolmat/imputations/diffusions/ddpms.py @@ -287,7 +287,7 @@ def _impute(self, x: np.ndarray, x_mask_obs: np.ndarray) -> np.ndarray: * i ) if len(x_batch.size()) == 3: - # Data are splited into chunks + # Data are split into chunks # (i.e., Time-series data), # a window of rows # is processed. From b97baa7cad552594f81f6cdf4093f0130abc5911 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:34:45 -0500 Subject: [PATCH 68/80] spelling: stopping Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/rpca/rpca_noisy.py | 2 +- qolmat/imputations/rpca/rpca_pcp.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/qolmat/imputations/rpca/rpca_noisy.py b/qolmat/imputations/rpca/rpca_noisy.py index cbd0772f..82a34683 100644 --- a/qolmat/imputations/rpca/rpca_noisy.py +++ b/qolmat/imputations/rpca/rpca_noisy.py @@ -54,7 +54,7 @@ class RpcaNoisy(RPCA): stopping criteria, maximum number of iterations. By default, the value is set to 10_000 tolerance: Optional[float] - stoppign criteria, minimum difference between 2 consecutive iterations. + stopping criteria, minimum difference between 2 consecutive iterations. By default, the value is set to 1e-6 norm: Optional[str] error norm, can be "L1" or "L2". By default, the value is set to "L2" diff --git a/qolmat/imputations/rpca/rpca_pcp.py b/qolmat/imputations/rpca/rpca_pcp.py index e4854725..765d929b 100644 --- a/qolmat/imputations/rpca/rpca_pcp.py +++ b/qolmat/imputations/rpca/rpca_pcp.py @@ -43,7 +43,7 @@ class RpcaPcp(RPCA): stopping criteria, maximum number of iterations. By default, the value is set to 10_000 tolerance: Optional[float] - stoppign criteria, minimum difference between 2 consecutive iterations. + stopping criteria, minimum difference between 2 consecutive iterations. By default, the value is set to 1e-6 verbose: Optional[bool] verbosity level, if False the warnings are silenced From a3192c36edacf6b5ad9bc839382cc9e1ffdacd68 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:34:53 -0500 Subject: [PATCH 69/80] spelling: supported Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/imputers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py index c6650392..3b106d97 100644 --- a/qolmat/imputations/imputers.py +++ b/qolmat/imputations/imputers.py @@ -879,7 +879,7 @@ class ImputerInterpolation(_Imputer): """Interpolation imputer. This class implements a way to impute time series using some interpolation - strategies suppoted by pd.Series.interpolate, such as "linear", "slinear", + strategies supported by pd.Series.interpolate, such as "linear", "slinear", "quadratic", ... By default, linear interpolation. As for pd.Series.interpolate, if "method" is "spline" or "polynomial", an "order" has to be passed. From 0bc1c81c4bc05302ba5cf1379fa180a5d8ab0c08 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:35:30 -0500 Subject: [PATCH 70/80] spelling: temporal Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/benchmark.md | 2 +- examples/tutorials/plot_tuto_benchmark_TS.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/benchmark.md b/examples/benchmark.md index 50421d93..37d3cc8f 100644 --- a/examples/benchmark.md +++ b/examples/benchmark.md @@ -98,7 +98,7 @@ plt.show() This part is devoted to the imputation methods. The idea is to try different algorithms and compare them. **Methods**: -All presented methods are group-wise: here each station is imputed independently. For example ImputerMean computes the mean of each variable in each station and uses the result for imputation; ImputerInterpolation interpolates termporal signals corresponding to each variable on each station. +All presented methods are group-wise: here each station is imputed independently. For example ImputerMean computes the mean of each variable in each station and uses the result for imputation; ImputerInterpolation interpolates temporal signals corresponding to each variable on each station. **Hyperparameters' search**: Some methods require hyperparameters. The user can directly specify them, or rather determine them through an optimization step using the `search_params` dictionary. The keys are the imputation method's name and the values are a dictionary specifying the minimum, maximum or list of categories and type of values (Integer, Real, Category or a dictionary indexed by the variable names) to search. diff --git a/examples/tutorials/plot_tuto_benchmark_TS.py b/examples/tutorials/plot_tuto_benchmark_TS.py index 332026fe..db267030 100644 --- a/examples/tutorials/plot_tuto_benchmark_TS.py +++ b/examples/tutorials/plot_tuto_benchmark_TS.py @@ -68,7 +68,7 @@ # --------------------------------------------------------------- # All presented methods are group-wise: here each station is imputed independently. # For example ImputerMean computes the mean of each variable in each station and uses -# the result for imputation; ImputerInterpolation interpolates termporal +# the result for imputation; ImputerInterpolation interpolates temporal # signals corresponding to each variable on each station. # We consider five imputation methods: # ``median`` for a baseline imputation; From 7fcd3cce643e7e9e5ab406bf1ca3a8cd7f5b47a4 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:00:05 -0500 Subject: [PATCH 71/80] spelling: the Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_mean_median.py | 2 +- qolmat/benchmark/metrics.py | 2 +- qolmat/imputations/rpca/rpca_noisy.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/tutorials/plot_tuto_mean_median.py b/examples/tutorials/plot_tuto_mean_median.py index 021a2a18..36037da8 100644 --- a/examples/tutorials/plot_tuto_mean_median.py +++ b/examples/tutorials/plot_tuto_mean_median.py @@ -6,7 +6,7 @@ (:class:`~qolmat.benchmark.comparator`) to choose the best imputation between two of the simplest imputation methods: mean or median (:class:`~qolmat.imputations.imputers.ImputerSimple`). -The dataset used is the the numerical `superconduct` dataset and +The dataset used is the numerical `superconduct` dataset and contains information on 21263 superconductors. We generate holes uniformly at random via :class:`~qolmat.benchmark.missing_patterns.UniformHoleGenerator` diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py index 07fb1152..5d5ff3eb 100644 --- a/qolmat/benchmark/metrics.py +++ b/qolmat/benchmark/metrics.py @@ -932,7 +932,7 @@ def frechet_distance( def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> float: - """Estimate the the Kullback-Leibler divergence for 1D. + """Estimate the Kullback-Leibler divergence for 1D. Computation between the two 1D empirical distributions given by `df1`and `df2`. The samples are binarized using a uniform spacing diff --git a/qolmat/imputations/rpca/rpca_noisy.py b/qolmat/imputations/rpca/rpca_noisy.py index 82a34683..b018a96f 100644 --- a/qolmat/imputations/rpca/rpca_noisy.py +++ b/qolmat/imputations/rpca/rpca_noisy.py @@ -1,4 +1,4 @@ -"""Script for an the noisy RPCA.""" +"""Script for the noisy RPCA.""" from __future__ import annotations From 9e480d50e9a6276bae7cd2673d5f5997eadf2bf3 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:38:11 -0500 Subject: [PATCH 72/80] spelling: transformers Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/diffusions/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/imputations/diffusions/base.py b/qolmat/imputations/diffusions/base.py index 3df13823..1bfe93c1 100644 --- a/qolmat/imputations/diffusions/base.py +++ b/qolmat/imputations/diffusions/base.py @@ -72,7 +72,7 @@ class ResidualBlockTS(torch.nn.Module): (https://arxiv.org/abs/2106.11959). We follow the implementation found in https://github.com/Yura52/rtdl/blob/main/rtdl/nn/_backbones.py - This class is for Time-Series data where we add Tranformers to + This class is for Time-Series data where we add Transformers to encode time-based/feature-based context. """ From 8789a6a11637747219d36140cf20e57cdc39dca2 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:38:19 -0500 Subject: [PATCH 73/80] spelling: transition Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/benchmark/missing_patterns.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/benchmark/missing_patterns.py b/qolmat/benchmark/missing_patterns.py index 0657340f..ffb5fe20 100644 --- a/qolmat/benchmark/missing_patterns.py +++ b/qolmat/benchmark/missing_patterns.py @@ -16,7 +16,7 @@ def compute_transition_counts_matrix(states: pd.Series): - """Compute transtion counts matrix. + """Compute transition counts matrix. Parameters ---------- From 323e454e000941f9215145ba27c8e63917bd6f1f Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:40:50 -0500 Subject: [PATCH 74/80] spelling: tutorial Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_mcar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tutorials/plot_tuto_mcar.py b/examples/tutorials/plot_tuto_mcar.py index e0a9a0c8..069fcf67 100644 --- a/examples/tutorials/plot_tuto_mcar.py +++ b/examples/tutorials/plot_tuto_mcar.py @@ -205,7 +205,7 @@ # %% # Limitations and conclusion # ========================== -# In this tutoriel, we can see that Little's test fails to detect covariance heterogeneity between +# In this tutorial, we can see that Little's test fails to detect covariance heterogeneity between # patterns. # # We also note that the Little's test does not handle categorical data or temporally From a452c93d975c778b72f6483b4b0d94f7feaf70c2 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:42:48 -0500 Subject: [PATCH 75/80] spelling: update Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/imputations/softimpute.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qolmat/imputations/softimpute.py b/qolmat/imputations/softimpute.py index 507faf17..763b25e8 100644 --- a/qolmat/imputations/softimpute.py +++ b/qolmat/imputations/softimpute.py @@ -156,7 +156,7 @@ def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: V_old = V D_old = D - # Step 2 : Upate on B + # Step 2 : Update on B D2_invreg = (D**2 + tau) ** (-1) Btilde = ( (U * D).T @ np.where(Omega, X - A @ B.T, 0) + (B * D**2).T @@ -168,7 +168,7 @@ def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]: D = np.sqrt(D2tilde).reshape(1, -1) B = V * D - # Step 3 : Upate on A + # Step 3 : Update on A D2_invreg = (D**2 + tau) ** (-1) Atilde = ( (V * D).T @ np.where(Omega, X - A @ B.T, 0).T + (A * D**2).T From cbafb26a3724edc01e0ff8c6c91b4c0cfb1036f1 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:43:53 -0500 Subject: [PATCH 76/80] spelling: useful Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- qolmat/benchmark/hyperparameters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qolmat/benchmark/hyperparameters.py b/qolmat/benchmark/hyperparameters.py index 640e4e42..242fdd58 100644 --- a/qolmat/benchmark/hyperparameters.py +++ b/qolmat/benchmark/hyperparameters.py @@ -106,7 +106,7 @@ def optimize( Each estimation involves one call to fit_transform per fold returned by the generator. See the n_fold attribute. verbose: bool - Verbosity switch, usefull for imputers that can have unstable + Verbosity switch, useful for imputers that can have unstable behavior for some hyperparameters values Returns From c51796425b0391ac5569dde1e98d7960dfe975ad Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:44:10 -0500 Subject: [PATCH 77/80] spelling: variables Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/benchmark.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/benchmark.md b/examples/benchmark.md index 37d3cc8f..551a3aaf 100644 --- a/examples/benchmark.md +++ b/examples/benchmark.md @@ -54,7 +54,7 @@ from qolmat.utils import data, utils, plot The dataset `Beijing` is the Beijing Multi-Site Air-Quality Data Set. It consists in hourly air pollutants data from 12 chinese nationally-controlled air-quality monitoring sites and is available at https://archive.ics.uci.edu/ml/machine-learning-databases/00501/. -This dataset only contains numerical vairables. +This dataset only contains numerical variables. ```python tags=[] df_data = data.get_data_corrupted("Beijing", ratio_masked=.2, mean_size=120) From 050f8d2a0a8f31a409853b4de41816bbe5ae204b Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:04:13 -0500 Subject: [PATCH 78/80] spelling: whether or not Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- docs/analysis.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/analysis.rst b/docs/analysis.rst index 5d1e8aad..e3d6e513 100644 --- a/docs/analysis.rst +++ b/docs/analysis.rst @@ -16,7 +16,7 @@ Then Qolmat proposes two tests to determine whether the missing data mechanism i 2. How to use the results ------------------------- -At the end of the MCAR test, it can then be assumed whether the missing data mechanism is MCAR or not. This serves three different purposes: +At the end of the MCAR test, it can then be assumed whether or not the missing data mechanism is MCAR. This serves three different purposes: a. Diagnosis ^^^^^^^^^^^^ From 5d881eba98831005fd626f746a81481decd0cede Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:44:46 -0500 Subject: [PATCH 79/80] spelling: while Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- examples/tutorials/plot_tuto_benchmark_TS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tutorials/plot_tuto_benchmark_TS.py b/examples/tutorials/plot_tuto_benchmark_TS.py index db267030..c2330203 100644 --- a/examples/tutorials/plot_tuto_benchmark_TS.py +++ b/examples/tutorials/plot_tuto_benchmark_TS.py @@ -184,7 +184,7 @@ # One observes the methods provide similar visual results: it's difficult to compare # them based on this criterion, except the median imputation that greatly differs. # Black points and ellipses are original dataframes -# whiel colored ones are imputed dataframes. +# while colored ones are imputed dataframes. n_columns = len(dfs_imputed_station) fig = plt.figure(figsize=(10, 10)) From 2939549d8d242c91acb22b2b8449a7b1827deec3 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sun, 16 Nov 2025 06:58:52 -0500 Subject: [PATCH 80/80] link: scikit-learn API Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- CONTRIBUTING.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index d5f1e3bc..f2d292a2 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -46,7 +46,7 @@ Documenting your change ----------------------- If you're adding a class or a function, then you'll need to add a docstring with a doctest. We follow the `numpy docstring convention `_, so please do too. -Any estimator should follow the [scikit-learn API](https://scikit-learn.org/stable/developers/develop.html), so please follow these guidelines. +Any estimator should follow the `scikit-learn API `_, so please follow these guidelines. Updating changelog ------------------