soft impute tests updated

Julien Roussel · Julien Roussel · commit aec630739843 · 2024-02-26T19:19:35.000+01:00
diff --git a/docs/imputers.rst b/docs/imputers.rst
@@ -49,7 +49,7 @@ with :math:`\mathbf{E} = \mathbf{D} - \mathbf{M} - \mathbf{A}`.
 SoftImpute is an iterative method for matrix completion that uses nuclear-norm regularization [11]. It is a faster alternative to RPCA, although it is much less robust due to the quadratic penalization. Given a matrix :math:`\mathbf{D} \in \mathbb{R}^{n \times d}` with observed entries indexed by the set :math:`\Omega`, this algorithm solves the following problem:
 
 .. math::
-    \text{minimise}_{\mathbf{L} \in \mathbb{R}^{n \times r}, \mathbf{Q} \in \mathbb{R}^{d \times r}} \quad \Vert P_{\Omega}(\mathbf{D} - \mathbf{L}\mathbf{Q}) \Vert_F^2 + \tau \Vert \mathbf{L} \Vert_F^2 + \tau \Vert \mathbf{Q} \Vert_F^2
+    \text{minimise}_{\mathbf{M} \in \mathbb{R}^{n \times d}, rg(M) \leq r} \quad \Vert P_{\Omega}(\mathbf{D} - \mathbf{M}) \Vert_F^2 + \tau \Vert \mathbf{M} \Vert_*
 
 The imputed values are then given by the matrix :math:`M=LQ` on the unobserved data. See the :class:`~qolmat.imputations.imputers.ImputerSoftImpute` class for implementation details.
 
diff --git a/qolmat/imputations/imputers.py b/qolmat/imputations/imputers.py
@@ -1825,7 +1825,7 @@ def get_model(self, **hyperparams) -> rpca_noisy.RpcaNoisy:
 
     def _fit_element(
         self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0
-    ) -> em_sampler.EM:
+    ) -> rpca_noisy.RpcaNoisy:
         """
         Fits the imputer on `df`, at the group and/or column level depending onself.groups and
         self.columnwise.
@@ -1937,9 +1937,9 @@ def __init__(
         columnwise: bool = False,
         random_state: Union[None, int, np.random.RandomState] = None,
         period: int = 1,
-        rank: int = 2,
+        rank: Optional[int] = None,
         tolerance: float = 1e-05,
-        tau: float = 0,
+        tau: Optional[float] = None,
         max_iterations: int = 100,
         verbose: bool = False,
     ):
@@ -2051,7 +2051,6 @@ def _transform_element(
 
         D = utils.prepare_data(X, self.period)
         Omega = ~np.isnan(D)
-        # D = utils.linear_interpolation(D)
 
         M, A = model.decompose(D, Omega)
 
diff --git a/qolmat/imputations/rpca/rpca_noisy.py b/qolmat/imputations/rpca/rpca_noisy.py
@@ -260,6 +260,7 @@ def minimise_loss(
 
         """
 
+        print("minimise_loss")
         rho = 1.1
         n_rows, n_cols = D.shape
 
@@ -338,13 +339,13 @@ def minimise_loss(
             Ac = np.linalg.norm(A - A_temp, np.inf)
             Lc = np.linalg.norm(L - L_temp, np.inf)
             Qc = np.linalg.norm(Q - Q_temp, np.inf)
-            tolerance = max([Mc, Ac, Lc, Qc])  # type: ignore # noqa
+            error_max = max([Mc, Ac, Lc, Qc])  # type: ignore # noqa
             if norm == "L1":
                 for i_period, _ in enumerate(list_periods):
                     Rc = np.linalg.norm(R[i_period] - R_temp[i_period], np.inf)
-                    tolerance = max(tolerance, Rc)  # type: ignore # noqa
+                    error_max = max(error_max, Rc)  # type: ignore # noqa
 
-            if tolerance < tolerance:
+            if error_max < tolerance:
                 break
 
         M = L @ Q
diff --git a/qolmat/imputations/softimpute.py b/qolmat/imputations/softimpute.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from typing import Optional, Tuple, Union
+import warnings
 
 import numpy as np
 from numpy.typing import NDArray
@@ -55,9 +56,9 @@ class SoftImpute(BaseEstimator, TransformerMixin):
     def __init__(
         self,
         period: int = 1,
-        rank: int = 2,
+        rank: Optional[int] = None,
         tolerance: float = 1e-05,
-        tau: float = 0,
+        tau: Optional[float] = None,
         max_iterations: int = 100,
         random_state: Union[None, int, np.random.RandomState] = None,
         verbose: bool = False,
@@ -70,86 +71,38 @@ def __init__(
         self.random_state = sku.check_random_state(random_state)
         self.verbose = verbose
 
-    # def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]:
-    #     """
-    #     Compute the Soft Impute decomposition
+    def get_params_scale(self, X: NDArray):
+        """
+        Get parameters for scaling in Soft Impute based on the input data.
 
-    #     Parameters
-    #     ----------
-    #     D : NDArray
-    #         Matrix of the observations
-    #     Omega: NDArray
-    #         Matrix of missingness, with boolean data
+        Parameters
+        ----------
+        X : np.ndarray
+            Input data matrix of shape (m, n).
 
-    #     Returns
-    #     -------
-    #     M: NDArray
-    #         Low-rank signal
-    #     A: NDArray
-    #         Anomalies
-    #     """
-    #     print()
-    #     print()
-    #     print(X.shape)
-    #     print()
-    #     X = utils.linear_interpolation(X)
-
-    #     n, m = X.shape
-    #     V = np.zeros((m, self.rank))
-    #     U = self.random_state.normal(0.0, 1.0, (n, self.rank))
-    #     U, _, _ = np.linalg.svd(U, full_matrices=False)
-    #     D2 = np.ones((self.rank, 1))
-    #     col_means = np.nanmean(X, axis=0)
-    #     np.copyto(X, col_means, where=~Omega)
-    #     if self.rank is None:
-    #         self.rank = rpca_utils.approx_rank(X)
-    #     for iter_ in range(self.max_iterations):
-    #         U_old = U
-    #         V_old = V
-    #         D2_old = D2
-
-    #         BDt = U.T @ X
-    #         if self.tau > 0:
-    #             BDt *= D2 / (D2**2 + self.tau)
-    #         Vtilde, D2tilde, Rt = np.linalg.svd(BDt.T, full_matrices=False)
-    #         V = Vtilde
-    #         D2 = D2tilde.reshape(-1, 1)
-    #         U = U @ Rt
-    #         X_hat = U @ (D2 * V.T)
-    #         X[~Omega] = X_hat[~Omega]
-
-    #         A = (X @ V).T
-    #         if self.tau > 0:
-    #             A *= D2 / (D2 + self.tau)
-    #         Lsvd = np.linalg.svd(A.T, full_matrices=False)
-    #         U = Lsvd[0]
-    #         D2 = Lsvd[1][:, np.newaxis]
-    #         V = V @ Lsvd[2]
-    #         X_hat = U @ (D2 * V.T)
-    #         X[~Omega] = X_hat[~Omega]
-
-    #         ratio = self._check_convergence(U_old, D2_old, V_old, U, D2, V)
-    #         if self.verbose:
-    #             print(f"iter {iter_}: ratio = {round(ratio, 4)}")
-    #         if ratio < self.tolerance:
-    #             break
-
-    #     u = U[:, : self.rank]
-    #     d = D2[: self.rank]
-    #     v = V[:, : self.rank]
-
-    #     M = u @ np.diag(d.T[0]) @ (v).T
-    #     A = X - M
-
-    #     return M, A
+        Returns
+        -------
+        dict
+            A dictionary containing the following parameters:
+                - "rank" : float
+                    Rank estimate for low-rank matrix decomposition.
+                - "tau" : float
+                    Parameter for the nuclear norm penality
+
+        """
+        X = utils.linear_interpolation(X)
+        rank = rpca_utils.approx_rank(X)
+        tau = 1 / np.sqrt(np.max(X.shape))
+        dict_params = {"rank": rank, "tau": tau}
+        return dict_params
 
     def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]:
         """
         Compute the Soft Impute decomposition
 
         Parameters
         ----------
-        D : NDArray
+        X : NDArray
             Matrix of the observations
         Omega: NDArray
             Matrix of missingness, with boolean data
@@ -161,29 +114,29 @@ def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]:
         A: NDArray
             Anomalies
         """
-        assert self.tau > 0
-        if self.rank is None:
-            self.rank = rpca_utils.approx_rank(X)
-        # X = utils.linear_interpolation(X)
+        params_scale = self.get_params_scale(X)
+        rank = params_scale["rank"] if self.rank is None else self.rank
+        tau = params_scale["tau"] if self.tau is None else self.tau
+        assert tau > 0
 
         # Step 1 : Initializing
         n, m = X.shape
-        V = np.zeros((m, self.rank))
-        U = self.random_state.normal(0.0, 1.0, (n, self.rank))
+        V = np.zeros((m, rank))
+        U = self.random_state.normal(0.0, 1.0, (n, rank))
         U, _, _ = np.linalg.svd(U, full_matrices=False)
-        D = np.ones((1, self.rank))
-        # col_means = np.nanmean(X, axis=0)
-        # np.copyto(X, col_means, where=~Omega)
+        D = np.ones((1, rank))
 
         A = U * D
         B = V * D
+        M = A @ B.T
+        cost_start = self.cost_function(X, M, A, Omega, tau)
         for iter_ in range(self.max_iterations):
             U_old = U
             V_old = V
             D_old = D
 
             # Step 2 : Upate on B
-            D2_invreg = (D**2 + self.tau) ** (-1)
+            D2_invreg = (D**2 + tau) ** (-1)
             Btilde = ((U * D).T @ np.where(Omega, X - A @ B.T, 0) + (B * D**2).T).T
             Btilde = Btilde * D2_invreg
 
@@ -193,8 +146,8 @@ def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]:
             B = V * D
 
             # Step 3 : Upate on A
-            D2_invreg = (D**2 + self.tau) ** (-1)
-            Atilde = ((V * D).T @ np.where(Omega, X.T - B @ A.T, 0) + (A * D**2).T).T
+            D2_invreg = (D**2 + tau) ** (-1)
+            Atilde = ((V * D).T @ np.where(Omega, X - A @ B.T, 0).T + (A * D**2).T).T
             Atilde = Atilde * D2_invreg
 
             Utilde, D2tilde, _ = np.linalg.svd(Atilde * D, full_matrices=False)
@@ -213,85 +166,19 @@ def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]:
         Xstar = np.where(Omega, X - A @ B.T, 0) + A @ B.T
         M = Xstar @ V
         U, D, Rt = np.linalg.svd(M, full_matrices=False)
-        D = rpca_utils.soft_thresholding(D, self.tau)
+        D = rpca_utils.soft_thresholding(D, tau)
         M = (U * D) @ Rt @ V.T
 
         A = np.where(Omega, X - M, 0)
 
-        return M, A
-
-    # def fit(self, D: NDArray, y=None) -> SoftImpute:
-    #     """Fit the imputer on D.
+        cost_end = self.cost_function(X, M, A, Omega, tau)
+        if self.verbose and (cost_end > cost_start + 1e-9):
+            warnings.warn(
+                f"Convergence failed: cost function increased from"
+                f" {cost_start} to {cost_end} instead of decreasing!".format("%.2f")
+            )
 
-    #     Parameters
-    #     ----------
-    #     D : NDArray
-    #         Input data
-
-    #     y : Ignored
-    #         Not used, present here for API consistency by convention.
-
-    #     Returns
-    #     -------
-    #     self : object
-    #         The fitted `SoftImpute` class instance.
-    #     """
-    #     D = D.copy()
-    #     D = utils.prepare_data(D, self.period)
-
-    #     if not isinstance(D, np.ndarray):
-    #         raise AssertionError("Invalid type. D must be a NDArray.")
-
-    #     n, m = D.shape
-    #     mask = np.isnan(D)
-    #     V = np.zeros((m, self.rank))
-    #     U = self.random_state.normal(0.0, 1.0, (n, self.rank))
-    #     U, _, _ = np.linalg.svd(U, full_matrices=False)
-    #     Dsq = np.ones((self.rank, 1))
-    #     col_means = np.nanmean(D, axis=0)
-    #     np.copyto(D, col_means, where=np.isnan(D))
-    #     if self.rank is None:
-    #         self.rank = rpca_utils.approx_rank(D)
-    #     for iter_ in range(self.max_iterations):
-    #         U_old = U
-    #         V_old = V
-    #         Dsq_old = Dsq
-
-    #         Q = U.T @ D
-    #         if self.tau > 0:
-    #             tmp = Dsq / (Dsq + self.tau)
-    #             Q = Q * tmp
-    #         Bsvd = np.linalg.svd(Q.T, full_matrices=False)
-    #         V = Bsvd[0]
-    #         Dsq = Bsvd[1][:, np.newaxis]
-    #         U = U @ Bsvd[2]
-    #         tmp = Dsq * V.T
-    #         D_hat = U @ tmp
-    #         D[mask] = D_hat[mask]
-
-    #         L = (D @ V).T
-    #         if self.tau > 0:
-    #             tmp = Dsq / (Dsq + self.tau)
-    #             L = L * tmp
-    #         Lsvd = np.linalg.svd(L.T, full_matrices=False)
-    #         U = Lsvd[0]
-    #         Dsq = Lsvd[1][:, np.newaxis]
-    #         V = V @ Lsvd[2]
-    #         tmp = Dsq * V.T
-    #         D_hat = U @ tmp
-    #         D[mask] = D_hat[mask]
-
-    #         ratio = self._check_convergence(U_old, Dsq_old, V_old, U, Dsq, V)
-    #         if self.verbose:
-    #             print(f"iter {iter_}: ratio = {round(ratio, 4)}")
-    #         if ratio < self.tolerance:
-    #             break
-
-    #     self.u = U[:, : self.rank]
-    #     self.d = Dsq[: self.rank]
-    #     self.v = V[:, : self.rank]
-
-    #     return self
+        return M, A
 
     def _check_convergence(
         self,
@@ -362,3 +249,36 @@ def _check_convergence(
     #         raise AssertionError("Result contains NaN. This is a bug.")
 
     #     return D_transformed
+
+    @staticmethod
+    def cost_function(
+        X: NDArray,
+        M: NDArray,
+        A: NDArray,
+        Omega: NDArray,
+        tau: float,
+    ):
+        """
+        Compute cost function for different RPCA algorithm
+
+        Parameters
+        ----------
+        X : NDArray
+            Matrix of observations
+        M : NDArray
+            Low-rank signal
+        A : NDArray
+            Anomalies
+        Omega : NDArray
+            Mask for observations
+        tau: Optional[float]
+            penalizing parameter for the nuclear norm
+
+        Returns
+        -------
+        float
+            Value of the cost function minimized by the Soft Impute algorithm
+        """
+        norm_frobenius = np.sum(np.where(Omega, X - M, 0) ** 2)
+        norm_nuclear = np.linalg.norm(M, "nuc")
+        return norm_frobenius + tau * norm_nuclear
diff --git a/tests/imputations/test_imputers.py b/tests/imputations/test_imputers.py
@@ -350,7 +350,7 @@ def test_models_fit_transform_grouped(imputer):
         imputers.ImputerRegressor(),
         imputers.ImputerRpcaNoisy(tau=0, lam=0),
         imputers.ImputerRpcaPcp(lam=0),
-        imputers.ImputerSoftImpute(tau=0),
+        imputers.ImputerSoftImpute(),
         imputers.ImputerEM(),
     ]
 )
diff --git a/tests/imputations/test_softimpute.py b/tests/imputations/test_softimpute.py

Original file line number	Diff line number	Diff line change
`@@ -350,7 +350,7 @@ def test_models_fit_transform_grouped(imputer):`
`350`	`350`	`imputers.ImputerRegressor(),`
`351`	`351`	`imputers.ImputerRpcaNoisy(tau=0, lam=0),`
`352`	`352`	`imputers.ImputerRpcaPcp(lam=0),`
`353`		`- imputers.ImputerSoftImpute(tau=0),`
	`353`	`+ imputers.ImputerSoftImpute(),`
`354`	`354`	`imputers.ImputerEM(),`
`355`	`355`	`]`
`356`	`356`	`)`