Skip to content

Commit aec6307

Browse files
Julien RousselJulien Roussel
authored andcommitted
soft impute tests updated
1 parent b10304a commit aec6307

File tree

6 files changed

+125
-201
lines changed

6 files changed

+125
-201
lines changed

docs/imputers.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ with :math:`\mathbf{E} = \mathbf{D} - \mathbf{M} - \mathbf{A}`.
4949
SoftImpute is an iterative method for matrix completion that uses nuclear-norm regularization [11]. It is a faster alternative to RPCA, although it is much less robust due to the quadratic penalization. Given a matrix :math:`\mathbf{D} \in \mathbb{R}^{n \times d}` with observed entries indexed by the set :math:`\Omega`, this algorithm solves the following problem:
5050

5151
.. math::
52-
\text{minimise}_{\mathbf{L} \in \mathbb{R}^{n \times r}, \mathbf{Q} \in \mathbb{R}^{d \times r}} \quad \Vert P_{\Omega}(\mathbf{D} - \mathbf{L}\mathbf{Q}) \Vert_F^2 + \tau \Vert \mathbf{L} \Vert_F^2 + \tau \Vert \mathbf{Q} \Vert_F^2
52+
\text{minimise}_{\mathbf{M} \in \mathbb{R}^{n \times d}, rg(M) \leq r} \quad \Vert P_{\Omega}(\mathbf{D} - \mathbf{M}) \Vert_F^2 + \tau \Vert \mathbf{M} \Vert_*
5353
5454
The imputed values are then given by the matrix :math:`M=LQ` on the unobserved data. See the :class:`~qolmat.imputations.imputers.ImputerSoftImpute` class for implementation details.
5555

qolmat/imputations/imputers.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1825,7 +1825,7 @@ def get_model(self, **hyperparams) -> rpca_noisy.RpcaNoisy:
18251825

18261826
def _fit_element(
18271827
self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0
1828-
) -> em_sampler.EM:
1828+
) -> rpca_noisy.RpcaNoisy:
18291829
"""
18301830
Fits the imputer on `df`, at the group and/or column level depending onself.groups and
18311831
self.columnwise.
@@ -1937,9 +1937,9 @@ def __init__(
19371937
columnwise: bool = False,
19381938
random_state: Union[None, int, np.random.RandomState] = None,
19391939
period: int = 1,
1940-
rank: int = 2,
1940+
rank: Optional[int] = None,
19411941
tolerance: float = 1e-05,
1942-
tau: float = 0,
1942+
tau: Optional[float] = None,
19431943
max_iterations: int = 100,
19441944
verbose: bool = False,
19451945
):
@@ -2051,7 +2051,6 @@ def _transform_element(
20512051

20522052
D = utils.prepare_data(X, self.period)
20532053
Omega = ~np.isnan(D)
2054-
# D = utils.linear_interpolation(D)
20552054

20562055
M, A = model.decompose(D, Omega)
20572056

qolmat/imputations/rpca/rpca_noisy.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ def minimise_loss(
260260
261261
"""
262262

263+
print("minimise_loss")
263264
rho = 1.1
264265
n_rows, n_cols = D.shape
265266

@@ -338,13 +339,13 @@ def minimise_loss(
338339
Ac = np.linalg.norm(A - A_temp, np.inf)
339340
Lc = np.linalg.norm(L - L_temp, np.inf)
340341
Qc = np.linalg.norm(Q - Q_temp, np.inf)
341-
tolerance = max([Mc, Ac, Lc, Qc]) # type: ignore # noqa
342+
error_max = max([Mc, Ac, Lc, Qc]) # type: ignore # noqa
342343
if norm == "L1":
343344
for i_period, _ in enumerate(list_periods):
344345
Rc = np.linalg.norm(R[i_period] - R_temp[i_period], np.inf)
345-
tolerance = max(tolerance, Rc) # type: ignore # noqa
346+
error_max = max(error_max, Rc) # type: ignore # noqa
346347

347-
if tolerance < tolerance:
348+
if error_max < tolerance:
348349
break
349350

350351
M = L @ Q

qolmat/imputations/softimpute.py

Lines changed: 79 additions & 159 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
from typing import Optional, Tuple, Union
4+
import warnings
45

56
import numpy as np
67
from numpy.typing import NDArray
@@ -55,9 +56,9 @@ class SoftImpute(BaseEstimator, TransformerMixin):
5556
def __init__(
5657
self,
5758
period: int = 1,
58-
rank: int = 2,
59+
rank: Optional[int] = None,
5960
tolerance: float = 1e-05,
60-
tau: float = 0,
61+
tau: Optional[float] = None,
6162
max_iterations: int = 100,
6263
random_state: Union[None, int, np.random.RandomState] = None,
6364
verbose: bool = False,
@@ -70,86 +71,38 @@ def __init__(
7071
self.random_state = sku.check_random_state(random_state)
7172
self.verbose = verbose
7273

73-
# def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]:
74-
# """
75-
# Compute the Soft Impute decomposition
74+
def get_params_scale(self, X: NDArray):
75+
"""
76+
Get parameters for scaling in Soft Impute based on the input data.
7677
77-
# Parameters
78-
# ----------
79-
# D : NDArray
80-
# Matrix of the observations
81-
# Omega: NDArray
82-
# Matrix of missingness, with boolean data
78+
Parameters
79+
----------
80+
X : np.ndarray
81+
Input data matrix of shape (m, n).
8382
84-
# Returns
85-
# -------
86-
# M: NDArray
87-
# Low-rank signal
88-
# A: NDArray
89-
# Anomalies
90-
# """
91-
# print()
92-
# print()
93-
# print(X.shape)
94-
# print()
95-
# X = utils.linear_interpolation(X)
96-
97-
# n, m = X.shape
98-
# V = np.zeros((m, self.rank))
99-
# U = self.random_state.normal(0.0, 1.0, (n, self.rank))
100-
# U, _, _ = np.linalg.svd(U, full_matrices=False)
101-
# D2 = np.ones((self.rank, 1))
102-
# col_means = np.nanmean(X, axis=0)
103-
# np.copyto(X, col_means, where=~Omega)
104-
# if self.rank is None:
105-
# self.rank = rpca_utils.approx_rank(X)
106-
# for iter_ in range(self.max_iterations):
107-
# U_old = U
108-
# V_old = V
109-
# D2_old = D2
110-
111-
# BDt = U.T @ X
112-
# if self.tau > 0:
113-
# BDt *= D2 / (D2**2 + self.tau)
114-
# Vtilde, D2tilde, Rt = np.linalg.svd(BDt.T, full_matrices=False)
115-
# V = Vtilde
116-
# D2 = D2tilde.reshape(-1, 1)
117-
# U = U @ Rt
118-
# X_hat = U @ (D2 * V.T)
119-
# X[~Omega] = X_hat[~Omega]
120-
121-
# A = (X @ V).T
122-
# if self.tau > 0:
123-
# A *= D2 / (D2 + self.tau)
124-
# Lsvd = np.linalg.svd(A.T, full_matrices=False)
125-
# U = Lsvd[0]
126-
# D2 = Lsvd[1][:, np.newaxis]
127-
# V = V @ Lsvd[2]
128-
# X_hat = U @ (D2 * V.T)
129-
# X[~Omega] = X_hat[~Omega]
130-
131-
# ratio = self._check_convergence(U_old, D2_old, V_old, U, D2, V)
132-
# if self.verbose:
133-
# print(f"iter {iter_}: ratio = {round(ratio, 4)}")
134-
# if ratio < self.tolerance:
135-
# break
136-
137-
# u = U[:, : self.rank]
138-
# d = D2[: self.rank]
139-
# v = V[:, : self.rank]
140-
141-
# M = u @ np.diag(d.T[0]) @ (v).T
142-
# A = X - M
143-
144-
# return M, A
83+
Returns
84+
-------
85+
dict
86+
A dictionary containing the following parameters:
87+
- "rank" : float
88+
Rank estimate for low-rank matrix decomposition.
89+
- "tau" : float
90+
Parameter for the nuclear norm penality
91+
92+
"""
93+
X = utils.linear_interpolation(X)
94+
rank = rpca_utils.approx_rank(X)
95+
tau = 1 / np.sqrt(np.max(X.shape))
96+
dict_params = {"rank": rank, "tau": tau}
97+
return dict_params
14598

14699
def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]:
147100
"""
148101
Compute the Soft Impute decomposition
149102
150103
Parameters
151104
----------
152-
D : NDArray
105+
X : NDArray
153106
Matrix of the observations
154107
Omega: NDArray
155108
Matrix of missingness, with boolean data
@@ -161,29 +114,29 @@ def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]:
161114
A: NDArray
162115
Anomalies
163116
"""
164-
assert self.tau > 0
165-
if self.rank is None:
166-
self.rank = rpca_utils.approx_rank(X)
167-
# X = utils.linear_interpolation(X)
117+
params_scale = self.get_params_scale(X)
118+
rank = params_scale["rank"] if self.rank is None else self.rank
119+
tau = params_scale["tau"] if self.tau is None else self.tau
120+
assert tau > 0
168121

169122
# Step 1 : Initializing
170123
n, m = X.shape
171-
V = np.zeros((m, self.rank))
172-
U = self.random_state.normal(0.0, 1.0, (n, self.rank))
124+
V = np.zeros((m, rank))
125+
U = self.random_state.normal(0.0, 1.0, (n, rank))
173126
U, _, _ = np.linalg.svd(U, full_matrices=False)
174-
D = np.ones((1, self.rank))
175-
# col_means = np.nanmean(X, axis=0)
176-
# np.copyto(X, col_means, where=~Omega)
127+
D = np.ones((1, rank))
177128

178129
A = U * D
179130
B = V * D
131+
M = A @ B.T
132+
cost_start = self.cost_function(X, M, A, Omega, tau)
180133
for iter_ in range(self.max_iterations):
181134
U_old = U
182135
V_old = V
183136
D_old = D
184137

185138
# Step 2 : Upate on B
186-
D2_invreg = (D**2 + self.tau) ** (-1)
139+
D2_invreg = (D**2 + tau) ** (-1)
187140
Btilde = ((U * D).T @ np.where(Omega, X - A @ B.T, 0) + (B * D**2).T).T
188141
Btilde = Btilde * D2_invreg
189142

@@ -193,8 +146,8 @@ def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]:
193146
B = V * D
194147

195148
# Step 3 : Upate on A
196-
D2_invreg = (D**2 + self.tau) ** (-1)
197-
Atilde = ((V * D).T @ np.where(Omega, X.T - B @ A.T, 0) + (A * D**2).T).T
149+
D2_invreg = (D**2 + tau) ** (-1)
150+
Atilde = ((V * D).T @ np.where(Omega, X - A @ B.T, 0).T + (A * D**2).T).T
198151
Atilde = Atilde * D2_invreg
199152

200153
Utilde, D2tilde, _ = np.linalg.svd(Atilde * D, full_matrices=False)
@@ -213,85 +166,19 @@ def decompose(self, X: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]:
213166
Xstar = np.where(Omega, X - A @ B.T, 0) + A @ B.T
214167
M = Xstar @ V
215168
U, D, Rt = np.linalg.svd(M, full_matrices=False)
216-
D = rpca_utils.soft_thresholding(D, self.tau)
169+
D = rpca_utils.soft_thresholding(D, tau)
217170
M = (U * D) @ Rt @ V.T
218171

219172
A = np.where(Omega, X - M, 0)
220173

221-
return M, A
222-
223-
# def fit(self, D: NDArray, y=None) -> SoftImpute:
224-
# """Fit the imputer on D.
174+
cost_end = self.cost_function(X, M, A, Omega, tau)
175+
if self.verbose and (cost_end > cost_start + 1e-9):
176+
warnings.warn(
177+
f"Convergence failed: cost function increased from"
178+
f" {cost_start} to {cost_end} instead of decreasing!".format("%.2f")
179+
)
225180

226-
# Parameters
227-
# ----------
228-
# D : NDArray
229-
# Input data
230-
231-
# y : Ignored
232-
# Not used, present here for API consistency by convention.
233-
234-
# Returns
235-
# -------
236-
# self : object
237-
# The fitted `SoftImpute` class instance.
238-
# """
239-
# D = D.copy()
240-
# D = utils.prepare_data(D, self.period)
241-
242-
# if not isinstance(D, np.ndarray):
243-
# raise AssertionError("Invalid type. D must be a NDArray.")
244-
245-
# n, m = D.shape
246-
# mask = np.isnan(D)
247-
# V = np.zeros((m, self.rank))
248-
# U = self.random_state.normal(0.0, 1.0, (n, self.rank))
249-
# U, _, _ = np.linalg.svd(U, full_matrices=False)
250-
# Dsq = np.ones((self.rank, 1))
251-
# col_means = np.nanmean(D, axis=0)
252-
# np.copyto(D, col_means, where=np.isnan(D))
253-
# if self.rank is None:
254-
# self.rank = rpca_utils.approx_rank(D)
255-
# for iter_ in range(self.max_iterations):
256-
# U_old = U
257-
# V_old = V
258-
# Dsq_old = Dsq
259-
260-
# Q = U.T @ D
261-
# if self.tau > 0:
262-
# tmp = Dsq / (Dsq + self.tau)
263-
# Q = Q * tmp
264-
# Bsvd = np.linalg.svd(Q.T, full_matrices=False)
265-
# V = Bsvd[0]
266-
# Dsq = Bsvd[1][:, np.newaxis]
267-
# U = U @ Bsvd[2]
268-
# tmp = Dsq * V.T
269-
# D_hat = U @ tmp
270-
# D[mask] = D_hat[mask]
271-
272-
# L = (D @ V).T
273-
# if self.tau > 0:
274-
# tmp = Dsq / (Dsq + self.tau)
275-
# L = L * tmp
276-
# Lsvd = np.linalg.svd(L.T, full_matrices=False)
277-
# U = Lsvd[0]
278-
# Dsq = Lsvd[1][:, np.newaxis]
279-
# V = V @ Lsvd[2]
280-
# tmp = Dsq * V.T
281-
# D_hat = U @ tmp
282-
# D[mask] = D_hat[mask]
283-
284-
# ratio = self._check_convergence(U_old, Dsq_old, V_old, U, Dsq, V)
285-
# if self.verbose:
286-
# print(f"iter {iter_}: ratio = {round(ratio, 4)}")
287-
# if ratio < self.tolerance:
288-
# break
289-
290-
# self.u = U[:, : self.rank]
291-
# self.d = Dsq[: self.rank]
292-
# self.v = V[:, : self.rank]
293-
294-
# return self
181+
return M, A
295182

296183
def _check_convergence(
297184
self,
@@ -362,3 +249,36 @@ def _check_convergence(
362249
# raise AssertionError("Result contains NaN. This is a bug.")
363250

364251
# return D_transformed
252+
253+
@staticmethod
254+
def cost_function(
255+
X: NDArray,
256+
M: NDArray,
257+
A: NDArray,
258+
Omega: NDArray,
259+
tau: float,
260+
):
261+
"""
262+
Compute cost function for different RPCA algorithm
263+
264+
Parameters
265+
----------
266+
X : NDArray
267+
Matrix of observations
268+
M : NDArray
269+
Low-rank signal
270+
A : NDArray
271+
Anomalies
272+
Omega : NDArray
273+
Mask for observations
274+
tau: Optional[float]
275+
penalizing parameter for the nuclear norm
276+
277+
Returns
278+
-------
279+
float
280+
Value of the cost function minimized by the Soft Impute algorithm
281+
"""
282+
norm_frobenius = np.sum(np.where(Omega, X - M, 0) ** 2)
283+
norm_nuclear = np.linalg.norm(M, "nuc")
284+
return norm_frobenius + tau * norm_nuclear

tests/imputations/test_imputers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ def test_models_fit_transform_grouped(imputer):
350350
imputers.ImputerRegressor(),
351351
imputers.ImputerRpcaNoisy(tau=0, lam=0),
352352
imputers.ImputerRpcaPcp(lam=0),
353-
imputers.ImputerSoftImpute(tau=0),
353+
imputers.ImputerSoftImpute(),
354354
imputers.ImputerEM(),
355355
]
356356
)

0 commit comments

Comments
 (0)