|
| 1 | +from __future__ import annotations |
| 2 | +from typing import Literal, Optional, TYPE_CHECKING |
| 3 | + |
| 4 | +import numpy as np |
| 5 | +import pandas as pd |
| 6 | +from scipy.stats import chi2 |
| 7 | + |
| 8 | +from qolmat.imputations.imputers import ImputerEM |
| 9 | + |
| 10 | +if TYPE_CHECKING: |
| 11 | + from qolmat.imputations.imputers import _Imputer |
| 12 | + |
| 13 | + |
| 14 | +class MCARTest: |
| 15 | + """ |
| 16 | + This class implements the statistical tests to test the MCAR case. |
| 17 | +
|
| 18 | + Parameters |
| 19 | + ---------- |
| 20 | + method : Literal["little"] |
| 21 | + The name of the statistical test. This should be handled by qolmat. |
| 22 | + imputer : Optional[_Imputer], optional |
| 23 | + If the selected test needs a imputer, you can provide the Imputer you want. Otherwise, |
| 24 | + a default imputer will be used. |
| 25 | + """ |
| 26 | + |
| 27 | + def __init__(self, method: Literal["little"], imputer: Optional[_Imputer] = None): |
| 28 | + if method not in ["little"]: |
| 29 | + raise ValueError(f"method` must be handled by qolmat, provided value is '{method}'") |
| 30 | + |
| 31 | + self.method = method |
| 32 | + self.imputer = imputer |
| 33 | + |
| 34 | + def test(self, df: pd.DataFrame) -> float: |
| 35 | + if self.method == "little": |
| 36 | + return self.little_mcar_test(df) |
| 37 | + |
| 38 | + def little_mcar_test(self, df: pd.DataFrame) -> float: |
| 39 | + """ |
| 40 | + This method implements the Little's test. Use this test to test the homogenity of means |
| 41 | + between all your missing patterns. |
| 42 | + The null hypethoses is "The missing data mechanism is MCAR". |
| 43 | + Be aware that this test won't detect the heterogeneity of covariance. |
| 44 | +
|
| 45 | + Parameters |
| 46 | + ---------- |
| 47 | + df : pd.DataFrame |
| 48 | + Your input data with missing values. |
| 49 | +
|
| 50 | + Returns |
| 51 | + ------- |
| 52 | + float |
| 53 | + The p-value of the test. |
| 54 | + """ |
| 55 | + imputer = self.imputer or ImputerEM() |
| 56 | + fitted_imputer = imputer._fit_element(df) |
| 57 | + |
| 58 | + # Instanciant the stat, the degree of freedom and estimators. |
| 59 | + d0 = 0 |
| 60 | + n_rows, degree_f = df.shape |
| 61 | + degree_f = -degree_f |
| 62 | + ml_means = fitted_imputer.means |
| 63 | + ml_cov = n_rows / (n_rows - 1) * fitted_imputer.cov |
| 64 | + |
| 65 | + # Iterate over the patterns |
| 66 | + df_nan = df.notna() |
| 67 | + for tup_pattern, df_nan_pattern in df_nan.groupby(df_nan.columns.tolist()): |
| 68 | + n_rows_pattern, _ = df_nan_pattern.shape |
| 69 | + ind_pattern = df_nan_pattern.index |
| 70 | + df_pattern = df.loc[ind_pattern, list(tup_pattern)] |
| 71 | + obs_mean = df_pattern.mean().to_numpy() |
| 72 | + |
| 73 | + diff_means = obs_mean - ml_means[list(tup_pattern)] |
| 74 | + inv_sigma_pattern = np.linalg.inv(ml_cov[:, tup_pattern][tup_pattern, :]) |
| 75 | + |
| 76 | + d0 += n_rows_pattern * np.dot(np.dot(diff_means, inv_sigma_pattern), diff_means.T) |
| 77 | + degree_f += tup_pattern.count(True) |
| 78 | + |
| 79 | + return 1 - chi2.cdf(d0, degree_f) |
0 commit comments