Skip to content

Commit d70563d

Browse files
committed
✨ Creation of the audit folder in which the MCARTest class is created. Implementation of the Little's test + unit test.
1 parent 5da1e22 commit d70563d

File tree

2 files changed

+109
-0
lines changed

2 files changed

+109
-0
lines changed
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
from __future__ import annotations
2+
from typing import Literal, Optional, TYPE_CHECKING
3+
4+
import numpy as np
5+
import pandas as pd
6+
from scipy.stats import chi2
7+
8+
from qolmat.imputations.imputers import ImputerEM
9+
10+
if TYPE_CHECKING:
11+
from qolmat.imputations.imputers import _Imputer
12+
13+
14+
class MCARTest:
15+
"""
16+
This class implements the statistical tests to test the MCAR case.
17+
18+
Parameters
19+
----------
20+
method : Literal["little"]
21+
The name of the statistical test. This should be handled by qolmat.
22+
imputer : Optional[_Imputer], optional
23+
If the selected test needs a imputer, you can provide the Imputer you want. Otherwise,
24+
a default imputer will be used.
25+
"""
26+
27+
def __init__(self, method: Literal["little"], imputer: Optional[_Imputer] = None):
28+
if method not in ["little"]:
29+
raise ValueError(f"method` must be handled by qolmat, provided value is '{method}'")
30+
31+
self.method = method
32+
self.imputer = imputer
33+
34+
def test(self, df: pd.DataFrame) -> float:
35+
if self.method == "little":
36+
return self.little_mcar_test(df)
37+
38+
def little_mcar_test(self, df: pd.DataFrame) -> float:
39+
"""
40+
This method implements the Little's test. Use this test to test the homogenity of means
41+
between all your missing patterns.
42+
The null hypethoses is "The missing data mechanism is MCAR".
43+
Be aware that this test won't detect the heterogeneity of covariance.
44+
45+
Parameters
46+
----------
47+
df : pd.DataFrame
48+
Your input data with missing values.
49+
50+
Returns
51+
-------
52+
float
53+
The p-value of the test.
54+
"""
55+
imputer = self.imputer or ImputerEM()
56+
fitted_imputer = imputer._fit_element(df)
57+
58+
# Instanciant the stat, the degree of freedom and estimators.
59+
d0 = 0
60+
n_rows, degree_f = df.shape
61+
degree_f = -degree_f
62+
ml_means = fitted_imputer.means
63+
ml_cov = n_rows / (n_rows - 1) * fitted_imputer.cov
64+
65+
# Iterate over the patterns
66+
df_nan = df.notna()
67+
for tup_pattern, df_nan_pattern in df_nan.groupby(df_nan.columns.tolist()):
68+
n_rows_pattern, _ = df_nan_pattern.shape
69+
ind_pattern = df_nan_pattern.index
70+
df_pattern = df.loc[ind_pattern, list(tup_pattern)]
71+
obs_mean = df_pattern.mean().to_numpy()
72+
73+
diff_means = obs_mean - ml_means[list(tup_pattern)]
74+
inv_sigma_pattern = np.linalg.inv(ml_cov[:, tup_pattern][tup_pattern, :])
75+
76+
d0 += n_rows_pattern * np.dot(np.dot(diff_means, inv_sigma_pattern), diff_means.T)
77+
degree_f += tup_pattern.count(True)
78+
79+
return 1 - chi2.cdf(d0, degree_f)
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import numpy as np
2+
import pandas as pd
3+
import pytest
4+
5+
from qolmat.audit.holes_characterization import MCARTest
6+
from qolmat.imputations.imputers import ImputerEM
7+
8+
9+
np.random.seed(11)
10+
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
11+
matrix_1, matrix_2, matrix_3 = map(np.copy, [matrix] * 3)
12+
13+
# Case 1 : MCAR case detected by Little
14+
matrix_1.ravel()[np.random.choice(matrix_1.size, size=20, replace=False)] = np.nan
15+
df_1 = pd.DataFrame(matrix_1)
16+
17+
# Case 2 : MAR case detected by Little
18+
matrix_2[np.argwhere(matrix_2[:, 0] > 1.96), 1] = np.nan
19+
df_2 = pd.DataFrame(matrix_2)
20+
21+
# Case 3 : MAR case undetected by Little
22+
matrix_3[np.argwhere(abs(matrix_3[:, 0]) >= 1.95), 1] = np.nan
23+
df_3 = pd.DataFrame(matrix_3)
24+
25+
26+
@pytest.mark.parametrize("df_input, expected", [(df_1, True), (df_2, False), (df_3, True)])
27+
def test_little_mcar_test(df_input: pd.DataFrame, expected: bool):
28+
mcar_test_little = MCARTest(method="little", imputer=ImputerEM(random_state=42))
29+
result = mcar_test_little.test(df_input)
30+
assert expected == (result > 0.05)

0 commit comments

Comments
 (0)