Skip to content

Commit 23aa9b4

Browse files
committed
📝 Add the tuto of the Little's MCAR test. And modify the test file regarding the random generator seed.
1 parent d70563d commit 23aa9b4

File tree

4 files changed

+189
-20
lines changed

4 files changed

+189
-20
lines changed

docs/audit.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
Audit
3+
===============

docs/index.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
.. include:: ../README.rst
22

3+
.. toctree::
4+
:maxdepth: 2
5+
:hidden:
6+
:caption: AUDIT
7+
8+
audit
9+
examples/tutorials/plot_tuto_mcar_test
10+
311
.. toctree::
412
:maxdepth: 2
513
:hidden:
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
"""
2+
============================================
3+
Tutorial for testing the MCAR case
4+
============================================
5+
6+
In this tutorial, we show how to use the mcar test classe and it methods
7+
8+
Keep in my mind that, at this moment, the mcar tests are only handle tabular data.
9+
"""
10+
# %%
11+
# First import some libraries
12+
from matplotlib import pyplot as plt
13+
import random
14+
15+
import numpy as np
16+
import pandas as pd
17+
18+
from qolmat.audit.holes_characterization import MCARTest
19+
20+
# %%
21+
# 1. The Little's test
22+
# ---------------------------------------------------------------
23+
# How to use the Little's test ?
24+
# ==============================
25+
# When we deal with missing data in our dataset it's interesting to know the nature of these holes.
26+
# There exist three types of holes : MCAR, MAR and MNAR.
27+
# (see the: `Rubin's missing mechanism classification
28+
# <https://qolmat.readthedocs.io/en/latest/explanation.html>`_)
29+
#
30+
# The simplest case to test is the MCAR case. The most famous MCAR statistical test is the
31+
# `Little's test <https://www.tandfonline.com/doi/abs/10.1080/01621459.1988.10478722>`_.
32+
# Keep in mind that the Little's test is designed to test the homogeneity of means between the
33+
# missing patterns and won't be efficient to detect the heterogeneity of covariance between missing
34+
# patterns.
35+
#
36+
# This notebook shows how the Little's test performs and its limitations.
37+
38+
np.random.seed(11)
39+
40+
mcartest = MCARTest(method="little")
41+
42+
# %%
43+
# Case 1 : Normal iid feature with MCAR holes
44+
# ===========================================
45+
46+
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
47+
matrix.ravel()[np.random.choice(matrix.size, size=20, replace=False)] = np.nan
48+
matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
49+
df_1 = pd.DataFrame(matrix)
50+
51+
plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
52+
plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])
53+
54+
plt.legend(
55+
(plt_1, plt_2),
56+
("observed_values", "masked_vlues"),
57+
scatterpoints=1,
58+
loc="lower left",
59+
ncol=1,
60+
fontsize=8,
61+
)
62+
63+
plt.title("Case 1 : MCAR missingness mechanism")
64+
plt.xlabel("x values (all observed)")
65+
plt.ylabel("y values (with missing ones)")
66+
67+
plt.show()
68+
69+
# %%
70+
71+
mcartest.test(df_1)
72+
# %%
73+
# The p-value is quite high, therefore we don't reject H_0.
74+
# We can then suppose that our missingness mechanism is MCAR.
75+
76+
# %%
77+
# Case 2 : Normal iid feature with MAR holes
78+
# ==========================================
79+
np.random.seed(11)
80+
81+
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
82+
threshold = random.uniform(0, 1)
83+
matrix[np.argwhere(matrix[:, 0] > 1.96), 1] = np.nan
84+
matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
85+
df_2 = pd.DataFrame(matrix)
86+
87+
plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
88+
plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])
89+
90+
plt.legend(
91+
(plt_1, plt_2),
92+
("observed_values", "masked_vlues"),
93+
scatterpoints=1,
94+
loc="lower left",
95+
ncol=1,
96+
fontsize=8,
97+
)
98+
99+
plt.title("Case 2 : MAR missingness mechanism")
100+
plt.xlabel("x values (all observed)")
101+
plt.ylabel("y values (with missing ones)")
102+
103+
plt.show()
104+
105+
# %%
106+
107+
mcartest.test(df_2)
108+
# %%
109+
# The p-value is lower than the classic threshold (5%).
110+
# H_0 is then rejected and we can suppose that our missingness mechanism is MAR.
111+
112+
# %%
113+
# Case 3 : Normal iid feature MAR holes
114+
# =====================================
115+
# The specific case is design to emphasize the Little's test limits. In the case, we generate holes
116+
# when the value of the first feature is high. This missingness mechanism is clearly MAR but the
117+
# means between missing patterns is not statistically different.
118+
119+
np.random.seed(11)
120+
121+
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
122+
matrix[np.argwhere(abs(matrix[:, 0]) >= 1.95), 1] = np.nan
123+
matrix_masked = matrix[np.argwhere(np.isnan(matrix))]
124+
df_3 = pd.DataFrame(matrix)
125+
126+
plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1])
127+
plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1])
128+
129+
plt.legend(
130+
(plt_1, plt_2),
131+
("observed_values", "masked_values"),
132+
scatterpoints=1,
133+
loc="lower left",
134+
ncol=1,
135+
fontsize=8,
136+
)
137+
138+
plt.title("Case 3 : MAR missingness mechanism undetected by the Little's test")
139+
plt.xlabel("x values (all observed)")
140+
plt.ylabel("y values (with missing ones)")
141+
142+
plt.show()
143+
144+
# %%
145+
146+
mcartest.test(df_3)
147+
# %%
148+
# The p-value is higher than the classic threshold (5%).
149+
# H_0 is not rejected whereas the missingness mechanism is clearly MAR.

tests/audit/test_holes_characterization.py

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,25 +6,34 @@
66
from qolmat.imputations.imputers import ImputerEM
77

88

9-
np.random.seed(11)
10-
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
11-
matrix_1, matrix_2, matrix_3 = map(np.copy, [matrix] * 3)
12-
13-
# Case 1 : MCAR case detected by Little
14-
matrix_1.ravel()[np.random.choice(matrix_1.size, size=20, replace=False)] = np.nan
15-
df_1 = pd.DataFrame(matrix_1)
16-
17-
# Case 2 : MAR case detected by Little
18-
matrix_2[np.argwhere(matrix_2[:, 0] > 1.96), 1] = np.nan
19-
df_2 = pd.DataFrame(matrix_2)
20-
21-
# Case 3 : MAR case undetected by Little
22-
matrix_3[np.argwhere(abs(matrix_3[:, 0]) >= 1.95), 1] = np.nan
23-
df_3 = pd.DataFrame(matrix_3)
24-
25-
26-
@pytest.mark.parametrize("df_input, expected", [(df_1, True), (df_2, False), (df_3, True)])
27-
def test_little_mcar_test(df_input: pd.DataFrame, expected: bool):
9+
@pytest.fixture
10+
def mcar_df() -> pd.DataFrame:
11+
rng = np.random.default_rng(42)
12+
matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
13+
matrix.ravel()[rng.choice(matrix.size, size=20, replace=False)] = np.nan
14+
return pd.DataFrame(data=matrix)
15+
16+
17+
@pytest.fixture
18+
def mar_hm_df() -> pd.DataFrame:
19+
rng = np.random.default_rng(42)
20+
matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
21+
matrix[np.argwhere(matrix[:, 0] > 1.96), 1] = np.nan
22+
return pd.DataFrame(data=matrix)
23+
24+
25+
@pytest.fixture
26+
def mcar_hc_df() -> pd.DataFrame:
27+
rng = np.random.default_rng(42)
28+
matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=100)
29+
matrix[np.argwhere(abs(matrix[:, 0]) >= 1.95), 1] = np.nan
30+
return pd.DataFrame(data=matrix)
31+
32+
33+
@pytest.mark.parametrize(
34+
"df_input, expected", [("mcar_df", True), ("mar_hm_df", False), ("mcar_hc_df", True)]
35+
)
36+
def test_little_mcar_test(df_input: pd.DataFrame, expected: bool, request):
2837
mcar_test_little = MCARTest(method="little", imputer=ImputerEM(random_state=42))
29-
result = mcar_test_little.test(df_input)
38+
result = mcar_test_little.test(request.getfixturevalue(df_input))
3039
assert expected == (result > 0.05)

0 commit comments

Comments
 (0)