|
3 | 3 | Tutorial for testing the MCAR case |
4 | 4 | ============================================ |
5 | 5 |
|
6 | | -In this tutorial, we show how to use the mcar test class and its methods. |
7 | | -
|
8 | | -Keep in my mind that, at this moment, the mcar tests only handle tabular data. |
| 6 | +In this tutorial, we show how to test the MCAR case using the Little's test. |
9 | 7 | """ |
10 | 8 | # %% |
11 | 9 | # First import some libraries |
12 | 10 | from matplotlib import pyplot as plt |
13 | | -import random |
14 | 11 |
|
15 | 12 | import numpy as np |
16 | 13 | import pandas as pd |
| 14 | +from scipy.stats import norm |
| 15 | + |
| 16 | +from qolmat.analysis.holes_characterization import LittleTest |
| 17 | +from qolmat.benchmark.missing_patterns import UniformHoleGenerator |
17 | 18 |
|
18 | | -from qolmat.audit.holes_characterization import MCARTest |
| 19 | +plt.rcParams.update({"font.size": 12}) |
19 | 20 |
|
20 | 21 | # %% |
21 | 22 | # 1. The Little's test |
22 | 23 | # --------------------------------------------------------------- |
23 | | -# How to use the Little's test ? |
24 | | -# ============================== |
25 | | -# When we deal with missing data in our dataset it's interesting to know the nature of these holes. |
26 | | -# There exist three types of holes : MCAR, MAR and MNAR. |
27 | | -# (see the: `Rubin's missing mechanism classification |
28 | | -# <https://qolmat.readthedocs.io/en/latest/explanation.html>`_) |
| 24 | +# First, we need to introduce the concept of missing pattern. A missing pattern, also called |
| 25 | +# pattern, is the structure of observed and missing values in a data set. For example, for a |
| 26 | +# dataset with 2 columns, the possible patterns are : (0, 0), (1, 0), (0, 1), (1, 1). The value 1 |
| 27 | +# (0) indicates that the value in the column is missing (observed). |
29 | 28 | # |
30 | | -# The simplest case to test is the MCAR case. The most famous MCAR statistical test is the |
31 | | -# `Little's test <https://www.tandfonline.com/doi/abs/10.1080/01621459.1988.10478722>`_. |
32 | | -# Keep in mind that the Little's test is designed to test the homogeneity of means between the |
33 | | -# missing patterns and won't be efficient to detect the heterogeneity of covariance between missing |
34 | | -# patterns. |
| 29 | +# The null hypothesis, H0, is : "The means of observations within each pattern are similar.". |
| 30 | +# Against the alternative hypothesis, H1 : "The means of the observed variables can vary across the |
| 31 | +# patterns." |
35 | 32 | # |
36 | | -# The null hypothesis, H0, is : "The data are MCAR". Against, |
37 | | -# The alternative hypothesis : " The data are not MCAR, the means of the observed variables can |
38 | | -# vary across the patterns" |
| 33 | +# If H0 is not rejected , we can assume that the missing data mechanism is MCAR. On the contrary, |
| 34 | +# if H0 is rejected, we can assume that the missing data mechanism is MAR. |
39 | 35 | # |
40 | | -# We choose to use the classic threshold, equal to 5%. If the test pval is below this threshold, |
| 36 | +# We choose to use the classic threshold, equal to 5%. If the test p_value is below this threshold, |
41 | 37 | # we reject the null hypothesis. |
42 | 38 | # |
43 | 39 | # This notebook shows how the Little's test performs and its limitations. |
44 | 40 |
|
45 | | -np.random.seed(11) |
46 | | - |
47 | | -mcartest = MCARTest(method="little") |
| 41 | +mcartest = LittleTest() |
48 | 42 |
|
49 | 43 | # %% |
50 | | -# Case 1 : Normal iid feature with MCAR holes |
51 | | -# =========================================== |
| 44 | +# Case 1 : Normal iid features with MCAR holes |
| 45 | +# ============================================ |
52 | 46 |
|
| 47 | +np.random.seed(42) |
53 | 48 | matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) |
54 | | -matrix.ravel()[np.random.choice(matrix.size, size=20, replace=False)] = np.nan |
55 | | -matrix_masked = matrix[np.argwhere(np.isnan(matrix))] |
56 | | -df_1 = pd.DataFrame(matrix) |
| 49 | +df = pd.DataFrame(data=matrix, columns=["Column_1", "Column_2"]) |
| 50 | + |
| 51 | +hole_gen = UniformHoleGenerator(n_splits=1, random_state=42, subset=["Column_2"], ratio_masked=0.2) |
| 52 | +df_mask = hole_gen.generate_mask(df) |
| 53 | +df_unmasked = ~df_mask |
| 54 | +df_unmasked["Column_1"] = False |
57 | 55 |
|
58 | | -plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1]) |
59 | | -plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1]) |
| 56 | +df_observed = df.mask(df_mask).dropna() |
| 57 | +df_hidden = df.mask(df_unmasked).dropna(subset="Column_2") |
| 58 | + |
| 59 | +plt_1 = plt.scatter(df_observed.iloc[:, 0], df_observed.iloc[:, 1], label="Observed values") |
| 60 | +plt_2 = plt.scatter(df_hidden.iloc[:, 0], df_hidden.iloc[:, 1], label="Missing values") |
60 | 61 |
|
61 | 62 | plt.legend( |
62 | | - (plt_1, plt_2), |
63 | | - ("observed_values", "masked_values"), |
64 | | - scatterpoints=1, |
65 | 63 | loc="lower left", |
66 | | - ncol=1, |
67 | 64 | fontsize=8, |
68 | 65 | ) |
69 | | - |
70 | 66 | plt.title("Case 1 : MCAR missingness mechanism") |
71 | | -plt.xlabel("x values (all observed)") |
72 | | -plt.ylabel("y values (with missing ones)") |
73 | | - |
74 | 67 | plt.show() |
75 | 68 |
|
76 | 69 | # %% |
77 | 70 |
|
78 | | -mcartest.test(df_1) |
| 71 | +mcartest.test(df.mask(df_mask)) |
79 | 72 | # %% |
80 | | -# The p-value is quite high, therefore we don't reject H_0. |
| 73 | +# The p-value is quite high, therefore we don't reject H0. |
81 | 74 | # We can then suppose that our missingness mechanism is MCAR. |
82 | 75 |
|
83 | 76 | # %% |
84 | | -# Case 2 : Normal iid feature with MAR holes |
85 | | -# ========================================== |
86 | | -np.random.seed(11) |
| 77 | +# Case 2 : Normal iid features with MAR holes |
| 78 | +# =========================================== |
| 79 | +np.random.seed(42) |
| 80 | +quantile_95 = norm.ppf(0.975) |
87 | 81 |
|
88 | 82 | matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) |
89 | | -threshold = random.uniform(0, 1) |
90 | | -matrix[np.argwhere(matrix[:, 0] >= 1.96), 1] = np.nan |
91 | | -matrix_masked = matrix[np.argwhere(np.isnan(matrix))] |
92 | | -df_2 = pd.DataFrame(matrix) |
| 83 | +df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"]) |
| 84 | +df_nan = df.copy() |
| 85 | +df_nan.loc[df_nan["Column_1"] > quantile_95, "Column_2"] = np.nan |
| 86 | + |
| 87 | +df_mask = df_nan.isna() |
| 88 | +df_unmasked = ~df_mask |
| 89 | +df_unmasked["Column_1"] = False |
| 90 | + |
| 91 | +df_observed = df.mask(df_mask).dropna() |
| 92 | +df_hidden = df.mask(df_unmasked).dropna(subset="Column_2") |
93 | 93 |
|
94 | | -plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1]) |
95 | | -plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1]) |
| 94 | +plt_1 = plt.scatter(df_observed.iloc[:, 0], df_observed.iloc[:, 1], label="Observed values") |
| 95 | +plt_2 = plt.scatter(df_hidden.iloc[:, 0], df_hidden.iloc[:, 1], label="Missing values") |
96 | 96 |
|
97 | 97 | plt.legend( |
98 | | - (plt_1, plt_2), |
99 | | - ("observed_values", "masked_vlues"), |
100 | | - scatterpoints=1, |
101 | 98 | loc="lower left", |
102 | | - ncol=1, |
103 | 99 | fontsize=8, |
104 | 100 | ) |
105 | | - |
106 | 101 | plt.title("Case 2 : MAR missingness mechanism") |
107 | | -plt.xlabel("x values (all observed)") |
108 | | -plt.ylabel("y values (with missing ones)") |
109 | | - |
110 | 102 | plt.show() |
111 | 103 |
|
112 | 104 | # %% |
113 | 105 |
|
114 | | -mcartest.test(df_2) |
| 106 | +mcartest.test(df.mask(df_mask)) |
115 | 107 | # %% |
116 | 108 | # The p-value is lower than the classic threshold (5%). |
117 | | -# H_0 is then rejected and we can suppose that our missingness mechanism is MAR. |
| 109 | +# H0 is then rejected and we can suppose that our missingness mechanism is MAR. |
118 | 110 |
|
119 | 111 | # %% |
120 | | -# Case 3 : Normal iid feature MAR holes |
121 | | -# ===================================== |
122 | | -# The specific case is design to emphasize the Little's test limits. In the case, we generate holes |
123 | | -# when the value of the first feature is high. This missingness mechanism is clearly MAR but the |
124 | | -# means between missing patterns is not statistically different. |
| 112 | +# Case 3 : Normal iid features with MAR holes |
| 113 | +# =========================================== |
| 114 | +# The specific case is designed to emphasize the Little's test limits. In the case, we generate |
| 115 | +# holes when the absolute value of the first feature is high. This missingness mechanism is clearly |
| 116 | +# MAR but the means between missing patterns is not statistically different. |
125 | 117 |
|
126 | | -np.random.seed(11) |
| 118 | +np.random.seed(42) |
127 | 119 |
|
128 | 120 | matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) |
129 | | -matrix[np.argwhere(abs(matrix[:, 0]) >= 1.96), 1] = np.nan |
130 | | -matrix_masked = matrix[np.argwhere(np.isnan(matrix))] |
131 | | -df_3 = pd.DataFrame(matrix) |
| 121 | +df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"]) |
| 122 | +df_nan = df.copy() |
| 123 | +df_nan.loc[abs(df_nan["Column_1"]) > quantile_95, "Column_2"] = np.nan |
132 | 124 |
|
133 | | -plt_1 = plt.scatter(matrix[:, 0], matrix[:, 1]) |
134 | | -plt_2 = plt.scatter(matrix_masked[:, 0], matrix_masked[:, 1]) |
| 125 | +df_mask = df_nan.isna() |
| 126 | +df_unmasked = ~df_mask |
| 127 | +df_unmasked["Column_1"] = False |
| 128 | + |
| 129 | +df_observed = df.mask(df_mask).dropna() |
| 130 | +df_hidden = df.mask(df_unmasked).dropna(subset="Column_2") |
| 131 | + |
| 132 | +plt_1 = plt.scatter(df_observed.iloc[:, 0], df_observed.iloc[:, 1], label="Observed values") |
| 133 | +plt_2 = plt.scatter(df_hidden.iloc[:, 0], df_hidden.iloc[:, 1], label="Missing values") |
135 | 134 |
|
136 | 135 | plt.legend( |
137 | | - (plt_1, plt_2), |
138 | | - ("observed_values", "masked_values"), |
139 | | - scatterpoints=1, |
140 | 136 | loc="lower left", |
141 | | - ncol=1, |
142 | 137 | fontsize=8, |
143 | 138 | ) |
144 | | - |
145 | 139 | plt.title("Case 3 : MAR missingness mechanism undetected by the Little's test") |
146 | | -plt.xlabel("x values (all observed)") |
147 | | -plt.ylabel("y values (with missing ones)") |
148 | | - |
149 | 140 | plt.show() |
150 | 141 |
|
151 | 142 | # %% |
152 | 143 |
|
153 | | -mcartest.test(df_3) |
| 144 | +mcartest.test(df.mask(df_mask)) |
154 | 145 | # %% |
155 | 146 | # The p-value is higher than the classic threshold (5%). |
156 | | -# H_0 is not rejected whereas the missingness mechanism is clearly MAR. |
| 147 | +# H0 is not rejected whereas the missingness mechanism is clearly MAR. |
| 148 | + |
| 149 | +# %% |
| 150 | +# Limitations |
| 151 | +# ----------- |
| 152 | +# In this tutoriel, we can see that Little's test fails to detect covariance heterogeneity between |
| 153 | +# patterns. |
| 154 | +# |
| 155 | +# There exist other limitations. The Little's test only handles quantitative data. And finally, the |
| 156 | +# MCAR tests can only handle tabular data (withtout correlation in time). |
0 commit comments