|
1 | 1 | """ |
2 | 2 | ============================================ |
3 | | -Tutorial for testing the MCAR case |
| 3 | +Tutorial for Testing the MCAR Case |
4 | 4 | ============================================ |
5 | 5 |
|
6 | 6 | In this tutorial, we show how to test the MCAR case using the Little's test. |
|
19 | 19 |
|
20 | 20 | plt.rcParams.update({"font.size": 12}) |
21 | 21 |
|
| 22 | +rng = np.random.RandomState(42) |
| 23 | + |
22 | 24 | # %% |
23 | 25 | # 1. The Little's test |
24 | 26 | # --------------------------------------------------------------- |
25 | | -# First, we need to introduce the concept of missing pattern. A missing pattern, also called |
26 | | -# pattern, is the structure of observed and missing values in a data set. For example, for a |
27 | | -# dataset with 2 columns, the possible patterns are : (0, 0), (1, 0), (0, 1), (1, 1). The value 1 |
28 | | -# (0) indicates that the value in the column is missing (observed). |
29 | | -# |
30 | | -# The null hypothesis, H0, is : "The means of observations within each pattern are similar.". |
31 | | -# Against the alternative hypothesis, H1 : "The means of the observed variables can vary across the |
32 | | -# patterns." |
| 27 | +# First, we need to introduce the concept of a missing pattern. A missing pattern, also called a |
| 28 | +# pattern, is the structure of observed and missing values in a dataset. For example, in a |
| 29 | +# dataset with two columns, the possible patterns are: (0, 0), (1, 0), (0, 1), (1, 1). The value 1 |
| 30 | +# (0) indicates that the column value is missing (observed). |
33 | 31 | # |
34 | | -# If H0 is not rejected , we can assume that the missing data mechanism is MCAR. On the contrary, |
35 | | -# if H0 is rejected, we can assume that the missing data mechanism is MAR. |
| 32 | +# The null hypothesis, H0, is: "The means of observations within each pattern are similar.". |
36 | 33 | # |
37 | | -# We choose to use the classic threshold, equal to 5%. If the test p_value is below this threshold, |
| 34 | +# We choose to use the classic threshold of 5%. If the test p-value is below this threshold, |
38 | 35 | # we reject the null hypothesis. |
39 | 36 | # |
40 | 37 | # This notebook shows how the Little's test performs and its limitations. |
41 | 38 |
|
42 | | -mcartest = LittleTest() |
| 39 | +test_mcar = LittleTest(random_state=rng) |
43 | 40 |
|
44 | 41 | # %% |
45 | | -# Case 1 : Normal iid features with MCAR holes |
| 42 | +# Case 1: Normal iid features with MCAR holes |
46 | 43 | # ============================================ |
47 | 44 |
|
48 | | -np.random.seed(42) |
49 | | -matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) |
50 | | -df = pd.DataFrame(data=matrix, columns=["Column_1", "Column_2"]) |
51 | 45 |
|
52 | | -hole_gen = UniformHoleGenerator(n_splits=1, random_state=42, subset=["Column_2"], ratio_masked=0.2) |
| 46 | +matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) |
| 47 | +df = pd.DataFrame(data=matrix, columns=["Column 1", "Column 2"]) |
| 48 | + |
| 49 | +hole_gen = UniformHoleGenerator( |
| 50 | + n_splits=1, random_state=rng, subset=["Column 2"], ratio_masked=0.2 |
| 51 | +) |
53 | 52 | df_mask = hole_gen.generate_mask(df) |
54 | | -df_unmasked = ~df_mask |
55 | | -df_unmasked["Column_1"] = False |
56 | 53 |
|
57 | | -df_observed = df.mask(df_mask).dropna() |
58 | | -df_hidden = df.mask(df_unmasked).dropna(subset="Column_2") |
| 54 | +has_nan = df_mask.any(axis=1) |
59 | 55 |
|
60 | | -plt_1 = plt.scatter(df_observed.iloc[:, 0], df_observed.iloc[:, 1], label="Observed values") |
61 | | -plt_2 = plt.scatter(df_hidden.iloc[:, 0], df_hidden.iloc[:, 1], label="Missing values") |
| 56 | +df_observed = df.loc[~has_nan] |
| 57 | +df_hidden = df.loc[has_nan] |
| 58 | + |
| 59 | +plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values") |
| 60 | +plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2") |
62 | 61 |
|
63 | 62 | plt.legend( |
64 | 63 | loc="lower left", |
65 | 64 | fontsize=8, |
66 | 65 | ) |
67 | | -plt.title("Case 1 : MCAR missingness mechanism") |
| 66 | +plt.xlabel("Column 1") |
| 67 | +plt.ylabel("Column 2") |
| 68 | +plt.title("Case 1: MCAR missingness mechanism") |
68 | 69 | plt.show() |
69 | 70 |
|
70 | 71 | # %% |
71 | 72 |
|
72 | | -mcartest.test(df.mask(df_mask)) |
| 73 | +result = test_mcar.test(df.mask(df_mask)) |
| 74 | +print(f"Test p-value: {result:.2%}") |
73 | 75 | # %% |
74 | 76 | # The p-value is quite high, therefore we don't reject H0. |
75 | 77 | # We can then suppose that our missingness mechanism is MCAR. |
76 | 78 |
|
77 | 79 | # %% |
78 | | -# Case 2 : Normal iid features with MAR holes |
| 80 | +# Case 2: Normal iid features with MAR holes |
79 | 81 | # =========================================== |
80 | | -np.random.seed(42) |
81 | 82 | quantile_95 = norm.ppf(0.975) |
82 | 83 |
|
83 | 84 | matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) |
|
99 | 100 | loc="lower left", |
100 | 101 | fontsize=8, |
101 | 102 | ) |
102 | | -plt.title("Case 2 : MAR missingness mechanism") |
| 103 | +plt.title("Case 2: MAR missingness mechanism") |
103 | 104 | plt.show() |
104 | 105 |
|
105 | 106 | # %% |
106 | 107 |
|
107 | | -mcartest.test(df.mask(df_mask)) |
| 108 | +test_mcar.test(df.mask(df_mask)) |
108 | 109 | # %% |
109 | 110 | # The p-value is lower than the classic threshold (5%). |
110 | 111 | # H0 is then rejected and we can suppose that our missingness mechanism is MAR. |
111 | 112 |
|
112 | 113 | # %% |
113 | | -# Case 3 : Normal iid features with MAR holes |
| 114 | +# Case 3: Normal iid features with MAR holes |
114 | 115 | # =========================================== |
115 | 116 | # The specific case is designed to emphasize the Little's test limits. In the case, we generate |
116 | 117 | # holes when the absolute value of the first feature is high. This missingness mechanism is clearly |
|
137 | 138 | loc="lower left", |
138 | 139 | fontsize=8, |
139 | 140 | ) |
140 | | -plt.title("Case 3 : MAR missingness mechanism undetected by the Little's test") |
| 141 | +plt.title("Case 3: MAR missingness mechanism undetected by the Little's test") |
141 | 142 | plt.show() |
142 | 143 |
|
143 | 144 | # %% |
144 | 145 |
|
145 | | -mcartest.test(df.mask(df_mask)) |
| 146 | +test_mcar.test(df.mask(df_mask)) |
146 | 147 | # %% |
147 | 148 | # The p-value is higher than the classic threshold (5%). |
148 | 149 | # H0 is not rejected whereas the missingness mechanism is clearly MAR. |
|
0 commit comments