|
19 | 19 |
|
20 | 20 | plt.rcParams.update({"font.size": 12}) |
21 | 21 |
|
| 22 | + |
| 23 | +# %% |
| 24 | +# Generating random data |
| 25 | +# ---------------------- |
| 26 | + |
22 | 27 | rng = np.random.RandomState(42) |
| 28 | +data = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) |
| 29 | +df = pd.DataFrame(data=data, columns=["Column 1", "Column 2"]) |
| 30 | + |
| 31 | +q975 = norm.ppf(0.975) |
23 | 32 |
|
24 | 33 | # %% |
25 | | -# 1. The Little's test |
| 34 | +# The Little's test |
26 | 35 | # --------------------------------------------------------------- |
27 | 36 | # First, we need to introduce the concept of a missing pattern. A missing pattern, also called a |
28 | 37 | # pattern, is the structure of observed and missing values in a dataset. For example, in a |
|
34 | 43 | # We choose to use the classic threshold of 5%. If the test p-value is below this threshold, |
35 | 44 | # we reject the null hypothesis. |
36 | 45 | # |
37 | | -# This notebook shows how the Little's test performs and its limitations. |
| 46 | +# This notebook shows how the Little's test performs on a simplistic case and its limitations. We |
| 47 | +# instanciate a test object with a random state for reproducibility. |
38 | 48 |
|
39 | 49 | test_mcar = LittleTest(random_state=rng) |
40 | 50 |
|
41 | 51 | # %% |
42 | | -# Case 1: Normal iid features with MCAR holes |
43 | | -# ============================================ |
| 52 | +# Case 1: MCAR holes (True negative) |
| 53 | +# ================================== |
44 | 54 |
|
45 | 55 |
|
46 | | -matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) |
47 | | -df = pd.DataFrame(data=matrix, columns=["Column 1", "Column 2"]) |
48 | | - |
49 | 56 | hole_gen = UniformHoleGenerator( |
50 | 57 | n_splits=1, random_state=rng, subset=["Column 2"], ratio_masked=0.2 |
51 | 58 | ) |
52 | 59 | df_mask = hole_gen.generate_mask(df) |
| 60 | +df_nan = df.where(~df_mask, np.nan) |
53 | 61 |
|
54 | 62 | has_nan = df_mask.any(axis=1) |
55 | | - |
56 | 63 | df_observed = df.loc[~has_nan] |
57 | 64 | df_hidden = df.loc[has_nan] |
58 | 65 |
|
|
66 | 73 | plt.xlabel("Column 1") |
67 | 74 | plt.ylabel("Column 2") |
68 | 75 | plt.title("Case 1: MCAR missingness mechanism") |
| 76 | +plt.grid() |
69 | 77 | plt.show() |
70 | 78 |
|
71 | 79 | # %% |
72 | | - |
73 | | -result = test_mcar.test(df.mask(df_mask)) |
| 80 | +result = test_mcar.test(df_nan) |
74 | 81 | print(f"Test p-value: {result:.2%}") |
75 | 82 | # %% |
76 | 83 | # The p-value is quite high, therefore we don't reject H0. |
77 | 84 | # We can then suppose that our missingness mechanism is MCAR. |
78 | 85 |
|
79 | 86 | # %% |
80 | | -# Case 2: Normal iid features with MAR holes |
81 | | -# =========================================== |
82 | | -quantile_95 = norm.ppf(0.975) |
| 87 | +# Case 2: MAR holes with mean bias (True positive) |
| 88 | +# ================================================ |
83 | 89 |
|
84 | | -matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) |
85 | | -df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"]) |
86 | | -df_nan = df.copy() |
87 | | -df_nan.loc[df_nan["Column_1"] > quantile_95, "Column_2"] = np.nan |
| 90 | +df_mask = pd.DataFrame({"Column 1": False, "Column 2": df["Column 1"] > q975}, index=df.index) |
88 | 91 |
|
89 | | -df_mask = df_nan.isna() |
90 | | -df_unmasked = ~df_mask |
91 | | -df_unmasked["Column_1"] = False |
| 92 | +df_nan = df.where(~df_mask, np.nan) |
92 | 93 |
|
93 | | -df_observed = df.mask(df_mask).dropna() |
94 | | -df_hidden = df.mask(df_unmasked).dropna(subset="Column_2") |
| 94 | +has_nan = df_mask.any(axis=1) |
| 95 | +df_observed = df.loc[~has_nan] |
| 96 | +df_hidden = df.loc[has_nan] |
95 | 97 |
|
96 | | -plt_1 = plt.scatter(df_observed.iloc[:, 0], df_observed.iloc[:, 1], label="Observed values") |
97 | | -plt_2 = plt.scatter(df_hidden.iloc[:, 0], df_hidden.iloc[:, 1], label="Missing values") |
| 98 | +plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values") |
| 99 | +plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2") |
98 | 100 |
|
99 | 101 | plt.legend( |
100 | 102 | loc="lower left", |
101 | 103 | fontsize=8, |
102 | 104 | ) |
| 105 | +plt.xlabel("Column 1") |
| 106 | +plt.ylabel("Column 2") |
103 | 107 | plt.title("Case 2: MAR missingness mechanism") |
| 108 | +plt.grid() |
104 | 109 | plt.show() |
105 | 110 |
|
106 | 111 | # %% |
107 | 112 |
|
108 | | -test_mcar.test(df.mask(df_mask)) |
| 113 | +result = test_mcar.test(df_nan) |
| 114 | +print(f"Test p-value: {result:.2%}") |
109 | 115 | # %% |
110 | 116 | # The p-value is lower than the classic threshold (5%). |
111 | 117 | # H0 is then rejected and we can suppose that our missingness mechanism is MAR. |
112 | 118 |
|
113 | 119 | # %% |
114 | | -# Case 3: Normal iid features with MAR holes |
115 | | -# =========================================== |
| 120 | +# Case 3: MAR holes with any mean bias (False negative) |
| 121 | +# ===================================================== |
| 122 | +# |
116 | 123 | # The specific case is designed to emphasize the Little's test limits. In the case, we generate |
117 | 124 | # holes when the absolute value of the first feature is high. This missingness mechanism is clearly |
118 | 125 | # MAR but the means between missing patterns is not statistically different. |
119 | 126 |
|
120 | | -np.random.seed(42) |
121 | | - |
122 | | -matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200) |
123 | | -df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"]) |
124 | | -df_nan = df.copy() |
125 | | -df_nan.loc[abs(df_nan["Column_1"]) > quantile_95, "Column_2"] = np.nan |
| 127 | +df_mask = pd.DataFrame( |
| 128 | + {"Column 1": False, "Column 2": df["Column 1"].abs() > q975}, index=df.index |
| 129 | +) |
126 | 130 |
|
127 | | -df_mask = df_nan.isna() |
128 | | -df_unmasked = ~df_mask |
129 | | -df_unmasked["Column_1"] = False |
| 131 | +df_nan = df.where(~df_mask, np.nan) |
130 | 132 |
|
131 | | -df_observed = df.mask(df_mask).dropna() |
132 | | -df_hidden = df.mask(df_unmasked).dropna(subset="Column_2") |
| 133 | +has_nan = df_mask.any(axis=1) |
| 134 | +df_observed = df.loc[~has_nan] |
| 135 | +df_hidden = df.loc[has_nan] |
133 | 136 |
|
134 | | -plt_1 = plt.scatter(df_observed.iloc[:, 0], df_observed.iloc[:, 1], label="Observed values") |
135 | | -plt_2 = plt.scatter(df_hidden.iloc[:, 0], df_hidden.iloc[:, 1], label="Missing values") |
| 137 | +plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values") |
| 138 | +plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2") |
136 | 139 |
|
137 | 140 | plt.legend( |
138 | 141 | loc="lower left", |
139 | 142 | fontsize=8, |
140 | 143 | ) |
| 144 | +plt.xlabel("Column 1") |
| 145 | +plt.ylabel("Column 2") |
141 | 146 | plt.title("Case 3: MAR missingness mechanism undetected by the Little's test") |
| 147 | +plt.grid() |
142 | 148 | plt.show() |
143 | 149 |
|
144 | 150 | # %% |
145 | 151 |
|
146 | | -test_mcar.test(df.mask(df_mask)) |
| 152 | +result = test_mcar.test(df_nan) |
| 153 | +print(f"Test p-value: {result:.2%}") |
147 | 154 | # %% |
148 | 155 | # The p-value is higher than the classic threshold (5%). |
149 | 156 | # H0 is not rejected whereas the missingness mechanism is clearly MAR. |
|
154 | 161 | # In this tutoriel, we can see that Little's test fails to detect covariance heterogeneity between |
155 | 162 | # patterns. |
156 | 163 | # |
157 | | -# There exist other limitations. The Little's test only handles quantitative data. And finally, the |
158 | | -# MCAR tests can only handle tabular data (withtout correlation in time). |
| 164 | +# We also note that the Little's test does not handle categorical data or temporally |
| 165 | +# correlated data. |
0 commit comments