Skip to content

Commit 7b95699

Browse files
Julien RousselJulien Roussel
authored andcommitted
tuto refacto
1 parent 93ddbbe commit 7b95699

File tree

3 files changed

+56
-47
lines changed

3 files changed

+56
-47
lines changed

HISTORY.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22
History
33
=======
44

5+
??
6+
------------------
7+
* Little's test implemented in a new hole_characterization module
8+
* Documentation now includes an analysis section with a tutorial
9+
* Hole generators now provide reproducible outputs
10+
511
0.1.3 (2024-03-07)
612
------------------
713

examples/tutorials/plot_tuto_mcar.py

Lines changed: 50 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,19 @@
1919

2020
plt.rcParams.update({"font.size": 12})
2121

22+
23+
# %%
24+
# Generating random data
25+
# ----------------------
26+
2227
rng = np.random.RandomState(42)
28+
data = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
29+
df = pd.DataFrame(data=data, columns=["Column 1", "Column 2"])
30+
31+
q975 = norm.ppf(0.975)
2332

2433
# %%
25-
# 1. The Little's test
34+
# The Little's test
2635
# ---------------------------------------------------------------
2736
# First, we need to introduce the concept of a missing pattern. A missing pattern, also called a
2837
# pattern, is the structure of observed and missing values in a dataset. For example, in a
@@ -34,25 +43,23 @@
3443
# We choose to use the classic threshold of 5%. If the test p-value is below this threshold,
3544
# we reject the null hypothesis.
3645
#
37-
# This notebook shows how the Little's test performs and its limitations.
46+
# This notebook shows how the Little's test performs on a simplistic case and its limitations. We
47+
# instanciate a test object with a random state for reproducibility.
3848

3949
test_mcar = LittleTest(random_state=rng)
4050

4151
# %%
42-
# Case 1: Normal iid features with MCAR holes
43-
# ============================================
52+
# Case 1: MCAR holes (True negative)
53+
# ==================================
4454

4555

46-
matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
47-
df = pd.DataFrame(data=matrix, columns=["Column 1", "Column 2"])
48-
4956
hole_gen = UniformHoleGenerator(
5057
n_splits=1, random_state=rng, subset=["Column 2"], ratio_masked=0.2
5158
)
5259
df_mask = hole_gen.generate_mask(df)
60+
df_nan = df.where(~df_mask, np.nan)
5361

5462
has_nan = df_mask.any(axis=1)
55-
5663
df_observed = df.loc[~has_nan]
5764
df_hidden = df.loc[has_nan]
5865

@@ -66,84 +73,84 @@
6673
plt.xlabel("Column 1")
6774
plt.ylabel("Column 2")
6875
plt.title("Case 1: MCAR missingness mechanism")
76+
plt.grid()
6977
plt.show()
7078

7179
# %%
72-
73-
result = test_mcar.test(df.mask(df_mask))
80+
result = test_mcar.test(df_nan)
7481
print(f"Test p-value: {result:.2%}")
7582
# %%
7683
# The p-value is quite high, therefore we don't reject H0.
7784
# We can then suppose that our missingness mechanism is MCAR.
7885

7986
# %%
80-
# Case 2: Normal iid features with MAR holes
81-
# ===========================================
82-
quantile_95 = norm.ppf(0.975)
87+
# Case 2: MAR holes with mean bias (True positive)
88+
# ================================================
8389

84-
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
85-
df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"])
86-
df_nan = df.copy()
87-
df_nan.loc[df_nan["Column_1"] > quantile_95, "Column_2"] = np.nan
90+
df_mask = pd.DataFrame({"Column 1": False, "Column 2": df["Column 1"] > q975}, index=df.index)
8891

89-
df_mask = df_nan.isna()
90-
df_unmasked = ~df_mask
91-
df_unmasked["Column_1"] = False
92+
df_nan = df.where(~df_mask, np.nan)
9293

93-
df_observed = df.mask(df_mask).dropna()
94-
df_hidden = df.mask(df_unmasked).dropna(subset="Column_2")
94+
has_nan = df_mask.any(axis=1)
95+
df_observed = df.loc[~has_nan]
96+
df_hidden = df.loc[has_nan]
9597

96-
plt_1 = plt.scatter(df_observed.iloc[:, 0], df_observed.iloc[:, 1], label="Observed values")
97-
plt_2 = plt.scatter(df_hidden.iloc[:, 0], df_hidden.iloc[:, 1], label="Missing values")
98+
plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values")
99+
plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2")
98100

99101
plt.legend(
100102
loc="lower left",
101103
fontsize=8,
102104
)
105+
plt.xlabel("Column 1")
106+
plt.ylabel("Column 2")
103107
plt.title("Case 2: MAR missingness mechanism")
108+
plt.grid()
104109
plt.show()
105110

106111
# %%
107112

108-
test_mcar.test(df.mask(df_mask))
113+
result = test_mcar.test(df_nan)
114+
print(f"Test p-value: {result:.2%}")
109115
# %%
110116
# The p-value is lower than the classic threshold (5%).
111117
# H0 is then rejected and we can suppose that our missingness mechanism is MAR.
112118

113119
# %%
114-
# Case 3: Normal iid features with MAR holes
115-
# ===========================================
120+
# Case 3: MAR holes with any mean bias (False negative)
121+
# =====================================================
122+
#
116123
# The specific case is designed to emphasize the Little's test limits. In the case, we generate
117124
# holes when the absolute value of the first feature is high. This missingness mechanism is clearly
118125
# MAR but the means between missing patterns is not statistically different.
119126

120-
np.random.seed(42)
121-
122-
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
123-
df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"])
124-
df_nan = df.copy()
125-
df_nan.loc[abs(df_nan["Column_1"]) > quantile_95, "Column_2"] = np.nan
127+
df_mask = pd.DataFrame(
128+
{"Column 1": False, "Column 2": df["Column 1"].abs() > q975}, index=df.index
129+
)
126130

127-
df_mask = df_nan.isna()
128-
df_unmasked = ~df_mask
129-
df_unmasked["Column_1"] = False
131+
df_nan = df.where(~df_mask, np.nan)
130132

131-
df_observed = df.mask(df_mask).dropna()
132-
df_hidden = df.mask(df_unmasked).dropna(subset="Column_2")
133+
has_nan = df_mask.any(axis=1)
134+
df_observed = df.loc[~has_nan]
135+
df_hidden = df.loc[has_nan]
133136

134-
plt_1 = plt.scatter(df_observed.iloc[:, 0], df_observed.iloc[:, 1], label="Observed values")
135-
plt_2 = plt.scatter(df_hidden.iloc[:, 0], df_hidden.iloc[:, 1], label="Missing values")
137+
plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values")
138+
plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2")
136139

137140
plt.legend(
138141
loc="lower left",
139142
fontsize=8,
140143
)
144+
plt.xlabel("Column 1")
145+
plt.ylabel("Column 2")
141146
plt.title("Case 3: MAR missingness mechanism undetected by the Little's test")
147+
plt.grid()
142148
plt.show()
143149

144150
# %%
145151

146-
test_mcar.test(df.mask(df_mask))
152+
result = test_mcar.test(df_nan)
153+
print(f"Test p-value: {result:.2%}")
147154
# %%
148155
# The p-value is higher than the classic threshold (5%).
149156
# H0 is not rejected whereas the missingness mechanism is clearly MAR.
@@ -154,5 +161,5 @@
154161
# In this tutoriel, we can see that Little's test fails to detect covariance heterogeneity between
155162
# patterns.
156163
#
157-
# There exist other limitations. The Little's test only handles quantitative data. And finally, the
158-
# MCAR tests can only handle tabular data (withtout correlation in time).
164+
# We also note that the Little's test does not handle categorical data or temporally
165+
# correlated data.

qolmat/analysis/holes_characterization.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,8 @@ def test(self, df: pd.DataFrame) -> float:
6666
float
6767
The p-value of the test.
6868
"""
69-
print("test")
70-
print(self.random_state.randint(100))
7169
imputer = self.imputer or ImputerEM(random_state=self.random_state)
7270
imputer = imputer._fit_element(df)
73-
print(df[df.notna()].mean().mean())
74-
print("means:", imputer.means)
7571

7672
d0 = 0
7773
n_rows, n_cols = df.shape

0 commit comments

Comments
 (0)