Skip to content

Commit 93ddbbe

Browse files
Julien RousselJulien Roussel
authored andcommitted
Random state management and tests added to hole_generator
1 parent 67d44cf commit 93ddbbe

File tree

4 files changed

+64
-39
lines changed

4 files changed

+64
-39
lines changed

examples/tutorials/plot_tuto_mcar.py

Lines changed: 34 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
============================================
3-
Tutorial for testing the MCAR case
3+
Tutorial for Testing the MCAR Case
44
============================================
55
66
In this tutorial, we show how to test the MCAR case using the Little's test.
@@ -19,65 +19,66 @@
1919

2020
plt.rcParams.update({"font.size": 12})
2121

22+
rng = np.random.RandomState(42)
23+
2224
# %%
2325
# 1. The Little's test
2426
# ---------------------------------------------------------------
25-
# First, we need to introduce the concept of missing pattern. A missing pattern, also called
26-
# pattern, is the structure of observed and missing values in a data set. For example, for a
27-
# dataset with 2 columns, the possible patterns are : (0, 0), (1, 0), (0, 1), (1, 1). The value 1
28-
# (0) indicates that the value in the column is missing (observed).
29-
#
30-
# The null hypothesis, H0, is : "The means of observations within each pattern are similar.".
31-
# Against the alternative hypothesis, H1 : "The means of the observed variables can vary across the
32-
# patterns."
27+
# First, we need to introduce the concept of a missing pattern. A missing pattern, also called a
28+
# pattern, is the structure of observed and missing values in a dataset. For example, in a
29+
# dataset with two columns, the possible patterns are: (0, 0), (1, 0), (0, 1), (1, 1). The value 1
30+
# (0) indicates that the column value is missing (observed).
3331
#
34-
# If H0 is not rejected , we can assume that the missing data mechanism is MCAR. On the contrary,
35-
# if H0 is rejected, we can assume that the missing data mechanism is MAR.
32+
# The null hypothesis, H0, is: "The means of observations within each pattern are similar.".
3633
#
37-
# We choose to use the classic threshold, equal to 5%. If the test p_value is below this threshold,
34+
# We choose to use the classic threshold of 5%. If the test p-value is below this threshold,
3835
# we reject the null hypothesis.
3936
#
4037
# This notebook shows how the Little's test performs and its limitations.
4138

42-
mcartest = LittleTest()
39+
test_mcar = LittleTest(random_state=rng)
4340

4441
# %%
45-
# Case 1 : Normal iid features with MCAR holes
42+
# Case 1: Normal iid features with MCAR holes
4643
# ============================================
4744

48-
np.random.seed(42)
49-
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
50-
df = pd.DataFrame(data=matrix, columns=["Column_1", "Column_2"])
5145

52-
hole_gen = UniformHoleGenerator(n_splits=1, random_state=42, subset=["Column_2"], ratio_masked=0.2)
46+
matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
47+
df = pd.DataFrame(data=matrix, columns=["Column 1", "Column 2"])
48+
49+
hole_gen = UniformHoleGenerator(
50+
n_splits=1, random_state=rng, subset=["Column 2"], ratio_masked=0.2
51+
)
5352
df_mask = hole_gen.generate_mask(df)
54-
df_unmasked = ~df_mask
55-
df_unmasked["Column_1"] = False
5653

57-
df_observed = df.mask(df_mask).dropna()
58-
df_hidden = df.mask(df_unmasked).dropna(subset="Column_2")
54+
has_nan = df_mask.any(axis=1)
5955

60-
plt_1 = plt.scatter(df_observed.iloc[:, 0], df_observed.iloc[:, 1], label="Observed values")
61-
plt_2 = plt.scatter(df_hidden.iloc[:, 0], df_hidden.iloc[:, 1], label="Missing values")
56+
df_observed = df.loc[~has_nan]
57+
df_hidden = df.loc[has_nan]
58+
59+
plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values")
60+
plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2")
6261

6362
plt.legend(
6463
loc="lower left",
6564
fontsize=8,
6665
)
67-
plt.title("Case 1 : MCAR missingness mechanism")
66+
plt.xlabel("Column 1")
67+
plt.ylabel("Column 2")
68+
plt.title("Case 1: MCAR missingness mechanism")
6869
plt.show()
6970

7071
# %%
7172

72-
mcartest.test(df.mask(df_mask))
73+
result = test_mcar.test(df.mask(df_mask))
74+
print(f"Test p-value: {result:.2%}")
7375
# %%
7476
# The p-value is quite high, therefore we don't reject H0.
7577
# We can then suppose that our missingness mechanism is MCAR.
7678

7779
# %%
78-
# Case 2 : Normal iid features with MAR holes
80+
# Case 2: Normal iid features with MAR holes
7981
# ===========================================
80-
np.random.seed(42)
8182
quantile_95 = norm.ppf(0.975)
8283

8384
matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
@@ -99,18 +100,18 @@
99100
loc="lower left",
100101
fontsize=8,
101102
)
102-
plt.title("Case 2 : MAR missingness mechanism")
103+
plt.title("Case 2: MAR missingness mechanism")
103104
plt.show()
104105

105106
# %%
106107

107-
mcartest.test(df.mask(df_mask))
108+
test_mcar.test(df.mask(df_mask))
108109
# %%
109110
# The p-value is lower than the classic threshold (5%).
110111
# H0 is then rejected and we can suppose that our missingness mechanism is MAR.
111112

112113
# %%
113-
# Case 3 : Normal iid features with MAR holes
114+
# Case 3: Normal iid features with MAR holes
114115
# ===========================================
115116
# The specific case is designed to emphasize the Little's test limits. In the case, we generate
116117
# holes when the absolute value of the first feature is high. This missingness mechanism is clearly
@@ -137,12 +138,12 @@
137138
loc="lower left",
138139
fontsize=8,
139140
)
140-
plt.title("Case 3 : MAR missingness mechanism undetected by the Little's test")
141+
plt.title("Case 3: MAR missingness mechanism undetected by the Little's test")
141142
plt.show()
142143

143144
# %%
144145

145-
mcartest.test(df.mask(df_mask))
146+
test_mcar.test(df.mask(df_mask))
146147
# %%
147148
# The p-value is higher than the classic threshold (5%).
148149
# H0 is not rejected whereas the missingness mechanism is clearly MAR.

qolmat/analysis/holes_characterization.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,12 @@ def test(self, df: pd.DataFrame) -> float:
6666
float
6767
The p-value of the test.
6868
"""
69+
print("test")
70+
print(self.random_state.randint(100))
6971
imputer = self.imputer or ImputerEM(random_state=self.random_state)
7072
imputer = imputer._fit_element(df)
73+
print(df[df.notna()].mean().mean())
74+
print("means:", imputer.means)
7175

7276
d0 = 0
7377
n_rows, n_cols = df.shape

qolmat/benchmark/missing_patterns.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -185,17 +185,16 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame:
185185
Initial dataframe with a missing pattern to be imitated.
186186
"""
187187

188-
self.rng = sku.check_random_state(self.random_state)
188+
self.random_state = sku.check_random_state(self.random_state)
189189
df_mask = pd.DataFrame(False, index=X.index, columns=X.columns)
190190
n_masked_col = math.ceil(self.ratio_masked * len(X))
191191

192192
for column in self.subset:
193193
indices = np.where(X[column].notna())[0]
194-
indices = resample(
194+
indices = self.random_state.choice(
195195
indices,
196196
replace=False,
197-
n_samples=n_masked_col,
198-
stratify=None,
197+
size=n_masked_col,
199198
)
200199
df_mask[column].iloc[indices] = True
201200

@@ -699,7 +698,7 @@ def split(self, X: pd.DataFrame) -> List[pd.DataFrame]:
699698
list_masks = []
700699

701700
for _ in range(self.n_splits):
702-
shuffled_group_sizes = group_sizes.sample(frac=1)
701+
shuffled_group_sizes = group_sizes.sample(frac=1, random_state=self.random_state)
703702

704703
ratio_masks = shuffled_group_sizes.cumsum() / len(X)
705704
ratio_masks = ratio_masks.reset_index(name="ratio")

tests/benchmark/test_missing_patterns.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@
3232
@pytest.mark.parametrize(
3333
"df, generator",
3434
[
35-
(df_incomplet, list_generators["geo"]),
3635
(df_incomplet, list_generators["unif"]),
36+
(df_incomplet, list_generators["geo"]),
3737
(df_incomplet, list_generators["multi"]),
3838
(df_incomplet_group, list_generators["group"]),
3939
],
@@ -48,6 +48,27 @@ def test_SamplerHoleGenerator_split(df: pd.DataFrame, generator: mp._HoleGenerat
4848
np.testing.assert_allclose(col2_holes, expected_col2_holes, atol=1)
4949

5050

51+
@pytest.mark.parametrize(
52+
"df, generator",
53+
[
54+
(df_incomplet, list_generators["unif"]),
55+
(df_incomplet, list_generators["geo"]),
56+
(df_incomplet, list_generators["multi"]),
57+
(df_incomplet_group, list_generators["group"]),
58+
],
59+
)
60+
def test_SamplerHoleGenerator_reproducible(df: pd.DataFrame, generator: mp._HoleGenerator) -> None:
61+
generator.random_state = 42
62+
mask1 = generator.split(df)[0]
63+
generator.random_state = 43
64+
mask2 = generator.split(df)[0]
65+
generator.random_state = 42
66+
mask3 = generator.split(df)[0]
67+
68+
np.testing.assert_array_equal(mask1, mask3)
69+
assert (mask1 != mask2).any().any()
70+
71+
5172
@pytest.mark.parametrize(
5273
"df, generator",
5374
[

0 commit comments

Comments
 (0)