tuto refacto

Julien Roussel · Julien Roussel · commit 7b95699edb0a · 2024-06-13T00:26:55.000+02:00
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -2,6 +2,12 @@
 History
 =======
 
+??
+------------------
+* Little's test implemented in a new hole_characterization module
+* Documentation now includes an analysis section with a tutorial
+* Hole generators now provide reproducible outputs
+
 0.1.3 (2024-03-07)
 ------------------
 
diff --git a/examples/tutorials/plot_tuto_mcar.py b/examples/tutorials/plot_tuto_mcar.py
@@ -19,10 +19,19 @@
 
 plt.rcParams.update({"font.size": 12})
 
+
+# %%
+# Generating random data
+# ----------------------
+
 rng = np.random.RandomState(42)
+data = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
+df = pd.DataFrame(data=data, columns=["Column 1", "Column 2"])
+
+q975 = norm.ppf(0.975)
 
 # %%
-# 1. The Little's test
+# The Little's test
 # ---------------------------------------------------------------
 # First, we need to introduce the concept of a missing pattern. A missing pattern, also called a
 # pattern, is the structure of observed and missing values in a dataset. For example, in a
@@ -34,25 +43,23 @@
 # We choose to use the classic threshold of 5%. If the test p-value is below this threshold,
 # we reject the null hypothesis.
 #
-# This notebook shows how the Little's test performs and its limitations.
+# This notebook shows how the Little's test performs on a simplistic case and its limitations. We
+# instanciate a test object with a random state for reproducibility.
 
 test_mcar = LittleTest(random_state=rng)
 
 # %%
-# Case 1: Normal iid features with MCAR holes
-# ============================================
+# Case 1: MCAR holes (True negative)
+# ==================================
 
 
-matrix = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
-df = pd.DataFrame(data=matrix, columns=["Column 1", "Column 2"])
-
 hole_gen = UniformHoleGenerator(
     n_splits=1, random_state=rng, subset=["Column 2"], ratio_masked=0.2
 )
 df_mask = hole_gen.generate_mask(df)
+df_nan = df.where(~df_mask, np.nan)
 
 has_nan = df_mask.any(axis=1)
-
 df_observed = df.loc[~has_nan]
 df_hidden = df.loc[has_nan]
 
@@ -66,84 +73,84 @@
 plt.xlabel("Column 1")
 plt.ylabel("Column 2")
 plt.title("Case 1: MCAR missingness mechanism")
+plt.grid()
 plt.show()
 
 # %%
-
-result = test_mcar.test(df.mask(df_mask))
+result = test_mcar.test(df_nan)
 print(f"Test p-value: {result:.2%}")
 # %%
 # The p-value is quite high, therefore we don't reject H0.
 # We can then suppose that our missingness mechanism is MCAR.
 
 # %%
-# Case 2: Normal iid features with MAR holes
-# ===========================================
-quantile_95 = norm.ppf(0.975)
+# Case 2: MAR holes with mean bias (True positive)
+# ================================================
 
-matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
-df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"])
-df_nan = df.copy()
-df_nan.loc[df_nan["Column_1"] > quantile_95, "Column_2"] = np.nan
+df_mask = pd.DataFrame({"Column 1": False, "Column 2": df["Column 1"] > q975}, index=df.index)
 
-df_mask = df_nan.isna()
-df_unmasked = ~df_mask
-df_unmasked["Column_1"] = False
+df_nan = df.where(~df_mask, np.nan)
 
-df_observed = df.mask(df_mask).dropna()
-df_hidden = df.mask(df_unmasked).dropna(subset="Column_2")
+has_nan = df_mask.any(axis=1)
+df_observed = df.loc[~has_nan]
+df_hidden = df.loc[has_nan]
 
-plt_1 = plt.scatter(df_observed.iloc[:, 0], df_observed.iloc[:, 1], label="Observed values")
-plt_2 = plt.scatter(df_hidden.iloc[:, 0], df_hidden.iloc[:, 1], label="Missing values")
+plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values")
+plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2")
 
 plt.legend(
     loc="lower left",
     fontsize=8,
 )
+plt.xlabel("Column 1")
+plt.ylabel("Column 2")
 plt.title("Case 2: MAR missingness mechanism")
+plt.grid()
 plt.show()
 
 # %%
 
-test_mcar.test(df.mask(df_mask))
+result = test_mcar.test(df_nan)
+print(f"Test p-value: {result:.2%}")
 # %%
 # The p-value is lower than the classic threshold (5%).
 # H0 is then rejected and we can suppose that our missingness mechanism is MAR.
 
 # %%
-# Case 3: Normal iid features with MAR holes
-# ===========================================
+# Case 3: MAR holes with any mean bias (False negative)
+# =====================================================
+#
 # The specific case is designed to emphasize the Little's test limits. In the case, we generate
 # holes when the absolute value of the first feature is high. This missingness mechanism is clearly
 # MAR but the means between missing patterns is not statistically different.
 
-np.random.seed(42)
-
-matrix = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
-df = pd.DataFrame(matrix, columns=["Column_1", "Column_2"])
-df_nan = df.copy()
-df_nan.loc[abs(df_nan["Column_1"]) > quantile_95, "Column_2"] = np.nan
+df_mask = pd.DataFrame(
+    {"Column 1": False, "Column 2": df["Column 1"].abs() > q975}, index=df.index
+)
 
-df_mask = df_nan.isna()
-df_unmasked = ~df_mask
-df_unmasked["Column_1"] = False
+df_nan = df.where(~df_mask, np.nan)
 
-df_observed = df.mask(df_mask).dropna()
-df_hidden = df.mask(df_unmasked).dropna(subset="Column_2")
+has_nan = df_mask.any(axis=1)
+df_observed = df.loc[~has_nan]
+df_hidden = df.loc[has_nan]
 
-plt_1 = plt.scatter(df_observed.iloc[:, 0], df_observed.iloc[:, 1], label="Observed values")
-plt_2 = plt.scatter(df_hidden.iloc[:, 0], df_hidden.iloc[:, 1], label="Missing values")
+plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values")
+plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2")
 
 plt.legend(
     loc="lower left",
     fontsize=8,
 )
+plt.xlabel("Column 1")
+plt.ylabel("Column 2")
 plt.title("Case 3: MAR missingness mechanism undetected by the Little's test")
+plt.grid()
 plt.show()
 
 # %%
 
-test_mcar.test(df.mask(df_mask))
+result = test_mcar.test(df_nan)
+print(f"Test p-value: {result:.2%}")
 # %%
 # The p-value is higher than the classic threshold (5%).
 # H0 is not rejected whereas the missingness mechanism is clearly MAR.
@@ -154,5 +161,5 @@
 # In this tutoriel, we can see that Little's test fails to detect covariance heterogeneity between
 # patterns.
 #
-# There exist other limitations. The Little's test only handles quantitative data. And finally, the
-# MCAR tests can only handle tabular data (withtout correlation in time).
+# We also note that the Little's test does not handle categorical data or temporally
+# correlated data.
diff --git a/qolmat/analysis/holes_characterization.py b/qolmat/analysis/holes_characterization.py
@@ -66,12 +66,8 @@ def test(self, df: pd.DataFrame) -> float:
         float
             The p-value of the test.
         """
-        print("test")
-        print(self.random_state.randint(100))
         imputer = self.imputer or ImputerEM(random_state=self.random_state)
         imputer = imputer._fit_element(df)
-        print(df[df.notna()].mean().mean())
-        print("means:", imputer.means)
 
         d0 = 0
         n_rows, n_cols = df.shape