scikit-learn-contrib
diff --git a/‎examples/tutorials/plot_tuto_benchmark_TS.py‎
Lines changed: 19 additions & 6 deletions b/‎examples/tutorials/plot_tuto_benchmark_TS.py‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎examples/tutorials/plot_tuto_diffusion_models.py‎
Lines changed: 14 additions & 4 deletions b/‎examples/tutorials/plot_tuto_diffusion_models.py‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎examples/tutorials/plot_tuto_hole_generator.py‎
Lines changed: 20 additions & 5 deletions b/‎examples/tutorials/plot_tuto_hole_generator.py‎
Lines changed: 20 additions & 5 deletions
diff --git a/‎examples/tutorials/plot_tuto_mcar.py‎
Lines changed: 51 additions & 23 deletions b/‎examples/tutorials/plot_tuto_mcar.py‎
Lines changed: 51 additions & 23 deletions
@@ -78,7 +78,9 @@
 ratio_masked = 0.1
 
 imputer_median = imputers.ImputerSimple(groups=("station",), strategy="median")
-imputer_interpol = imputers.ImputerInterpolation(groups=("station",), method="linear")
+imputer_interpol = imputers.ImputerInterpolation(
+    groups=("station",), method="linear"
+)
 imputer_residuals = imputers.ImputerResiduals(
     groups=("station",),
     period=365,
@@ -103,7 +105,10 @@
 )
 
 generator_holes = missing_patterns.EmpiricalHoleGenerator(
-    n_splits=4, groups=("station",), subset=cols_to_impute, ratio_masked=ratio_masked
+    n_splits=4,
+    groups=("station",),
+    subset=cols_to_impute,
+    ratio_masked=ratio_masked,
 )
 
 dict_imputers = {
@@ -142,11 +147,17 @@
 # Aotizhongxin
 
 df_plot = df[cols_to_impute]
-dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()}
+dfs_imputed = {
+    name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()
+}
 station = "Aotizhongxin"
 df_station = df_plot.loc[station]
-dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
-fig, axs = plt.subplots(3, 1, sharex=True, figsize=(10, 3 * len(cols_to_impute)))
+dfs_imputed_station = {
+    name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()
+}
+fig, axs = plt.subplots(
+    3, 1, sharex=True, figsize=(10, 3 * len(cols_to_impute))
+)
 for col, ax in zip(cols_to_impute, axs.flatten()):
     values_orig = df_station[col]
     ax.plot(values_orig, ".", color="black", label="original")
@@ -174,7 +185,9 @@
 fig = plt.figure(figsize=(10, 10))
 i_plot = 1
 for i, col in enumerate(cols_to_impute[:-1]):
-    for i_imputer, (name_imputer, df_imp) in enumerate(dfs_imputed_station.items()):
+    for i_imputer, (name_imputer, df_imp) in enumerate(
+        dfs_imputed_station.items()
+    ):
         ax = fig.add_subplot(n_columns, n_imputers, i_plot)
         plot.compare_covariances(
             df_station,
 
@@ -66,7 +66,11 @@
 df_data_valid = df_data.iloc[:500]
 
 tabddpm = ImputerDiffusion(
-    model=TabDDPM(), epochs=10, batch_size=100, x_valid=df_data_valid, print_valid=True
+    model=TabDDPM(),
+    epochs=10,
+    batch_size=100,
+    x_valid=df_data_valid,
+    print_valid=True,
 )
 tabddpm = tabddpm.fit(df_data)
 
@@ -150,8 +154,12 @@
 # reconstruction errors (mae) but increases distribution distance (KL_columnwise).
 
 dict_imputers = {
-    "num_sampling=5": ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=10, batch_size=100),
-    "num_sampling=10": ImputerDiffusion(model=TabDDPM(num_sampling=10), epochs=10, batch_size=100),
+    "num_sampling=5": ImputerDiffusion(
+        model=TabDDPM(num_sampling=5), epochs=10, batch_size=100
+    ),
+    "num_sampling=10": ImputerDiffusion(
+        model=TabDDPM(num_sampling=10), epochs=10, batch_size=100
+    ),
 }
 
 comparison = comparator.Comparator(
@@ -196,7 +204,9 @@
 #   but requires a longer training/inference time.
 
 dict_imputers = {
-    "tabddpm": ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=10, batch_size=100),
+    "tabddpm": ImputerDiffusion(
+        model=TabDDPM(num_sampling=5), epochs=10, batch_size=100
+    ),
     "tsddpm": ImputerDiffusion(
         model=TsDDPM(num_sampling=5, is_rolling=False),
         epochs=10,
 
@@ -14,6 +14,7 @@
 It consists in hourly air pollutants data from 12 chinese nationally-controlled
 air-quality monitoring sites.
 """
+
 from typing import List
 
 import matplotlib
@@ -49,7 +50,9 @@
 # Missing values are in white, while observed ones are in black.
 
 plt.figure(figsize=(15, 4))
-plt.imshow(df.notna().values.T, aspect="auto", cmap="binary", interpolation="none")
+plt.imshow(
+    df.notna().values.T, aspect="auto", cmap="binary", interpolation="none"
+)
 plt.yticks(range(len(df.columns)), df.columns)
 plt.xlabel("Samples", fontsize=12)
 plt.grid(False)
@@ -96,7 +99,9 @@ def visualise_missing_values(df_init: pd.DataFrame, df_mask: pd.DataFrame):
     colorsList = [(0.9, 0, 0), (0, 0, 0), (0.8, 0.8, 0.8)]
     custom_cmap = matplotlib.colors.ListedColormap(colorsList)
     plt.figure(figsize=(15, 4))
-    plt.imshow(df_tot.values.T, aspect="auto", cmap=custom_cmap, interpolation="none")
+    plt.imshow(
+        df_tot.values.T, aspect="auto", cmap=custom_cmap, interpolation="none"
+    )
     plt.yticks(range(len(df_tot.columns)), df_tot.columns)
     plt.xlabel("Samples", fontsize=12)
     plt.grid(False)
@@ -156,7 +161,9 @@ def plot_cdf(
     _, axs = plt.subplots(1, df.shape[1], sharey=True, figsize=(15, 3))
 
     hole_sizes_original = get_holes_sizes_column_wise(df.to_numpy())
-    for ind, (hole_original, col) in enumerate(zip(hole_sizes_original, df.columns)):
+    for ind, (hole_original, col) in enumerate(
+        zip(hole_sizes_original, df.columns)
+    ):
         sorted_data = np.sort(hole_original)
         cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
         axs[ind].plot(sorted_data, cdf, c="gray", lw=2, label="original")
@@ -166,7 +173,9 @@ def plot_cdf(
         array_mask[array_mask == True] = np.nan
         hole_sizes_created = get_holes_sizes_column_wise(array_mask.to_numpy())
 
-        for ind, (hole_created, col) in enumerate(zip(hole_sizes_created, df.columns)):
+        for ind, (hole_created, col) in enumerate(
+            zip(hole_sizes_created, df.columns)
+        ):
             sorted_data = np.sort(hole_created)
             cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
             axs[ind].plot(sorted_data, cdf, c=color, lw=2, label=label)
@@ -309,7 +318,13 @@ def plot_cdf(
 
 plot_cdf(
     df,
-    [uniform_mask, geometric_mask, empirical_mask, multi_markov_mask, grouped_mask],
+    [
+        uniform_mask,
+        geometric_mask,
+        empirical_mask,
+        multi_markov_mask,
+        grouped_mask,
+    ],
     ["uniform", "geometric", "empirical", "mutli markov", "grouped"],
     ["tab:orange", "tab:blue", "tab:green", "tab:pink", "tab:olive"],
 )
@@ -73,8 +73,16 @@
 df_observed = df.loc[~has_nan]
 df_hidden = df.loc[has_nan]
 
-plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values")
-plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2")
+plt.scatter(
+    df_observed["Column 1"],
+    df_observed[["Column 2"]],
+    label="Fully observed values",
+)
+plt.scatter(
+    df_hidden[["Column 1"]],
+    df_hidden[["Column 2"]],
+    label="Values with missing C2",
+)
 
 plt.legend(
     loc="lower left",
@@ -99,16 +107,26 @@
 # Case 2: MAR holes with mean bias (True positive)
 # ================================================
 
-df_mask = pd.DataFrame({"Column 1": False, "Column 2": df["Column 1"] > q975}, index=df.index)
+df_mask = pd.DataFrame(
+    {"Column 1": False, "Column 2": df["Column 1"] > q975}, index=df.index
+)
 
 df_nan = df.where(~df_mask, np.nan)
 
 has_nan = df_mask.any(axis=1)
 df_observed = df.loc[~has_nan]
 df_hidden = df.loc[has_nan]
 
-plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values")
-plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2")
+plt.scatter(
+    df_observed["Column 1"],
+    df_observed[["Column 2"]],
+    label="Fully observed values",
+)
+plt.scatter(
+    df_hidden[["Column 1"]],
+    df_hidden[["Column 2"]],
+    label="Values with missing C2",
+)
 
 plt.legend(
     loc="lower left",
@@ -139,7 +157,8 @@
 # MAR but the means between missing patterns is not statistically different.
 
 df_mask = pd.DataFrame(
-    {"Column 1": False, "Column 2": df["Column 1"].abs() > q975}, index=df.index
+    {"Column 1": False, "Column 2": df["Column 1"].abs() > q975},
+    index=df.index,
 )
 
 df_nan = df.where(~df_mask, np.nan)
@@ -148,8 +167,16 @@
 df_observed = df.loc[~has_nan]
 df_hidden = df.loc[has_nan]
 
-plt.scatter(df_observed["Column 1"], df_observed[["Column 2"]], label="Fully observed values")
-plt.scatter(df_hidden[["Column 1"]], df_hidden[["Column 2"]], label="Values with missing C2")
+plt.scatter(
+    df_observed["Column 1"],
+    df_observed[["Column 2"]],
+    label="Fully observed values",
+)
+plt.scatter(
+    df_hidden[["Column 1"]],
+    df_hidden[["Column 2"]],
+    label="Values with missing C2",
+)
 
 plt.legend(
     loc="lower left",
@@ -182,7 +209,7 @@
 #
 # We also note that the Little's test does not handle categorical data or temporally
 # correlated data.
-# 
+#
 # This is why we have implemented the PKLM test, which makes up for the shortcomings of the Little
 # test. We present this test in more detail in the next section.
 
@@ -277,21 +304,18 @@
 col1 = rng.rand(n_rows) * 100
 col2 = rng.randint(1, 100, n_rows)
 col3 = rng.choice([True, False], n_rows)
-modalities = ['A', 'B', 'C', 'D']
+modalities = ["A", "B", "C", "D"]
 col4 = rng.choice(modalities, n_rows)
 
-df = pd.DataFrame({
-    'Numeric1': col1,
-    'Numeric2': col2,
-    'Boolean': col3,
-    'Object': col4
-})
+df = pd.DataFrame(
+    {"Numeric1": col1, "Numeric2": col2, "Boolean": col3, "Object": col4}
+)
 
 hole_gen = UniformHoleGenerator(
     n_splits=1,
     ratio_masked=0.2,
-    subset=['Numeric1', 'Numeric2', 'Boolean', 'Object'],
-    random_state=rng
+    subset=["Numeric1", "Numeric2", "Boolean", "Object"],
+    random_state=rng,
 )
 df_mask = hole_gen.generate_mask(df)
 df_nan = df.where(~df_mask, np.nan)
@@ -328,9 +352,11 @@
 data = rng.multivariate_normal(
     mean=[0, 0, 0, 0],
     cov=[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
-    size=400
+    size=400,
+)
+df = pd.DataFrame(
+    data=data, columns=["Column 1", "Column 2", "Column 3", "Column 4"]
 )
-df = pd.DataFrame(data=data, columns=["Column 1", "Column 2", "Column 3", "Column 4"])
 
 df_mask = pd.DataFrame(
     {
@@ -339,7 +365,7 @@
         "Column 3": False,
         "Column 4": False,
     },
-    index=df.index
+    index=df.index,
 )
 df_nan = df.where(~df_mask, np.nan)
 
@@ -360,7 +386,9 @@
 
 # %%
 for col_index, partial_p_v in enumerate(partial_p_values):
-    print(f"The partial p-value for the column index {col_index + 1} is: {partial_p_v:.2%}")
+    print(
+        f"The partial p-value for the column index {col_index + 1} is: {partial_p_v:.2%}"
+    )
 
 # %%
 # As a result, by removing the missing patterns induced by variable 2, the p-value rises
@@ -380,4 +408,4 @@
 # | 10000      | 6          | 20"54                |
 # | 10000      | 10         | 14"48                |
 # | 100000     | 10         | 4'51"                |
-# | 100000     | 15         | 3'06"                |
+# | 100000     | 15         | 3'06"                |