frechet distance refacto

Julien Roussel · Julien Roussel · commit 397d26f8d0c7 · 2024-04-15T14:30:17.000+02:00
diff --git a/examples/benchmark.md b/examples/benchmark.md
@@ -16,9 +16,6 @@ jupyter:
 **This notebook aims to present the Qolmat repo through an example of a multivariate time series.
 In Qolmat, a few data imputation methods are implemented as well as a way to evaluate their performance.**
 
-```python
-
-```
 
 First, import some useful librairies
 
@@ -36,26 +33,18 @@ from IPython.display import Image
 import pandas as pd
 from datetime import datetime
 import numpy as np
-import scipy
 import hyperopt as ho
-from hyperopt.pyll.base import Apply as hoApply
 np.random.seed(1234)
-import pprint
 from matplotlib import pyplot as plt
-import matplotlib.image as mpimg
 import matplotlib.ticker as plticker
 
 tab10 = plt.get_cmap("tab10")
 plt.rcParams.update({'font.size': 18})
 
-from typing import Optional
 
 from sklearn.linear_model import LinearRegression
-from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
-
 
-import sys
-from qolmat.benchmark import comparator, missing_patterns, hyperparameters
+from qolmat.benchmark import comparator, missing_patterns
 from qolmat.imputations import imputers
 from qolmat.utils import data, utils, plot
 
@@ -239,10 +228,6 @@ df_plot = data.add_datetime_features(df_plot, col_time="date")
 dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()}
 ```
 
-```python tags=[]
-dfs_imputed["VAR_max"].groupby("station").min()
-```
-
 ```python tags=[]
 station = df_plot.index.get_level_values("station")[0]
 # station = "Huairou"
diff --git a/qolmat/benchmark/metrics.py b/qolmat/benchmark/metrics.py
@@ -368,12 +368,12 @@ def total_variance_distance(
     pd.Series
         Total variance distance
     """
-    cols_categorical = utils._get_categorical_features(df1)
     return columnwise_metric(
-        df1[cols_categorical],
-        df2[cols_categorical],
-        df_mask[cols_categorical],
+        df1,
+        df2,
+        df_mask,
         _total_variance_distance_1D,
+        type_cols="categorical",
     )
 
 
@@ -792,7 +792,7 @@ def frechet_distance(
         df1,
         df2,
         df_mask,
-        frechet_distance,
+        frechet_distance_base,
         min_n_rows=min_n_rows,
         type_cols="numerical",
     )
@@ -1003,10 +1003,12 @@ def pattern_based_weighted_mean_metric(
         cols = df1.select_dtypes(exclude=["number"]).columns
     else:
         raise ValueError(f"Value {type_cols} is not valid for parameter `type_cols`!")
+
     if np.any(df_mask & df1.isna()):
         raise ValueError("The argument df1 has missing values on the mask!")
     if np.any(df_mask & df2.isna()):
         raise ValueError("The argument df2 has missing values on the mask!")
+
     rows_mask = df_mask.any(axis=1)
     scores = []
     weights = []
@@ -1041,7 +1043,7 @@ def get_metric(name: str) -> Callable:
         "KS_test": kolmogorov_smirnov_test,
         "correlation_diff": mean_difference_correlation_matrix_numerical_features,
         "energy": sum_energy_distances,
-        "frechet_single": partial(frechet_distance, method="single"),
+        "frechet": partial(frechet_distance, method="single"),
         "frechet_pattern": partial(frechet_distance, method="pattern"),
         "dist_corr_pattern": distance_anticorr_pattern,
     }
diff --git a/qolmat/imputations/preprocessing.py b/qolmat/imputations/preprocessing.py
@@ -314,10 +314,13 @@ def make_pipeline_mixte_preprocessing(
 
     ohe = OneHotEncoder(handle_unknown="ignore", use_cat_names=True)
     transformers += [("cat", ohe, selector(dtype_exclude=np.number))]
-    col_transformer = ColumnTransformer(transformers=transformers).set_output(transform="pandas")
+    col_transformer = ColumnTransformer(transformers=transformers, remainder="passthrough")
+    col_transformer = col_transformer.set_output(transform="pandas")
     preprocessor = Pipeline(steps=[("col_transformer", col_transformer)])
+
     if avoid_new:
         preprocessor.steps.append(("bins", BinTransformer()))
+    print(preprocessor)
     return preprocessor
 
 
diff --git a/qolmat/utils/utils.py b/qolmat/utils/utils.py
@@ -288,7 +288,7 @@ def get_shape_original(M: NDArray, shape: tuple) -> NDArray:
 
 
 def create_lag_matrices(X: NDArray, p: int) -> Tuple[NDArray, NDArray]:
-    n_rows, n_cols = X.shape
+    n_rows, _ = X.shape
     n_rows_new = n_rows - p
     list_X_lag = [np.ones((n_rows_new, 1))]
     for lag in range(p):
@@ -304,7 +304,5 @@ def nan_mean_cov(X: NDArray) -> Tuple[NDArray, NDArray]:
     _, n_variables = X.shape
     means = np.nanmean(X, axis=0)
     cov = np.ma.cov(np.ma.masked_invalid(X), rowvar=False).data
-    print(cov.shape)
-    print(X.shape)
     cov = cov.reshape(n_variables, n_variables)
     return means, cov
diff --git a/tests/benchmark/test_metrics.py b/tests/benchmark/test_metrics.py
@@ -383,20 +383,12 @@ def test_pattern_based_weighted_mean_metric(
 
 def test_pattern_mae_comparison(mocker) -> None:
 
-    # def mock_metric(values1: pd.Series, values2: pd.Series) -> float:
-    #     call_count += 1
-    #     return 0
-
     mock_metric = mocker.patch("qolmat.benchmark.metrics.accuracy_1D", return_value=0)
-    # def fun_mean_mae(df_gauss1, df_gauss2, df_mask_gauss) -> float:
-    #     return metrics.mean_squared_error(df_gauss1, df_gauss2, df_mask_gauss).mean()
 
-    print(df_mask)
     df_nonan = df_incomplete.notna()
-    result = metrics.pattern_based_weighted_mean_metric(
+    metrics.pattern_based_weighted_mean_metric(
         df_incomplete, df_imputed, df_nonan, metric=mock_metric, min_n_rows=1
     )
-    print(result)
     assert mock_metric.call_count == 2
 
 
diff --git a/tests/imputations/test_preprocessing.py b/tests/imputations/test_preprocessing.py
@@ -198,6 +198,8 @@ def test_preprocessing_pipeline(preprocessing_pipeline):
     # Test with numerical features
     X_num = pd.DataFrame([[1, 2], [3, 4], [5, 6]])
     X_transformed = preprocessing_pipeline.fit_transform(X_num)
+    print(X_num.shape)
+    print(X_transformed.shape)
     assert isinstance(X_transformed, pd.DataFrame)
     assert X_transformed.shape[1] == X_num.shape[1]