tests passing

Julien Roussel · Julien Roussel · commit 21871a59ff94 · 2024-04-04T16:13:09.000+02:00
diff --git a/examples/tutorials/plot_tuto_categorical.ipynb b/examples/tutorials/plot_tuto_categorical.ipynb
@@ -3,18 +3,39 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a220df49",
+   "id": "7bec9ffc",
    "metadata": {},
    "outputs": [],
    "source": [
     "%load_ext autoreload\n",
     "%autoreload 2"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
+   "id": "3587aa0a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1, 5)"
+      ]
+     },
+     "execution_count": 122,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.array([[1, 2, 3, np.nan, 5]]).shape"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "80d3ba10",
+   "id": "b5136ec4",
    "metadata": {},
    "outputs": [
     {
@@ -59,7 +80,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "id": "96667a7c",
+   "id": "55e57e8b",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -69,7 +90,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "id": "f5ee81fb",
+   "id": "61bcee05",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -79,7 +100,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "id": "ace56085",
+   "id": "88f12b44",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -90,7 +111,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "dab55dd3",
+   "id": "34cfca81",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,7 +122,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "id": "ba1ea100",
+   "id": "7cefa249",
    "metadata": {},
    "outputs": [
     {
@@ -184,7 +205,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "id": "f571ad13",
+   "id": "87e6d8c6",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -197,7 +218,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
-   "id": "d2a26bd9",
+   "id": "394b40d9",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -212,7 +233,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-   "id": "a005e3b6",
+   "id": "915c5caf",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -222,7 +243,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-   "id": "d5bdbcb3",
+   "id": "93d0c02d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -232,7 +253,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "id": "2ad54886",
+   "id": "ea24e781",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -244,7 +265,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
-   "id": "711a8e3e",
+   "id": "e5347dfe",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -254,7 +275,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "id": "e57379ae",
+   "id": "325b7354",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -268,7 +289,7 @@
   {
    "cell_type": "code",
    "execution_count": 61,
-   "id": "c727306a",
+   "id": "5d4c2127",
    "metadata": {},
    "outputs": [
     {
@@ -295,7 +316,7 @@
   {
    "cell_type": "code",
    "execution_count": 111,
-   "id": "7668b17c",
+   "id": "4b0ebe4e",
    "metadata": {},
    "outputs": [
     {
@@ -358,7 +379,7 @@
   {
    "cell_type": "code",
    "execution_count": 112,
-   "id": "edcd6516",
+   "id": "08640c07",
    "metadata": {},
    "outputs": [
     {
@@ -432,7 +453,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b6127f00",
+   "id": "d8193a27",
    "metadata": {},
    "source": [
     "# Imputation analysis"
@@ -441,7 +462,7 @@
   {
    "cell_type": "code",
    "execution_count": 113,
-   "id": "d6ad8c0c",
+   "id": "4df8e2ce",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -453,7 +474,7 @@
   {
    "cell_type": "code",
    "execution_count": 114,
-   "id": "8834e9e6",
+   "id": "c4681f8e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -464,7 +485,7 @@
   {
    "cell_type": "code",
    "execution_count": 115,
-   "id": "02cb4a6e",
+   "id": "1537a2a7",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -476,7 +497,7 @@
   {
    "cell_type": "code",
    "execution_count": 116,
-   "id": "b11df2f4",
+   "id": "dad580cc",
    "metadata": {},
    "outputs": [
     {
@@ -600,7 +621,7 @@
   {
    "cell_type": "code",
    "execution_count": 117,
-   "id": "671d6b3c",
+   "id": "12a99c70",
    "metadata": {},
    "outputs": [
     {
@@ -632,7 +653,7 @@
   {
    "cell_type": "code",
    "execution_count": 120,
-   "id": "ccc38665",
+   "id": "8006ba1e",
    "metadata": {},
    "outputs": [
     {
@@ -663,7 +684,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b8c5a4b4",
+   "id": "b8cc543a",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/qolmat/imputations/preprocessing.py b/qolmat/imputations/preprocessing.py
diff --git a/tests/imputations/test_preprocessing.py b/tests/imputations/test_preprocessing.py
@@ -2,18 +2,15 @@
 import pandas as pd
 import pytest
 from sklearn.compose import make_column_selector as selector
-from sklearn.ensemble import (
-    HistGradientBoostingClassifier,
-    HistGradientBoostingRegressor,
-)
+
 from sklearn.pipeline import Pipeline
 from sklearn.base import BaseEstimator
 from sklearn.metrics import mean_squared_error
 from sklearn.utils.estimator_checks import check_estimator
 from sklearn.utils.validation import check_X_y, check_array
 from sklearn.model_selection import train_test_split
 from sklearn.compose import ColumnTransformer
-from qolmat.imputations.estimators import (
+from qolmat.imputations.preprocessing import (
     BinTransformer,
     MixteHGBM,
     make_pipeline_mixte_preprocessing,
@@ -71,33 +68,39 @@ def bin_transformer():
 
 
 def test_fit_transform(bin_transformer):
-    X = np.array([1, 2, 3, np.nan, 5])
-    transformed_X = bin_transformer.fit_transform(X)
-    assert np.array_equal(transformed_X, np.array([1, 2, 3, np.nan, 5]), equal_nan=True)
+    X = np.array([[1, 2, 3, np.nan, 5]]).T
+    X_transformed = bin_transformer.fit_transform(X)
+    assert np.array_equal(X_transformed, X, equal_nan=True)
 
 
 def test_transform(bin_transformer):
     bin_transformer.dict_df_bins_ = {
         0: pd.DataFrame({"value": [1, 2, 3, 4, 5], "min": [-np.inf, 1.5, 2.5, 3.5, 4.5]})
     }
-    X = np.array([4.2, -1, 3.0, 4.5, 12])
-    transformed_X = bin_transformer.transform(X)
-    assert np.array_equal(transformed_X, np.array([4, 1, 3, 5, 5]))
+    X = np.array([[4.2, -1, 3.0, 4.5, 12]]).T
+    X_transformed = bin_transformer.transform(X)
+    print(X_transformed)
+    print(X)
+    assert np.array_equal(X_transformed, np.array([[4, 1, 3, 5, 5]]).T)
 
 
-def test_fit_transform_with_series(bin_transformer):
-    X = pd.Series([1, 2, 3, np.nan, 5])
-    transformed_X = bin_transformer.fit_transform(X)
-    pd.testing.assert_series_equal(transformed_X, pd.Series([1, 2, 3, np.nan, 5]))
+def test_fit_transform_with_dataframes(bin_transformer):
+    X = pd.DataFrame({"0": [1, 2, 3, np.nan, 5]})
+    X_transformed = bin_transformer.fit_transform(X)
+    print(X_transformed)
+    print(X)
+    pd.testing.assert_frame_equal(X_transformed, X)
 
 
-def test_transform_with_series(bin_transformer):
+def test_transform_with_dataframes(bin_transformer):
     bin_transformer.dict_df_bins_ = {
         0: pd.DataFrame({"value": [1, 2, 3, 4, 5], "min": [0.5, 1.5, 2.5, 3.5, 4.5]})
     }
-    X = pd.Series([1, 2, 3, 4, 5])
-    transformed_X = bin_transformer.transform(X)
-    pd.testing.assert_series_equal(transformed_X, pd.Series([1, 2, 3, 4, 5], dtype=float))
+    X = pd.DataFrame({"0": [1, 2, 3, 4, 5]})
+    X_transformed = bin_transformer.transform(X)
+    print(X_transformed)
+    print(X)
+    pd.testing.assert_frame_equal(X_transformed, X)
 
 
 # Testing make_pipeline_mixte_preprocessing
@@ -114,21 +117,21 @@ def test_preprocessing_pipeline(preprocessing_pipeline):
 
     # Test with numerical features
     X_num = pd.DataFrame([[1, 2], [3, 4], [5, 6]])
-    transformed_X = preprocessing_pipeline.fit_transform(X_num)
-    assert isinstance(transformed_X, pd.DataFrame)
-    assert transformed_X.shape[1] == X_num.shape[1]
+    X_transformed = preprocessing_pipeline.fit_transform(X_num)
+    assert isinstance(X_transformed, pd.DataFrame)
+    assert X_transformed.shape[1] == X_num.shape[1]
 
     # Test with categorical features
     X_cat = pd.DataFrame([["a", "b"], ["c", "d"], ["e", "f"]])
-    transformed_X = preprocessing_pipeline.fit_transform(X_cat)
-    assert isinstance(transformed_X, pd.DataFrame)
-    assert transformed_X.shape[1] > X_cat.shape[1]
+    X_transformed = preprocessing_pipeline.fit_transform(X_cat)
+    assert isinstance(X_transformed, pd.DataFrame)
+    assert X_transformed.shape[1] > X_cat.shape[1]
 
     # Test with mixed features
     X_mixed = pd.DataFrame([[1, "a"], [2, "b"], [3, "c"]])
-    transformed_X = preprocessing_pipeline.fit_transform(X_mixed)
-    assert isinstance(transformed_X, pd.DataFrame)
-    assert transformed_X.shape[1] > X_mixed.shape[1]
+    X_transformed = preprocessing_pipeline.fit_transform(X_mixed)
+    assert isinstance(X_transformed, pd.DataFrame)
+    assert X_transformed.shape[1] > X_mixed.shape[1]
 
 
 # Testing make_robust_MixteHGB