From 889120f56ec6c741dcf26e74d047a9ee2acb004e Mon Sep 17 00:00:00 2001 From: PaulWestenthanner Date: Thu, 3 Oct 2024 11:08:35 +0200 Subject: [PATCH 1/3] Use poetry for packaging and update tooling - support newer python - introduce ruff rules for linting - refactor some tests --- .github/workflows/docs.yml | 4 +- .github/workflows/pypi-publish.yml | 13 +- .github/workflows/test-docs-build.yml | 2 +- .github/workflows/test-suite.yml | 8 +- CHANGELOG.md | 6 + CONTRIBUTING.md | 8 +- category_encoders/__init__.py | 65 +- category_encoders/backward_difference.py | 42 +- category_encoders/base_contrast_encoder.py | 126 +- category_encoders/basen.py | 180 ++- category_encoders/binary.py | 53 +- category_encoders/cat_boost.py | 118 +- category_encoders/count.py | 173 +-- category_encoders/datasets/__init__.py | 11 +- category_encoders/datasets/_base.py | 17 +- category_encoders/glmm.py | 129 +- category_encoders/gray.py | 98 +- category_encoders/hashing.py | 178 ++- category_encoders/helmert.py | 49 +- category_encoders/james_stein.py | 190 ++- category_encoders/leave_one_out.py | 123 +- category_encoders/m_estimate.py | 99 +- category_encoders/one_hot.py | 146 ++- category_encoders/ordinal.py | 177 ++- category_encoders/polynomial.py | 45 +- category_encoders/quantile_encoder.py | 279 +++-- category_encoders/rankhot.py | 184 ++- category_encoders/sum_coding.py | 43 +- category_encoders/target_encoder.py | 226 +++- category_encoders/utils.py | 393 ++++-- category_encoders/woe.py | 96 +- category_encoders/wrapper.py | 163 ++- docs/source/conf.py | 145 ++- examples/benchmarking/benchmarking.py | 9 +- examples/benchmarking/reporting.py | 11 +- examples/benchmarking_cpu/benchmarking_cpu.py | 32 +- examples/benchmarking_large/arff_loader.py | 11 +- .../benchmarking_large/benchmarking_large.py | 167 ++- .../benchmarking_large/catboost_comparison.py | 214 +++- examples/benchmarking_large/csv_loader.py | 12 +- examples/benchmarking_large/report.py | 30 +- examples/benchmarking_large/report_history.py | 28 +- examples/benchmarking_large/util.py | 56 +- examples/column_transformer_example.py | 9 +- examples/encoding_examples.py | 24 +- examples/grid_search_example.py | 44 +- examples/source_data/loaders.py | 7 +- poetry.lock | 1092 +++++++++++++++++ pyproject.toml | 61 + requirements-dev.txt | 5 - requirements.txt | 8 - setup.cfg | 5 - setup.py | 45 - tests/__init__.py | 1 + tests/helpers.py | 113 +- tests/test_backward_difference.py | 152 +-- tests/test_basen.py | 135 +- tests/test_binary.py | 8 +- tests/test_cat_boost.py | 96 +- tests/test_count.py | 172 +-- tests/test_encoders.py | 497 +++++--- tests/test_feature_names.py | 88 +- tests/test_glmm.py | 24 +- tests/test_gray.py | 49 +- tests/test_hashing.py | 45 +- tests/test_helmert.py | 152 +-- tests/test_helpers.py | 17 +- tests/test_james_stein.py | 141 ++- tests/test_leave_one_out.py | 85 +- tests/test_m_estimate.py | 29 +- tests/test_one_hot.py | 378 +++--- tests/test_ordinal.py | 365 +++--- tests/test_polynomial.py | 185 ++- tests/test_quantile_encoder.py | 65 +- tests/test_rankhot.py | 91 +- tests/test_sum_coding.py | 191 ++- tests/test_target_encoder.py | 431 ++++--- tests/test_utils.py | 88 +- tests/test_woe.py | 111 +- tests/test_wrapper.py | 200 +-- 80 files changed, 6112 insertions(+), 3256 deletions(-) create mode 100644 poetry.lock create mode 100644 pyproject.toml delete mode 100644 requirements-dev.txt delete mode 100644 requirements.txt delete mode 100644 setup.cfg delete mode 100644 setup.py diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 47375452..dcc7ef1f 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -12,8 +12,8 @@ jobs: - name: Dependencies run: | python -m pip install --upgrade pip wheel - pip install -r requirements.txt - pip install -r requirements-dev.txt + python -m pip install poetry + poetry install - name: Build Docs uses: ammaraskar/sphinx-action@master with: diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index c6d89726..5635b97d 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -11,16 +11,15 @@ jobs: steps: - name: Clone uses: actions/checkout@v2 - - name: Set up Python 3.7 + - name: Set up Python 3.12 uses: actions/setup-python@v2 with: - python-version: 3.7 + python-version: 3.12 - name: Build package run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - pip install -r requirements-dev.txt - pip install wheel - python setup.py bdist_wheel sdist + python -m pip install --upgrade pip wheel + python -m pip install poetry + poetry install + poetry build - name: Publish package uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/test-docs-build.yml b/.github/workflows/test-docs-build.yml index 0603c9a4..4a05c378 100644 --- a/.github/workflows/test-docs-build.yml +++ b/.github/workflows/test-docs-build.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.10'] + python-version: ['3.12'] steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index d9c346ef..2d9a9bab 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.7', '3.8', '3.9', '3.10'] + python-version: ['3.9', '3.10', '3.11', '3.12'] steps: - uses: actions/checkout@v2 @@ -26,8 +26,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip wheel - python -m pip install -r requirements.txt - python -m pip install -r requirements-dev.txt + python -m pip install poetry + poetry install - name: Test with pytest run: | - pytest + poetry run pytest tests diff --git a/CHANGELOG.md b/CHANGELOG.md index 248c128f..d25dbb08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,12 @@ unreleased ========== +* Refactor: Use poetry as packaging tool +* Refactor: Add more typing +* Change `feature_names_in_` and `feature_names_out_` to `np.ndarray` instead of lists. +* Breaking: Do not allow scalar values as target variable (of length 1) anymore +* Breaking: Force dataframe column names to be strings. + v2.6.4 ====== * fixed: Future Warning in Pandas diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 37541026..ff1199f4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,14 +16,18 @@ How to Contribute The preferred workflow to contribute to git-pandas is: 1. Fork this repository into your own github account. - 2. Clone the fork on your account onto your local disk: - + 2. Clone the fork and install project via poetry: + ``` $ git clone git@github.com:YourLogin/category_encoders.git $ cd category_encoders + $ poetry install + ``` 3. Create a branch for your new awesome feature, do not work in the master branch: + ``` $ git checkout -b new-awesome-feature + ``` 4. Write some code, or docs, or tests. 5. When you are done, submit a pull request. diff --git a/category_encoders/__init__.py b/category_encoders/__init__.py index 5bb411db..97dc5dc5 100644 --- a/category_encoders/__init__.py +++ b/category_encoders/__init__.py @@ -1,4 +1,4 @@ -""" +"""Category encoders library. .. module:: category_encoders :synopsis: @@ -7,51 +7,50 @@ """ from category_encoders.backward_difference import BackwardDifferenceEncoder +from category_encoders.basen import BaseNEncoder from category_encoders.binary import BinaryEncoder -from category_encoders.gray import GrayEncoder +from category_encoders.cat_boost import CatBoostEncoder from category_encoders.count import CountEncoder +from category_encoders.glmm import GLMMEncoder +from category_encoders.gray import GrayEncoder from category_encoders.hashing import HashingEncoder from category_encoders.helmert import HelmertEncoder +from category_encoders.james_stein import JamesSteinEncoder +from category_encoders.leave_one_out import LeaveOneOutEncoder +from category_encoders.m_estimate import MEstimateEncoder from category_encoders.one_hot import OneHotEncoder from category_encoders.ordinal import OrdinalEncoder -from category_encoders.sum_coding import SumEncoder from category_encoders.polynomial import PolynomialEncoder -from category_encoders.basen import BaseNEncoder -from category_encoders.leave_one_out import LeaveOneOutEncoder +from category_encoders.quantile_encoder import QuantileEncoder, SummaryEncoder +from category_encoders.rankhot import RankHotEncoder +from category_encoders.sum_coding import SumEncoder from category_encoders.target_encoder import TargetEncoder from category_encoders.woe import WOEEncoder -from category_encoders.m_estimate import MEstimateEncoder -from category_encoders.james_stein import JamesSteinEncoder -from category_encoders.cat_boost import CatBoostEncoder -from category_encoders.rankhot import RankHotEncoder -from category_encoders.glmm import GLMMEncoder -from category_encoders.quantile_encoder import QuantileEncoder, SummaryEncoder - __version__ = '2.6.4' -__author__ = "willmcginnis", "cmougan", "paulwestenthanner" +__author__ = 'willmcginnis', 'cmougan', 'paulwestenthanner' __all__ = [ - "BackwardDifferenceEncoder", - "BinaryEncoder", - "GrayEncoder", - "CountEncoder", - "HashingEncoder", - "HelmertEncoder", - "OneHotEncoder", - "OrdinalEncoder", - "SumEncoder", - "PolynomialEncoder", - "BaseNEncoder", - "LeaveOneOutEncoder", - "TargetEncoder", - "WOEEncoder", - "MEstimateEncoder", - "JamesSteinEncoder", - "CatBoostEncoder", - "GLMMEncoder", - "QuantileEncoder", - "SummaryEncoder", + 'BackwardDifferenceEncoder', + 'BinaryEncoder', + 'GrayEncoder', + 'CountEncoder', + 'HashingEncoder', + 'HelmertEncoder', + 'OneHotEncoder', + 'OrdinalEncoder', + 'SumEncoder', + 'PolynomialEncoder', + 'BaseNEncoder', + 'LeaveOneOutEncoder', + 'TargetEncoder', + 'WOEEncoder', + 'MEstimateEncoder', + 'JamesSteinEncoder', + 'CatBoostEncoder', + 'GLMMEncoder', + 'QuantileEncoder', + 'SummaryEncoder', 'RankHotEncoder', ] diff --git a/category_encoders/backward_difference.py b/category_encoders/backward_difference.py index 7757a0e7..e9e77311 100644 --- a/category_encoders/backward_difference.py +++ b/category_encoders/backward_difference.py @@ -1,7 +1,7 @@ -"""Backward difference contrast encoding""" +"""Backward difference contrast encoding.""" -from patsy.contrasts import Diff, ContrastMatrix import numpy as np +from patsy.contrasts import ContrastMatrix, Diff from category_encoders.base_contrast_encoder import BaseContrastEncoder @@ -13,7 +13,6 @@ class BackwardDifferenceEncoder(BaseContrastEncoder): Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -21,23 +20,32 @@ class BackwardDifferenceEncoder(BaseContrastEncoder): drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has unknown categories. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform matrix + has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has nan values. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform + matrix has nan values. This can cause unexpected changes in dimension in some cases. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = BackwardDifferenceEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -46,12 +54,12 @@ class BackwardDifferenceEncoder(BaseContrastEncoder): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 12 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 intercept 1460 non-null int64 + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 intercept 1460 non-null int64 1 Id 1460 non-null float64 2 MSSubClass 1460 non-null float64 - 3 MSZoning 1460 non-null object + 3 MSZoning 1460 non-null object 4 LotFrontage 1201 non-null float64 5 YearBuilt 1460 non-null float64 6 Heating_0 1460 non-null float64 @@ -76,5 +84,5 @@ class BackwardDifferenceEncoder(BaseContrastEncoder): """ def get_contrast_matrix(self, values_to_encode: np.array) -> ContrastMatrix: + """Get the contrast matrix for the backward difference encoder.""" return Diff().code_without_intercept(values_to_encode) - diff --git a/category_encoders/base_contrast_encoder.py b/category_encoders/base_contrast_encoder.py index 0079f3bb..a322dd6f 100644 --- a/category_encoders/base_contrast_encoder.py +++ b/category_encoders/base_contrast_encoder.py @@ -1,22 +1,23 @@ -"""Base encoder for various contrast coding schemes""" +"""Base encoder for various contrast coding schemes.""" + +import warnings from abc import abstractmethod +import numpy as np import pandas as pd from patsy.contrasts import ContrastMatrix -import numpy as np -from category_encoders.ordinal import OrdinalEncoder + import category_encoders.utils as util -import warnings +from category_encoders.ordinal import OrdinalEncoder __author__ = 'paulwestenthanner' class BaseContrastEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): - """Base class for various contrast encoders + """Base class for various contrast encoders. Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -24,15 +25,16 @@ class BaseContrastEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has unknown categories. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform matrix + has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has nan values. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform matrix + has nan values. This can cause unexpected changes in dimension in some cases. References ---------- @@ -44,23 +46,35 @@ class BaseContrastEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): http://ibgwww.colorado.edu/~carey/p5741ndir/Coding_Categorical_Variables.pdf """ + prefit_ordinal = True encoding_relation = util.EncodingRelation.ONE_TO_N_UNIQUE - def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, - handle_unknown='value', handle_missing='value'): - super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, - handle_unknown=handle_unknown, handle_missing=handle_missing) + def __init__( + self, + verbose=0, + cols=None, + mapping=None, + drop_invariant=False, + return_df=True, + handle_unknown='value', + handle_missing='value', + ): + super().__init__( + verbose=verbose, + cols=cols, + drop_invariant=drop_invariant, + return_df=return_df, + handle_unknown=handle_unknown, + handle_missing=handle_missing, + ) self.mapping = mapping self.ordinal_encoder = None def _fit(self, X, y=None, **kwargs): # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( - verbose=self.verbose, - cols=self.cols, - handle_unknown='value', - handle_missing='value' + verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) @@ -68,11 +82,18 @@ def _fit(self, X, y=None, **kwargs): mappings_out = [] for switch in ordinal_mapping: - values = switch.get('mapping') + values: pd.Series = switch.get('mapping') col = switch.get('col') - column_mapping = self.fit_contrast_coding(col, values, self.handle_missing, self.handle_unknown) - mappings_out.append({'col': col, 'mapping': column_mapping, }) + column_mapping = self.fit_contrast_coding( + col, values, self.handle_missing, self.handle_unknown + ) + mappings_out.append( + { + 'col': col, + 'mapping': column_mapping, + } + ) self.mapping = mappings_out @@ -87,9 +108,30 @@ def _transform(self, X) -> pd.DataFrame: @abstractmethod def get_contrast_matrix(self, values_to_encode: np.ndarray) -> ContrastMatrix: + """Get the contrast matrix for the encoder.""" raise NotImplementedError - def fit_contrast_coding(self, col, values, handle_missing, handle_unknown): + def fit_contrast_coding( + self, col: str, values: pd.Series, handle_missing: str, handle_unknown: str + ) -> pd.DataFrame: + """Fit contrast coding for a column. + + Parameters + ---------- + col: str + Column name to fit contrast coding for. + values: str + Ordinal encoding mapping of column. + handle_missing: str + How to handle missing values. + handle_unknown: str + How to hande unkown values. + + Returns + ------- + pd.DataFrame + Contrast coding matrix. + """ if handle_missing == 'value': values = values[values > 0] @@ -102,8 +144,11 @@ def fit_contrast_coding(self, col, values, handle_missing, handle_unknown): values_to_encode = np.append(values_to_encode, -1) contrast_matrix = self.get_contrast_matrix(values_to_encode) - df = pd.DataFrame(data=contrast_matrix.matrix, index=values_to_encode, - columns=[f"{col}_{i}" for i in range(len(contrast_matrix.column_suffixes))]) + df = pd.DataFrame( + data=contrast_matrix.matrix, + index=values_to_encode, + columns=[f'{col}_{i}' for i in range(len(contrast_matrix.column_suffixes))], + ) if handle_unknown == 'return_nan': df.loc[-1] = np.nan @@ -118,13 +163,32 @@ def fit_contrast_coding(self, col, values, handle_missing, handle_unknown): return df @staticmethod - def transform_contrast_coding(X, mapping): + def transform_contrast_coding( + X: pd.DataFrame, mapping: list[dict[str, str | pd.DataFrame]] + ) -> pd.DataFrame: + """Apply contrast coding scheme. + + Parameters + ---------- + X: pd.DataFrame + Data to apply contrast coding to. + mapping: list[dict[str, str | pd.DataFrame]] + List of contrast coding schemes to apply for each column. + + Returns + ------- + pd.DataFrame + Encoded data. + """ cols = X.columns.tolist() # See issue 370 if it is necessary to add an intercept or not. X['intercept'] = pd.Series([1] * X.shape[0], index=X.index) - warnings.warn("Intercept column might not be added anymore in future releases (c.f. issue #370)", - category=FutureWarning) + warnings.warn( + 'Intercept column might not be added anymore in future releases (c.f. issue #370)', + category=FutureWarning, + stacklevel=4, + ) for switch in mapping: col = switch.get('col') @@ -136,7 +200,7 @@ def transform_contrast_coding(X, mapping): X = pd.concat([base_df, X], axis=1) old_column_index = cols.index(col) - cols[old_column_index: old_column_index + 1] = mod.columns + cols[old_column_index : old_column_index + 1] = mod.columns # this could lead to problems if an intercept column is already present # (e.g. if another column has been encoded with another contrast coding scheme) diff --git a/category_encoders/basen.py b/category_encoders/basen.py index b6a99a0b..78a2a530 100644 --- a/category_encoders/basen.py +++ b/category_encoders/basen.py @@ -1,11 +1,14 @@ -"""BaseX encoding""" +"""BaseX encoding.""" -import pandas as pd -import numpy as np import re -from category_encoders.ordinal import OrdinalEncoder -import category_encoders.utils as util import warnings +from typing import Any + +import numpy as np +import pandas as pd + +import category_encoders.utils as util +from category_encoders.ordinal import OrdinalEncoder __author__ = 'willmcginnis' @@ -32,13 +35,14 @@ def _ceillogint(n, base): class BaseNEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): - """Base-N encoder encodes the categories into arrays of their base-N representation. A base of 1 is equivalent to - one-hot encoding (not really base-1, but useful), a base of 2 is equivalent to binary encoding. N=number of actual - categories is equivalent to vanilla ordinal encoding. + """Base-N encoder encodes the categories into arrays of their base-N representation. + + A base of 1 is equivalent to one-hot encoding (not really base-1, but useful), + a base of 2 is equivalent to binary encoding. + N=number of actual categories is equivalent to vanilla ordinal encoding. Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -46,25 +50,35 @@ class BaseNEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). base: int - when the downstream model copes well with nonlinearities (like decision tree), use higher base. + when the downstream model copes well with nonlinearities (like decision tree), + use higher base. handle_unknown: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has unknown categories. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform matrix + has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has nan values. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform matrix + has nan values. This can cause unexpected changes in dimension in some cases. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = BaseNEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -73,18 +87,18 @@ class BaseNEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 10 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 - 5 Heating_0 1460 non-null int64 - 6 Heating_1 1460 non-null int64 - 7 Heating_2 1460 non-null int64 - 8 CentralAir_0 1460 non-null int64 - 9 CentralAir_1 1460 non-null int64 + 5 Heating_0 1460 non-null int64 + 6 Heating_1 1460 non-null int64 + 7 Heating_2 1460 non-null int64 + 8 CentralAir_0 1460 non-null int64 + 9 CentralAir_1 1460 non-null int64 dtypes: float64(4), int64(5), object(1) memory usage: 114.2+ KB None @@ -94,10 +108,25 @@ class BaseNEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): prefit_ordinal = True encoding_relation = util.EncodingRelation.N_TO_M - def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, base=2, - handle_unknown='value', handle_missing='value'): - super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, - handle_unknown=handle_unknown, handle_missing=handle_missing) + def __init__( + self, + verbose=0, + cols=None, + mapping=None, + drop_invariant=False, + return_df=True, + base=2, + handle_unknown='value', + handle_missing='value', + ): + super().__init__( + verbose=verbose, + cols=cols, + drop_invariant=drop_invariant, + return_df=return_df, + handle_unknown=handle_unknown, + handle_missing=handle_missing, + ) self.mapping = mapping self.ordinal_encoder = None self.base = base @@ -105,16 +134,21 @@ def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, ret def _fit(self, X, y=None, **kwargs): # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( - verbose=self.verbose, - cols=self.cols, - handle_unknown='value', - handle_missing='value' + verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) - self.mapping = self.fit_base_n_encoding(X) + self.mapping = self.fit_base_n_encoding() - def fit_base_n_encoding(self, X): + def fit_base_n_encoding(self) -> list[dict[str, Any]]: + """Fit the base n encoder. + + Returns + ------- + list[dict[str, Any]] + List containing encoding mappings for each column. + + """ mappings_out = [] for switch in self.ordinal_encoder.category_mapping: @@ -128,9 +162,11 @@ def fit_base_n_encoding(self, X): values = np.append(values, -1) digits = self.calc_required_digits(values) - X_unique = pd.DataFrame(index=values, - columns=[f"{col}_{x}" for x in range(digits)], - data=np.array([self.col_transform(x, digits) for x in range(1, len(values) + 1)])) + X_unique = pd.DataFrame( + index=values, + columns=[f'{col}_{x}' for x in range(digits)], + data=np.array([self.col_transform(x, digits) for x in range(1, len(values) + 1)]), + ) if self.handle_unknown == 'return_nan': X_unique.loc[-1] = np.nan @@ -169,12 +205,12 @@ def inverse_transform(self, X_in): p: array, the same size of X_in """ - # fail fast if self._dim is None: raise ValueError('Must train encoder before it can be used to inverse_transform data') - # unite the type into pandas dataframe (it makes the input size detection code easier...) and make deep copy + # unite the type into pandas dataframe. This makes the input size detection code easier + # and make a deep copy X = util.convert_input(X_in, columns=self.feature_names_out_, deep=True) X = self.basen_to_integer(X, self.cols, self.base) @@ -182,8 +218,10 @@ def inverse_transform(self, X_in): # make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: - raise ValueError(f"Unexpected input dimension {X.shape[1]}, the attribute drop_invariant should " - "be False when transforming the data") + raise ValueError( + f'Unexpected input dimension {X.shape[1]}, the attribute drop_invariant should ' + 'be False when transforming the data' + ) else: raise ValueError(f'Unexpected input dimension {X.shape[1]}, expected {self._dim}') @@ -198,13 +236,27 @@ def inverse_transform(self, X_in): if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': for col in self.cols: if X[switch.get('col')].isna().any(): - warnings.warn("inverse_transform is not supported because transform impute " - f"the unknown category nan when encode {col}") + warnings.warn( + 'inverse_transform is not supported because transform impute ' + f'the unknown category nan when encode {col}', + stacklevel=4, + ) return X if self.return_df else X.to_numpy() - def calc_required_digits(self, values): - # figure out how many digits we need to represent the classes present + def calc_required_digits(self, values: list) -> int: + """Figure out how many digits we need to represent the classes present. + + Parameters + ---------- + values: list + list of values. + + Returns + ------- + int + number of digits necessary for encoding. + """ if self.base == 1: digits = len(values) + 1 else: @@ -227,7 +279,6 @@ def basen_encode(self, X_in: pd.DataFrame, cols=None): dummies : DataFrame """ - X = X_in.copy(deep=True) cols = X.columns.tolist() @@ -241,7 +292,7 @@ def basen_encode(self, X_in: pd.DataFrame, cols=None): X = pd.concat([base_df, X], axis=1) old_column_index = cols.index(col) - cols[old_column_index: old_column_index + 1] = mod.columns + cols[old_column_index : old_column_index + 1] = mod.columns return X.reindex(columns=cols) @@ -266,7 +317,9 @@ def basen_to_integer(self, X: pd.DataFrame, cols, base): out_cols = X.columns.tolist() for col in cols: - col_list = [col0 for col0 in out_cols if re.match(re.escape(str(col))+'_\\d+', str(col0))] + col_list = [ + col0 for col0 in out_cols if re.match(re.escape(str(col)) + '_\\d+', str(col0)) + ] insert_at = out_cols.index(col_list[0]) if base == 1: @@ -281,10 +334,7 @@ def basen_to_integer(self, X: pd.DataFrame, cols, base): return X def col_transform(self, col, digits): - """ - The lambda body to transform the column values - """ - + """The lambda body to transform the column values.""" if col is None or float(col) < 0.0: return None else: @@ -295,7 +345,25 @@ def col_transform(self, col, digits): return [0 for _ in range(digits - len(col))] + col @staticmethod - def number_to_base(n, b, limit): + def number_to_base(n: int, b: int, limit: int) -> list[int]: + """Convert number to base n representation (as list of digits). + + The list will be of length `limit`. + + Parameters + ---------- + n: int + number to convert + b: int + base + limit: int + length of representation. + + Returns + ------- + list[int] + base n representation as list of length limit containing the digits. + """ if b == 1: return [0 if n != _ else 1 for _ in range(limit)] diff --git a/category_encoders/binary.py b/category_encoders/binary.py index 9861b10f..cb31aee4 100644 --- a/category_encoders/binary.py +++ b/category_encoders/binary.py @@ -1,5 +1,7 @@ -"""Binary encoding""" +"""Binary encoding.""" + from functools import partialmethod + from category_encoders import utils from category_encoders.basen import BaseNEncoder @@ -7,11 +9,12 @@ class BinaryEncoder(BaseNEncoder): - """Binary encoding for categorical variables, similar to onehot, but stores categories as binary bitstrings. + """Binary encoding for categorical variables. + + This is similar to onehot, but categories are stored as binary bitstrings. Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -19,23 +22,32 @@ class BinaryEncoder(BaseNEncoder): drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has unknown categories. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform matrix + has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has nan values. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform matrix + has nan values. This can cause unexpected changes in dimension in some cases. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = BinaryEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -44,22 +56,23 @@ class BinaryEncoder(BaseNEncoder): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 10 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 - 5 Heating_0 1460 non-null int64 - 6 Heating_1 1460 non-null int64 - 7 Heating_2 1460 non-null int64 - 8 CentralAir_0 1460 non-null int64 - 9 CentralAir_1 1460 non-null int64 + 5 Heating_0 1460 non-null int64 + 6 Heating_1 1460 non-null int64 + 7 Heating_2 1460 non-null int64 + 8 CentralAir_0 1460 non-null int64 + 9 CentralAir_1 1460 non-null int64 dtypes: float64(4), int64(5), object(1) memory usage: 114.2+ KB None """ + encoding_relation = utils.EncodingRelation.ONE_TO_M __init__ = partialmethod(BaseNEncoder.__init__, base=2) diff --git a/category_encoders/cat_boost.py b/category_encoders/cat_boost.py index 11b813df..b7b55254 100644 --- a/category_encoders/cat_boost.py +++ b/category_encoders/cat_boost.py @@ -1,37 +1,38 @@ -"""CatBoost coding""" +"""CatBoost coding.""" import numpy as np import pandas as pd -import category_encoders.utils as util from sklearn.utils.random import check_random_state +import category_encoders.utils as util + __author__ = 'Jan Motl' class CatBoostEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): """CatBoost Encoding for categorical features. -    Supported targets: binomial and continuous. For polynomial target support, see PolynomialWrapper. + Supported targets: binomial and continuous. + For polynomial target support, see PolynomialWrapper. -    CatBoostEncoder is the variation of target encoding. It supports -    time-aware encoding, regularization, and online learning. + CatBoostEncoder is the variation of target encoding. It supports + time-aware encoding, regularization, and online learning. -    This implementation is time-aware (similar to CatBoost's parameter 'has_time=True'), -    so no random permutations are used. It makes this encoder sensitive to -    ordering of the data and suitable for time series problems. If your data -    does not have time dependency, it should still work just fine, assuming -    sorting of the data won't leak any information outside the training scope -    (i.e., no data leakage). When data leakage is a possibility, it is wise to -    eliminate it first (for example, shuffle or resample the data). + This implementation is time-aware (similar to CatBoost's parameter 'has_time=True'), + so no random permutations are used. It makes this encoder sensitive to + ordering of the data and suitable for time series problems. If your data + does not have time dependency, it should still work just fine, assuming + sorting of the data won't leak any information outside the training scope + (i.e., no data leakage). When data leakage is a possibility, it is wise to + eliminate it first (for example, shuffle or resample the data). -    NOTE: behavior of the transformer would differ in transform and fit_transform -    methods depending if y values are passed. If no target is passed, then -    encoder will map the last value of the running mean to each category. If y is passed -    then it will map all values of the running mean to each category's occurrences. + NOTE: behavior of the transformer would differ in transform and fit_transform + methods depending if y values are passed. If no target is passed, then + encoder will map the last value of the running mean to each category. If y is passed + then it will map all values of the running mean to each category's occurrences. Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -39,24 +40,37 @@ class CatBoostEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_missing: str - options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. + options are 'error', 'return_nan' and 'value', defaults to 'value', + which returns the target mean. handle_unknown: str - options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. + options are 'error', 'return_nan' and 'value', defaults to 'value', + which returns the target mean. sigma: float - adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). + adds normal (Gaussian) distribution noise into training data in order to decrease + overfitting (testing data are untouched). sigma gives the standard deviation (spread or "width") of the normal distribution. a: float - additive smoothing (it is the same variable as "m" in m-probability estimate). By default set to 1. + additive smoothing (it is the same variable as "m" in m-probability estimate). + By default, set to 1. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = CatBoostEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -65,11 +79,11 @@ class CatBoostEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 7 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 5 Heating 1460 non-null float64 @@ -88,13 +102,30 @@ class CatBoostEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): https://arxiv.org/abs/1706.09516 """ + prefit_ordinal = False encoding_relation = util.EncodingRelation.ONE_TO_ONE - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, - handle_unknown='value', handle_missing='value', random_state=None, sigma=None, a=1): - super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, - handle_unknown=handle_unknown, handle_missing=handle_missing) + def __init__( + self, + verbose=0, + cols=None, + drop_invariant=False, + return_df=True, + handle_unknown='value', + handle_missing='value', + random_state=None, + sigma=None, + a=1, + ): + super().__init__( + verbose=verbose, + cols=cols, + drop_invariant=drop_invariant, + return_df=return_df, + handle_unknown=handle_unknown, + handle_missing=handle_missing, + ) self.mapping = None self._mean = None self.random_state = random_state @@ -118,7 +149,9 @@ def _transform(self, X, y=None): level_notunique = colmap['count'] > 1 unique_train = colmap.index - unseen_values = pd.Series([x for x in X[col].unique() if x not in unique_train], dtype=unique_train.dtype) + unseen_values = pd.Series( + [x for x in X[col].unique() if x not in unique_train], dtype=unique_train.dtype + ) is_nan = X[col].isna() is_unknown_value = X[col].isin(unseen_values.dropna().astype(object)) @@ -126,13 +159,18 @@ def _transform(self, X, y=None): if self.handle_unknown == 'error' and is_unknown_value.any(): raise ValueError('Columns to be encoded can not contain new values') - if y is None: # Replace level with its mean target; if level occurs only once, use global mean - level_means = ((colmap['sum'] + self._mean * self.a) / (colmap['count'] + self.a)).where(level_notunique, self._mean) + if ( + y is None + ): # Replace level with its mean target; if level occurs only once, use global mean + level_means = ( + (colmap['sum'] + self._mean * self.a) / (colmap['count'] + self.a) + ).where(level_notunique, self._mean) X[col] = X[col].map(level_means) else: # Simulation of CatBoost implementation, which calculates leave-one-out on the fly. # The nice thing about this is that it helps to prevent overfitting. The bad thing - # is that CatBoost uses many iterations over the data. But we run just one iteration. + # is that CatBoost uses many iterations over the data. + # But we run just one iteration. # Still, it works better than leave-one-out without any noise. # See: # https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/ @@ -151,7 +189,8 @@ def _transform(self, X, y=None): if self.handle_missing == 'value': # only set value if there are actually missing values. - # In case of pd.Categorical columns setting values that are not seen in pd.Categorical gives an error. + # In case of pd.Categorical columns setting values that are not seen in + # pd.Categorical gives an error. nan_cond = is_nan & unseen_values.isna().any() if nan_cond.any(): X.loc[nan_cond, col] = self._mean @@ -159,13 +198,14 @@ def _transform(self, X, y=None): X.loc[is_nan, col] = np.nan if self.sigma is not None and y is not None: - X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0]) + X[col] = X[col] * random_state_.normal(1.0, self.sigma, X[col].shape[0]) return X - def _more_tags(self): + def _more_tags(self) -> dict[str, bool]: + """Set scikit transformer tags.""" tags = super()._more_tags() - tags["predict_depends_on_y"] = True + tags['predict_depends_on_y'] = True return tags def _fit_column_map(self, series, y): diff --git a/category_encoders/count.py b/category_encoders/count.py index cd459433..2cf3e748 100644 --- a/category_encoders/count.py +++ b/category_encoders/count.py @@ -1,24 +1,38 @@ -"""Count Encoder""" +"""Count Encoder.""" + +from copy import copy + import numpy as np import pandas as pd + import category_encoders.utils as util from category_encoders.ordinal import OrdinalEncoder -from copy import copy - - __author__ = 'joshua t. dunn' class CountEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): + """Count encoding for categorical features. + + For a given categorical feature, replace the names of the groups with the group counts. + """ + prefit_ordinal = True encoding_relation = util.EncodingRelation.ONE_TO_ONE - def __init__(self, verbose=0, cols=None, drop_invariant=False, - return_df=True, handle_unknown='value', - handle_missing='value', - min_group_size=None, combine_min_nan_groups=None, - min_group_name=None, normalize=False): + def __init__( + self, + verbose=0, + cols=None, + drop_invariant=False, + return_df=True, + handle_unknown='value', + handle_missing='value', + min_group_size=None, + combine_min_nan_groups=None, + min_group_name=None, + normalize=False, + ): """Count encoding for categorical features. For a given categorical feature, replace the names of the groups @@ -26,7 +40,6 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, Parameters ---------- - verbose: int integer indicating verbosity of output. 0 for none. cols: list @@ -50,7 +63,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, for more details. min_group_size: int, float or dict of {column : option, ...}. the minimal count threshold of a group needed to ensure it is not - combined into a "leftovers" group. Default value is 0.01. + combined into a "leftovers" group. Default value is 0.01. If float in the range (0, 1), `min_group_size` is calculated as int(X.shape[0] * min_group_size). Note: This value may change type based on the `normalize` variable. If True @@ -59,7 +72,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, Set the name of the combined minimum groups when the defaults become too long. Default None. In this case the category names will be joined alphabetically with a `_` delimiter. - Note: The default name can be long and may keep changing, for example, + Note: The default name can be long and may keep changing, for example, in cross-validation. combine_min_nan_groups: bool or dict of {column : bool, ...}. whether to combine the leftovers group with nan group. Default True. Can @@ -74,8 +87,16 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, >>> from sklearn.datasets import fetch_openml >>> from category_encoders import CountEncoder - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = CountEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -84,25 +105,27 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, RangeIndex: 1460 entries, 0 to 1459 Data columns (total 7 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 - 5 Heating 1460 non-null int64 - 6 CentralAir 1460 non-null int64 + 5 Heating 1460 non-null int64 + 6 CentralAir 1460 non-null int64 dtypes: float64(4), int64(2), object(1) memory usage: 80.0+ KB None - - References - ---------- - """ - super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, - handle_unknown=handle_unknown, handle_missing=handle_missing) + super().__init__( + verbose=verbose, + cols=cols, + drop_invariant=drop_invariant, + return_df=return_df, + handle_unknown=handle_unknown, + handle_missing=handle_missing, + ) self.mapping = None self.normalize = normalize self.min_group_size = min_group_size @@ -122,10 +145,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, def _fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( - verbose=self.verbose, - cols=self.cols, - handle_unknown='value', - handle_missing='value' + verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) @@ -137,8 +157,10 @@ def _fit(self, X, y=None, **kwargs): def _transform(self, X): for col in self.cols: # Treat None as np.nan - X[col] = pd.Series([el if el is not None else np.nan for el in X[col]], index=X[col].index) - if self.handle_missing == "value": + X[col] = pd.Series( + [el if el is not None else np.nan for el in X[col]], index=X[col].index + ) + if self.handle_missing == 'value': if not util.is_category(X[col].dtype): X[col] = X[col].fillna(np.nan) @@ -150,16 +172,14 @@ def _transform(self, X): if isinstance(self._handle_unknown[col], (int, np.integer)): X[col] = X[col].fillna(self._handle_unknown[col]) - elif (self._handle_unknown[col] == 'value' - and X[col].isna().any() - and self._handle_missing[col] != 'return_nan' + elif ( + self._handle_unknown[col] == 'value' + and X[col].isna().any() + and self._handle_missing[col] != 'return_nan' ): X[col] = X[col].replace(np.nan, 0) - elif ( - self._handle_unknown[col] == 'error' - and X[col].isna().any() - ): + elif self._handle_unknown[col] == 'error' and X[col].isna().any(): raise ValueError(f'Missing data found in column {col} at transform time.') return X @@ -174,24 +194,25 @@ def _fit_count_encode(self, X_in, y): for col in self.cols: mapping_values = X[col].value_counts(normalize=self._normalize[col]) - ordinal_encoding = [m["mapping"] for m in self.ordinal_encoder.mapping if m["col"] == col][0] + ordinal_encoding = [ + m['mapping'] for m in self.ordinal_encoder.mapping if m['col'] == col + ][0] reversed_ordinal_enc = {v: k for k, v in ordinal_encoding.to_dict().items()} mapping_values.index = mapping_values.index.map(reversed_ordinal_enc) self.mapping[col] = mapping_values if self._handle_missing[col] == 'return_nan': self.mapping[col][np.nan] = np.nan - + # elif self._handle_missing[col] == 'value': - #test_count.py failing self.mapping[col].loc[-2] = 0 + # test_count.py failing self.mapping[col].loc[-2] = 0 - if any([val is not None for val in self._min_group_size.values()]): + if any(val is not None for val in self._min_group_size.values()): self.combine_min_categories(X) def combine_min_categories(self, X): """Combine small categories into a single category.""" for col, mapper in self.mapping.items(): - if self._normalize[col] and isinstance(self._min_group_size[col], int): self._min_group_size[col] = self._min_group_size[col] / X.shape[0] elif not self._normalize and isinstance(self._min_group_size[col], float): @@ -200,15 +221,9 @@ def combine_min_categories(self, X): if self._combine_min_nan_groups[col] is True: min_groups_idx = mapper < self._min_group_size[col] elif self._combine_min_nan_groups[col] == 'force': - min_groups_idx = ( - (mapper < self._min_group_size[col]) - | (mapper.index.isna()) - ) + min_groups_idx = (mapper < self._min_group_size[col]) | (mapper.index.isna()) else: - min_groups_idx = ( - (mapper < self._min_group_size[col]) - & (~mapper.index.isna()) - ) + min_groups_idx = (mapper < self._min_group_size[col]) & (~mapper.index.isna()) min_groups_sum = mapper.loc[min_groups_idx].sum() @@ -220,16 +235,15 @@ def combine_min_categories(self, X): if isinstance(self._min_group_name[col], str): min_group_mapper_name = self._min_group_name[col] else: - min_group_mapper_name = '_'.join([ - str(idx) - for idx - in mapper.loc[min_groups_idx].index.astype(str).sort_values() - ]) + min_group_mapper_name = '_'.join( + [ + str(idx) + for idx in mapper.loc[min_groups_idx].index.astype(str).sort_values() + ] + ) self._min_group_categories[col] = { - cat: min_group_mapper_name - for cat - in mapper.loc[min_groups_idx].index.tolist() + cat: min_group_mapper_name for cat in mapper.loc[min_groups_idx].index.tolist() } if not min_groups_idx.all(): @@ -249,33 +263,22 @@ def _check_set_create_attrs(self): "['force', True, False, None] or type dict." ) - if ( - self.handle_missing == 'return_nan' - and self.combine_min_nan_groups == 'force' - ): + if self.handle_missing == 'return_nan' and self.combine_min_nan_groups == 'force': raise ValueError( "Cannot have `handle_missing` == 'return_nan' and " "'combine_min_nan_groups' == 'force' for all columns." ) - - if ( - self.combine_min_nan_groups is not None - and self.min_group_size is None - ): + if self.combine_min_nan_groups is not None and self.min_group_size is None: pass # raise ValueError( # "`combine_min_nan_groups` only works when `min_group_size` " # "is set for all columns." # ) - if ( - self.min_group_name is not None - and self.min_group_size is None - ): + if self.min_group_name is not None and self.min_group_size is None: raise ValueError( - "`min_group_name` only works when `min_group_size` is set " - "for all columns." + '`min_group_name` only works when `min_group_size` is set ' 'for all columns.' ) if self.combine_min_nan_groups is None: @@ -314,15 +317,15 @@ def _check_set_create_dict_attrs(self): "Cannot have `handle_missing` == 'return_nan' and " f"'combine_min_nan_groups' == 'force' for columns `{col}`." ) - - if ( - self._combine_min_nan_groups[col] is not True - and self._min_group_size[col] is None - ): - raise ValueError(f"`combine_min_nan_groups` only works when `min_group_size` is set for column {col}.") - if ( - self._min_group_name[col] is not None - and self._min_group_size[col] is None - ): - raise ValueError(f"`min_group_name` only works when `min_group_size` is set for column {col}.") + if self._combine_min_nan_groups[col] is not True and self._min_group_size[col] is None: + raise ValueError( + f'`combine_min_nan_groups` only works when `min_group_size` is set for ' + f'column {col}.' + ) + + if self._min_group_name[col] is not None and self._min_group_size[col] is None: + raise ValueError( + f'`min_group_name` only works when `min_group_size` is set for ' + f'column {col}.' + ) diff --git a/category_encoders/datasets/__init__.py b/category_encoders/datasets/__init__.py index 4c65428f..1ecc7fa1 100644 --- a/category_encoders/datasets/__init__.py +++ b/category_encoders/datasets/__init__.py @@ -1,7 +1,8 @@ -from ._base import load_compass -from ._base import load_postcodes +"""Datasets used for examples.""" + +from ._base import load_compass, load_postcodes __all__ = [ - "load_compass", - "load_postcodes", -] \ No newline at end of file + 'load_compass', + 'load_postcodes', +] diff --git a/category_encoders/datasets/_base.py b/category_encoders/datasets/_base.py index bb168da1..a56dd62c 100644 --- a/category_encoders/datasets/_base.py +++ b/category_encoders/datasets/_base.py @@ -1,14 +1,13 @@ -""" -Base IO code for datasets -""" +"""Base IO code for datasets.""" try: - from importlib.resources import files, as_file + from importlib.resources import as_file, files except ImportError: - from importlib_resources import files, as_file + from importlib_resources import as_file, files import pandas as pd + def load_compass(): """Return a dataframe for target encoding with 16 rows of compass directions. @@ -24,8 +23,8 @@ def load_compass(): y: A pandas series containing the target variable """ - data_filename = "data/compass.csv" - stream = files("category_encoders.datasets") / data_filename + data_filename = 'data/compass.csv' + stream = files('category_encoders.datasets') / data_filename with as_file(stream) as f: df = pd.read_csv(f, encoding='latin-1') @@ -59,8 +58,8 @@ def load_postcodes(target_type='binary'): y: A pandas series containing the target variable """ - data_filename = "data/postcode_dataset_100.csv" - stream = files("category_encoders.datasets") / data_filename + data_filename = 'data/postcode_dataset_100.csv' + stream = files('category_encoders.datasets') / data_filename with as_file(stream) as f: df = pd.read_csv(f, encoding='latin-1') diff --git a/category_encoders/glmm.py b/category_encoders/glmm.py index 93eadf1f..a3f59bec 100644 --- a/category_encoders/glmm.py +++ b/category_encoders/glmm.py @@ -1,13 +1,16 @@ -"""Generalized linear mixed model""" -import warnings +"""Generalized linear mixed model.""" + import re +import warnings + import numpy as np import pandas as pd +import statsmodels.formula.api as smf from sklearn.utils.random import check_random_state -from category_encoders.ordinal import OrdinalEncoder +from statsmodels.genmod.bayes_mixed_glm import BinomialBayesMixedGLM + import category_encoders.utils as util -import statsmodels.formula.api as smf -from statsmodels.genmod.bayes_mixed_glm import BinomialBayesMixedGLM as bgmm +from category_encoders.ordinal import OrdinalEncoder __author__ = 'Jan Motl' @@ -15,26 +18,31 @@ class GLMMEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): """Generalized linear mixed model. - Supported targets: binomial and continuous. For polynomial target support, see PolynomialWrapper. + Supported targets: binomial and continuous. + For polynomial target support, see PolynomialWrapper. - This is a supervised encoder similar to TargetEncoder or MEstimateEncoder, but there are some advantages: + This is a supervised encoder similar to TargetEncoder or MEstimateEncoder, + but there are some advantages: - 1. Solid statistical theory behind the technique. Mixed effects models are a mature branch of statistics. - 2. No hyper-parameters to tune. The amount of shrinkage is automatically determined through the estimation - process. In short, the less observations a category has and/or the more the outcome varies for a category - then the higher the regularization towards "the prior" or "grand mean". - 3. The technique is applicable for both continuous and binomial targets. If the target is continuous, - the encoder returns regularized difference of the observation's category from the global mean. + 1. Solid statistical theory behind the technique. + Mixed effects models are a mature branch of statistics. + 2. No hyper-parameters to tune. The amount of shrinkage is automatically determined + through the estimation process. In short, the less observations a category has and/or + the more the outcome varies for a category. Then the higher the regularization + towards "the prior" or "grand mean". + 3. The technique is applicable for both continuous and binomial targets. + If the target is continuous, the encoder returns regularized difference of the + observation's category from the global mean. If the target is binomial, the encoder returns regularized log odds per category. - In comparison to JamesSteinEstimator, this encoder utilizes generalized linear mixed models from statsmodels library. + In comparison to JamesSteinEstimator, this encoder utilizes generalized linear mixed models + from statsmodels library. Note: This is an alpha implementation. The API of the method may change in the future. Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -42,13 +50,15 @@ class GLMMEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): drop_invariant: bool boolean for whether or not to drop encoded columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_missing: str options are 'return_nan', 'error' and 'value', defaults to 'value', which returns 0. handle_unknown: str options are 'return_nan', 'error' and 'value', defaults to 'value', which returns 0. randomized: bool, - adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). + adds normal (Gaussian) distribution noise into training data in order to decrease + overfitting (testing data are untouched). sigma: float standard deviation (spread or "width") of the normal distribution. binomial_target: bool @@ -61,8 +71,16 @@ class GLMMEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target > 200000 >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = GLMMEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -71,11 +89,11 @@ class GLMMEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 7 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 5 Heating 1460 non-null float64 @@ -91,13 +109,31 @@ class GLMMEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): https://faculty.psau.edu.sa/filedownload/doc-12-pdf-a1997d0d31f84d13c1cdc44ac39a8f2c-original.pdf """ + prefit_ordinal = True encoding_relation = util.EncodingRelation.ONE_TO_ONE - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', - handle_missing='value', random_state=None, randomized=False, sigma=0.05, binomial_target=None): - super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, - handle_unknown=handle_unknown, handle_missing=handle_missing) + def __init__( + self, + verbose=0, + cols=None, + drop_invariant=False, + return_df=True, + handle_unknown='value', + handle_missing='value', + random_state=None, + randomized=False, + sigma=0.05, + binomial_target=None, + ): + super().__init__( + verbose=verbose, + cols=cols, + drop_invariant=drop_invariant, + return_df=return_df, + handle_unknown=handle_unknown, + handle_missing=handle_missing, + ) self.ordinal_encoder = None self.mapping = None self.random_state = random_state @@ -109,10 +145,7 @@ def _fit(self, X, y, **kwargs): y = y.astype(float) self.ordinal_encoder = OrdinalEncoder( - verbose=self.verbose, - cols=self.cols, - handle_unknown='value', - handle_missing='value' + verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) @@ -131,9 +164,10 @@ def _transform(self, X, y=None): X = self._score(X, y) return X - def _more_tags(self): + def _more_tags(self) -> dict[str, bool]: + """Set scikit transformer tags.""" tags = super()._more_tags() - tags["predict_depends_on_y"] = True + tags['predict_depends_on_y'] = True return tags def _train(self, X, y): @@ -162,17 +196,25 @@ def _train(self, X, y): try: with warnings.catch_warnings(): - warnings.filterwarnings("ignore") + warnings.filterwarnings('ignore') if binomial_target: - # Classification, returns (regularized) log odds per category as stored in vc_mean - # Note: md.predict() returns: output = fe_mean + vcp_mean + vc_mean[category] - md = bgmm.from_formula('target ~ 1', {'a': '0 + C(feature)'}, data).fit_vb() - index_names = [int(float(re.sub(r'C\(feature\)\[(\S+)\]', r'\1', index_name))) for index_name in md.model.vc_names] + # Classification, returns (regularized) log odds per category as + # stored in vc_mean + # Note: md.predict() returns: + # output = fe_mean + vcp_mean + vc_mean[category] + md = BinomialBayesMixedGLM.from_formula( + 'target ~ 1', {'a': '0 + C(feature)'}, data + ).fit_vb() + index_names = [ + int(float(re.sub(r'C\(feature\)\[(\S+)\]', r'\1', index_name))) + for index_name in md.model.vc_names + ] estimate = pd.Series(md.vc_mean, index=index_names) else: - # Regression, returns (regularized) mean deviation of the observation's category from the global mean + # Regression, returns (regularized) mean deviation of the + # observation's category from the global mean md = smf.mixedlm('target ~ 1', data, groups=data['feature']).fit() - tmp = dict() + tmp = {} for key, value in md.random_effects.items(): tmp[key] = value[0] estimate = pd.Series(tmp) @@ -208,13 +250,14 @@ def _score(self, X, y): # Randomization is meaningful only for training data -> we do it only if y is present if self.randomized and y is not None: random_state_generator = check_random_state(self.random_state) - X[col] = (X[col] * random_state_generator.normal(1., self.sigma, X[col].shape[0])) + X[col] = X[col] * random_state_generator.normal(1.0, self.sigma, X[col].shape[0]) return X - def _rename_and_merge(self, X, y, col): - """ - Statsmodels requires: + def _rename_and_merge(self, X: pd.DataFrame, y: pd.Series, col: str) -> pd.DataFrame: + """Create a new DataFrame combining the column and target. + + This is needed as statsmodels requires: 1) unique column names 2) non-numeric columns names Solution: internally rename the columns. diff --git a/category_encoders/gray.py b/category_encoders/gray.py index eaa24feb..df04f3e8 100644 --- a/category_encoders/gray.py +++ b/category_encoders/gray.py @@ -1,24 +1,25 @@ -"""Gray encoding""" +"""Gray encoding.""" + from functools import partialmethod +from typing import List import pandas as pd from category_encoders import utils from category_encoders.basen import BaseNEncoder -from typing import List __author__ = 'paulwestenthanner' class GrayEncoder(BaseNEncoder): """Gray encoding for categorical variables. + Gray encoding is a form of binary encoding where consecutive values only differ by a single bit. Hence, gray encoding only makes sense for ordinal features. This has benefits in privacy preserving data publishing. Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -26,23 +27,32 @@ class GrayEncoder(BaseNEncoder): drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has unknown categories. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform matrix + has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has nan values. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform matrix + has nan values. This can cause unexpected changes in dimension in some cases. Example ------- >>> from category_encoders import GrayEncoder >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = GrayEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -51,18 +61,18 @@ class GrayEncoder(BaseNEncoder): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 10 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 - 5 Heating_0 1460 non-null int64 - 6 Heating_1 1460 non-null int64 - 7 Heating_2 1460 non-null int64 - 8 CentralAir_0 1460 non-null int64 - 9 CentralAir_1 1460 non-null int64 + 5 Heating_0 1460 non-null int64 + 6 Heating_1 1460 non-null int64 + 7 Heating_2 1460 non-null int64 + 8 CentralAir_0 1460 non-null int64 + 9 CentralAir_1 1460 non-null int64 dtypes: float64(4), int64(5), object(1) memory usage: 114.2+ KB None @@ -71,16 +81,32 @@ class GrayEncoder(BaseNEncoder): ---------- .. [1] https://en.wikipedia.org/wiki/Gray_code - .. [2] Jun Zhang, Graham Cormode, Cecilia M. Procopiuc, Divesh Srivastava, and Xiaokui Xiao. 2017. PrivBayes: - Private Data Release via Bayesian Networks. ACM Trans. Database Syst. 42, 4, Article 25 (October 2017) + .. [2] Jun Zhang, Graham Cormode, Cecilia M. Procopiuc, Divesh Srivastava, and Xiaokui Xiao. + 2017. PrivBayes: Private Data Release via Bayesian Networks. ACM Trans. Database Syst. 42, 4, + Article 25 (October 2017) """ + encoding_relation = utils.EncodingRelation.ONE_TO_M __init__ = partialmethod(BaseNEncoder.__init__, base=2) @staticmethod - def gray_code(n, n_bit) -> List[int]: + def gray_code(n: int, n_bit: int) -> List[int]: + """Calculate the n-bit gray code for a value n. + + Parameters + ---------- + n: int + Value to encode (ordinal value of a category). + n_bit: int + Number of bits to encode to. + + Returns + ------- + List[int] + gray encoding of the input value. + """ gray = n ^ (n >> 1) - gray_formatted = "{0:0{1}b}".format(gray, n_bit) + gray_formatted = '{0:0{1}b}'.format(gray, n_bit) return [int(bit) for bit in gray_formatted] def _fit(self, X, y=None, **kwargs): @@ -88,21 +114,25 @@ def _fit(self, X, y=None, **kwargs): gray_mapping = [] # convert binary mapping to Gray mapping and reorder for col_to_encode in self.mapping: - col = col_to_encode["col"] - bin_mapping = col_to_encode["mapping"] + col = col_to_encode['col'] + bin_mapping = col_to_encode['mapping'] n_cols_out = bin_mapping.shape[1] null_cond = (bin_mapping.index < 0) | (bin_mapping.isna().all(1)) map_null = bin_mapping[null_cond] map_non_null = bin_mapping[~null_cond].copy() - ordinal_mapping = [m for m in self.ordinal_encoder.mapping if m.get("col") == col] + ordinal_mapping = [m for m in self.ordinal_encoder.mapping if m.get('col') == col] if len(ordinal_mapping) != 1: - raise ValueError("Cannot find ordinal encoder mapping of Gray encoder") - ordinal_mapping = ordinal_mapping[0]["mapping"] + raise ValueError('Cannot find ordinal encoder mapping of Gray encoder') + ordinal_mapping = ordinal_mapping[0]['mapping'] reverse_ordinal_mapping = {v: k for k, v in ordinal_mapping.to_dict().items()} - map_non_null["orig_value"] = map_non_null.index.to_series().map(reverse_ordinal_mapping) - map_non_null = map_non_null.sort_values(by="orig_value") - gray_encoding = [self.gray_code(i + 1, n_cols_out) for i in range(map_non_null.shape[0])] - gray_encoding = pd.DataFrame(data=gray_encoding, index=map_non_null.index, columns=bin_mapping.columns) + map_non_null['orig_value'] = map_non_null.index.to_series().map(reverse_ordinal_mapping) + map_non_null = map_non_null.sort_values(by='orig_value') + gray_encoding = [ + self.gray_code(i + 1, n_cols_out) for i in range(map_non_null.shape[0]) + ] + gray_encoding = pd.DataFrame( + data=gray_encoding, index=map_non_null.index, columns=bin_mapping.columns + ) gray_encoding = pd.concat([gray_encoding, map_null]) - gray_mapping.append({"col": col, "mapping": gray_encoding}) + gray_mapping.append({'col': col, 'mapping': gray_encoding}) self.mapping = gray_mapping diff --git a/category_encoders/hashing.py b/category_encoders/hashing.py index 3ce9a536..86f5c063 100644 --- a/category_encoders/hashing.py +++ b/category_encoders/hashing.py @@ -1,20 +1,21 @@ """The hashing module contains all methods and classes related to the hashing trick.""" import hashlib -import category_encoders.utils as util -import multiprocessing -import pandas as pd -import numpy as np import math +import multiprocessing import platform from concurrent.futures import ProcessPoolExecutor +import numpy as np +import pandas as pd + +import category_encoders.utils as util + __author__ = 'willmcginnis', 'LiuShulun' class HashingEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): - - """ A multivariate hashing implementation with configurable dimensionality/precision. + """A multivariate hashing implementation with configurable dimensionality/precision. The advantage of this encoder is that it does not maintain a dictionary of observed categories. Consequently, the encoder does not grow in size and accepts new values during data scoring @@ -23,13 +24,13 @@ class HashingEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): It's important to read about how max_process & max_sample work before setting them manually, inappropriate setting slows down encoding. - Default value of 'max_process' is 1 on Windows because multiprocessing might cause issues, see in : + Default value of 'max_process' is 1 on Windows because multiprocessing might cause issues, + see in : https://github.com/scikit-learn-contrib/categorical-encoding/issues/215 https://docs.python.org/2/library/multiprocessing.html?highlight=process#windows Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -37,7 +38,8 @@ class HashingEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). hash_method: str which hashing method to use. Any method from hashlib works. max_process: int @@ -58,7 +60,7 @@ class HashingEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): how many bits to use to represent the feature. By default, we use 8 bits. For high-cardinality features, consider using up-to 32 bits. process_creation_method: string - either "fork", "spawn" or "forkserver" (availability depends on your + either "fork", "spawn" or "forkserver" (availability depends on your platform). See https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods for more details and tradeoffs. Defaults to "fork" on linux/macos as it is the fastest option and to "spawn" on windows as it is the only one @@ -69,8 +71,16 @@ class HashingEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): >>> from category_encoders.hashing import HashingEncoder >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> y = bunch.target >>> he = HashingEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -79,19 +89,19 @@ class HashingEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 13 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 col_0 1460 non-null int64 - 1 col_1 1460 non-null int64 - 2 col_2 1460 non-null int64 - 3 col_3 1460 non-null int64 - 4 col_4 1460 non-null int64 - 5 col_5 1460 non-null int64 - 6 col_6 1460 non-null int64 - 7 col_7 1460 non-null int64 + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 col_0 1460 non-null int64 + 1 col_1 1460 non-null int64 + 2 col_2 1460 non-null int64 + 3 col_3 1460 non-null int64 + 4 col_4 1460 non-null int64 + 5 col_5 1460 non-null int64 + 6 col_6 1460 non-null int64 + 7 col_7 1460 non-null int64 8 Id 1460 non-null float64 9 MSSubClass 1460 non-null float64 - 10 MSZoning 1460 non-null object + 10 MSZoning 1460 non-null object 11 LotFrontage 1201 non-null float64 12 YearBuilt 1460 non-null float64 dtypes: float64(4), int64(8), object(1) @@ -106,13 +116,30 @@ class HashingEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087 """ + prefit_ordinal = False encoding_relation = util.EncodingRelation.ONE_TO_M - def __init__(self, max_process=0, max_sample=0, verbose=0, n_components=8, cols=None, drop_invariant=False, - return_df=True, hash_method='md5', process_creation_method='fork'): - super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, - handle_unknown="does not apply", handle_missing="does not apply") + def __init__( + self, + max_process=0, + max_sample=0, + verbose=0, + n_components=8, + cols=None, + drop_invariant=False, + return_df=True, + hash_method='md5', + process_creation_method='fork', + ): + super().__init__( + verbose=verbose, + cols=cols, + drop_invariant=drop_invariant, + return_df=return_df, + handle_unknown='does not apply', + handle_missing='does not apply', + ) if max_process not in range(1, 128): if platform.system() == 'Windows': @@ -127,7 +154,7 @@ def __init__(self, max_process=0, max_sample=0, verbose=0, n_components=8, cols= self.max_process = max_process self.max_sample = int(max_sample) if platform.system() == 'Windows': - self.process_creation_method = "spawn" + self.process_creation_method = 'spawn' else: self.process_creation_method = process_creation_method self.data_lines = 0 @@ -144,17 +171,14 @@ def _transform(self, X, override_return_df=False): Parameters ---------- - X : array-like, shape = [n_samples, n_features] Returns ------- - p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ - if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -169,16 +193,32 @@ def _transform(self, X, override_return_df=False): return X X = self.hashing_trick( - X, + X, hashing_method=self.hash_method, - N=self.n_components, - cols=self.cols, + N=self.n_components, + cols=self.cols, ) - + return X @staticmethod def hash_chunk(hash_method: str, np_df: np.ndarray, N: int) -> np.ndarray: + """Perform hashing on the given numpy array. + + Parameters + ---------- + hash_method: str + Hashlib method to use. + np_df: np.ndarray + Data to hash. + N: int + Number of bits to encode the data. + + Returns + ------- + np.ndarray + Hashed data. + """ # Calling getattr outside the loop saves some time in the loop hasher_constructor = getattr(hashlib, hash_method) # Same when the call to getattr is implicit @@ -188,12 +228,12 @@ def hash_chunk(hash_method: str, np_df: np.ndarray, N: int) -> np.ndarray: for val in row: if val is not None: hasher = hasher_constructor() - # Computes an integer index from the hasher digest. The endian is + # Computes an integer index from the hasher digest. The endian is # "big" as the code use to read: # column_index = int(hasher.hexdigest(), 16) % N # which is implicitly considering the hexdigest to be big endian, # even if the system is little endian. - # Building the index that way is about 30% faster than using the + # Building the index that way is about 30% faster than using the # hexdigest. hasher.update(bytes(str(val), 'utf-8')) column_index = int_from_bytes(hasher.digest(), byteorder='big') % N @@ -201,22 +241,52 @@ def hash_chunk(hash_method: str, np_df: np.ndarray, N: int) -> np.ndarray: return result def hashing_trick_with_np_parallel(self, df: pd.DataFrame, N: int) -> pd.DataFrame: + """Perform the hashing trick in parallel. + + Parameters + ---------- + df: pd.DataFrame + data to hash. + N: int + how many bits to use to represent the feature. + + Returns + ------- + pd.DataFrame + hashed data. + """ np_df = df.to_numpy() ctx = multiprocessing.get_context(self.process_creation_method) with ProcessPoolExecutor(max_workers=self.max_process, mp_context=ctx) as executor: - result = np.concatenate(list( - executor.map( - self.hash_chunk, - [self.hash_method]*self.max_process, - np.array_split(np_df, self.max_process), - [N]*self.max_process + result = np.concatenate( + list( + executor.map( + self.hash_chunk, + [self.hash_method] * self.max_process, + np.array_split(np_df, self.max_process), + [N] * self.max_process, + ) ) - )) + ) return pd.DataFrame(result, index=df.index) def hashing_trick_with_np_no_parallel(self, df: pd.DataFrame, N: int) -> pd.DataFrame: + """Perform the hashing trick in a single thread (non-parallel). + + Parameters + ---------- + df: pd.DataFrame + data to hash. + N: int + how many bits to use to represent the feature. + + Returns + ------- + pd.DataFrame + hashed data. + """ np_df = df.to_numpy() result = HashingEncoder.hash_chunk(self.hash_method, np_df, N) @@ -224,15 +294,14 @@ def hashing_trick_with_np_no_parallel(self, df: pd.DataFrame, N: int) -> pd.Data return pd.DataFrame(result, index=df.index) def hashing_trick(self, X_in, hashing_method='md5', N=2, cols=None, make_copy=False): - """A basic hashing implementation with configurable dimensionality/precision + """A basic hashing implementation with configurable dimensionality/precision. - Performs the hashing trick on a pandas dataframe, `X`, using the hashing method from hashlib - identified by `hashing_method`. The number of output dimensions (`N`), and columns to hash (`cols`) are - also configurable. + Performs the hashing trick on a pandas dataframe, `X`, using the hashing method from + hashlib identified by `hashing_method`. + The number of output dimensions (`N`), and columns to hash (`cols`) are also configurable. Parameters ---------- - X_in: pandas dataframe description text hashing_method: string, optional @@ -246,7 +315,6 @@ def hashing_trick(self, X_in, hashing_method='md5', N=2, cols=None, make_copy=Fa Returns ------- - out : dataframe A hashing encoded dataframe. @@ -254,13 +322,15 @@ def hashing_trick(self, X_in, hashing_method='md5', N=2, cols=None, make_copy=Fa ---------- Cite the relevant literature, e.g. [1]_. You may also cite these references in the notes section above. - .. [1] Kilian Weinberger; Anirban Dasgupta; John Langford; Alex Smola; Josh Attenberg (2009). Feature Hashing - for Large Scale Multitask Learning. Proc. ICML. + .. [1] Kilian Weinberger; Anirban Dasgupta; John Langford; Alex Smola; + Josh Attenberg (2009). Feature Hashing for Large Scale Multitask Learning. Proc. ICML. """ if hashing_method not in hashlib.algorithms_available: - raise ValueError(f"Hashing Method: {hashing_method} not Available. " - f"Please use one from: [{', '.join([str(x) for x in hashlib.algorithms_available])}]") + raise ValueError( + f"Hashing Method: {hashing_method} not available. " + f"Please use one of: {', '.join([str(x) for x in hashlib.algorithms_available])}" + ) if make_copy: X = X_in.copy(deep=True) diff --git a/category_encoders/helmert.py b/category_encoders/helmert.py index 42ae7c7e..e236f457 100644 --- a/category_encoders/helmert.py +++ b/category_encoders/helmert.py @@ -1,8 +1,7 @@ -"""Helmert contrast coding""" +"""Helmert contrast coding.""" - -from patsy.contrasts import ContrastMatrix, Helmert import numpy as np +from patsy.contrasts import ContrastMatrix, Helmert from category_encoders.base_contrast_encoder import BaseContrastEncoder @@ -14,7 +13,6 @@ class HelmertEncoder(BaseContrastEncoder): Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -22,37 +20,48 @@ class HelmertEncoder(BaseContrastEncoder): drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has unknown categories. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform matrix + has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has nan values. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform matrix + has nan values. This can cause unexpected changes in dimension in some cases. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] - >>> enc = HelmertEncoder(cols=['CentralAir', 'Heating'], handle_unknown='value', handle_missing='value').fit(X, y) + >>> enc = HelmertEncoder( + ... cols=['CentralAir', 'Heating'], handle_unknown='value', handle_missing='value' + ... ).fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) RangeIndex: 1460 entries, 0 to 1459 Data columns (total 12 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 intercept 1460 non-null int64 + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 intercept 1460 non-null int64 1 Id 1460 non-null float64 2 MSSubClass 1460 non-null float64 - 3 MSZoning 1460 non-null object + 3 MSZoning 1460 non-null object 4 LotFrontage 1201 non-null float64 5 YearBuilt 1460 non-null float64 6 Heating_0 1460 non-null float64 @@ -73,7 +82,9 @@ class HelmertEncoder(BaseContrastEncoder): .. [2] Gregory Carey (2003). Coding Categorical Variables, from http://ibgwww.colorado.edu/~carey/p5741ndir/Coding_Categorical_Variables.pdf - + """ + def get_contrast_matrix(self, values_to_encode: np.array) -> ContrastMatrix: + """Get the contrast matrix for the helmert encoder.""" return Helmert().code_without_intercept(values_to_encode) diff --git a/category_encoders/james_stein.py b/category_encoders/james_stein.py index ab16c6f3..eca34140 100644 --- a/category_encoders/james_stein.py +++ b/category_encoders/james_stein.py @@ -1,19 +1,21 @@ -"""James-Stein""" +"""James-Stein.""" + import numpy as np import pandas as pd import scipy -from scipy import optimize -from category_encoders.ordinal import OrdinalEncoder -import category_encoders.utils as util from sklearn.utils.random import check_random_state +import category_encoders.utils as util +from category_encoders.ordinal import OrdinalEncoder + __author__ = 'Jan Motl' class JamesSteinEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): """James-Stein estimator. - Supported targets: binomial and continuous. For polynomial target support, see PolynomialWrapper. + Supported targets: binomial and continuous. + For polynomial target support, see PolynomialWrapper. For feature value `i`, James-Stein estimator returns a weighted average of: @@ -22,7 +24,7 @@ class JamesSteinEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): This can be written as:: - JS_i = (1-B)*mean(y_i) + B*mean(y) + JS_i = (1 - B) * mean(y_i) + B * mean(y) The question is, what should be the weight `B`? If we put too much weight on the conditional mean value, we will overfit. @@ -32,7 +34,7 @@ class JamesSteinEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): The intuition is: If the estimate of `mean(y_i)` is unreliable (`y_i` has high variance), we should put more weight on `mean(y)`. Stein put it into an equation as:: - B = var(y_i) / (var(y_i)+var(y)) + B = var(y_i) / (var(y_i) + var(y)) The only remaining issue is that we do not know `var(y)`, let alone `var(y_i)`. Hence, we have to estimate the variances. But how can we reliably estimate the @@ -64,7 +66,6 @@ class JamesSteinEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -72,15 +73,19 @@ class JamesSteinEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): drop_invariant: bool boolean for whether or not to drop encoded columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_missing: str - options are 'return_nan', 'error' and 'value', defaults to 'value', which returns the prior probability. + options are 'return_nan', 'error' and 'value', defaults to 'value', + which returns the prior probability. handle_unknown: str - options are 'return_nan', 'error' and 'value', defaults to 'value', which returns the prior probability. + options are 'return_nan', 'error' and 'value', defaults to 'value', + which returns the prior probability. model: str options are 'pooled', 'beta', 'binary' and 'independent', defaults to 'independent'. randomized: bool, - adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). + adds normal (Gaussian) distribution noise into training data in order to decrease + overfitting (testing data are untouched). sigma: float standard deviation (spread or "width") of the normal distribution. @@ -90,8 +95,16 @@ class JamesSteinEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = JamesSteinEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -100,11 +113,11 @@ class JamesSteinEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 7 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 5 Heating 1460 non-null float64 @@ -116,8 +129,8 @@ class JamesSteinEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): References ---------- - .. [1] Parametric empirical Bayes inference: Theory and applications, equations 1.19 & 1.20, from - https://www.jstor.org/stable/2287098 + .. [1] Parametric empirical Bayes inference: Theory and applications, equations 1.19 & 1.20, + from https://www.jstor.org/stable/2287098 .. [2] Empirical Bayes for multiple sample sizes, from http://chris-said.io/2017/05/03/empirical-bayes-for-multiple-sample-sizes/ @@ -132,13 +145,31 @@ class JamesSteinEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): http://statweb.stanford.edu/~ckirby/brad/other/Article1977.pdf """ + prefit_ordinal = True encoding_relation = util.EncodingRelation.ONE_TO_ONE - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', - handle_missing='value', model='independent', random_state=None, randomized=False, sigma=0.05): - super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, - handle_unknown=handle_unknown, handle_missing=handle_missing) + def __init__( + self, + verbose=0, + cols=None, + drop_invariant=False, + return_df=True, + handle_unknown='value', + handle_missing='value', + model='independent', + random_state=None, + randomized=False, + sigma=0.05, + ): + super().__init__( + verbose=verbose, + cols=cols, + drop_invariant=drop_invariant, + return_df=return_df, + handle_unknown=handle_unknown, + handle_missing=handle_missing, + ) self.ordinal_encoder = None self.mapping = None self.random_state = random_state @@ -147,12 +178,8 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, h self.model = model def _fit(self, X, y, **kwargs): - self.ordinal_encoder = OrdinalEncoder( - verbose=self.verbose, - cols=self.cols, - handle_unknown='value', - handle_missing='value' + verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) @@ -168,13 +195,23 @@ def _fit(self, X, y, **kwargs): # The label must be binary with values {0,1} unique = y.unique() if len(unique) != 2: - raise ValueError("The target column y must be binary. But the target contains " + str(len(unique)) + " unique value(s).") + raise ValueError( + 'The target column y must be binary. But the target contains ' + + str(len(unique)) + + ' unique value(s).' + ) if y.isna().any(): - raise ValueError("The target column y must not contain missing values.") + raise ValueError('The target column y must not contain missing values.') if np.max(unique) < 1: - raise ValueError("The target column y must be binary with values {0, 1}. Value 1 was not found in the target.") + raise ValueError( + 'The target column y must be binary with values {0, 1}. ' + 'Value 1 was not found in the target.' + ) if np.min(unique) > 0: - raise ValueError("The target column y must be binary with values {0, 1}. Value 0 was not found in the target.") + raise ValueError( + 'The target column y must be binary with values {0, 1}. ' + 'Value 0 was not found in the target.' + ) # Perform the training self.mapping = self._train_log_odds_ratio(X_ordinal, y) else: @@ -191,9 +228,10 @@ def _transform(self, X, y=None): X = self._score(X, y) return X - def _more_tags(self): + def _more_tags(self) -> dict[str, bool]: + """Set scikit transformer tags.""" tags = super()._more_tags() - tags["predict_depends_on_y"] = True + tags['predict_depends_on_y'] = True return tags def _train_pooled(self, X, y): @@ -214,29 +252,32 @@ def _train_pooled(self, X, y): # Calculate sum and count of the target for each unique value in the feature col stats = y.groupby(X[col]).agg(['mean', 'count']) - # See: Computer Age Statistical Inference: Algorithms, Evidence, and Data Science (Bradley Efron & Trevor Hastie, 2016) - # Equations 7.19 and 7.20. + # See: Computer Age Statistical Inference: Algorithms, Evidence, and Data Science + # (Bradley Efron & Trevor Hastie, 2016) Equations 7.19 and 7.20. # Note: The equations assume normal distribution of the label. But our label is p(y|x), - # which is definitely not normally distributed as probabilities are bound to lie on interval 0..1. + # which is definitely not normally distributed as probabilities are bound to lie on + # interval 0..1. # We make this approximation because Efron does it as well. # Equation 7.19 # Explanation of the equation: # https://stats.stackexchange.com/questions/191444/variance-in-estimating-p-for-a-binomial-distribution # if stats['count'].var() > 0: - # warnings.warn('The pooled model assumes that each category is observed exactly N times. This was violated in "' + str(col) +'" column. Consider comparing the accuracy of this model to "independent" model.') + # warnings.warn('The pooled model assumes that each category is observed + # exactly N times. This was violated in "' + str(col) +'" column. + # Consider comparing the accuracy of this model to "independent" model.') # This is a parametric estimate of var(p) in the binomial distribution. # We do not use it because we also want to support non-binary targets. # The difference in the estimates is small. # variance = prior * (1 - prior) / stats['count'].mean() # This is a squared estimate of standard error of the mean: # https://en.wikipedia.org/wiki/Standard_error - variance = target_var/(stats['count'].mean()) + variance = target_var / (stats['count'].mean()) # Equation 7.20 - SSE = ((stats['mean']-prior)**2).sum() # Sum of Squared Errors + SSE = ((stats['mean'] - prior) ** 2).sum() # Sum of Squared Errors if SSE > 0: # We have to avoid division by zero - B = ((len(stats['count'])-3)*variance) / SSE + B = ((len(stats['count']) - 3) * variance) / SSE B = B.clip(0, 1) estimate = prior + (1 - B) * (stats['mean'] - prior) else: @@ -280,28 +321,34 @@ def _train_independent(self, X, y): # Calculate sum and count of the target for each unique value in the feature col stats = y.groupby(X[col]).agg(['mean', 'var']) - i_var = stats['var'].fillna(0) # When we do not have more than 1 sample, assume 0 variance + i_var = stats['var'].fillna( + 0 + ) # When we do not have more than 1 sample, assume 0 variance unique_cnt = len(X[col].unique()) # See: Parametric Empirical Bayes Inference: Theory and Applications (Morris, 1983) # Equations 1.19 and 1.20. # Note: The equations assume normal distribution of the label. But our label is p(y|x), - # which is definitely not normally distributed as probabilities are bound to lie on interval 0..1. + # which is definitely not normally distributed as probabilities are bound to lie + # on interval 0..1. # Nevertheless, it seems to perform surprisingly well. This is in agreement with: # Data Analysis with Stein's Estimator and Its Generalizations (Efron & Morris, 1975) # The equations are similar to James-Stein estimator, as listed in: # Stein's Paradox in Statistics (Efron & Morris, 1977) # Or: - # Computer Age Statistical Inference: Algorithms, Evidence, and Data Science (Efron & Hastie, 2016) - # Equations 7.19 and 7.20. - # The difference is that they have equal count of observations per estimated variable, while we generally - # do not have that. Nice discussion about that is given at: + # Computer Age Statistical Inference: Algorithms, Evidence, and Data Science + # (Efron & Hastie, 2016) Equations 7.19 and 7.20. + # The difference is that they have equal count of observations per estimated variable, + # while we generally # do not have that. + # Nice discussion about that is given at: # http://chris-said.io/2017/05/03/empirical-bayes-for-multiple-sample-sizes/ - smoothing = i_var / (global_var + i_var) * (unique_cnt-3) / (unique_cnt-1) + smoothing = i_var / (global_var + i_var) * (unique_cnt - 3) / (unique_cnt - 1) smoothing = 1 - smoothing - smoothing = smoothing.clip(lower=0, upper=1) # Smoothing should be in the interval <0,1> + smoothing = smoothing.clip( + lower=0, upper=1 + ) # Smoothing should be in the interval <0,1> - estimate = smoothing*(stats['mean']) + (1-smoothing)*prior + estimate = smoothing * (stats['mean']) + (1 - smoothing) * prior # Ignore unique values. This helps to prevent overfitting on id-like columns if len(stats['mean']) == global_count: @@ -337,10 +384,10 @@ def _train_log_odds_ratio(self, X, y): # https://en.wikipedia.org/wiki/Newton%27s_method # But we just use sklearn minimizer. def get_best_sigma(sigma, mu_k, sigma_k, K): - global mu # Ugly. But I want to be able to read it once the optimization ends. - w_k = 1. / (sigma ** 2 + sigma_k ** 2) # Weights depends on sigma - mu = sum(w_k * mu_k) / sum(w_k) # Mu transitively depends on sigma - total = sum(w_k * (mu_k - mu) ** 2) # We want this to be close to K-1 + global mu # Ugly. But I want to be able to read it once the optimization ends. + w_k = 1.0 / (sigma**2 + sigma_k**2) # Weights depends on sigma + mu = sum(w_k * mu_k) / sum(w_k) # Mu transitively depends on sigma + total = sum(w_k * (mu_k - mu) ** 2) # We want this to be close to K-1 loss = abs(total - (K - 1)) return loss @@ -358,7 +405,9 @@ def get_best_sigma(sigma, mu_k, sigma_k, K): crosstable['E+A-'] = global_sum - stats['sum'] crosstable['E+A+'] = stats['sum'] index = crosstable.index - crosstable = np.array(crosstable, dtype=np.float32) # The argument unites the types into float + crosstable = np.array( + crosstable, dtype=np.float32 + ) # The argument unites the types into float # Count of contingency tables. K = len(crosstable) @@ -369,23 +418,36 @@ def get_best_sigma(sigma, mu_k, sigma_k, K): else: if K > 1: # We want to avoid division by zero in y_k calculation # Estimate log-odds ratios with Yates correction as listed on page 5. - mu_k = np.log((crosstable[:, 0] + 0.5) * (crosstable[:, 3] + 0.5) / ((crosstable[:, 1] + 0.5) * (crosstable[:, 2] + 0.5))) + mu_k = np.log( + (crosstable[:, 0] + 0.5) + * (crosstable[:, 3] + 0.5) + / ((crosstable[:, 1] + 0.5) * (crosstable[:, 2] + 0.5)) + ) # Standard deviation estimate for 2x2 contingency table as given in equation 2. # The explanation of the equation is given in: # https://stats.stackexchange.com/questions/266098/how-do-i-calculate-the-standard-deviation-of-the-log-odds - sigma_k = np.sqrt(np.sum(1. / (crosstable + 0.5), axis=1)) + sigma_k = np.sqrt(np.sum(1.0 / (crosstable + 0.5), axis=1)) # Estimate the sigma and mu. Sigma is non-negative. - result = scipy.optimize.minimize(get_best_sigma, x0=1e-4, args=(mu_k, sigma_k, K), bounds=[(0, np.inf)], method='TNC', tol=1e-12, options={'gtol': 1e-12, 'ftol': 1e-12, 'eps': 1e-12}) + result = scipy.optimize.minimize( + get_best_sigma, + x0=1e-4, + args=(mu_k, sigma_k, K), + bounds=[(0, np.inf)], + method='TNC', + tol=1e-12, + options={'gtol': 1e-12, 'ftol': 1e-12, 'eps': 1e-12}, + ) sigma = result.x[0] # Empirical Bayes follows equation 7. - # However, James-Stein estimator behaves perversely when K < 3. Hence, we clip the B into interval <0,1>. + # However, James-Stein estimator behaves perversely when K < 3. + # Hence, we clip the B into interval <0,1>. # Literature reference for the clipping: - # Estimates of Income for Small Places: An Application of James-Stein Procedures to Census Data (Fay & Harriout, 1979), - # page 270. - B = (K - 3) * sigma_k ** 2 / ((K - 1) * (sigma ** 2 + sigma_k ** 2)) + # Estimates of Income for Small Places: An Application of James-Stein + # Procedures to Census Data (Fay & Harriout, 1979), page 270. + B = (K - 3) * sigma_k**2 / ((K - 1) * (sigma**2 + sigma_k**2)) B = B.clip(0, 1) y_k = mu + (1 - B) * (mu_k - mu) @@ -429,7 +491,7 @@ def _train_beta(self, X, y): # See: Stein's paradox and group rationality (Romeijn, 2017), page 14 smoothing = stats['count'] / (stats['count'] + global_count) - estimate = smoothing*(stats['mean']) + (1-smoothing)*prior + estimate = smoothing * (stats['mean']) + (1 - smoothing) * prior # Ignore unique values. This helps to prevent overfitting on id-like columns if len(stats['mean']) == global_count: @@ -459,6 +521,6 @@ def _score(self, X, y): # Randomization is meaningful only for training data -> we do it only if y is present if self.randomized and y is not None: random_state_generator = check_random_state(self.random_state) - X[col] = (X[col] * random_state_generator.normal(1., self.sigma, X[col].shape[0])) + X[col] = X[col] * random_state_generator.normal(1.0, self.sigma, X[col].shape[0]) return X diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py index 40317f81..ffa068b4 100644 --- a/category_encoders/leave_one_out.py +++ b/category_encoders/leave_one_out.py @@ -1,9 +1,11 @@ -"""Leave one out coding""" +"""Leave one out coding.""" + import numpy as np import pandas as pd -import category_encoders.utils as util from sklearn.utils.random import check_random_state +import category_encoders.utils as util + __author__ = 'hbghhy' @@ -16,7 +18,6 @@ class LeaveOneOutEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -24,16 +25,20 @@ class LeaveOneOutEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_missing: str - options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. + options are 'error', 'return_nan' and 'value', defaults to 'value', + which returns the target mean. handle_unknown: str - options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. + options are 'error', 'return_nan' and 'value', defaults to 'value', + which returns the target mean. sigma: float - adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing - data are untouched). Sigma gives the standard deviation (spread or "width") of the normal distribution. - The optimal value is commonly between 0.05 and 0.6. The default is to not add noise, but that leads - to significantly suboptimal results. + adds normal (Gaussian) distribution noise into training data in order to decrease + overfitting (testing data are untouched). Sigma gives the standard deviation + (spread or "width") of the normal distribution. The optimal value is commonly + between 0.05 and 0.6. + The default is to not add noise, but that leads to significantly suboptimal results. Example @@ -41,8 +46,16 @@ class LeaveOneOutEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = LeaveOneOutEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -51,11 +64,11 @@ class LeaveOneOutEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 7 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 5 Heating 1460 non-null float64 @@ -71,13 +84,29 @@ class LeaveOneOutEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): https://datascience.stackexchange.com/questions/10839/what-is-difference-between-one-hot-encoding-and-leave-one-out-encoding """ + prefit_ordinal = False encoding_relation = util.EncodingRelation.ONE_TO_ONE - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, - handle_unknown='value', handle_missing='value', random_state=None, sigma=None): - super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, - handle_unknown=handle_unknown, handle_missing=handle_missing) + def __init__( + self, + verbose=0, + cols=None, + drop_invariant=False, + return_df=True, + handle_unknown='value', + handle_missing='value', + random_state=None, + sigma=None, + ): + super().__init__( + verbose=verbose, + cols=cols, + drop_invariant=drop_invariant, + return_df=return_df, + handle_unknown=handle_unknown, + handle_missing=handle_missing, + ) self.mapping = None self._mean = None self.random_state = random_state @@ -85,28 +114,26 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, def _fit(self, X, y, **kwargs): y = y.astype(float) - categories = self.fit_leave_one_out( - X, y, - cols=self.cols - ) + categories = self.fit_leave_one_out(X, y, cols=self.cols) self.mapping = categories def _transform(self, X, y=None): if y is not None: y = y.astype(float) - X = self.transform_leave_one_out( - X, y, - mapping=self.mapping - ) + X = self.transform_leave_one_out(X, y, mapping=self.mapping) return X - def _more_tags(self): + def _more_tags(self) -> dict[str, bool]: + """Set scikit transformer tags.""" tags = super()._more_tags() - tags["predict_depends_on_y"] = True + tags['predict_depends_on_y'] = True return tags - def fit_leave_one_out(self, X_in, y, cols=None): + def fit_leave_one_out( + self, X_in: pd.DataFrame, y: pd.Series, cols=None + ) -> dict[str, pd.Series]: + """Fit leave one out encoding.""" X = X_in.copy(deep=True) if cols is None: @@ -114,9 +141,10 @@ def fit_leave_one_out(self, X_in, y, cols=None): self._mean = y.mean() - return {col: self.fit_column_map(X[col], y) for col in cols} + return {col: self._fit_column_map(X[col], y) for col in cols} - def fit_column_map(self, series, y): + @staticmethod + def _fit_column_map(series: pd.Series, y: pd.Series) -> pd.Series: category = pd.Categorical(series) categories = category.categories @@ -125,43 +153,52 @@ def fit_column_map(self, series, y): codes[codes == -1] = len(categories) categories = np.append(categories, np.nan) - return_map = pd.Series({code: category for code, category in enumerate(categories)}) + return_map = pd.Series(dict(enumerate(categories))) result = y.groupby(codes).agg(['sum', 'count']) return result.rename(return_map) - def transform_leave_one_out(self, X, y, mapping=None): - """ - Leave one out encoding uses a single column of floats to represent the means of the target variables. - """ + def transform_leave_one_out(self, X: pd.DataFrame, y: pd.Series | None, mapping=None): + """Apply leave-one-out-encoding to a dataframe. + If a target is given the lable-mean is calculated without the target (left out). + Otherwise, the label mean from the fit step is taken. + """ random_state_ = check_random_state(self.random_state) for col, colmap in mapping.items(): level_notunique = colmap['count'] > 1 unique_train = colmap.index - unseen_values = pd.Series([x for x in X[col].unique() if x not in unique_train], dtype=unique_train.dtype) + unseen_values = pd.Series( + [x for x in X[col].unique() if x not in unique_train], dtype=unique_train.dtype + ) is_nan = X[col].isna() is_unknown_value = X[col].isin(unseen_values.dropna().astype(object)) - if X[col].dtype.name == 'category': # Pandas 0.24 tries hard to preserve categorical data type + if ( + X[col].dtype.name == 'category' + ): # Pandas 0.24 tries hard to preserve categorical data type index_dtype = X[col].dtype.categories.dtype X[col] = X[col].astype(index_dtype) if self.handle_unknown == 'error' and is_unknown_value.any(): raise ValueError('Columns to be encoded can not contain new values') - if y is None: # Replace level with its mean target; if level occurs only once, use global mean + if ( + y is None + ): # Replace level with its mean target; if level occurs only once, use global mean level_means = (colmap['sum'] / colmap['count']).where(level_notunique, self._mean) X[col] = X[col].map(level_means) - else: # Replace level with its mean target, calculated excluding this row's target + else: # Replace level with its mean target, calculated excluding this row's target # The y (target) mean for this level is normally just the sum/count; # excluding this row's y, it's (sum - y) / (count - 1) level_means = (X[col].map(colmap['sum']) - y) / (X[col].map(colmap['count']) - 1) # The 'where' fills in singleton levels (count = 1 -> div by 0) with the global mean - X[col] = level_means.where(X[col].map(colmap['count'][level_notunique]).notna(), self._mean) + X[col] = level_means.where( + X[col].map(colmap['count'][level_notunique]).notna(), self._mean + ) if self.handle_unknown == 'value': X.loc[is_unknown_value, col] = self._mean @@ -174,6 +211,6 @@ def transform_leave_one_out(self, X, y, mapping=None): X.loc[is_nan, col] = np.nan if self.sigma is not None and y is not None: - X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0]) + X[col] = X[col] * random_state_.normal(1.0, self.sigma, X[col].shape[0]) return X diff --git a/category_encoders/m_estimate.py b/category_encoders/m_estimate.py index f195e07f..d6862c5a 100644 --- a/category_encoders/m_estimate.py +++ b/category_encoders/m_estimate.py @@ -1,25 +1,27 @@ -"""M-probability estimate""" +"""M-probability estimate.""" + import numpy as np -from category_encoders.ordinal import OrdinalEncoder -import category_encoders.utils as util from sklearn.utils.random import check_random_state +import category_encoders.utils as util +from category_encoders.ordinal import OrdinalEncoder + __author__ = 'Jan Motl' class MEstimateEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): """M-probability estimate of likelihood. - Supported targets: binomial and continuous. For polynomial target support, see PolynomialWrapper. + Supported targets: binomial and continuous. + For polynomial target support, see PolynomialWrapper. - This is a simplified version of target encoder, which goes under names like m-probability estimate or - additive smoothing with known incidence rates. In comparison to target encoder, m-probability estimate - has only one tunable parameter (`m`), while target encoder has two tunable parameters (`min_samples_leaf` - and `smoothing`). + This is a simplified version of target encoder, which goes under names like m-probability + estimate or additive smoothing with known incidence rates. In comparison to target encoder, + m-probability estimate has only one tunable parameter (`m`), while target encoder has two + tunable parameters (`min_samples_leaf` and `smoothing`). Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -27,26 +29,38 @@ class MEstimateEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): drop_invariant: bool boolean for whether or not to drop encoded columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_missing: str - options are 'return_nan', 'error' and 'value', defaults to 'value', which returns the prior probability. + options are 'return_nan', 'error' and 'value', defaults to 'value', + which returns the prior probability. handle_unknown: str - options are 'return_nan', 'error' and 'value', defaults to 'value', which returns the prior probability. + options are 'return_nan', 'error' and 'value', defaults to 'value', + which returns the prior probability. randomized: bool, - adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). + adds normal (Gaussian) distribution noise into training data in order to decrease + overfitting (testing data are untouched). sigma: float standard deviation (spread or "width") of the normal distribution. m: float - this is the "m" in the m-probability estimate. Higher value of m results into stronger shrinking. - M is non-negative. + this is the "m" in the m-probability estimate. Higher value of m results into stronger + shrinking. M is non-negative. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target > 200000 >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = MEstimateEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -55,11 +69,11 @@ class MEstimateEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 7 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 5 Heating 1460 non-null float64 @@ -71,8 +85,8 @@ class MEstimateEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): References ---------- - .. [1] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems, equation 7, from - https://dl.acm.org/citation.cfm?id=507538 + .. [1] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification + and Prediction Problems, equation 7, from https://dl.acm.org/citation.cfm?id=507538 .. [2] On estimating probabilities in tree pruning, equation 1, from https://link.springer.com/chapter/10.1007/BFb0017010 @@ -81,13 +95,31 @@ class MEstimateEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): https://en.wikipedia.org/wiki/Additive_smoothing#Generalized_to_the_case_of_known_incidence_rates """ + prefit_ordinal = True encoding_relation = util.EncodingRelation.ONE_TO_ONE - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, - handle_unknown='value', handle_missing='value', random_state=None, randomized=False, sigma=0.05, m=1.0): - super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, - handle_unknown=handle_unknown, handle_missing=handle_missing) + def __init__( + self, + verbose=0, + cols=None, + drop_invariant=False, + return_df=True, + handle_unknown='value', + handle_missing='value', + random_state=None, + randomized=False, + sigma=0.05, + m=1.0, + ): + super().__init__( + verbose=verbose, + cols=cols, + drop_invariant=drop_invariant, + return_df=return_df, + handle_unknown=handle_unknown, + handle_missing=handle_missing, + ) self.ordinal_encoder = None self.mapping = None self._sum = None @@ -98,12 +130,8 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, self.m = m def _fit(self, X, y, **kwargs): - self.ordinal_encoder = OrdinalEncoder( - verbose=self.verbose, - cols=self.cols, - handle_unknown='value', - handle_missing='value' + verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) @@ -122,9 +150,10 @@ def _transform(self, X, y=None): X = self._score(X, y) return X - def _more_tags(self): + def _more_tags(self) -> dict[str, bool]: + """Set scikit transformer tags.""" tags = super()._more_tags() - tags["predict_depends_on_y"] = True + tags['predict_depends_on_y'] = True return tags def _train(self, X, y): @@ -134,7 +163,7 @@ def _train(self, X, y): # Calculate global statistics self._sum = y.sum() self._count = y.count() - prior = self._sum/self._count + prior = self._sum / self._count for switch in self.ordinal_encoder.category_mapping: col = switch.get('col') @@ -172,6 +201,6 @@ def _score(self, X, y): # Randomization is meaningful only for training data -> we do it only if y is present if self.randomized and y is not None: random_state_generator = check_random_state(self.random_state) - X[col] = (X[col] * random_state_generator.normal(1., self.sigma, X[col].shape[0])) + X[col] = X[col] * random_state_generator.normal(1.0, self.sigma, X[col].shape[0]) return X diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py index 06efe531..7726402a 100644 --- a/category_encoders/one_hot.py +++ b/category_encoders/one_hot.py @@ -1,29 +1,34 @@ -"""One-hot or dummy coding""" +"""One-hot or dummy coding.""" + +import warnings + import numpy as np import pandas as pd -import warnings -from category_encoders.ordinal import OrdinalEncoder + import category_encoders.utils as util +from category_encoders.ordinal import OrdinalEncoder __author__ = 'willmcginnis' class OneHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): - """Onehot (or dummy) coding for categorical features, produces one feature per category, each binary. + """Onehot (or dummy) coding for categorical features, produces a binary feature per category. Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool - boolean for whether or not to drop columns with 0 variance. + boolean for whether to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). use_cat_names: bool - if True, category values will be included in the encoded column names. Since this can result in duplicate column names, duplicates are suffixed with '#' symbol until a unique name is generated. + if True, category values will be included in the encoded column names. + Since this can result in duplicate column names, duplicates are suffixed with '#' + symbol until a unique name is generated. If False, category indices will be used instead of the category values. handle_unknown: str options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. @@ -40,16 +45,25 @@ class OneHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): 'value' will encode a missing value as 0 in every dummy column. 'indicator' will treat missingness as its own category, adding an additional dummy column (whether there are missing values in the training set or not). - 'ignore' will encode missing values as 0 in every dummy column, NOT adding an additional category. - + 'ignore' will encode missing values as 0 in every dummy column, + NOT adding an additional category. + Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = OneHotEncoder(cols=['CentralAir', 'Heating'], handle_unknown='indicator').fit(X, y) @@ -58,23 +72,23 @@ class OneHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 15 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 - 5 Heating_1 1460 non-null int64 - 6 Heating_2 1460 non-null int64 - 7 Heating_3 1460 non-null int64 - 8 Heating_4 1460 non-null int64 - 9 Heating_5 1460 non-null int64 - 10 Heating_6 1460 non-null int64 - 11 Heating_-1 1460 non-null int64 - 12 CentralAir_1 1460 non-null int64 - 13 CentralAir_2 1460 non-null int64 - 14 CentralAir_-1 1460 non-null int64 + 5 Heating_1 1460 non-null int64 + 6 Heating_2 1460 non-null int64 + 7 Heating_3 1460 non-null int64 + 8 Heating_4 1460 non-null int64 + 9 Heating_5 1460 non-null int64 + 10 Heating_6 1460 non-null int64 + 11 Heating_-1 1460 non-null int64 + 12 CentralAir_1 1460 non-null int64 + 13 CentralAir_2 1460 non-null int64 + 14 CentralAir_-1 1460 non-null int64 dtypes: float64(4), int64(10), object(1) memory usage: 171.2+ KB None @@ -87,21 +101,37 @@ class OneHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): .. [2] Gregory Carey (2003). Coding Categorical Variables, from http://ibgwww.colorado.edu/~carey/p5741ndir/Coding_Categorical_Variables.pdf - + """ + prefit_ordinal = True encoding_relation = util.EncodingRelation.ONE_TO_N_UNIQUE - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, - handle_missing='value', handle_unknown='value', use_cat_names=False): - super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, - handle_unknown=handle_unknown, handle_missing=handle_missing) - self.mapping = None + def __init__( + self, + verbose: int = 0, + cols: list[str] | None = None, + drop_invariant: bool = False, + return_df: bool = True, + handle_missing: str = 'value', + handle_unknown: str = 'value', + use_cat_names: bool = False, + ): + super().__init__( + verbose=verbose, + cols=cols, + drop_invariant=drop_invariant, + return_df=return_df, + handle_unknown=handle_unknown, + handle_missing=handle_missing, + ) + self.mapping: list[dict[str, pd.DataFrame]] | None = None self.ordinal_encoder = None self.use_cat_names = use_cat_names @property - def category_mapping(self): + def category_mapping(self) -> list[dict[str, pd.DataFrame]] | None: + """Return the mapping.""" return self.mapping def _fit(self, X, y=None, **kwargs): @@ -110,7 +140,7 @@ def _fit(self, X, y=None, **kwargs): 'return_nan': 'return_nan', 'value': 'value', 'indicator': 'return_nan', - 'ignore': 'return_nan' + 'ignore': 'return_nan', }[self.handle_missing] self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, @@ -121,7 +151,14 @@ def _fit(self, X, y=None, **kwargs): self.ordinal_encoder = self.ordinal_encoder.fit(X) self.mapping = self.generate_mapping() - def generate_mapping(self): + def generate_mapping(self) -> list[dict[str, str | pd.DataFrame]] | None: + """Generate one-hot-encoding mapping. + + The mapping is a list [ + {'col': '', 'mapping': mapping_df}, + ... + ] + """ mapping = [] found_column_counts = {} @@ -146,18 +183,18 @@ def generate_mapping(self): append_nan_to_index = class_ continue if self.use_cat_names: - n_col_name = f"{col}_{cat_name}" + n_col_name = f'{col}_{cat_name}' found_count = found_column_counts.get(n_col_name, 0) found_column_counts[n_col_name] = found_count + 1 n_col_name += '#' * found_count else: - n_col_name = f"{col}_{class_}" + n_col_name = f'{col}_{class_}' index.append(class_) new_columns.append(n_col_name) if self.handle_unknown == 'indicator': - n_col_name = f"{col}_-1" + n_col_name = f'{col}_-1' if self.use_cat_names: found_count = found_column_counts.get(n_col_name, 0) found_column_counts[n_col_name] = found_count + 1 @@ -178,7 +215,7 @@ def generate_mapping(self): if self.handle_missing == 'return_nan': base_df.loc[-2] = np.nan - elif self.handle_missing in ['value','ignore']: + elif self.handle_missing in ['value', 'ignore']: base_df.loc[-2] = 0 mapping.append({'col': col, 'mapping': base_df}) @@ -195,7 +232,7 @@ def _transform(self, X): X = self.get_dummies(X) return X - def inverse_transform(self, X_in): + def inverse_transform(self, X_in: util.X_type) -> pd.DataFrame | np.ndarray: """ Perform the inverse transformation to encoded data. @@ -208,7 +245,6 @@ def inverse_transform(self, X_in): p: array, the same size of X_in """ - # fail fast if self._dim is None: raise ValueError('Must train encoder before it can be used to inverse_transform data') @@ -221,8 +257,10 @@ def inverse_transform(self, X_in): # then make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: - raise ValueError(f"Unexpected input dimension {X.shape[1]}, the attribute drop_invariant should " - "be False when transforming the data") + raise ValueError( + f'Unexpected input dimension {X.shape[1]}, the attribute drop_invariant should ' + 'be False when transforming the data' + ) else: raise ValueError(f'Unexpected input dimension {X.shape[1]}, expected {self._dim}') @@ -237,14 +275,17 @@ def inverse_transform(self, X_in): if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': for col in self.cols: if X[switch.get('col')].isna().any(): - warnings.warn("inverse_transform is not supported because transform impute " - f"the unknown category nan when encode {col}") + warnings.warn( + 'inverse_transform is not supported because transform impute ' + f'the unknown category nan when encode {col}', + stacklevel=4, + ) return X if self.return_df else X.to_numpy() - def get_dummies(self, X_in): + def get_dummies(self, X_in: pd.DataFrame) -> pd.DataFrame: """ - Convert numerical variable into dummy variables + Convert numerical variable into dummy variables. Parameters ---------- @@ -255,7 +296,6 @@ def get_dummies(self, X_in): dummies : DataFrame """ - X = X_in.copy(deep=True) cols = X.columns.tolist() @@ -269,21 +309,25 @@ def get_dummies(self, X_in): X = pd.concat([base_df, X], axis=1) old_column_index = cols.index(col) - cols[old_column_index: old_column_index + 1] = mod.columns + cols[old_column_index : old_column_index + 1] = mod.columns X = X.reindex(columns=cols) return X - def reverse_dummies(self, X, mapping): + @staticmethod + def reverse_dummies( + X: pd.DataFrame, mapping: list[dict[str, str | pd.DataFrame]] + ) -> pd.DataFrame: """ - Convert dummy variable into numerical variables + Convert dummy variable into numerical variables. Parameters ---------- X : DataFrame mapping: list-like - Contains mappings of column to be transformed to it's new columns and value represented + Contains mappings of column to be transformed to it's new columns and + value represented. Returns ------- diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py index 717611bb..4237d447 100644 --- a/category_encoders/ordinal.py +++ b/category_encoders/ordinal.py @@ -1,10 +1,13 @@ -"""Ordinal or label encoding""" +"""Ordinal or label encoding.""" + +from __future__ import annotations + +import warnings import numpy as np import pandas as pd + import category_encoders.utils as util -import warnings -from typing import Dict, List, Union __author__ = 'willmcginnis' @@ -12,13 +15,13 @@ class OrdinalEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): """Encodes categorical features as ordinal, in one ordered feature. - Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed - in; in this case, we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes + Ordinal encoding uses a single column of integers to represent the classes. + An optional mapping dict can be passed in; in this case, we use the knowledge that there is + some true order to the classes themselves. Otherwise, the classes are assumed to have no true order and integers are selected at random. Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -26,20 +29,24 @@ class OrdinalEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). mapping: list of dicts a mapping of class to label to use for the encoding, optional. the dict contains the keys 'col' and 'mapping'. the value of 'col' should be the feature name. - the value of 'mapping' should be a dictionary or pd.Series of 'original_label' to 'encoded_label'. + the value of 'mapping' should be a dictionary or pd.Series of 'original_label' to + 'encoded_label'. example mapping: [ {'col': 'col1', 'mapping': {None: 0, 'a': 1, 'b': 2}}, {'col': 'col2', 'mapping': {None: 0, 'x': 1, 'y': 2}} ] handle_unknown: str - options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the category -1. + options are 'error', 'return_nan' and 'value', defaults to 'value', + which will impute the category -1. handle_missing: str - options are 'error', 'return_nan', and 'value, default to 'value', which treat nan as a category at fit time, + options are 'error', 'return_nan', and 'value, default to 'value', + which treat nan as a category at fit time, or -2 at transform time if nan is not a category during fit. Example @@ -47,8 +54,16 @@ class OrdinalEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = OrdinalEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -57,15 +72,15 @@ class OrdinalEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 7 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 - 5 Heating 1460 non-null int64 - 6 CentralAir 1460 non-null int64 + 5 Heating 1460 non-null int64 + 6 CentralAir 1460 non-null int64 dtypes: float64(4), int64(2), object(1) memory usage: 80.0+ KB None @@ -79,23 +94,39 @@ class OrdinalEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): .. [2] Gregory Carey (2003). Coding Categorical Variables, from http://ibgwww.colorado.edu/~carey/p5741ndir/Coding_Categorical_Variables.pdf """ + prefit_ordinal = False encoding_relation = util.EncodingRelation.ONE_TO_ONE - def __init__(self, verbose=0, mapping=None, cols=None, drop_invariant=False, return_df=True, - handle_unknown='value', handle_missing='value'): - super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, - handle_unknown=handle_unknown, handle_missing=handle_missing) + def __init__( + self, + verbose: int = 0, + mapping: list[dict[str, str | dict | pd.Series]] | None = None, + cols: list[str] = None, + drop_invariant: bool = False, + return_df: bool = True, + handle_unknown: str = 'value', + handle_missing: str = 'value', + ): + super().__init__( + verbose=verbose, + cols=cols, + drop_invariant=drop_invariant, + return_df=return_df, + handle_unknown=handle_unknown, + handle_missing=handle_missing, + ) self.mapping_supplied = mapping is not None if self.mapping_supplied: mapping = self._validate_supplied_mapping(mapping) self.mapping = mapping @property - def category_mapping(self): + def category_mapping(self) -> list[dict[str, str | dict | pd.Series]] | None: + """The underlying category mapping.""" return self.mapping - def _fit(self, X, y=None, **kwargs): + def _fit(self, X: pd.DataFrame, y: pd.Series | None = None, **kwargs) -> None: # reset mapping in case of refit if not self.mapping_supplied: self.mapping = None @@ -104,26 +135,26 @@ def _fit(self, X, y=None, **kwargs): mapping=self.mapping, cols=self.cols, handle_unknown=self.handle_unknown, - handle_missing=self.handle_missing + handle_missing=self.handle_missing, ) self.mapping = categories - def _transform(self, X): - + def _transform(self, X: pd.DataFrame) -> pd.DataFrame: X, _ = self.ordinal_encoding( X, mapping=self.mapping, cols=self.cols, handle_unknown=self.handle_unknown, - handle_missing=self.handle_missing + handle_missing=self.handle_missing, ) return X - def inverse_transform(self, X_in): - """ - Perform the inverse transformation to encoded data. Will attempt best case reconstruction, which means - it will return nan for handle_missing and handle_unknown settings that break the bijection. We issue - warnings when some of those cases occur. + def inverse_transform(self, X_in: util.X_type) -> pd.DataFrame | np.ndarray: + """Perform the inverse transformation to encoded data. + + Will attempt best case reconstruction, which means it will return nan for handle_missing + and handle_unknown settings that break the bijection. + We issue warnings when some of those cases occur. Parameters ---------- @@ -134,7 +165,6 @@ def inverse_transform(self, X_in): p: array, the same size of X_in """ - # fail fast if self._dim is None: raise ValueError('Must train encoder before it can be used to inverse_transform data') @@ -145,8 +175,10 @@ def inverse_transform(self, X_in): # then make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: - raise ValueError(f"Unexpected input dimension {X.shape[1]}, the attribute drop_invariant should " - "be False when transforming the data") + raise ValueError( + f'Unexpected input dimension {X.shape[1]}, the attribute drop_invariant should ' + 'be False when transforming the data' + ) else: raise ValueError(f'Unexpected input dimension {X.shape[1]}, expected {self._dim}') @@ -156,14 +188,20 @@ def inverse_transform(self, X_in): if self.handle_unknown == 'value': for col in self.cols: if any(X[col] == -1): - warnings.warn("inverse_transform is not supported because transform impute " - f"the unknown category -1 when encode {col}") + warnings.warn( + 'inverse_transform is not supported because transform impute ' + f'the unknown category -1 when encode {col}', + stacklevel=4, + ) if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': for col in self.cols: if X[col].isna().any(): - warnings.warn("inverse_transform is not supported because transform impute " - f"the unknown category nan when encode {col}") + warnings.warn( + 'inverse_transform is not supported because transform impute ' + f'the unknown category nan when encode {col}', + stacklevel=4, + ) for switch in self.mapping: column_mapping = switch.get('mapping') @@ -173,13 +211,20 @@ def inverse_transform(self, X_in): return X if self.return_df else X.to_numpy() @staticmethod - def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', handle_missing='value'): + def ordinal_encoding( + X_in: pd.DataFrame, + mapping: list[dict[str, str | dict | pd.Series]] | None = None, + cols: list[str] = None, + handle_unknown: str = 'value', + handle_missing: str = 'value', + ) -> tuple[pd.DataFrame, list[dict]]: + """Ordinal encoding uses a single column of integers to represent the classes. + + An optional mapping dict can be passed in, in this case we use the knowledge that there + is some true order to the classes themselves. + Otherwise, the classes are assumed to have no true order and integers are selected + at random. """ - Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed - in, in this case we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes - are assumed to have no true order and integers are selected at random. - """ - return_nan_series = pd.Series(data=[np.nan], index=[-2]) X = X_in.copy(deep=True) @@ -197,16 +242,16 @@ def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', hand # fillna changes None and pd.NA to np.nan try: with pd.option_context('future.no_silent_downcasting', True): - X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping) + X[column] = X[column].astype('object').fillna(np.nan).map(col_mapping) except pd._config.config.OptionError: # old pandas versions - X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping) + X[column] = X[column].astype('object').fillna(np.nan).map(col_mapping) if util.is_category(X[column].dtype): nan_identity = col_mapping.loc[col_mapping.index.isna()].array[0] X[column] = X[column].cat.add_categories(nan_identity) X[column] = X[column].fillna(nan_identity) try: X[column] = X[column].astype(int) - except ValueError as e: + except ValueError: X[column] = X[column].astype(float) if handle_unknown == 'value': @@ -232,7 +277,9 @@ def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', hand if util.is_category(X[col].dtype): # Avoid using pandas category dtype meta-data if possible, see #235, #238. if X[col].dtype.ordered: - category_set = set(categories) # convert to set for faster membership checks c.f. #407 + category_set = set( + categories + ) # convert to set for faster membership checks c.f. #407 categories = [c for c in X[col].dtype.categories if c in category_set] if X[col].isna().any(): categories += [np.nan] @@ -246,31 +293,39 @@ def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', hand elif handle_missing == 'return_nan': data.loc[nan_identity] = -2 - mapping_out.append({'col': col, 'mapping': data, 'data_type': X[col].dtype}, ) + mapping_out.append( + {'col': col, 'mapping': data, 'data_type': X[col].dtype}, + ) return X, mapping_out - def _validate_supplied_mapping(self, supplied_mapping: List[Dict[str, Union[str, Dict, pd.Series]]]) -> List[Dict[str, Union[str, pd.Series]]]: + def _validate_supplied_mapping( + self, supplied_mapping: list[dict[str, str | dict | pd.Series]] + ) -> list[dict[str, str | pd.Series]]: """ validate the supplied mapping and convert the actual mapping per column to a pandas series. - :param supplied_mapping: mapping as list of dicts. They actual mapping can be either a dict or pd.Series - :return: the mapping with all actual mappings being pandas series + + :param supplied_mapping: mapping as list of dicts. + They actual mapping can be either a dict or pd.Series + :return: the mapping with all actual mappings being pandas series. """ - msg = "Invalid supplied mapping, must be of type List[Dict[str, Union[Dict, pd.Series]]]." \ - "For an example refer to the documentation" + msg = ( + 'Invalid supplied mapping, must be of type List[Dict[str, Union[Dict, pd.Series]]].' + 'For an example refer to the documentation' + ) if not isinstance(supplied_mapping, list): raise ValueError(msg) for mapping_el in supplied_mapping: if not isinstance(mapping_el, dict): raise ValueError(msg) - if "col" not in mapping_el: + if 'col' not in mapping_el: raise KeyError("Mapping must contain a key 'col' for each column to encode") - if "mapping" not in mapping_el: + if 'mapping' not in mapping_el: raise KeyError("Mapping must contain a key 'mapping' for each column to encode") - mapping = mapping_el["mapping"] + mapping = mapping_el['mapping'] if isinstance(mapping_el, dict): # convert to dict in order to standardise - mapping_el["mapping"] = pd.Series(mapping) - if "data_type" not in mapping_el: - mapping_el["data_type"] = mapping_el["mapping"].index.dtype + mapping_el['mapping'] = pd.Series(mapping) + if 'data_type' not in mapping_el: + mapping_el['data_type'] = mapping_el['mapping'].index.dtype return supplied_mapping diff --git a/category_encoders/polynomial.py b/category_encoders/polynomial.py index bcef1228..1dd94552 100644 --- a/category_encoders/polynomial.py +++ b/category_encoders/polynomial.py @@ -1,7 +1,7 @@ -"""Polynomial contrast coding""" +"""Polynomial contrast coding.""" -from patsy.contrasts import ContrastMatrix, Poly import numpy as np +from patsy.contrasts import ContrastMatrix, Poly from category_encoders.base_contrast_encoder import BaseContrastEncoder @@ -13,7 +13,6 @@ class PolynomialEncoder(BaseContrastEncoder): Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -21,23 +20,33 @@ class PolynomialEncoder(BaseContrastEncoder): drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has unknown categories. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. + The default is 'value'. Warning: if indicator is used, + an extra column will be added in if the transform matrix has unknown categories. + This can cause unexpected changes in dimension in some cases. handle_missing: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has nan values. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform matrix + has nan values. This can cause unexpected changes in dimension in some cases. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = PolynomialEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -46,12 +55,12 @@ class PolynomialEncoder(BaseContrastEncoder): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 12 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 intercept 1460 non-null int64 + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 intercept 1460 non-null int64 1 Id 1460 non-null float64 2 MSSubClass 1460 non-null float64 - 3 MSZoning 1460 non-null object + 3 MSZoning 1460 non-null object 4 LotFrontage 1201 non-null float64 5 YearBuilt 1460 non-null float64 6 Heating_0 1460 non-null float64 @@ -72,7 +81,9 @@ class PolynomialEncoder(BaseContrastEncoder): .. [2] Gregory Carey (2003). Coding Categorical Variables, from http://ibgwww.colorado.edu/~carey/p5741ndir/Coding_Categorical_Variables.pdf - + """ + def get_contrast_matrix(self, values_to_encode: np.array) -> ContrastMatrix: + """Get the contrast matrix for the polynomial encoder.""" return Poly().code_without_intercept(values_to_encode) diff --git a/category_encoders/quantile_encoder.py b/category_encoders/quantile_encoder.py index 50844b8c..a5df8118 100644 --- a/category_encoders/quantile_encoder.py +++ b/category_encoders/quantile_encoder.py @@ -1,10 +1,13 @@ -"""Quantile Encoder""" -__author__ = "david26694", "cmougan" +"""Quantile Encoder.""" + +from __future__ import annotations + +__author__ = 'david26694', 'cmougan' -from functools import reduce import operator -from typing import List import warnings +from functools import reduce +from typing import Sequence import numpy as np import pandas as pd @@ -25,44 +28,55 @@ class QuantileEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. quantile: float float indicating statistical quantile. ´0.5´ for median. m: float - this is the “m” in the m-probability estimate. Higher value of m results into stronger shrinking. M is non-negative. 0 for no smoothing. + this is the “m” in the m-probability estimate. Higher value of m results into + stronger shrinking. M is non-negative. 0 for no smoothing. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_missing: str - options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target quantile. + options are 'error', 'return_nan' and 'value', defaults to 'value', + which returns the target quantile. handle_unknown: str - options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target quantile. + options are 'error', 'return_nan' and 'value', defaults to 'value', + which returns the target quantile. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] - >>> enc = QuantileEncoder(cols=["CentralAir", "Heating"], quantile=0.5, m=1.0).fit(X, y) + >>> enc = QuantileEncoder(cols=['CentralAir', 'Heating'], quantile=0.5, m=1.0).fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) RangeIndex: 1460 entries, 0 to 1459 Data columns (total 7 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 5 Heating 1460 non-null float64 @@ -74,88 +88,113 @@ class QuantileEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): References ---------- - .. [1] Quantile Encoder: Tackling High Cardinality Categorical Features in Regression Problems, https://link.springer.com/chapter/10.1007%2F978-3-030-85529-1_14 - .. [2] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems, equation 7, from https://dl.acm.org/citation.cfm?id=507538 - .. [3] On estimating probabilities in tree pruning, equation 1, from https://link.springer.com/chapter/10.1007/BFb0017010 + .. [1] Quantile Encoder: Tackling High Cardinality Categorical Features in Regression Problems, + https://link.springer.com/chapter/10.1007%2F978-3-030-85529-1_14 + .. [2] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification + and Prediction Problems, equation 7, from https://dl.acm.org/citation.cfm?id=507538 + .. [3] On estimating probabilities in tree pruning, equation 1, + from https://link.springer.com/chapter/10.1007/BFb0017010 .. [4] Additive smoothing, from https://en.wikipedia.org/wiki/Additive_smoothing#Generalized_to_the_case_of_known_incidence_rates .. [5] Target encoding done the right way https://maxhalford.github.io/blog/target-encoding/ """ + prefit_ordinal = True encoding_relation = util.EncodingRelation.ONE_TO_ONE def __init__( self, - verbose=0, - cols=None, - drop_invariant=False, - return_df=True, - handle_missing="value", - handle_unknown="value", - quantile=0.5, - m=1.0, + verbose: int = 0, + cols: list[str] = None, + drop_invariant: bool = False, + return_df: bool = True, + handle_missing: str = 'value', + handle_unknown: str = 'value', + quantile: float = 0.5, + m: float = 1.0, ): - super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, - handle_unknown=handle_unknown, handle_missing=handle_missing) + super().__init__( + verbose=verbose, + cols=cols, + drop_invariant=drop_invariant, + return_df=return_df, + handle_unknown=handle_unknown, + handle_missing=handle_missing, + ) self.ordinal_encoder = None self.mapping = None self.quantile = quantile self.m = m - def _fit(self, X, y, **kwargs): + def _fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> None: y = y.astype(float) self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown="value", - handle_missing="value", + handle_unknown='value', + handle_missing='value', ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) self.mapping = self.fit_quantile_encoding(X_ordinal, y) - def fit_quantile_encoding(self, X, y): + def fit_quantile_encoding(self, X: pd.DataFrame, y: pd.Series) -> dict[str, pd.Series]: + """Calculate the quantile encoding mapping. + + Parameters + ---------- + X: training data. + y: target data. + + Returns + ------- + mapping col-name -> series with category-label -> quantile mapping. + """ mapping = {} # Calculate global statistics prior = np.quantile(y, self.quantile) for switch in self.ordinal_encoder.category_mapping: - col = switch.get("col") - values = switch.get("mapping") + col = switch.get('col') + values = switch.get('mapping') - # Calculate sum, count and quantile of the target for each unique value in the feature col - stats = y.groupby(X[col]).agg([lambda x: np.quantile(x, self.quantile), "sum", "count"]) - stats.columns = ["quantile", "sum", "count"] + # Calculate sum, count and quantile of the target for each unique value + # in the feature col + stats = y.groupby(X[col]).agg([lambda x: np.quantile(x, self.quantile), 'sum', 'count']) + stats.columns = ['quantile', 'sum', 'count'] # Calculate the m-probability estimate of the quantile - estimate = (stats["count"] * stats["quantile"] + prior * self.m) / (stats["count"] + self.m) + estimate = (stats['count'] * stats['quantile'] + prior * self.m) / ( + stats['count'] + self.m + ) - if self.handle_unknown == "return_nan": + if self.handle_unknown == 'return_nan': estimate.loc[-1] = np.nan - elif self.handle_unknown == "value": + elif self.handle_unknown == 'value': estimate.loc[-1] = prior - if self.handle_missing == "return_nan": + if self.handle_missing == 'return_nan': estimate.loc[values.loc[np.nan]] = np.nan - elif self.handle_missing == "value": + elif self.handle_missing == 'value': estimate.loc[-2] = prior mapping[col] = estimate return mapping - def _transform(self, X, y=None): + def _transform(self, X: pd.DataFrame, y: pd.Series | None = None): X = self.ordinal_encoder.transform(X) - if self.handle_unknown == "error": + if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): - raise ValueError("Unexpected categories found in dataframe") + raise ValueError('Unexpected categories found in dataframe') X = self.quantile_encode(X) return X - def quantile_encode(self, X_in): + def quantile_encode(self, X_in: pd.DataFrame) -> pd.DataFrame: + """Apply quantile encoding.""" X = X_in.copy(deep=True) for col in self.cols: @@ -168,7 +207,8 @@ def quantile_encode(self, X_in): class SummaryEncoder(BaseEstimator, util.TransformerWithTargetMixin): """Summary Encoding for categorical features. - It's an encoder designed for creating richer representations by applying quantile encoding for a set of quantiles. + It's an encoder designed for creating richer representations by applying quantile + encoding for a set of quantiles. Parameters ---------- @@ -177,38 +217,50 @@ class SummaryEncoder(BaseEstimator, util.TransformerWithTargetMixin): quantiles: list list of floats indicating the statistical quantiles. Each element represent a column m: float - this is the “m” in the m-probability estimate. Higher value of m results into stronger shrinking. M is non-negative. 0 for no smoothing. + this is the “m” in the m-probability estimate. Higher value of m results into stronger + shrinking. M is non-negative. 0 for no smoothing. cols: list a list of columns to encode, if None, all string columns will be encoded. drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_missing: str - options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target quantile. + options are 'error', 'return_nan' and 'value', defaults to 'value', + which returns the target quantile. handle_unknown: str - options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target quantile. + options are 'error', 'return_nan' and 'value', defaults to 'value', + which returns the target quantile. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] - >>> enc = SummaryEncoder(cols=["CentralAir", "Heating"], quantiles=[0.25, 0.5, 0.75]).fit(X, y) + >>> enc = SummaryEncoder(cols=['CentralAir', 'Heating'], quantiles=[0.25, 0.5, 0.75]).fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) RangeIndex: 1460 entries, 0 to 1459 Data columns (total 11 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 5 Heating_25 1460 non-null float64 @@ -223,31 +275,37 @@ class SummaryEncoder(BaseEstimator, util.TransformerWithTargetMixin): References ---------- - .. [1] Quantile Encoder: Tackling High Cardinality Categorical Features in Regression Problems, https://link.springer.com/chapter/10.1007%2F978-3-030-85529-1_14 - .. [2] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems, equation 7, from https://dl.acm.org/citation.cfm?id=507538 - .. [3] On estimating probabilities in tree pruning, equation 1, from https://link.springer.com/chapter/10.1007/BFb0017010 + .. [1] Quantile Encoder: Tackling High Cardinality Categorical Features in Regression Problems, + https://link.springer.com/chapter/10.1007%2F978-3-030-85529-1_14 + .. [2] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification + and Prediction Problems, equation 7, from https://dl.acm.org/citation.cfm?id=507538 + .. [3] On estimating probabilities in tree pruning, equation 1, + from https://link.springer.com/chapter/10.1007/BFb0017010 .. [4] Additive smoothing, from https://en.wikipedia.org/wiki/Additive_smoothing#Generalized_to_the_case_of_known_incidence_rates .. [5] Target encoding done the right way https://maxhalford.github.io/blog/target-encoding/ """ + encoding_relation = util.EncodingRelation.ONE_TO_M def __init__( self, - verbose=0, - cols=None, - drop_invariant=False, - return_df=True, - handle_missing="value", - handle_unknown="value", - quantiles=(0.25, 0.75), - m=1.0, + verbose: int = 0, + cols: list[str] = None, + drop_invariant: bool = False, + return_df: bool = True, + handle_missing: str = 'value', + handle_unknown: str = 'value', + quantiles: Sequence[float] = (0.25, 0.75), + m: float = 1.0, ): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.cols = cols - self.use_default_cols = cols is None # if True, even a repeated call of fit() will select string columns from X + self.use_default_cols = ( + cols is None + ) # if True, even a repeated call of fit() will select string columns from X self.ordinal_encoder = None self._dim = None self.mapping = None @@ -257,12 +315,11 @@ def __init__( self.m = m self.encoder_list = None - def fit(self, X, y): + def fit(self, X: util.X_type, y: util.y_type) -> SummaryEncoder: """Fits the encoder according to X and y by fitting the individual encoders. Parameters ---------- - X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. @@ -271,7 +328,6 @@ def fit(self, X, y): Returns ------- - self : encoder Returns self. @@ -287,7 +343,7 @@ def fit(self, X, y): rounded_percentiles = [round(quantile * 100) for quantile in self.quantiles] if len(rounded_percentiles) != len(set(rounded_percentiles)): - raise ValueError("There are two quantiles that belong to the same rounded percentile") + raise ValueError('There are two quantiles that belong to the same rounded percentile') encoder_list = [] for quantile in self.quantiles: @@ -295,7 +351,8 @@ def fit(self, X, y): verbose=self.verbose, cols=self.cols, drop_invariant=self.drop_invariant, - return_df=True, # always return df for individual encoders. If not desired this is handled below. + # always return df for individual encoders. If not desired this is handled below. + return_df=True, handle_missing=self.handle_missing, handle_unknown=self.handle_unknown, quantile=quantile, @@ -307,18 +364,40 @@ def fit(self, X, y): self.feature_names_out_ = reduce( operator.add, [ - [self._get_col_name(c, enc.quantile) for enc in encoder_list if c not in enc.invariant_cols] - if c in self.cols - else [c] + ( + [ + self._get_col_name(c, enc.quantile) + for enc in encoder_list + if c not in enc.invariant_cols + ] + if c in self.cols + else [c] + ) for c in X.columns ], ) self.encoder_list = encoder_list return self - def transform(self, X, y=None, override_return_df=False): + def transform( + self, X: util.X_type, y: util.y_type | None = None, override_return_df: bool = False + ) -> pd.DataFrame | np.ndarray: + """Summary encode new data. + + Parameters + ---------- + X: data to encode. + y: optional target information. + override_return_df: if true return a numpy array instead of a + dataframe regardless of the return_df parameter. + + Returns + ------- + encoded data. + + """ if self.encoder_list is None: - raise ValueError("Must train encoder before it can be used to transform data.") + raise ValueError('Must train encoder before it can be used to transform data.') X, y = util.convert_inputs(X, y) orig_cols = X.columns @@ -339,45 +418,51 @@ def transform(self, X, y=None, override_return_df=False): else: return transformed_df.to_numpy() - def get_feature_names(self) -> List[str]: - warnings.warn("`get_feature_names` is deprecated in all of sklearn. Use `get_feature_names_out` instead.", - category=FutureWarning) + def get_feature_names(self) -> np.ndarray: + """Deprecated method to get feature names. Use `get_feature_names_out` instead.""" + msg = ( + '`get_feature_names` is deprecated in all of sklearn. ' + 'Use `get_feature_names_out` instead.' + ) + warnings.warn(msg, category=FutureWarning, stacklevel=2) return self.get_feature_names_out() def get_feature_names_out(self, input_features=None) -> np.ndarray: - """ - Returns the names of all transformed / added columns. + """Returns the names of all transformed / added columns. - Note that in sklearn the get_feature_names_out function takes the feature_names_in as an argument - and determines the output feature names using the input. A fit is usually not necessary and if so a - NotFittedError is raised. + Note that in sklearn the get_feature_names_out function takes the feature_names_in + as an argument and determines the output feature names using the input. + A fit is usually not necessary and if so a NotFittedError is raised. We just require a fit all the time and return the fitted output columns. Returns ------- feature_names: np.ndarray A list with all feature names transformed or added. - Note: potentially dropped features (because the feature is constant/invariant) are not included! + Note: potentially dropped features (because the feature is constant/invariant) + are not included! """ - out_feats = getattr(self, "feature_names_out_", None) + out_feats = getattr(self, 'feature_names_out_', None) if not isinstance(out_feats, list): - raise NotFittedError("Estimator has to be fitted to return feature names.") + raise NotFittedError('Estimator has to be fitted to return feature names.') else: return np.array(out_feats, dtype=object) - def get_feature_names_in(self) -> List[str]: - """ - Returns the names of all input columns present when fitting. + def get_feature_names_in(self) -> np.ndarray: + """Get the names of all input columns present when fitting. + These columns are necessary for the transform step. """ - in_feats = getattr(self, "feature_names_in_", None) - if not isinstance(in_feats, list): - raise NotFittedError("Estimator has to be fitted to return feature names.") + in_feats = getattr(self, 'feature_names_in_', None) + if isinstance(in_feats, list): + in_feats = np.array(in_feats) + if not isinstance(in_feats, np.ndarray): + raise NotFittedError('Estimator has to be fitted to return feature names.') else: return in_feats @staticmethod def _get_col_name(col: str, quantile: float) -> str: percentile = round(quantile * 100) - return f"{col}_{percentile}" + return f'{col}_{percentile}' diff --git a/category_encoders/rankhot.py b/category_encoders/rankhot.py index af886188..264ad219 100644 --- a/category_encoders/rankhot.py +++ b/category_encoders/rankhot.py @@ -1,17 +1,23 @@ +"""Rank Hot encoding.""" + +from __future__ import annotations + import numpy as np import pandas as pd -from category_encoders import OrdinalEncoder + import category_encoders.utils as util +from category_encoders import OrdinalEncoder class RankHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): - """The rank-hot encoder is similar to a one-hot encoder, + """Rank Hot Encoder. + + The rank-hot encoder is similar to a one-hot encoder, except every feature up to and including the current rank is hot. This is also called thermometer encoding. Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -38,8 +44,16 @@ class RankHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = RankHotEncoder(cols=['CentralAir', 'Heating'], handle_unknown='indicator').fit(X, y) @@ -48,21 +62,21 @@ class RankHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 13 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 - 5 Heating_1 1460 non-null int64 - 6 Heating_2 1460 non-null int64 - 7 Heating_3 1460 non-null int64 - 8 Heating_4 1460 non-null int64 - 9 Heating_5 1460 non-null int64 - 10 Heating_6 1460 non-null int64 - 11 CentralAir_1 1460 non-null int64 - 12 CentralAir_2 1460 non-null int64 + 5 Heating_1 1460 non-null int64 + 6 Heating_2 1460 non-null int64 + 7 Heating_3 1460 non-null int64 + 8 Heating_4 1460 non-null int64 + 9 Heating_5 1460 non-null int64 + 10 Heating_6 1460 non-null int64 + 11 CentralAir_1 1460 non-null int64 + 12 CentralAir_2 1460 non-null int64 dtypes: float64(4), int64(8), object(1) memory usage: 148.4+ KB None @@ -73,14 +87,14 @@ class RankHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin): def __init__( self, - verbose=0, - cols=None, - drop_invariant=False, - return_df=True, - handle_missing="value", - handle_unknown="value", - use_cat_names=None, - ): + verbose: int = 0, + cols: list[str] = None, + drop_invariant: bool = False, + return_df: bool = True, + handle_missing: str = 'value', + handle_unknown: str = 'value', + use_cat_names: bool = False, + ) -> None: super().__init__( verbose=verbose, cols=cols, @@ -93,7 +107,7 @@ def __init__( self.mapping = None self.use_cat_names = use_cat_names - def _fit(self, X, y, **kwargs): + def _fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> RankHotEncoder: oe_missing_strat = { 'error': 'error', 'return_nan': 'return_nan', @@ -103,12 +117,21 @@ def _fit(self, X, y, **kwargs): # supply custom mapping in order to assure order of ordinal variable ordered_mapping = [] for col in self.cols: - oe_col = OrdinalEncoder(verbose=self.verbose, cols=[col], handle_unknown="value", handle_missing=oe_missing_strat) + oe_col = OrdinalEncoder( + verbose=self.verbose, + cols=[col], + handle_unknown='value', + handle_missing=oe_missing_strat, + ) oe_col.fit(X[col].sort_values().to_frame(name=col)) ordered_mapping += oe_col.mapping self.ordinal_encoder = OrdinalEncoder( - verbose=self.verbose, cols=self.cols, handle_unknown="value", handle_missing=oe_missing_strat, mapping=ordered_mapping + verbose=self.verbose, + cols=self.cols, + handle_unknown='value', + handle_missing=oe_missing_strat, + mapping=ordered_mapping, ) self.ordinal_encoder = self.ordinal_encoder.fit(X) @@ -116,108 +139,139 @@ def _fit(self, X, y, **kwargs): return self - def _transform(self, X_in, override_return_df=False): + def _transform(self, X_in: pd.DataFrame, override_return_df: bool = False) -> pd.DataFrame: X = X_in.copy(deep=True) X = self.ordinal_encoder.transform(X) input_cols = X.columns.tolist() - if self.handle_unknown == "error": + if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): - raise ValueError("Columns to be encoded can not contain new values") + raise ValueError('Columns to be encoded can not contain new values') - for switch, ordinal_switch in zip(self.mapping, self.ordinal_encoder.category_mapping): - col = switch.get("col") - mod = switch.get("mapping") + for switch, _ordinal_switch in zip( + self.mapping, self.ordinal_encoder.category_mapping, strict=False + ): + col = switch.get('col') + mod = switch.get('mapping') encode_feature_series = X[col] unknow_elements = encode_feature_series[encode_feature_series == -1] - encoding_dict = {i: list(row.values()) for i, row in mod.to_dict(orient="index").items()} - if self.handle_unknown == "value": + encoding_dict = { + i: list(row.values()) for i, row in mod.to_dict(orient='index').items() + } + if self.handle_unknown == 'value': default_value = [0] * len(encoding_dict) - elif self.handle_unknown == "return_nan": + elif self.handle_unknown == 'return_nan': default_value = [np.nan] * len(encoding_dict) - elif self.handle_unknown == "error": + elif self.handle_unknown == 'error': if not unknow_elements.empty: unknowns_str = ', '.join([str(x) for x in unknow_elements.unique()]) - msg = f"Unseen values {unknowns_str} during transform in column {col}." + msg = f'Unseen values {unknowns_str} during transform in column {col}.' raise ValueError(msg) default_value = [0] * len(encoding_dict) else: - raise ValueError(f"invalid option for 'handle_unknown' parameter: {self.handle_unknown}") + raise ValueError( + f"invalid option for 'handle_unknown' parameter: {self.handle_unknown}" + ) def apply_coding(row: pd.Series): val = row.iloc[0] if pd.isna(val): - if self.handle_missing == "value": + if self.handle_missing == 'value': return default_value - elif self.handle_missing == "return_nan": + elif self.handle_missing == 'return_nan': return [np.nan] * len(default_value) else: - raise ValueError("Unhandled nan") + raise ValueError('Unhandled nan') return encoding_dict.get(row.iloc[0], default_value) - encoded = encode_feature_series.to_frame().apply(apply_coding, axis=1, result_type="expand") + encoded = encode_feature_series.to_frame().apply( + apply_coding, axis=1, result_type='expand' + ) encoded.columns = mod.columns X = pd.concat([encoded, X], axis=1) old_column_index = input_cols.index(col) - input_cols[old_column_index:old_column_index + 1] = mod.columns + input_cols[old_column_index : old_column_index + 1] = mod.columns X = X.reindex(columns=input_cols) return X - def create_dataframe(self, X, encoded, key_col): - + def _create_dataframe(self, X, encoded, key_col) -> pd.DataFrame: + # todo find the correct types here, key col does not seem + # like a list, probably this is not needed if not (isinstance(encoded, pd.DataFrame) or isinstance(encoded, pd.Series)): encoded = pd.DataFrame(encoded, columns=key_col) X_ = pd.concat([encoded, X], axis=1) return X_ - def inverse_transform(self, X_in): + def inverse_transform(self, X_in: pd.DataFrame) -> pd.DataFrame: + """Inverse transformation. + + This takes encoded data and gives back non-encoded data. + + Parameters + ---------- + X_in: data frame with rank-hot-encoded data. + + Returns + ------- + non-encoded data as a data frame. + + """ X = X_in.copy(deep=True) cols = X.columns.tolist() if self._dim is None: - raise ValueError("Must train encoder before it can be used to inverse_transform data") + raise ValueError('Must train encoder before it can be used to inverse_transform data') - for switch, ordinal_mapping in zip(self.mapping, self.ordinal_encoder.category_mapping): - col = switch.get("col") - cats = switch.get("mapping") - if col != ordinal_mapping.get("col"): - raise ValueError("Column order of OrdinalEncoder and RankHotEncoder do not match") - inv_map = {v: k for k, v in ordinal_mapping.get("mapping").to_dict().items()} + for switch, ordinal_mapping in zip( + self.mapping, self.ordinal_encoder.category_mapping, strict=False + ): + col = switch.get('col') + cats = switch.get('mapping') + if col != ordinal_mapping.get('col'): + raise ValueError('Column order of OrdinalEncoder and RankHotEncoder do not match') + inv_map = {v: k for k, v in ordinal_mapping.get('mapping').to_dict().items()} arrs = X[cats.columns] reencode = arrs.sum(axis=1).rename(col) - orig_dtype = ordinal_mapping.get("data_type") + orig_dtype = ordinal_mapping.get('data_type') reencode2 = reencode.replace(inv_map).astype(orig_dtype) if np.any(reencode2[:] == 0): reencode2[reencode2[:] == 0] = np.nan - X = self.create_dataframe(X, reencode2, col) + X = self._create_dataframe(X, reencode2, col) first_inex = cols.index(cats.columns[0]) last_index = cols.index(cats.columns[-1]) + 1 del cols[first_inex:last_index] - cols.insert(self.ordinal_encoder.feature_names_out_.index(col), col) + cols.insert(self.ordinal_encoder.feature_names_out_.tolist().index(col), col) X = X.reindex(columns=cols) return X - def generate_mapping(self): + def generate_mapping(self) -> list[dict[str, str | pd.DataFrame]]: + """Generate the mapping for rankhot encoding. + + Returns + ------- + List of dict containing colnames and their respective encoding. + + """ mapping = [] found_column_counts = {} for switch in self.ordinal_encoder.mapping: - col: str = switch.get("col") - values: pd.Series = switch.get("mapping").copy(deep=True) + col: str = switch.get('col') + values: pd.Series = switch.get('mapping').copy(deep=True) - if self.handle_missing == "value": + if self.handle_missing == 'value': values = values[values > 0] if len(values) == 0: @@ -228,12 +282,12 @@ def generate_mapping(self): for cat_name, class_ in values.items(): if self.use_cat_names: - n_col_name = f"{col}_{cat_name}" + n_col_name = f'{col}_{cat_name}' found_count = found_column_counts.get(n_col_name, 0) found_column_counts[n_col_name] = found_count + 1 - n_col_name += "#" * found_count + n_col_name += '#' * found_count else: - n_col_name = f"{col}_{class_}" + n_col_name = f'{col}_{class_}' index.append(class_) new_columns.append(n_col_name) @@ -241,5 +295,5 @@ def generate_mapping(self): base_matrix = np.tril(np.ones((len(index), len(index)), dtype=int)) base_df = pd.DataFrame(data=base_matrix, columns=new_columns, index=index) - mapping.append({"col": col, "mapping": base_df}) + mapping.append({'col': col, 'mapping': base_df}) return mapping diff --git a/category_encoders/sum_coding.py b/category_encoders/sum_coding.py index 27e40411..f3e849df 100644 --- a/category_encoders/sum_coding.py +++ b/category_encoders/sum_coding.py @@ -1,19 +1,20 @@ -"""Sum contrast coding""" +"""Sum contrast coding.""" +from __future__ import annotations -from patsy.contrasts import ContrastMatrix, Sum import numpy as np +from patsy.contrasts import ContrastMatrix, Sum from category_encoders.base_contrast_encoder import BaseContrastEncoder __author__ = 'paulwestenthanner' + class SumEncoder(BaseContrastEncoder): """Sum contrast coding for the encoding of categorical features. Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -21,23 +22,32 @@ class SumEncoder(BaseContrastEncoder): drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has unknown categories. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform matrix + has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str - options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has nan values. This can cause - unexpected changes in dimension in some cases. + options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. + Warning: if indicator is used, an extra column will be added in if the transform + matrix has nan values. This can cause unexpected changes in dimension in some cases. Example ------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = SumEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -46,12 +56,12 @@ class SumEncoder(BaseContrastEncoder): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 12 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 intercept 1460 non-null int64 + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 intercept 1460 non-null int64 1 Id 1460 non-null float64 2 MSSubClass 1460 non-null float64 - 3 MSZoning 1460 non-null object + 3 MSZoning 1460 non-null object 4 LotFrontage 1201 non-null float64 5 YearBuilt 1460 non-null float64 6 Heating_0 1460 non-null float64 @@ -76,4 +86,5 @@ class SumEncoder(BaseContrastEncoder): """ def get_contrast_matrix(self, values_to_encode: np.array) -> ContrastMatrix: + """Get the contrast matrix for the sum encoder.""" return Sum().code_without_intercept(values_to_encode.tolist()) diff --git a/category_encoders/target_encoder.py b/category_encoders/target_encoder.py index 8d523c34..f60706ac 100644 --- a/category_encoders/target_encoder.py +++ b/category_encoders/target_encoder.py @@ -1,9 +1,15 @@ -"""Target Encoder""" +"""Target Encoder.""" + +from __future__ import annotations + +from typing import Any + import numpy as np import pandas as pd from scipy.special import expit -from category_encoders.ordinal import OrdinalEncoder + import category_encoders.utils as util +from category_encoders.ordinal import OrdinalEncoder __author__ = 'chappers' @@ -11,17 +17,19 @@ class TargetEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): """Target encoding for categorical features. - Supported targets: binomial and continuous. For polynomial target support, see PolynomialWrapper. + Supported targets: binomial and continuous. + For polynomial target support, see PolynomialWrapper. - For the case of categorical target: features are replaced with a blend of posterior probability of the target - given particular categorical value and the prior probability of the target over all the training data. + For the case of categorical target: features are replaced with a blend of posterior + probability of the target given particular categorical value and the prior probability + of the target over all the training data. - For the case of continuous target: features are replaced with a blend of the expected value of the target - given particular categorical value and the expected value of the target over all the training data. + For the case of continuous target: features are replaced with a blend of the expected value + of the target given particular categorical value and the expected value of the + target over all the training data. Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -29,50 +37,67 @@ class TargetEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_missing: str - options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. + options are 'error', 'return_nan' and 'value', defaults to 'value', + which returns the target mean. handle_unknown: str - options are 'error', 'return_nan' and 'value', defaults to 'value', which returns the target mean. + options are 'error', 'return_nan' and 'value', defaults to 'value', + which returns the target mean. min_samples_leaf: int - For regularization the weighted average between category mean and global mean is taken. The weight is - an S-shaped curve between 0 and 1 with the number of samples for a category on the x-axis. - The curve reaches 0.5 at min_samples_leaf. (parameter k in the original paper) + For regularization the weighted average between category mean and global mean is taken. + The weight is an S-shaped curve between 0 and 1 with the number of samples for a category + on the x-axis. The curve reaches 0.5 at min_samples_leaf. + (parameter k in the original paper) smoothing: float - smoothing effect to balance categorical average vs prior. Higher value means stronger regularization. - The value must be strictly bigger than 0. Higher values mean a flatter S-curve (see min_samples_leaf). + smoothing effect to balance categorical average vs prior. Higher value means stronger + regularization. The value must be strictly bigger than 0. + Higher values mean a flatter S-curve (see min_samples_leaf). hierarchy: dict or dataframe A dictionary or a dataframe to define the hierarchy for mapping. - If a dictionary, this contains a dict of columns to map into hierarchies. Dictionary key(s) should be the column name from X - which requires mapping. For multiple hierarchical maps, this should be a dictionary of dictionaries. + If a dictionary, this contains a dict of columns to map into hierarchies. + Dictionary key(s) should be the column name from X which requires mapping. + For multiple hierarchical maps, this should be a dictionary of dictionaries. - If dataframe: a dataframe defining columns to be used for the hierarchies. Column names must take the form: + If dataframe: a dataframe defining columns to be used for the hierarchies. + Column names must take the form: HIER_colA_1, ... HIER_colA_N, HIER_colB_1, ... HIER_colB_M, ... - where [colA, colB, ...] are given columns in cols list. - 1:N and 1:M define the hierarchy for each column where 1 is the highest hierarchy (top of the tree). A single column or multiple - can be used, as relevant. + where [colA, colB, ...] are given columns in cols list. + 1:N and 1:M define the hierarchy for each column where 1 is the highest hierarchy + (top of the tree). A single column or multiple can be used, as relevant. Examples - ------- + -------- >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] - >>> bunch = fetch_openml(name="house_prices", as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) >>> y = bunch.target > 200000 >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] - >>> enc = TargetEncoder(cols=['CentralAir', 'Heating'], min_samples_leaf=20, smoothing=10).fit(X, y) + >>> enc = TargetEncoder(cols=['CentralAir', 'Heating'], min_samples_leaf=20, smoothing=10).fit( + ... X, y + ... ) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) RangeIndex: 1460 entries, 0 to 1459 Data columns (total 7 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 5 Heating 1460 non-null float64 @@ -80,20 +105,24 @@ class TargetEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): dtypes: float64(6), object(1) memory usage: 80.0+ KB None - + >>> from category_encoders.datasets import load_compass >>> X, y = load_compass() >>> hierarchical_map = {'compass': {'N': ('N', 'NE'), 'S': ('S', 'SE'), 'W': 'W'}} - >>> enc = TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=hierarchical_map, cols=['compass']).fit(X.loc[:,['compass']], y) - >>> hierarchy_dataset = enc.transform(X.loc[:,['compass']]) + >>> enc = TargetEncoder( + ... verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=hierarchical_map, cols=['compass'] + ... ).fit(X.loc[:, ['compass']], y) + >>> hierarchy_dataset = enc.transform(X.loc[:, ['compass']]) >>> print(hierarchy_dataset['compass'].values) [0.62263617 0.62263617 0.90382995 0.90382995 0.90382995 0.17660024 0.17660024 0.46051953 0.46051953 0.46051953 0.46051953 0.40332791 0.40332791 0.40332791 0.40332791 0.40332791] >>> X, y = load_postcodes('binary') >>> cols = ['postcode'] - >>> HIER_cols = ['HIER_postcode_1','HIER_postcode_2','HIER_postcode_3','HIER_postcode_4'] - >>> enc = TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=X[HIER_cols], cols=['postcode']).fit(X['postcode'], y) + >>> HIER_cols = ['HIER_postcode_1', 'HIER_postcode_2', 'HIER_postcode_3', 'HIER_postcode_4'] + >>> enc = TargetEncoder( + ... verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=X[HIER_cols], cols=['postcode'] + ... ).fit(X['postcode'], y) >>> hierarchy_dataset = enc.transform(X['postcode']) >>> print(hierarchy_dataset.loc[0:10, 'postcode'].values) [0.75063473 0.90208756 0.88328833 0.77041254 0.68891504 0.85012847 @@ -102,22 +131,40 @@ class TargetEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): References ---------- - .. [1] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems, from - https://dl.acm.org/citation.cfm?id=507538 + .. [1] A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification + and Prediction Problems, from https://dl.acm.org/citation.cfm?id=507538 """ + prefit_ordinal = True encoding_relation = util.EncodingRelation.ONE_TO_ONE - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value', - handle_unknown='value', min_samples_leaf=20, smoothing=10, hierarchy=None): - super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, - handle_unknown=handle_unknown, handle_missing=handle_missing) + def __init__( + self, + verbose: int = 0, + cols: list[str] = None, + drop_invariant: bool = False, + return_df: bool = True, + handle_missing: str = 'value', + handle_unknown: str = 'value', + min_samples_leaf: int = 20, + smoothing: float = 10, + hierarchy: dict = None, + ) -> None: + super().__init__( + verbose=verbose, + cols=cols, + drop_invariant=drop_invariant, + return_df=return_df, + handle_unknown=handle_unknown, + handle_missing=handle_missing, + ) self.ordinal_encoder = None self.min_samples_leaf = min_samples_leaf self.smoothing = smoothing self.mapping = None self._mean = None + # @ToDo create a function to check the hierarchy if isinstance(hierarchy, (dict, pd.DataFrame)) and cols is None: raise ValueError('Hierarchy is defined but no columns are named for encoding') if isinstance(hierarchy, dict): @@ -128,15 +175,23 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, h hierarchy_check = self._check_dict_key_tuples(flattened_hierarchy) self.hierarchy_depth[switch] = hierarchy_check[1] if not hierarchy_check[0]: - raise ValueError('Hierarchy mapping contains different levels for key "' + switch + '"') - self.hierarchy[switch] = {(k if isinstance(t, tuple) else t): v for t, v in flattened_hierarchy.items() for k in t} + raise ValueError( + 'Hierarchy mapping contains different levels for key "' + switch + '"' + ) + self.hierarchy[switch] = { + (k if isinstance(t, tuple) else t): v + for t, v in flattened_hierarchy.items() + for k in t + } elif isinstance(hierarchy, pd.DataFrame): self.hierarchy = hierarchy self.hierarchy_depth = {} for col in self.cols: - HIER_cols = self.hierarchy.columns[self.hierarchy.columns.str.startswith(f'HIER_{col}')].tolist() + HIER_cols = self.hierarchy.columns[ + self.hierarchy.columns.str.startswith(f'HIER_{col}') + ].tolist() HIER_levels = [int(i.replace(f'HIER_{col}_', '')) for i in HIER_cols] - if np.array_equal(sorted(HIER_levels), np.arange(1, max(HIER_levels)+1)): + if np.array_equal(sorted(HIER_levels), np.arange(1, max(HIER_levels) + 1)): self.hierarchy_depth[col] = max(HIER_levels) else: raise ValueError(f'Hierarchy columns are not complete for column {col}') @@ -147,18 +202,36 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, h self.cols_hier = [] - def _check_dict_key_tuples(self, d): - min_tuple_size = min(len(v) for v in d.values()) - max_tuple_size = max(len(v) for v in d.values()) + @staticmethod + def _check_dict_key_tuples(dict_to_check: dict[Any, tuple]) -> tuple[bool, int]: + """Check if all tuples in the dict values have the same length. + + Parameters + ---------- + dict_to_check: dictionary to check + + Returns + ------- + tuple: first entry if all sizes are equal, second minimum size. + """ + min_tuple_size = min(len(v) for v in dict_to_check.values()) + max_tuple_size = max(len(v) for v in dict_to_check.values()) return min_tuple_size == max_tuple_size, min_tuple_size - def _fit(self, X, y, **kwargs): + def _fit(self, X: util.X_type, y: util.y_type, **kwargs) -> None: if isinstance(self.hierarchy, dict): X_hier = pd.DataFrame() for switch in self.hierarchy: if switch in self.cols: - colnames = [f'HIER_{str(switch)}_{str(i + 1)}' for i in range(self.hierarchy_depth[switch])] - df = pd.DataFrame(X[str(switch)].map(self.hierarchy[str(switch)]).tolist(), index=X.index, columns=colnames) + colnames = [ + f'HIER_{str(switch)}_{str(i + 1)}' + for i in range(self.hierarchy_depth[switch]) + ] + df = pd.DataFrame( + X[str(switch)].map(self.hierarchy[str(switch)]).tolist(), + index=X.index, + columns=colnames, + ) X_hier = pd.concat([X_hier, df], axis=1) elif isinstance(self.hierarchy, pd.DataFrame): X_hier = self.hierarchy @@ -168,25 +241,36 @@ def _fit(self, X, y, **kwargs): verbose=self.verbose, cols=X_hier.columns, handle_unknown='value', - handle_missing='value' + handle_missing='value', ) enc_hier = enc_hier.fit(X_hier) X_hier_ordinal = enc_hier.transform(X_hier) self.ordinal_encoder = OrdinalEncoder( - verbose=self.verbose, - cols=self.cols, - handle_unknown='value', - handle_missing='value' + verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) if self.hierarchy is not None: - self.mapping = self.fit_target_encoding(pd.concat([X_ordinal, X_hier_ordinal], axis=1), y) + self.mapping = self.fit_target_encoding( + pd.concat([X_ordinal, X_hier_ordinal], axis=1), y + ) else: self.mapping = self.fit_target_encoding(X_ordinal, y) - def fit_target_encoding(self, X, y): + def fit_target_encoding(self, X: util.X_type, y: util.y_type) -> dict[str, np.ndarray]: + """Fit the target encoding mapping. + + Parameters + ---------- + X: training data to fit on. + y: training target. + + Returns + ------- + dictionary: column -> encoding values for column + + """ mapping = {} prior = self._mean = y.mean() @@ -196,17 +280,24 @@ def fit_target_encoding(self, X, y): values = switch.get('mapping') scalar = prior - if (isinstance(self.hierarchy, dict) and col in self.hierarchy) or \ - (isinstance(self.hierarchy, pd.DataFrame)): + if (isinstance(self.hierarchy, dict) and col in self.hierarchy) or ( + isinstance(self.hierarchy, pd.DataFrame) + ): for i in range(self.hierarchy_depth[col]): - col_hier = 'HIER_'+str(col)+'_'+str(i+1) - col_hier_m1 = col if i == self.hierarchy_depth[col]-1 else 'HIER_'+str(col)+'_'+str(i+2) - if not X[col].equals(X[col_hier]) and len(X[col_hier].unique())>1: + col_hier = 'HIER_' + str(col) + '_' + str(i + 1) + col_hier_m1 = ( + col + if i == self.hierarchy_depth[col] - 1 + else 'HIER_' + str(col) + '_' + str(i + 2) + ) + if not X[col].equals(X[col_hier]) and len(X[col_hier].unique()) > 1: stats_hier = y.groupby(X[col_hier]).agg(['count', 'mean']) smoove_hier = self._weighting(stats_hier['count']) - scalar_hier = scalar * (1 - smoove_hier) + stats_hier['mean'] * smoove_hier + scalar_hier = ( + scalar * (1 - smoove_hier) + stats_hier['mean'] * smoove_hier + ) scalar_hier_long = X[[col_hier_m1, col_hier]].drop_duplicates() - scalar_hier_long.index = np.arange(1, scalar_hier_long.shape[0]+1) + scalar_hier_long.index = np.arange(1, scalar_hier_long.shape[0] + 1) scalar = scalar_hier_long[col_hier].map(scalar_hier.to_dict()) stats = y.groupby(X[col]).agg(['count', 'mean']) @@ -228,7 +319,7 @@ def fit_target_encoding(self, X, y): return mapping - def _transform(self, X, y=None): + def _transform(self, X: pd.DataFrame, y: pd.Series | None = None) -> pd.DataFrame: # Now X is the correct dimensions it works with pre fitted ordinal encoder X = self.ordinal_encoder.transform(X) @@ -239,7 +330,8 @@ def _transform(self, X, y=None): X = self.target_encode(X) return X - def target_encode(self, X_in): + def target_encode(self, X_in: pd.DataFrame) -> pd.DataFrame: + """Apply target encoding via encoder mapping.""" X = X_in.copy(deep=True) # Was not mapping extra columns as self.featuer_names_in did not include new column @@ -248,7 +340,7 @@ def target_encode(self, X_in): return X - def _weighting(self, n): + def _weighting(self, n: int) -> float: # monotonically increasing function of n bounded between 0 and 1 # sigmoid in this case, using scipy.expit for numerical stability return expit((n - self.min_samples_leaf) / self.smoothing) diff --git a/category_encoders/utils.py b/category_encoders/utils.py index ba3a6d68..537dc43c 100644 --- a/category_encoders/utils.py +++ b/category_encoders/utils.py @@ -1,23 +1,42 @@ """A collection of shared utilities for all encoders, not intended for external use.""" + +from __future__ import annotations + +import warnings from abc import abstractmethod from enum import Enum, auto -import warnings +from typing import Hashable, Sequence -import pandas as pd import numpy as np +import pandas as pd import sklearn.base -from pandas.api.types import is_object_dtype, is_string_dtype, is_numeric_dtype +from pandas.api.types import is_numeric_dtype, is_object_dtype, is_string_dtype from pandas.core.dtypes.dtypes import CategoricalDtype -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.exceptions import NotFittedError -from typing import Dict, List, Optional, Union from scipy.sparse import csr_matrix +from sklearn.base import BaseEstimator +from sklearn.exceptions import NotFittedError from sklearn.preprocessing import LabelEncoder __author__ = 'willmcginnis' +X_type = np.ndarray | pd.DataFrame | list | np.generic | csr_matrix +y_type = list | pd.Series | np.ndarray | tuple | pd.DataFrame + + +def convert_cols_to_list( + cols: pd.Series | np.ndarray | set | tuple | CategoricalDtype | str | int, +) -> list: + """Convert columns to list. -def convert_cols_to_list(cols): + Parameters + ---------- + cols: columns as Series, array, set, tuple, ... + + Returns + ------- + columns as list. + + """ if isinstance(cols, pd.Series): return cols.tolist() elif isinstance(cols, np.ndarray): @@ -34,27 +53,59 @@ def convert_cols_to_list(cols): return cols -def get_categorical_cols(df): - """ - Returns names of categorical columns in the DataFrame. These - include columns of types: object, category, string, string[pyarrow]. +def get_categorical_cols(df: pd.DataFrame) -> list[str]: + """Returns names of categorical columns in the DataFrame. + + These include columns of types: object, category, string, string[pyarrow]. + + Parameters + ---------- + df DataFrame + + Returns + ------- + list of columns + """ obj_cols = [] for col, dtype in df.dtypes.items(): if is_object_dtype(dtype) or is_category(dtype) or is_string_dtype(dtype): + # if not isinstance(col, str): + # raise ValueError(f'DataFrame column names must be strings not {col}.') obj_cols.append(col) if not obj_cols: - print("Warning: No categorical columns found. Calling 'transform' will only return input data.") + msg = ( + 'Warning: No categorical columns found. ' + "Calling 'transform' will only return input data." + ) + print(msg) return obj_cols -def is_category(dtype): +def is_category(dtype: pd.core.dtypes.dtypes.ExtensionDtype) -> bool: + """Check if dtype is pandas categorical type. + + Parameters + ---------- + dtype pandas dtype + + Returns + ------- + True if CategoricalDtype, False otherwise. + + """ return isinstance(dtype, CategoricalDtype) -def convert_inputs(X, y, columns=None, index=None, deep=False): +def convert_inputs( + X: X_type, + y: y_type | None, + columns: Sequence = None, + index: Sequence = None, + deep: bool = False, +) -> tuple[pd.DataFrame, pd.Series | None]: """ Unite arraylike `X` and vectorlike `y` into a DataFrame and Series. @@ -84,29 +135,54 @@ def convert_inputs(X, y, columns=None, index=None, deep=False): # N.B.: If either was already pandas, it keeps its index. if any(X.index != y.index): - msg = "`X` and `y` both have indexes, but they do not match. If you are shuffling your input data on " \ - "purpose (e.g. via permutation_test_score) use np arrays instead of data frames / series" + msg = ( + '`X` and `y` both have indexes, but they do not match. If you are shuffling ' + 'your input data on purpose (e.g. via permutation_test_score) use ' + 'np arrays instead of data frames / series' + ) raise ValueError(msg) if X.shape[0] != y.shape[0]: - raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") + raise ValueError( + 'The length of X is ' + + str(X.shape[0]) + + ' but length of y is ' + + str(y.shape[0]) + + '.' + ) return X, y -def convert_input(X, columns=None, deep=False, index=None): - """ - Unite data into a DataFrame. +def convert_input( + X: X_type, columns: Sequence = None, index: Sequence = None, deep: bool = False +) -> pd.DataFrame: + """Unite data into a DataFrame. + Objects that do not contain column names take the names from the argument. Optionally perform deep copy of the data. + + Parameters + ---------- + X: data + columns: column names to assign, ignored if data is already a data frame. + index: index to use for the dataframe. Defaults to range(len(data)). + deep: flag whether the data should be copied when creating the data frame. + + Returns + ------- + A dataframe with the data and columns and index properly set. """ if not isinstance(X, pd.DataFrame): if isinstance(X, pd.Series): X = pd.DataFrame(X, copy=deep) else: if columns is not None and np.size(X, 1) != len(columns): - raise ValueError('The count of the column names does not correspond to the count of the columns') + raise ValueError( + 'The count of the column names does not correspond to the count of the columns' + ) if isinstance(X, list): - X = pd.DataFrame(X, columns=columns, copy=deep, - index=index) # lists are always copied, but for consistency, we still pass the argument + X = pd.DataFrame( + X, columns=columns, copy=deep, index=index + ) # lists are always copied, but for consistency, we still pass the argument elif isinstance(X, (np.generic, np.ndarray)): X = pd.DataFrame(X, columns=columns, copy=deep, index=index) elif isinstance(X, csr_matrix): @@ -119,14 +195,26 @@ def convert_input(X, columns=None, deep=False, index=None): return X -def convert_input_vector(y, index): - """ - Unite target data type into a Series. +def convert_input_vector(y: y_type, index: Sequence) -> pd.Series: + """Unite target data type into a Series. + If the target is a Series or a DataFrame, we preserve its index. But if the target does not contain index attribute, we use the index from the argument. + + Parameters + ---------- + y: target data to convert to series. + index: index to be used for the series. + + Returns + ------- + pd.Series containing the target. + """ if y is None: - raise ValueError('Supervised encoders need a target for the fitting. The target cannot be None') + raise ValueError( + 'Supervised encoders need a target for the fitting. The target cannot be None' + ) if isinstance(y, pd.Series): return y elif isinstance(y, np.ndarray): @@ -139,16 +227,21 @@ def convert_input_vector(y, index): else: raise ValueError(f'Unexpected input shape: {np.shape(y)}') elif np.isscalar(y): - return pd.Series([y], name='target', index=index) + raise ValueError('y must be a list, an np.ndarray or a pd.Series. Not a scalar') elif isinstance(y, list): if len(y) == 0: # empty list return pd.Series(y, name='target', index=index, dtype=float) elif len(y) > 0 and not isinstance(y[0], list): # vector return pd.Series(y, name='target', index=index) elif len(y) > 0 and isinstance(y[0], list) and len(y[0]) == 1: # single row in a matrix - flatten = lambda y: [item for sublist in y for item in sublist] + + def flatten(y): + return [item for sublist in y for item in sublist] + return pd.Series(flatten(y), name='target', index=index) - elif len(y) == 1 and len(y[0]) == 0 and isinstance(y[0], list): # single empty column in a matrix + elif ( + len(y) == 1 and len(y[0]) == 0 and isinstance(y[0], list) + ): # single empty column in a matrix return pd.Series(y[0], name='target', index=index, dtype=float) elif len(y) == 1 and isinstance(y[0], list): # single column in a matrix return pd.Series(y[0], name='target', index=index, dtype=type(y[0][0])) @@ -162,10 +255,14 @@ def convert_input_vector(y, index): else: raise ValueError(f'Unexpected input shape: {y.shape}') else: - return pd.Series(y, name='target', index=index) # this covers tuples and other directly convertible types + return pd.Series( + y, name='target', index=index + ) # this covers tuples and other directly convertible types -def get_generated_cols(X_original, X_transformed, to_transform): +def get_generated_cols( + X_original: pd.DataFrame, X_transformed: pd.DataFrame, to_transform: list[Hashable] +) -> list[Hashable]: """ Returns a list of the generated/transformed columns. @@ -175,7 +272,8 @@ def get_generated_cols(X_original, X_transformed, to_transform): X_transformed: df the transformed (current) DataFrame. to_transform: [str] - a list of columns that were transformed (as in the original DataFrame), commonly self.feature_names_in. + a list of columns that were transformed (as in the original DataFrame), + commonly self.feature_names_in. Output: a list of columns that were transformed (as in the current DataFrame). @@ -192,65 +290,104 @@ def get_generated_cols(X_original, X_transformed, to_transform): return current_cols -def flatten_reverse_dict(d): - sep = "___" - [flat_dict] = pd.json_normalize(d, sep=sep).to_dict(orient='records') +def flatten_reverse_dict(dict_to_flatten: dict) -> dict: + """Flatten a dictionary into a tuple of nested keys. + + Parameters + ---------- + dict_to_flatten + + Returns + ------- + the flattened dictionary with tuples as keys indicating the hierarchy. + + """ + sep = '___' + [flat_dict] = pd.json_normalize(dict_to_flatten, sep=sep).to_dict(orient='records') reversed_flat_dict = {v: tuple(k.split(sep)) for k, v in flat_dict.items()} return reversed_flat_dict class EncodingRelation(Enum): + """Relation of how many input features are encoded into how many output features.""" + # one input feature get encoded into one output feature ONE_TO_ONE = auto() # one input feature get encoded into as many output features as it has distinct values ONE_TO_N_UNIQUE = auto() - # one input feature get encoded into m output features that are not the number of distinct values + # one input feature get encoded into m output features + # that are not the number of distinct values ONE_TO_M = auto() # all N input features are encoded into M output features. # The encoding is done globally on all the input not on a per-feature basis N_TO_M = auto() -def get_docstring_output_shape(in_out_relation: EncodingRelation): +def get_docstring_output_shape(in_out_relation: EncodingRelation) -> str: + """Find how many encoded features are expected. + + Parameters + ---------- + in_out_relation + + Returns + ------- + A string saying how many features to expect. + + """ if in_out_relation == EncodingRelation.ONE_TO_ONE: - return "n_features" + return 'n_features' elif in_out_relation == EncodingRelation.ONE_TO_N_UNIQUE: - return "n_features * respective cardinality" + return 'n_features * respective cardinality' elif in_out_relation == EncodingRelation.ONE_TO_M: - return "M features (n_features < M)" + return 'M features (n_features < M)' elif in_out_relation == EncodingRelation.N_TO_M: - return "M features (M can be anything)" + return 'M features (M can be anything)' class BaseEncoder(BaseEstimator): - _dim: Optional[int] - cols: List[str] + """BaseEstimator class for all encoders. + + This follows the sklearn estimator / transformer pattern. + """ + + _dim: int | None + cols: list[str] use_default_cols: bool handle_missing: str handle_unknown: str verbose: int drop_invariant: bool - invariant_cols: List[str] = [] + invariant_cols: list[str] = [] return_df: bool supervised: bool encoding_relation: EncodingRelation - INVARIANCE_THRESHOLD = 10e-5 # columns with variance less than this will be considered constant / invariant - - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, - handle_unknown='value', handle_missing='value', **kwargs): - """ + INVARIANCE_THRESHOLD = ( + 10e-5 # columns with variance less than this will be considered constant / invariant + ) + + def __init__( + self, + verbose: int = 0, + cols: list[str] = None, + drop_invariant: bool = False, + return_df: bool = True, + handle_unknown: str = 'value', + handle_missing: str = 'value', + **kwargs, + ): + """Initialize the encoder. Parameters ---------- - verbose: int integer indicating verbosity of output. 0 for none. cols: list a list of columns to encode, if None, all string and categorical columns will be encoded. drop_invariant: bool - boolean for whether or not to drop columns with 0 variance. + boolean for whether to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform and inverse transform (otherwise it will be a numpy array). @@ -269,19 +406,20 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, self.drop_invariant = drop_invariant self.invariant_cols = [] self.verbose = verbose - self.use_default_cols = cols is None # if True, even a repeated call of fit() will select string columns from X - self.cols = cols # note that cols are only the columns to be encoded, feature_names_in_ are all columns + # if True, even a repeated call of fit() will select string columns from X + self.use_default_cols = cols is None + # note that cols are only the columns to be encoded, feature_names_in_ are all columns + self.cols = cols self.mapping = None self.handle_unknown = handle_unknown self.handle_missing = handle_missing self._dim = None - def fit(self, X, y=None, **kwargs): + def fit(self, X: X_type, y: y_type | None = None, **kwargs): """Fits the encoder according to X and y. Parameters ---------- - X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. @@ -290,7 +428,6 @@ def fit(self, X, y=None, **kwargs): Returns ------- - self : encoder Returns self. @@ -322,49 +459,59 @@ def fit(self, X, y=None, **kwargs): # for finding invariant columns transform without y (as is done on the test set) self.feature_names_out_ = None # Issue#437 X_transformed = self.transform(X, override_return_df=True) - self.feature_names_out_ = X_transformed.columns.tolist() + self.feature_names_out_ = X_transformed.columns.to_numpy() # drop all output columns with 0 variance. if self.drop_invariant: generated_cols = get_generated_cols(X, X_transformed, self.cols) - self.invariant_cols = [x for x in generated_cols if X_transformed[x].var() <= self.INVARIANCE_THRESHOLD] - self.feature_names_out_ = [x for x in self.feature_names_out_ if x not in self.invariant_cols] + self.invariant_cols = [ + x for x in generated_cols if X_transformed[x].var() <= self.INVARIANCE_THRESHOLD + ] + self.feature_names_out_ = np.fromiter( + (x for x in self.feature_names_out_ if x not in self.invariant_cols), + dtype=self.feature_names_out_.dtype, + ) return self - def _check_fit_inputs(self, X, y): + def _check_fit_inputs(self, X: X_type, y: y_type) -> None: if self._get_tags().get('supervised_encoder'): if y is None: - raise ValueError('Supervised encoders need a target for the fitting. The target cannot be None') + raise ValueError( + 'Supervised encoders need a target for the fitting. The target cannot be None' + ) else: - if y.isna().any(): # Target column should never have missing values - raise ValueError("The target column y must not contain missing values.") + if y.isna().any(): # Target column should never have missing values + raise ValueError('The target column y must not contain missing values.') - def _check_transform_inputs(self, X): + def _check_transform_inputs(self, df: pd.DataFrame) -> None: if self.handle_missing == 'error': - if X[self.cols].isna().any().any(): + if df[self.cols].isna().any().any(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise NotFittedError('Must train encoder before it can be used to transform data.') # then make sure that it is the right size - if X.shape[1] != self._dim: - raise ValueError(f'Unexpected input dimension {X.shape[1]}, expected {self._dim}') + if df.shape[1] != self._dim: + raise ValueError(f'Unexpected input dimension {df.shape[1]}, expected {self._dim}') - def _drop_invariants(self, X: pd.DataFrame, override_return_df: bool) -> Union[np.ndarray, pd.DataFrame]: + def _drop_invariants( + self, df: pd.DataFrame, override_return_df: bool + ) -> np.ndarray | pd.DataFrame: if self.drop_invariant: - X = X.drop(columns=self.invariant_cols) + df = df.drop(columns=self.invariant_cols) if self.return_df or override_return_df: - return X + return df else: - return X.to_numpy() + return df.to_numpy() def _determine_fit_columns(self, X: pd.DataFrame) -> None: - """ Determine columns used by encoder. + """Determine columns used by encoder. - Note that the implementation also deals with re-fitting the same encoder object with different columns. + Note that the implementation also deals with re-fitting the same encoder object + with different columns. :param X: input data frame :return: none, sets self.cols as a side effect @@ -375,64 +522,70 @@ def _determine_fit_columns(self, X: pd.DataFrame) -> None: else: self.cols = convert_cols_to_list(self.cols) - def get_feature_names(self) -> List[str]: - warnings.warn("`get_feature_names` is deprecated in all of sklearn. Use `get_feature_names_out` instead.", - category=FutureWarning) + def get_feature_names(self) -> np.ndarray: + """Deprecated method to get feature names. Use `get_feature_names_out` instead.""" + msg = ( + '`get_feature_names` is deprecated in all of sklearn. ' + 'Use `get_feature_names_out` instead.' + ) + warnings.warn(msg, category=FutureWarning, stacklevel=2) return self.get_feature_names_out() def get_feature_names_out(self, input_features=None) -> np.ndarray: - """ - Returns the names of all transformed / added columns. + """Get the names of all transformed / added columns. - Note that in sklearn the get_feature_names_out function takes the feature_names_in as an argument - and determines the output feature names using the input. A fit is usually not necessary and if so a - NotFittedError is raised. + Note that in sklearn the get_feature_names_out function takes the feature_names_in + as an argument and determines the output feature names using the input. + A fit is usually not necessary and if so a NotFittedError is raised. We just require a fit all the time and return the fitted output columns. Returns ------- feature_names: np.ndarray A numpy array with all feature names transformed or added. - Note: potentially dropped features (because the feature is constant/invariant) are not included! + Note: potentially dropped features (because the feature is constant/invariant) + are not included! """ - out_feats = getattr(self, "feature_names_out_", None) - if not isinstance(out_feats, list): - raise NotFittedError("Estimator has to be fitted to return feature names.") + out_feats = getattr(self, 'feature_names_out_', None) + if not isinstance(out_feats, np.ndarray): + raise NotFittedError('Estimator has to be fitted to return feature names.') else: - return np.array(out_feats, dtype=object) + return out_feats + + def get_feature_names_in(self) -> np.ndarray: + """Get the names of all input columns present when fitting. - def get_feature_names_in(self) -> List[str]: - """ - Returns the names of all input columns present when fitting. These columns are necessary for the transform step. """ - in_feats = getattr(self, "feature_names_in_", None) - if not isinstance(in_feats, list): - raise NotFittedError("Estimator has to be fitted to return feature names.") + in_feats = getattr(self, 'feature_names_in_', None) + if isinstance(in_feats, list): + in_feats = np.array(in_feats) + if not isinstance(in_feats, np.ndarray): + raise NotFittedError('Estimator has to be fitted to return feature names.') else: return in_feats @abstractmethod - def _fit(self, X: pd.DataFrame, y: Optional[pd.Series], **kwargs): - ... + def _fit(self, X: pd.DataFrame, y: pd.Series | None, **kwargs): ... class SupervisedTransformerMixin(sklearn.base.TransformerMixin): + """Mixin for supervised transformers (with target).""" - def _more_tags(self): + def _more_tags(self) -> dict[str, bool]: + """Set scikit transformer tags.""" return {'supervised_encoder': True} - def transform(self, X, y=None, override_return_df=False): + def transform(self, X: X_type, y: y_type | None = None, override_return_df: bool = False): """Perform the transformation to new categorical data. - Some encoders behave differently on whether y is given or not. This is mainly due to regularisation - in order to avoid overfitting. + Some encoders behave differently on whether y is given or not. + This is mainly due to regularisation in order to avoid overfitting. On training data transform should be called with y, on test data without. Parameters ---------- - X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] or None override_return_df : bool @@ -440,7 +593,6 @@ def transform(self, X, y=None, override_return_df=False): Returns ------- - p : array or DataFrame, shape = [n_samples, n_features_out] Transformed values with encoding applied. @@ -459,36 +611,32 @@ def transform(self, X, y=None, override_return_df=False): return self._drop_invariants(X, override_return_df) @abstractmethod - def _transform(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame: - ... + def _transform(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame: ... - def fit_transform(self, X, y=None, **fit_params): - """ - Encoders that utilize the target must make sure that the training data are transformed with: - transform(X, y) - and not with: - transform(X) + def fit_transform(self, X: X_type, y: y_type | None = None, **fit_params): + """Fit and transform using the target information. + + This also uses the target for transforming, not only for training. """ if y is None: - raise TypeError('fit_transform() missing argument: ''y''') + raise TypeError('fit_transform() missing argument: ' 'y' '') return self.fit(X, y, **fit_params).transform(X, y) class UnsupervisedTransformerMixin(sklearn.base.TransformerMixin): + """Mixin for Transformers without target information.""" - def transform(self, X, override_return_df=False): + def transform(self, X: X_type, override_return_df: bool = False): """Perform the transformation to new categorical data. Parameters ---------- - X : array-like, shape = [n_samples, n_features] override_return_df : bool override self.return_df to force to return a data frame Returns ------- - p : array or DataFrame, shape = [n_samples, n_features_out] Transformed values with encoding applied. @@ -504,22 +652,21 @@ def transform(self, X, override_return_df=False): return self._drop_invariants(X, override_return_df) @abstractmethod - def _transform(self, X) -> pd.DataFrame: - ... + def _transform(self, X: pd.DataFrame) -> pd.DataFrame: ... class TransformerWithTargetMixin: + """Mixin for transformers with target information.""" - def _more_tags(self): + def _more_tags(self) -> dict[str, bool]: + """Set scikit transformer tags.""" return {'supervised_encoder': True} - def fit_transform(self, X, y=None, **fit_params): - """ - Encoders that utilize the target must make sure that the training data are transformed with: - transform(X, y) - and not with: - transform(X) + def fit_transform(self, X: X_type, y: y_type | None = None, **fit_params): + """Fit and transform using target. + + This also uses the target for transforming, not only for training. """ if y is None: - raise TypeError('fit_transform() missing argument: ''y''') + raise TypeError('fit_transform() missing argument: ' 'y' '') return self.fit(X, y, **fit_params).transform(X, y) diff --git a/category_encoders/woe.py b/category_encoders/woe.py index a49f6fc2..0bba6d10 100644 --- a/category_encoders/woe.py +++ b/category_encoders/woe.py @@ -1,9 +1,13 @@ -"""Weight of Evidence""" +"""Weight of Evidence.""" + +from __future__ import annotations + import numpy as np -from category_encoders.ordinal import OrdinalEncoder -import category_encoders.utils as util -from sklearn.utils.random import check_random_state import pandas as pd +from sklearn.utils.random import check_random_state + +import category_encoders.utils as util +from category_encoders.ordinal import OrdinalEncoder __author__ = 'Jan Motl' @@ -15,7 +19,6 @@ class WOEEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): Parameters ---------- - verbose: int integer indicating verbosity of the output. 0 for none. cols: list @@ -23,13 +26,15 @@ class WOEEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): drop_invariant: bool boolean for whether or not to drop columns with 0 variance. return_df: bool - boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). + boolean for whether to return a pandas DataFrame from transform + (otherwise it will be a numpy array). handle_missing: str options are 'return_nan', 'error' and 'value', defaults to 'value', which will assume WOE=0. handle_unknown: str options are 'return_nan', 'error' and 'value', defaults to 'value', which will assume WOE=0. randomized: bool, - adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). + adds normal (Gaussian) distribution noise into training data in order to decrease + overfitting (testing data are untouched). sigma: float standard deviation (spread or "width") of the normal distribution. regularization: float @@ -41,8 +46,16 @@ class WOEEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): >>> from category_encoders import * >>> import pandas as pd >>> from sklearn.datasets import fetch_openml - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target > 200000 >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = WOEEncoder(cols=['CentralAir', 'Heating']).fit(X, y) @@ -51,11 +64,11 @@ class WOEEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 7 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 5 Heating 1460 non-null float64 @@ -71,13 +84,31 @@ class WOEEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html """ + prefit_ordinal = True encoding_relation = util.EncodingRelation.ONE_TO_ONE - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, - handle_unknown='value', handle_missing='value', random_state=None, randomized=False, sigma=0.05, regularization=1.0): - super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, - handle_unknown=handle_unknown, handle_missing=handle_missing) + def __init__( + self, + verbose=0, + cols=None, + drop_invariant=False, + return_df=True, + handle_unknown='value', + handle_missing='value', + random_state=None, + randomized=False, + sigma=0.05, + regularization=1.0, + ): + super().__init__( + verbose=verbose, + cols=cols, + drop_invariant=drop_invariant, + return_df=return_df, + handle_unknown=handle_unknown, + handle_missing=handle_missing, + ) self.ordinal_encoder = None self._sum = None self._count = None @@ -91,19 +122,28 @@ def _fit(self, X, y, **kwargs): y = pd.Series(y) unique = y.unique() if len(unique) != 2: - raise ValueError("The target column y must be binary. But the target contains " + str(len(unique)) + " unique value(s).") + raise ValueError( + 'The target column y must be binary. But the target contains ' + + str(len(unique)) + + ' unique value(s).' + ) if y.isna().any(): - raise ValueError("The target column y must not contain missing values.") + raise ValueError('The target column y must not contain missing values.') if np.max(unique) < 1: - raise ValueError("The target column y must be binary with values {0, 1}. Value 1 was not found in the target.") + msg = ( + 'The target column y must be binary with values {0, 1}. ' + 'Value 1 was not found in the target.' + ) + raise ValueError(msg) if np.min(unique) > 0: - raise ValueError("The target column y must be binary with values {0, 1}. Value 0 was not found in the target.") + msg = ( + 'The target column y must be binary with values {0, 1}. ' + 'Value 0 was not found in the target.' + ) + raise ValueError(msg) self.ordinal_encoder = OrdinalEncoder( - verbose=self.verbose, - cols=self.cols, - handle_unknown='value', - handle_missing='value' + verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) @@ -139,8 +179,10 @@ def _train(self, X, y): # Create a new column with regularized WOE. # Regularization helps to avoid division by zero. # Pre-calculate WOEs because logarithms are slow. - nominator = (stats['sum'] + self.regularization) / (self._sum + 2*self.regularization) - denominator = ((stats['count'] - stats['sum']) + self.regularization) / (self._count - self._sum + 2*self.regularization) + nominator = (stats['sum'] + self.regularization) / (self._sum + 2 * self.regularization) + denominator = ((stats['count'] - stats['sum']) + self.regularization) / ( + self._count - self._sum + 2 * self.regularization + ) woe = np.log(nominator / denominator) # Ignore unique values. This helps to prevent overfitting on id-like columns. @@ -169,6 +211,6 @@ def _score(self, X, y): # Randomization is meaningful only for training data -> we do it only if y is present if self.randomized and y is not None: random_state_generator = check_random_state(self.random_state) - X[col] = (X[col] * random_state_generator.normal(1., self.sigma, X[col].shape[0])) + X[col] = X[col] * random_state_generator.normal(1.0, self.sigma, X[col].shape[0]) return X diff --git a/category_encoders/wrapper.py b/category_encoders/wrapper.py index 015b602a..adb7c1b7 100644 --- a/category_encoders/wrapper.py +++ b/category_encoders/wrapper.py @@ -1,10 +1,15 @@ +"""Module for wrappers that add extra functionality to encoders.""" + +from __future__ import annotations + import copy -from category_encoders import utils + +import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin from sklearn.model_selection import StratifiedKFold + import category_encoders as encoders -import pandas as pd -from typing import Dict, Optional +from category_encoders import utils class PolynomialWrapper(BaseEstimator, TransformerMixin): @@ -23,7 +28,6 @@ class PolynomialWrapper(BaseEstimator, TransformerMixin): Parameters ---------- - feature_encoder: Object an instance of a supervised encoder. @@ -34,10 +38,18 @@ class PolynomialWrapper(BaseEstimator, TransformerMixin): >>> import pandas as pd >>> from sklearn.datasets import fetch_openml >>> from category_encoders.wrapper import PolynomialWrapper - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] - >>> bunch = fetch_openml(name="house_prices", as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) >>> # need more than one column - >>> y = bunch.target.map(lambda x: int(min([x, 300000])/50000)) + >>> y = bunch.target.map(lambda x: int(min([x, 300000]) / 50000)) >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> enc = TargetEncoder(cols=['CentralAir', 'Heating']) >>> wrapper = PolynomialWrapper(enc) @@ -46,11 +58,11 @@ class PolynomialWrapper(BaseEstimator, TransformerMixin): RangeIndex: 1460 entries, 0 to 1459 Data columns (total 17 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 1460 non-null float64 1 MSSubClass 1460 non-null float64 - 2 MSZoning 1460 non-null object + 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 YearBuilt 1460 non-null float64 5 CentralAir_3 1460 non-null float64 @@ -71,21 +83,25 @@ class PolynomialWrapper(BaseEstimator, TransformerMixin): """ def __init__(self, feature_encoder: utils.BaseEncoder): + """Init the polynomial wrapper.""" self.feature_encoder: utils.BaseEncoder = feature_encoder - self.feature_encoders: Dict[str, utils.BaseEncoder] = {} - self.label_encoder: Optional[encoders.OneHotEncoder] = None + self.feature_encoders: dict[str, utils.BaseEncoder] = {} + self.label_encoder: encoders.OneHotEncoder | None = None def fit(self, X, y, **kwargs): + """Fit a multi-label encoder.""" # unite the input into pandas types X, y = utils.convert_inputs(X, y) y = pd.DataFrame(y.rename('target')) # apply one-hot-encoder on the label - self.label_encoder = encoders.OneHotEncoder(handle_missing='error', - handle_unknown='error', - cols=['target'], - drop_invariant=True, - use_cat_names=True) + self.label_encoder = encoders.OneHotEncoder( + handle_missing='error', + handle_unknown='error', + cols=['target'], + drop_invariant=True, + use_cat_names=True, + ) labels = self.label_encoder.fit_transform(y) labels.columns = [column[7:] for column in labels.columns] labels = labels.iloc[:, 1:] # drop one label @@ -96,6 +112,7 @@ def fit(self, X, y, **kwargs): self.feature_encoders[class_name] = copy.deepcopy(self.feature_encoder).fit(X, label) def transform(self, X, y=None): + """Encode new data.""" # unite the input into pandas types X = utils.convert_input(X) @@ -106,42 +123,50 @@ def transform(self, X, y=None): # transform the features if y is not None: - y = self.label_encoder.transform(pd.DataFrame({"target": y})) + y = self.label_encoder.transform(pd.DataFrame({'target': y})) for class_name, feature_encoder in self.feature_encoders.items(): if y is not None: - y_transform = y[f"target_{class_name}"] + y_transform = y[f'target_{class_name}'] else: y_transform = None encoded = feature_encoder.transform(X, y_transform) # decorate the encoded features with the label class suffix new_features = encoded[feature_encoder.cols] - new_features.columns = [str(column) + '_' + class_name for column in new_features.columns] + new_features.columns = [ + str(column) + '_' + class_name for column in new_features.columns + ] all_new_features = pd.concat((all_new_features, new_features), axis=1) # add features that were not encoded - result = pd.concat((encoded[encoded.columns[~encoded.columns.isin(feature_encoder.cols)]], - all_new_features), axis=1) + result = pd.concat( + ( + encoded[encoded.columns[~encoded.columns.isin(feature_encoder.cols)]], + all_new_features, + ), + axis=1, + ) return result def fit_transform(self, X, y=None, **fit_params): + """Fit encoder and encode the training data.""" self.fit(X, y, **fit_params) return self.transform(X, y) class NestedCVWrapper(BaseEstimator, TransformerMixin): - """ - Extends supervised encoders with the nested cross validation on the training data to minimise overfitting. + """Cross validate supervised encoders to avoid overfitting. For a validation or a test set, supervised encoders can be used as follows: X_train_encoded = encoder.fit_transform(X_train, y_train) X_valid_encoded = encoder.transform(X_valid) - However, the downstream model will be overfitting to the encoded training data due to target leakage. - Using out-of-fold encodings is an effective way to prevent target leakage. This is equivalent to: + However, the downstream model will be overfitting to the encoded training data due to + target leakage. Using out-of-fold encodings is an effective way to prevent target leakage. + This is equivalent to: X_train_encoded = np.zeros(X.shape) for trn, val in kfold.split(X, y): @@ -153,20 +178,22 @@ class NestedCVWrapper(BaseEstimator, TransformerMixin): See README.md for a list of supervised encoders. - Discussion: Although leave-one-out encoder internally performs leave-one-out cross-validation, it is - actually the most overfitting supervised model in our library. To illustrate the issue, let's imagine we - have a totally unpredictive nominal feature and a perfectly balanced binary label. A supervised encoder - should encode the feature into a constant vector as the feature is unpredictive of the label. But when we - use leave-one-out cross-validation, the label ratio cease to be perfectly balanced and the wrong class - label always becomes the majority in the training fold. Leave-one-out encoder returns a seemingly - predictive feature. And the downstream model starts to overfit to the encoded feature. Unfortunately, - even 10-fold cross-validation is not immune to this effect: + Discussion: Although leave-one-out encoder internally performs leave-one-out cross-validation, + it is actually the most overfitting supervised model in our library. + To illustrate the issue, let's imagine we have a totally unpredictive nominal feature + and a perfectly balanced binary label. A supervised encoder should encode the feature into a + constant vector as the feature is unpredictive of the label. + But when we use leave-one-out cross-validation, the label ratio cease to be perfectly balanced + and the wrong class label always becomes the majority in the training fold. + Leave-one-out encoder returns a seemingly predictive feature. + And the downstream model starts to overfit to the encoded feature. + Unfortunately, even 10-fold cross-validation is not immune to this effect: http://www.kdd.org/exploration_files/v12-02-4-UR-Perlich.pdf - To decrease the effect, it is recommended to use a low count of the folds. And that is the reason why - this wrapper uses 5 folds by default. + To decrease the effect, it is recommended to use a low count of the folds. + And that is the reason why this wrapper uses 5 folds by default. - Based on the empirical results, only LeaveOneOutEncoder benefits greatly from this wrapper. The remaining - encoders can be used without this wrapper. + Based on the empirical results, only LeaveOneOutEncoder benefits greatly from this wrapper. + The remaining encoders can be used without this wrapper. Parameters @@ -175,13 +202,16 @@ class NestedCVWrapper(BaseEstimator, TransformerMixin): an instance of a supervised encoder. cv: int or sklearn cv Object - if an int is given, StratifiedKFold is used by default, where the int is the number of folds. + if an int is given, StratifiedKFold is used by default, where the int + is the number of folds. shuffle: boolean, optional - whether to shuffle each classes samples before splitting into batches. Ignored if a CV method is provided. + whether to shuffle each classes samples before splitting into batches. + Ignored if a CV method is provided. random_state: int, RandomState instance or None, optional, default=None - if int, random_state is the seed used by the random number generator. Ignored if a CV method is provided. + if int, random_state is the seed used by the random number generator. + Ignored if a CV method is provided. Example @@ -191,8 +221,16 @@ class NestedCVWrapper(BaseEstimator, TransformerMixin): >>> from category_encoders.wrapper import NestedCVWrapper >>> from sklearn.datasets import fetch_openml >>> from sklearn.model_selection import GroupKFold, train_test_split - >>> bunch = fetch_openml(name="house_prices", as_frame=True) - >>> display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"] + >>> bunch = fetch_openml(name='house_prices', as_frame=True) + >>> display_cols = [ + ... 'Id', + ... 'MSSubClass', + ... 'MSZoning', + ... 'LotFrontage', + ... 'YearBuilt', + ... 'Heating', + ... 'CentralAir', + ... ] >>> y = bunch.target > 200000 >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)[display_cols] >>> X_train, X_test, y_train, _ = train_test_split(X, y, random_state=42) @@ -200,16 +238,18 @@ class NestedCVWrapper(BaseEstimator, TransformerMixin): >>> # Define the nested CV encoder for a supervised encoder >>> enc_nested = NestedCVWrapper(TargetEncoder(cols=['CentralAir', 'Heating']), random_state=42) >>> # Encode the X data for train, valid & test - >>> X_train_enc, X_valid_enc, X_test_enc = enc_nested.fit_transform(X_train, y_train, X_test=(X_valid, X_test)) + >>> X_train_enc, X_valid_enc, X_test_enc = enc_nested.fit_transform( + ... X_train, y_train, X_test=(X_valid, X_test) + ... ) >>> print(X_train_enc.info()) Int64Index: 821 entries, 1390 to 896 Data columns (total 7 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- + # Column Non-Null Count Dtype + --- ------ -------------- ----- 0 Id 821 non-null float64 1 MSSubClass 821 non-null float64 - 2 MSZoning 821 non-null object + 2 MSZoning 821 non-null object 3 LotFrontage 672 non-null float64 4 YearBuilt 821 non-null float64 5 Heating 821 non-null float64 @@ -220,6 +260,7 @@ class NestedCVWrapper(BaseEstimator, TransformerMixin): """ def __init__(self, feature_encoder, cv=5, shuffle=True, random_state=None): + """Init wrapper.""" self.feature_encoder = feature_encoder self.__name__ = feature_encoder.__class__.__name__ self.shuffle = shuffle @@ -231,32 +272,32 @@ def __init__(self, feature_encoder, cv=5, shuffle=True, random_state=None): self.cv = cv def fit(self, X, y, **kwargs): - """ - Calls fit on the base feature_encoder without nested cross validation - """ + """Fit on the base feature_encoder without nested cross validation.""" self.feature_encoder.fit(X, y, **kwargs) def transform(self, X): - """ - Calls transform on the base feature_encoder without nested cross validation - """ + """Transform on the base feature_encoder without nested cross validation.""" return self.feature_encoder.transform(X) def fit_transform(self, X, y=None, X_test=None, groups=None, **fit_params): - """ - Creates unbiased encodings from a supervised encoder as well as infer encodings on a test set + """Unbiased encodings from a supervised encoder and inference on test set. + :param X: array-like, shape = [n_samples, n_features] - Training vectors for the supervised encoder, where n_samples is the number of samples + Training vectors for the supervised encoder, + where n_samples is the number of samples and n_features is the number of features. :param y: array-like, shape = [n_samples] Target values for the supervised encoder. - :param X_test, optional: array-like, shape = [m_samples, n_features] or a tuple of array-likes (X_test, X_valid...) - Vectors to be used for inference by an encoder (e.g. test or validation sets) trained on the + :param X_test, optional: array-like, shape = [m_samples, n_features] + or a tuple of array-likes (X_test, X_valid...) + Vectors to be used for inference by an encoder + (e.g. test or validation sets) trained on the full X & y sets. No nested folds are used here :param groups: Groups to be passed to the cv method, e.g. for GroupKFold :param fit_params: :return: array, shape = [n_samples, n_numeric + N] - Transformed values with encoding applied. Returns multiple arrays if X_test is not None + Transformed values with encoding applied. + Returns multiple arrays if X_test is not None """ X, y = utils.convert_inputs(X, y) @@ -276,9 +317,9 @@ def fit_transform(self, X, y=None, X_test=None, groups=None, **fit_params): return out_of_fold else: if isinstance(X_test, tuple): - encoded_data = (out_of_fold, ) + encoded_data = (out_of_fold,) for dataset in X_test: - encoded_data = encoded_data + (self.feature_encoder.transform(dataset), ) + encoded_data = encoded_data + (self.feature_encoder.transform(dataset),) return encoded_data else: return out_of_fold, self.feature_encoder.transform(X_test) diff --git a/docs/source/conf.py b/docs/source/conf.py index 66344bbe..4026e18d 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,20 +13,27 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys import os -import shlex +import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, str(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + os.sep + '') -sys.path.insert(0, str(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + os.sep + 'category_encoders') +# sys.path.insert(0, os.path.abspath('.')) +sys.path.insert( + 0, + str(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + os.sep + '', +) +sys.path.insert( + 0, + str(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + + os.sep + + 'category_encoders', +) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom @@ -37,7 +44,7 @@ 'sphinx.ext.viewcode', 'sphinx.ext.coverage', 'sphinx.ext.doctest', - 'numpydoc' + 'numpydoc', ] # Add any paths that contain templates here, relative to this directory. @@ -49,15 +56,15 @@ source_suffix = '.rst' # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. master_doc = 'index' # General information about the project. project = 'Category Encoders' -copyright = '2022, Will McGinnis' -author = 'Will McGinnis' +copyright = '2024, Paul Westenthanner, Will McGinnis' +author = 'Paul Westenthanner, Will McGinnis' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -65,6 +72,7 @@ # # The short X.Y version. from category_encoders import __version__ + version = __version__ # The full version, including alpha/beta/rc tags. release = __version__ @@ -78,9 +86,9 @@ # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -88,7 +96,7 @@ # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. add_function_parentheses = True @@ -105,10 +113,10 @@ pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -123,26 +131,26 @@ # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -152,34 +160,34 @@ # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. html_show_sphinx = True @@ -190,24 +198,24 @@ # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Language to be used for generating the HTML full-text search index. # Sphinx supports the following languages: # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' -#html_search_language = 'en' +# html_search_language = 'en' # A dictionary with options for the search language support, empty by default. # Now only 'ja' uses this config value -#html_search_options = {'type': 'default'} +# html_search_options = {'type': 'default'} # The name of a javascript file (relative to the configuration directory) that # implements a search results scorer. If empty, the default will be used. -#html_search_scorer = 'scorer.js' +# html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. htmlhelp_basename = 'CategoryEncodersdoc' @@ -215,59 +223,58 @@ # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', - -# Latex figure (float) alignment -#'figure_align': 'htbp', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', + # Latex figure (float) alignment + #'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'CategoryEncoders.tex', 'Category Encoders Documentation', - 'Will McGinnis', 'manual'), + ( + master_doc, + 'CategoryEncoders.tex', + 'Category Encoders Documentation', + 'Will McGinnis', + 'manual', + ), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'categoryencoders', 'Category Encoders Documentation', - [author], 1) -] +man_pages = [(master_doc, 'categoryencoders', 'Category Encoders Documentation', [author], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -276,19 +283,25 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'CategoryEncoders', 'Category Encoders Documentation', - author, 'CategoryEncoders', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + 'CategoryEncoders', + 'Category Encoders Documentation', + author, + 'CategoryEncoders', + 'One line description of project.', + 'Miscellaneous', + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False diff --git a/examples/benchmarking/benchmarking.py b/examples/benchmarking/benchmarking.py index a2d6171a..53d5fcd7 100644 --- a/examples/benchmarking/benchmarking.py +++ b/examples/benchmarking/benchmarking.py @@ -1,7 +1,9 @@ -from memory_profiler import profile import gc + import category_encoders as ce -from examples.source_data.loaders import get_mushroom_data, get_cars_data, get_splice_data +from memory_profiler import profile + +from examples.source_data.loaders import get_mushroom_data __author__ = 'willmcginnis' @@ -110,7 +112,8 @@ def control(): X, _, _ = get_mushroom_data() del X + if __name__ == '__main__': gc.collect() leaveoneout() - gc.collect() \ No newline at end of file + gc.collect() diff --git a/examples/benchmarking/reporting.py b/examples/benchmarking/reporting.py index 71902c29..dfb7ec88 100644 --- a/examples/benchmarking/reporting.py +++ b/examples/benchmarking/reporting.py @@ -15,5 +15,12 @@ # normalize compression df['compression (smaller better)'] = df['final_df_size(MB)'] / df['initial_df_size(MB)'] - df = df.reindex(columns=['dataset', 'version', 'memory_factor (smaller better)', 'compression (smaller better)']) - print(df) \ No newline at end of file + df = df.reindex( + columns=[ + 'dataset', + 'version', + 'memory_factor (smaller better)', + 'compression (smaller better)', + ] + ) + print(df) diff --git a/examples/benchmarking_cpu/benchmarking_cpu.py b/examples/benchmarking_cpu/benchmarking_cpu.py index f4e7bc20..29f9f435 100644 --- a/examples/benchmarking_cpu/benchmarking_cpu.py +++ b/examples/benchmarking_cpu/benchmarking_cpu.py @@ -1,9 +1,10 @@ -import psutil -import time -import pandas as pd -import numpy as np import multiprocessing +import time + import category_encoders as encoders +import numpy as np +import pandas as pd +import psutil import tests.helpers as th __author__ = 'LiuShulun' @@ -28,7 +29,15 @@ data_lines = 10000 # benchmarking result format -result_cols = ['encoder', 'used_processes', 'X_shape', 'min_time(s)', 'average_time(s)', 'max_cpu_utilization(%)', 'average_cpu_utilization(%)'] +result_cols = [ + 'encoder', + 'used_processes', + 'X_shape', + 'min_time(s)', + 'average_time(s)', + 'max_cpu_utilization(%)', + 'average_cpu_utilization(%)', +] results = [] cpu_utilization = multiprocessing.Manager().Queue() @@ -38,7 +47,16 @@ X = th.create_dataset(n_rows=data_lines) X_t = th.create_dataset(n_rows=int(data_lines / 2), extras=True) -cols = ['unique_str', 'underscore', 'extra', 'none', 'invariant', 321, 'categorical', 'na_categorical'] +cols = [ + 'unique_str', + 'underscore', + 'extra', + 'none', + 'invariant', + 321, + 'categorical', + 'na_categorical', +] def get_cpu_utilization(): @@ -65,7 +83,7 @@ def get_cpu_utilization(): rsl = [encoder_name, index + 1, X.shape] if encoder_name == 'HashingEncoder': - enc = encoders.HashingEncoder(max_process=index+1, cols=cols) + enc = encoders.HashingEncoder(max_process=index + 1, cols=cols) else: enc = getattr(encoders, encoder_name)(cols=cols) diff --git a/examples/benchmarking_large/arff_loader.py b/examples/benchmarking_large/arff_loader.py index c5d17b78..2a6824e6 100644 --- a/examples/benchmarking_large/arff_loader.py +++ b/examples/benchmarking_large/arff_loader.py @@ -3,16 +3,20 @@ import pandas as pd import requests - """ Read data in arff format from URL. E.g.: arff_loader.load('car.arff') """ + + def load(file_name): # Load ARFF from web - response = requests.get('https://raw.githubusercontent.com/renatopp/arff-datasets/master/classification/' + file_name) + response = requests.get( + 'https://raw.githubusercontent.com/renatopp/arff-datasets/master/classification/' + + file_name + ) html = response.text arff_f = arff.loads(html) @@ -80,7 +84,7 @@ def load(file_name): # Justification: OneHotEncoding and TargetEncoder work only with binary numerical output. # Approach: Take a majority class as 1 and the rest as 0. majority_class = y_unique[np.argmax(y_counts)] - df[target] = (df[target]==majority_class).astype('uint8') + df[target] = (df[target] == majority_class).astype('uint8') # Determine the count of folds that is not going to cause issues. # We identify the least common class label and then return min(10, minority_class_count). @@ -101,4 +105,3 @@ def load(file_name): pass return X, y, fold_count - diff --git a/examples/benchmarking_large/benchmarking_large.py b/examples/benchmarking_large/benchmarking_large.py index d5d5a7d9..e4582520 100644 --- a/examples/benchmarking_large/benchmarking_large.py +++ b/examples/benchmarking_large/benchmarking_large.py @@ -5,38 +5,52 @@ Note: A reasonably recent version of sklearn is required to run GradientBoostingClassifier and MLPClassifier. """ + import os -import pandas as pd +import category_encoders import numpy as np +import pandas as pd +from examples.benchmarking_large import arff_loader +from examples.benchmarking_large.util import train_encoder, train_model from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier -from sklearn.linear_model import LogisticRegression -from sklearn.linear_model import SGDClassifier +from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier -import category_encoders -from examples.benchmarking_large import arff_loader -from examples.benchmarking_large.util import train_model, train_encoder - # The settings are taken from: # Data-driven advice for applying machine learning to bioinformatics problems, Olson et al. # Following models have high variance of results: SGD, SVC and DecisionTree. That is not a big deal. # Just be careful during the result interpretation. # Also, following models are slow because of their configuration: GradientBoosting and RandomForest. # SGD and DecisionTree benefit from stronger regularization. -models = [SGDClassifier(loss='modified_huber', max_iter=50, tol=1e-3, random_state=2001), - LogisticRegression(C=1.5, penalty='l1', fit_intercept=True, solver='liblinear'), # ElasticNet would rid us of overflows in the model - SVC(kernel='poly', probability=True, C=0.01, gamma=0.1, degree=3, coef0=10.0, random_state=2001), - KNeighborsClassifier(), - GaussianNB(), - DecisionTreeClassifier(max_depth=4, random_state=2001), - GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=500, max_depth=3, max_features='log2', random_state=2001), - RandomForestClassifier(n_estimators=500, max_features=0.25, criterion='entropy', random_state=2001), - MLPClassifier(max_iter=200, n_iter_no_change=2, tol=1e-3, random_state=2001)] +models = [ + SGDClassifier(loss='modified_huber', max_iter=50, tol=1e-3, random_state=2001), + LogisticRegression( + C=1.5, penalty='l1', fit_intercept=True, solver='liblinear' + ), # ElasticNet would rid us of overflows in the model + SVC( + kernel='poly', probability=True, C=0.01, gamma=0.1, degree=3, coef0=10.0, random_state=2001 + ), + KNeighborsClassifier(), + GaussianNB(), + DecisionTreeClassifier(max_depth=4, random_state=2001), + GradientBoostingClassifier( + loss='deviance', + learning_rate=0.1, + n_estimators=500, + max_depth=3, + max_features='log2', + random_state=2001, + ), + RandomForestClassifier( + n_estimators=500, max_features=0.25, criterion='entropy', random_state=2001 + ), + MLPClassifier(max_iter=200, n_iter_no_change=2, tol=1e-3, random_state=2001), +] # We use Arff datasets on GitHub. But once OpenML loader will be part of scikit-learn: # https://github.com/scikit-learn/scikit-learn/pull/11419 @@ -69,30 +83,61 @@ # sponge.arff Large impact # tic-tac-toe.arff # trains.arff Medium impact (tiny dataset -> with high variance) -datasets = ['audiology.arff', 'autos.arff', 'breast.cancer.arff', 'bridges.version1.arff', 'bridges.version2.arff', 'car.arff', - 'colic.arff', 'credit.a.arff', 'credit.g.arff', 'cylinder.bands.arff', 'flags.arff', 'heart.c.arff', 'heart.h.arff', - 'hepatitis.arff', 'hypothyroid.arff', 'kr.vs.kp.arff', 'labor.arff', 'lymph.arff', 'mushroom.arff', 'nursery.arff', - 'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', - 'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff'] +datasets = [ + 'audiology.arff', + 'autos.arff', + 'breast.cancer.arff', + 'bridges.version1.arff', + 'bridges.version2.arff', + 'car.arff', + 'colic.arff', + 'credit.a.arff', + 'credit.g.arff', + 'cylinder.bands.arff', + 'flags.arff', + 'heart.c.arff', + 'heart.h.arff', + 'hepatitis.arff', + 'hypothyroid.arff', + 'kr.vs.kp.arff', + 'labor.arff', + 'lymph.arff', + 'mushroom.arff', + 'nursery.arff', + 'postoperative.patient.data.arff', + 'primary.tumor.arff', + 'sick.arff', + 'solar.flare1.arff', + 'solar.flare2.arff', + 'soybean.arff', + 'spectrometer.arff', + 'sponge.arff', + 'tic-tac-toe.arff', + 'trains.arff', + 'vote.arff', + 'vowel.arff', +] # We painstakingly initialize each encoder here because that gives us the freedom to initialize the # encoders with any setting we want. -encoders = [category_encoders.BackwardDifferenceEncoder(), - category_encoders.BaseNEncoder(), - category_encoders.BinaryEncoder(), - category_encoders.CatBoostEncoder(), - category_encoders.CountEncoder(), - category_encoders.HashingEncoder(), - category_encoders.HelmertEncoder(), - category_encoders.JamesSteinEncoder(), - category_encoders.LeaveOneOutEncoder(), - category_encoders.MEstimateEncoder(), - category_encoders.OneHotEncoder(), - category_encoders.OrdinalEncoder(), - category_encoders.PolynomialEncoder(), - category_encoders.SumEncoder(), - category_encoders.TargetEncoder(), - category_encoders.WOEEncoder()] +encoders = [ + category_encoders.BackwardDifferenceEncoder(), + category_encoders.BaseNEncoder(), + category_encoders.BinaryEncoder(), + category_encoders.CatBoostEncoder(), + category_encoders.CountEncoder(), + category_encoders.HashingEncoder(), + category_encoders.HelmertEncoder(), + category_encoders.JamesSteinEncoder(), + category_encoders.LeaveOneOutEncoder(), + category_encoders.MEstimateEncoder(), + category_encoders.OneHotEncoder(), + category_encoders.OrdinalEncoder(), + category_encoders.PolynomialEncoder(), + category_encoders.SumEncoder(), + category_encoders.TargetEncoder(), + category_encoders.WOEEncoder(), +] # Initialization if os.path.isfile('./output/result.csv'): @@ -110,21 +155,53 @@ # X, y, fold_count, nominal_columns = csv_loader.load(dataset_name) non_numeric = list(X.select_dtypes(exclude=[np.number]).columns) for encoder in encoders: - print("Encoding:", dataset_name, y.name, encoder.__class__.__name__) + print('Encoding:', dataset_name, y.name, encoder.__class__.__name__) folds, fit_encoder_time, score_encoder_time = train_encoder(X, y, fold_count, encoder) for model in models: print('Evaluating:', dataset_name, encoder.__class__.__name__, model.__class__.__name__) scores, fit_model_time, score_model_time = train_model(folds, model) # Log into csv - result = pd.DataFrame([dataset_name, y.name, encoder.__class__.__name__, encoder.__dict__, model.__class__.__name__, X.shape[1], - folds[0][0].shape[1], fit_encoder_time, score_encoder_time, fit_model_time, score_model_time] - + list(scores)).T + result = pd.DataFrame( + [ + dataset_name, + y.name, + encoder.__class__.__name__, + encoder.__dict__, + model.__class__.__name__, + X.shape[1], + folds[0][0].shape[1], + fit_encoder_time, + score_encoder_time, + fit_model_time, + score_model_time, + ] + + list(scores) + ).T if not os.path.isfile('./output/result.csv'): - result.to_csv('./output/result.csv', - header=['dataset', 'target', 'encoder', 'encoder_setting', 'model', 'input_features', 'output_features', - 'fit_encoder_time', 'score_encoder_time', 'fit_model_time', 'score_model_time', 'test_matthews', - 'train_matthews', 'test_auc', 'train_auc', 'test_brier', 'train_brier'], index=False) + result.to_csv( + './output/result.csv', + header=[ + 'dataset', + 'target', + 'encoder', + 'encoder_setting', + 'model', + 'input_features', + 'output_features', + 'fit_encoder_time', + 'score_encoder_time', + 'fit_model_time', + 'score_model_time', + 'test_matthews', + 'train_matthews', + 'test_auc', + 'train_auc', + 'test_brier', + 'train_brier', + ], + index=False, + ) else: result.to_csv('./output/result.csv', mode='a', header=False, index=False) diff --git a/examples/benchmarking_large/catboost_comparison.py b/examples/benchmarking_large/catboost_comparison.py index 99c2a141..29ae2abd 100644 --- a/examples/benchmarking_large/catboost_comparison.py +++ b/examples/benchmarking_large/catboost_comparison.py @@ -2,15 +2,15 @@ Compare performance of CatBoost internal categorical encoding with our categorical encoding. Conclusion: CatBoost beats our encoders by large margin. """ -import os -import pandas as pd -import numpy as np -from catboost import Pool, cv, CatBoostClassifier +import os import category_encoders -from examples.benchmarking_large import arff_loader, csv_loader -from examples.benchmarking_large.util import train_model, train_encoder +import numpy as np +import pandas as pd +from catboost import CatBoostClassifier, Pool, cv +from examples.benchmarking_large import csv_loader +from examples.benchmarking_large.util import train_encoder, train_model # The settings are taken from: # Data-driven advice for applying machine learning to bioinformatics problems, Olson et al. @@ -47,34 +47,75 @@ # sponge.arff Large impact # tic-tac-toe.arff # trains.arff Medium impact (tiny dataset -> with high variance) -datasets = [#'audiology.arff', - 'autos.arff', 'breast.cancer.arff', 'bridges.version1.arff', 'bridges.version2.arff', 'car.arff', - 'colic.arff', 'credit.a.arff', 'credit.g.arff', 'cylinder.bands.arff', 'flags.arff', 'heart.c.arff', 'heart.h.arff', - 'hepatitis.arff', 'hypothyroid.arff', 'kr.vs.kp.arff', 'labor.arff', 'lymph.arff', 'mushroom.arff', 'nursery.arff', - 'postoperative.patient.data.arff', 'primary.tumor.arff', 'sick.arff', 'solar.flare1.arff', 'solar.flare2.arff', - 'soybean.arff', 'spectrometer.arff', 'sponge.arff', 'tic-tac-toe.arff', 'trains.arff', 'vote.arff', 'vowel.arff'] - -datasets = ['carvana.csv', 'erasmus.csv', 'internetusage.csv', 'ipumsla97small.csv', 'kobe.csv', 'pbcseq.csv', 'phpvcoG8S.csv', 'westnile.csv'] # amazon is too large... +datasets = [ #'audiology.arff', + 'autos.arff', + 'breast.cancer.arff', + 'bridges.version1.arff', + 'bridges.version2.arff', + 'car.arff', + 'colic.arff', + 'credit.a.arff', + 'credit.g.arff', + 'cylinder.bands.arff', + 'flags.arff', + 'heart.c.arff', + 'heart.h.arff', + 'hepatitis.arff', + 'hypothyroid.arff', + 'kr.vs.kp.arff', + 'labor.arff', + 'lymph.arff', + 'mushroom.arff', + 'nursery.arff', + 'postoperative.patient.data.arff', + 'primary.tumor.arff', + 'sick.arff', + 'solar.flare1.arff', + 'solar.flare2.arff', + 'soybean.arff', + 'spectrometer.arff', + 'sponge.arff', + 'tic-tac-toe.arff', + 'trains.arff', + 'vote.arff', + 'vowel.arff', +] + +datasets = [ + 'carvana.csv', + 'erasmus.csv', + 'internetusage.csv', + 'ipumsla97small.csv', + 'kobe.csv', + 'pbcseq.csv', + 'phpvcoG8S.csv', + 'westnile.csv', +] # amazon is too large... # We painstakingly initialize each encoder here because that gives us the freedom to initialize the # encoders with any setting we want. -encoders = [ #category_encoders.BackwardDifferenceEncoder(), - category_encoders.BaseNEncoder(), - category_encoders.BinaryEncoder(), - category_encoders.HashingEncoder(), - # category_encoders.HelmertEncoder(), - category_encoders.JamesSteinEncoder(), - category_encoders.LeaveOneOutEncoder(), - category_encoders.MEstimateEncoder(), - category_encoders.OneHotEncoder(), - category_encoders.OrdinalEncoder(), - # category_encoders.PolynomialEncoder(), - # category_encoders.SumEncoder(), - category_encoders.TargetEncoder(), - category_encoders.WOEEncoder()] - -encoders = [category_encoders.TargetEncoder(), category_encoders.JamesSteinEncoder(), category_encoders.WOEEncoder()] +encoders = [ # category_encoders.BackwardDifferenceEncoder(), + category_encoders.BaseNEncoder(), + category_encoders.BinaryEncoder(), + category_encoders.HashingEncoder(), + # category_encoders.HelmertEncoder(), + category_encoders.JamesSteinEncoder(), + category_encoders.LeaveOneOutEncoder(), + category_encoders.MEstimateEncoder(), + category_encoders.OneHotEncoder(), + category_encoders.OrdinalEncoder(), + # category_encoders.PolynomialEncoder(), + # category_encoders.SumEncoder(), + category_encoders.TargetEncoder(), + category_encoders.WOEEncoder(), +] + +encoders = [ + category_encoders.TargetEncoder(), + category_encoders.JamesSteinEncoder(), + category_encoders.WOEEncoder(), +] # Initialization if os.path.isfile('./output/result.csv'): @@ -97,43 +138,114 @@ # Perform cross-validation pool = Pool(X, y, categorical_indexes) - params = {'iterations': 50, - 'depth': 3, - 'loss_function': 'Logloss', - 'eval_metric': 'AUC', - 'verbose': False} + params = { + 'iterations': 50, + 'depth': 3, + 'loss_function': 'Logloss', + 'eval_metric': 'AUC', + 'verbose': False, + } scores = cv(pool, params, logging_level='Silent') - auc = scores.iloc[-1,0] - + auc = scores.iloc[-1, 0] # Log into csv - result = pd.DataFrame([dataset_name, y.name, 'CatBoost', 'default', model.__class__.__name__, X.shape[1], - '', '', '', '', '', '', '', auc, '', '', '']).T + result = pd.DataFrame( + [ + dataset_name, + y.name, + 'CatBoost', + 'default', + model.__class__.__name__, + X.shape[1], + '', + '', + '', + '', + '', + '', + '', + auc, + '', + '', + '', + ] + ).T if not os.path.isfile('./output/result.csv'): - result.to_csv('./output/result.csv', - header=['dataset', 'target', 'encoder', 'encoder_setting', 'model', 'input_features', 'output_features', - 'fit_encoder_time', 'score_encoder_time', 'fit_model_time', 'score_model_time', 'test_matthews', - 'train_matthews', 'test_auc', 'train_auc', 'test_brier', 'train_brier'], index=False) + result.to_csv( + './output/result.csv', + header=[ + 'dataset', + 'target', + 'encoder', + 'encoder_setting', + 'model', + 'input_features', + 'output_features', + 'fit_encoder_time', + 'score_encoder_time', + 'fit_model_time', + 'score_model_time', + 'test_matthews', + 'train_matthews', + 'test_auc', + 'train_auc', + 'test_brier', + 'train_brier', + ], + index=False, + ) else: result.to_csv('./output/result.csv', mode='a', header=False, index=False) # Our encoding for encoder in encoders: - print("Encoding:", dataset_name, y.name, encoder.__class__.__name__) + print('Encoding:', dataset_name, y.name, encoder.__class__.__name__) folds, fit_encoder_time, score_encoder_time = train_encoder(X, y, fold_count, encoder) print('Evaluating:', dataset_name, encoder.__class__.__name__, model.__class__.__name__) scores, fit_model_time, score_model_time = train_model(folds, model) # Log into csv - result = pd.DataFrame([dataset_name, y.name, encoder.__class__.__name__, encoder.__dict__, model.__class__.__name__, X.shape[1], - folds[0][0].shape[1], fit_encoder_time, score_encoder_time, fit_model_time, score_model_time] - + list(scores)).T + result = pd.DataFrame( + [ + dataset_name, + y.name, + encoder.__class__.__name__, + encoder.__dict__, + model.__class__.__name__, + X.shape[1], + folds[0][0].shape[1], + fit_encoder_time, + score_encoder_time, + fit_model_time, + score_model_time, + ] + + list(scores) + ).T if not os.path.isfile('./output/result.csv'): - result.to_csv('./output/result.csv', - header=['dataset', 'target', 'encoder', 'encoder_setting', 'model', 'input_features', 'output_features', - 'fit_encoder_time', 'score_encoder_time', 'fit_model_time', 'score_model_time', 'test_matthews', - 'train_matthews', 'test_auc', 'train_auc', 'test_brier', 'train_brier'], index=False) + result.to_csv( + './output/result.csv', + header=[ + 'dataset', + 'target', + 'encoder', + 'encoder_setting', + 'model', + 'input_features', + 'output_features', + 'fit_encoder_time', + 'score_encoder_time', + 'fit_model_time', + 'score_model_time', + 'test_matthews', + 'train_matthews', + 'test_auc', + 'train_auc', + 'test_brier', + 'train_brier', + ], + index=False, + ) else: result.to_csv('./output/result.csv', mode='a', header=False, index=False) diff --git a/examples/benchmarking_large/csv_loader.py b/examples/benchmarking_large/csv_loader.py index 2806205a..9ab1d5d8 100644 --- a/examples/benchmarking_large/csv_loader.py +++ b/examples/benchmarking_large/csv_loader.py @@ -1,8 +1,5 @@ -import arff import numpy as np import pandas as pd -import requests - """ Read data in arff format from URL. @@ -10,9 +7,11 @@ E.g.: csv_loader.load('car.csv') """ + + def load(file_name): # Load CSV from file - df = pd.read_csv('./datasets/article/' + file_name, sep = None, na_values='?') + df = pd.read_csv('./datasets/article/' + file_name, sep=None, na_values='?') # Target column estimation if 'class' in list(df): @@ -83,7 +82,7 @@ def load(file_name): # Justification: OneHotEncoding and TargetEncoder work only with binary numerical output. # Approach: Take a majority class as 1 and the rest as 0. majority_class = y_unique[np.argmax(y_counts)] - df[target] = (df[target]==majority_class).astype('uint8') + df[target] = (df[target] == majority_class).astype('uint8') # Determine the count of folds that is not going to cause issues. # We identify the least common class label and then return min(10, minority_class_count). @@ -98,7 +97,7 @@ def load(file_name): # Estimate, which columns are nominal. If there is no string column in the data, assume that all integers are nominal. nominal_columns = [key for key, value in X.dtypes.items() if ('object' in value.name)] - if len(nominal_columns)==0: + if len(nominal_columns) == 0: nominal_columns = [key for key, value in X.dtypes.items() if ('int' in value.name)] # Data type estimation @@ -109,4 +108,3 @@ def load(file_name): pass return X, y, fold_count, nominal_columns - diff --git a/examples/benchmarking_large/report.py b/examples/benchmarking_large/report.py index b26aac4c..86aa2d4e 100644 --- a/examples/benchmarking_large/report.py +++ b/examples/benchmarking_large/report.py @@ -1,6 +1,6 @@ import matplotlib.pyplot as plt -import seaborn as sb import pandas as pd +import seaborn as sb results_df = pd.read_csv('./output/result.csv') @@ -9,7 +9,7 @@ f = plt.figure(figsize=(9, 9)) sb.boxplot(data=results_df, y='encoder', x='test_auc', notch=True) plt.grid(True, axis='x') -f.savefig("./output/auc.pdf", bbox_inches='tight') +f.savefig('./output/auc.pdf', bbox_inches='tight') # AUC grouped by encoder + classifier @@ -17,7 +17,9 @@ for index, clf in enumerate(results_df['model'].unique()): plt.subplot(3, 3, index + 1) plt.title(clf) - sb.boxplot(data=results_df.loc[results_df['model'] == clf], y='encoder', x='test_auc', notch=True) + sb.boxplot( + data=results_df.loc[results_df['model'] == clf], y='encoder', x='test_auc', notch=True + ) plt.grid(True, axis='x') plt.ylabel('') if index < 6 != 0: @@ -26,18 +28,24 @@ plt.yticks([]) plt.tight_layout() plt.xlim(0.0, 1.0) -f.savefig("./output/auc_model.pdf", bbox_inches='tight') +f.savefig('./output/auc_model.pdf', bbox_inches='tight') # Overfitting -df_overfitting = pd.melt(results_df, col_level=0, id_vars=['encoder', 'model'], value_vars=['train_auc', 'test_auc'], value_name='auc') +df_overfitting = pd.melt( + results_df, + col_level=0, + id_vars=['encoder', 'model'], + value_vars=['train_auc', 'test_auc'], + value_name='auc', +) # Clustered AUC grouped by encoder f = plt.figure(figsize=(9, 9)) sb.boxplot(data=df_overfitting, y='encoder', x='auc', hue='variable', notch=True) plt.grid(True, axis='x') -f.savefig("./output/overfitting.pdf", bbox_inches='tight') +f.savefig('./output/overfitting.pdf', bbox_inches='tight') # Clustered AUC grouped by encoder + classifier @@ -45,7 +53,13 @@ for index, clf in enumerate(df_overfitting['model'].unique()): plt.subplot(3, 3, index + 1) plt.title(clf) - sb.boxplot(data=df_overfitting.loc[df_overfitting['model'] == clf], y='encoder', x='auc', hue='variable', notch=True) + sb.boxplot( + data=df_overfitting.loc[df_overfitting['model'] == clf], + y='encoder', + x='auc', + hue='variable', + notch=True, + ) plt.grid(True, axis='x') plt.ylabel('') if index < 6 != 0: @@ -54,4 +68,4 @@ plt.yticks([]) plt.tight_layout() plt.xlim(0.0, 1.0) -f.savefig("./output/overfitting_model.pdf", bbox_inches='tight') +f.savefig('./output/overfitting_model.pdf', bbox_inches='tight') diff --git a/examples/benchmarking_large/report_history.py b/examples/benchmarking_large/report_history.py index 210ef4cc..032ca3f4 100644 --- a/examples/benchmarking_large/report_history.py +++ b/examples/benchmarking_large/report_history.py @@ -1,6 +1,6 @@ import matplotlib.pyplot as plt -import seaborn as sb import pandas as pd +import seaborn as sb results_df = pd.read_csv('./output/result_2019-01-05.csv') previous_df = pd.read_csv('./output/result_2018-09-02.csv') @@ -13,22 +13,36 @@ # Fit runtime by encoder f = plt.figure(figsize=(15, 9)) -sb.barplot(data=filtered_df, x="encoder", y="fit_encoder_time", hue="version", hue_order=['2018-09', '2019-01']) +sb.barplot( + data=filtered_df, + x='encoder', + y='fit_encoder_time', + hue='version', + hue_order=['2018-09', '2019-01'], +) plt.xlabel('Encoder') plt.ylabel('Encoder fit runtime [s]') -f.savefig("./output/fit_runtime.pdf", bbox_inches='tight') +f.savefig('./output/fit_runtime.pdf', bbox_inches='tight') # Score runtime by encoder f = plt.figure(figsize=(15, 9)) -sb.barplot(data=filtered_df, x="encoder", y="score_encoder_time", hue="version", hue_order=['2018-09', '2019-01']) +sb.barplot( + data=filtered_df, + x='encoder', + y='score_encoder_time', + hue='version', + hue_order=['2018-09', '2019-01'], +) plt.xlabel('Encoder') plt.ylabel('Encoder score runtime [s]') -f.savefig("./output/score_runtime.pdf", bbox_inches='tight') +f.savefig('./output/score_runtime.pdf', bbox_inches='tight') # Test AUC by encoder f = plt.figure(figsize=(15, 9)) -sb.barplot(data=filtered_df, x="encoder", y="test_auc", hue="version", hue_order=['2018-09', '2019-01']) +sb.barplot( + data=filtered_df, x='encoder', y='test_auc', hue='version', hue_order=['2018-09', '2019-01'] +) plt.xlabel('Encoder') plt.ylabel('Testing AUC') plt.show() -f.savefig("./output/test_auc.pdf", bbox_inches='tight') \ No newline at end of file +f.savefig('./output/test_auc.pdf', bbox_inches='tight') diff --git a/examples/benchmarking_large/util.py b/examples/benchmarking_large/util.py index 8d07ffc5..9c225e31 100644 --- a/examples/benchmarking_large/util.py +++ b/examples/benchmarking_large/util.py @@ -5,10 +5,11 @@ import numpy as np import sklearn from sklearn.exceptions import ConvergenceWarning -from sklearn.utils.testing import ignore_warnings from sklearn.impute import SimpleImputer from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import StandardScaler +from sklearn.utils.testing import ignore_warnings + def train_encoder(X, y, fold_count, encoder): """ @@ -25,7 +26,9 @@ def train_encoder(X, y, fold_count, encoder): https://github.com/scikit-learn/scikit-learn/issues/11832 """ kf = StratifiedKFold(n_splits=fold_count, shuffle=True, random_state=2001) - encoder = deepcopy(encoder) # Because of https://github.com/scikit-learn-contrib/categorical-encoding/issues/106 + encoder = deepcopy( + encoder + ) # Because of https://github.com/scikit-learn-contrib/categorical-encoding/issues/106 imputer = SimpleImputer(strategy='mean') scaler = StandardScaler() folds = [] @@ -34,8 +37,14 @@ def train_encoder(X, y, fold_count, encoder): for train_index, test_index in kf.split(X, y): # Split data - X_train, X_test = X.iloc[train_index, :].reset_index(drop=True), X.iloc[test_index, :].reset_index(drop=True) - y_train, y_test = y[train_index].reset_index(drop=True), y[test_index].reset_index(drop=True) + X_train, X_test = ( + X.iloc[train_index, :].reset_index(drop=True), + X.iloc[test_index, :].reset_index(drop=True), + ) + y_train, y_test = ( + y[train_index].reset_index(drop=True), + y[test_index].reset_index(drop=True), + ) # Training start_time = time.time() @@ -53,7 +62,8 @@ def train_encoder(X, y, fold_count, encoder): folds.append([X_train, y_train, X_test, y_test]) - return folds, fit_encoder_time/fold_count, score_encoder_time/fold_count + return folds, fit_encoder_time / fold_count, score_encoder_time / fold_count + def train_model(folds, model): """ @@ -63,13 +73,19 @@ def train_model(folds, model): Brier score: represents calibration measures """ scores = [] - fit_model_time = 0 # Sum of all the time spend on fitting the training data, later on normalized - score_model_time = 0 # Sum of all the time spend on scoring the testing data, later on normalized + fit_model_time = ( + 0 # Sum of all the time spend on fitting the training data, later on normalized + ) + score_model_time = ( + 0 # Sum of all the time spend on scoring the testing data, later on normalized + ) for X_train, y_train, X_test, y_test in folds: # Training start_time = time.time() - with ignore_warnings(category=ConvergenceWarning): # Yes, neural networks do not always converge + with ignore_warnings( + category=ConvergenceWarning + ): # Yes, neural networks do not always converge model.fit(X_train, y_train) fit_model_time += time.time() - start_time prediction_train_proba = model.predict_proba(X_train)[:, 1] @@ -83,14 +99,16 @@ def train_model(folds, model): # When all the predictions are of a single class, we get a RuntimeWarning in matthews_corr with warnings.catch_warnings(): - warnings.simplefilter("ignore") - scores.append([ - sklearn.metrics.matthews_corrcoef(y_test, prediction_test), - sklearn.metrics.matthews_corrcoef(y_train, prediction_train), - sklearn.metrics.roc_auc_score(y_test, prediction_test_proba), - sklearn.metrics.roc_auc_score(y_train, prediction_train_proba), - sklearn.metrics.brier_score_loss(y_test, prediction_test_proba), - sklearn.metrics.brier_score_loss(y_train, prediction_train_proba) - ]) - - return np.mean(scores, axis=0), fit_model_time/len(folds), score_model_time/len(folds) + warnings.simplefilter('ignore') + scores.append( + [ + sklearn.metrics.matthews_corrcoef(y_test, prediction_test), + sklearn.metrics.matthews_corrcoef(y_train, prediction_train), + sklearn.metrics.roc_auc_score(y_test, prediction_test_proba), + sklearn.metrics.roc_auc_score(y_train, prediction_train_proba), + sklearn.metrics.brier_score_loss(y_test, prediction_test_proba), + sklearn.metrics.brier_score_loss(y_train, prediction_train_proba), + ] + ) + + return np.mean(scores, axis=0), fit_model_time / len(folds), score_model_time / len(folds) diff --git a/examples/column_transformer_example.py b/examples/column_transformer_example.py index 368281e7..8ee92881 100644 --- a/examples/column_transformer_example.py +++ b/examples/column_transformer_example.py @@ -1,15 +1,14 @@ -from examples.source_data.loaders import get_mushroom_data -from sklearn.compose import ColumnTransformer from category_encoders import TargetEncoder +from sklearn.compose import ColumnTransformer + +from examples.source_data.loaders import get_mushroom_data # get data from the mushroom dataset X, y, _ = get_mushroom_data() # encode the specified columns ct = ColumnTransformer( - [ - ('Target encoding', TargetEncoder(), ['bruises', 'odor']) - ], remainder='passthrough' + [('Target encoding', TargetEncoder(), ['bruises', 'odor'])], remainder='passthrough' ) encoded = ct.fit_transform(X=X, y=y) diff --git a/examples/encoding_examples.py b/examples/encoding_examples.py index 4a55fda8..7c8f7203 100644 --- a/examples/encoding_examples.py +++ b/examples/encoding_examples.py @@ -6,6 +6,7 @@ import time import warnings +import category_encoders import matplotlib.pyplot as plt import numpy as np import pandas as pd @@ -14,7 +15,6 @@ from sklearn.model_selection import cross_validate from sklearn.preprocessing import StandardScaler -import category_encoders from examples.source_data.loaders import get_cars_data warnings.filterwarnings(action='ignore', category=DataConversionWarning) @@ -29,7 +29,6 @@ def score_models(clf, X, y, encoder, runs=1): Takes in a classifier that supports multiclass classification, and X and a y, and returns a cross validation score. """ - scores = [] X_test = None @@ -54,17 +53,20 @@ def main(loader, name): Here we iterate through the datasets and score them with a classifier using different encodings. """ - scores = [] raw_scores_ds = {} # first get the dataset X, y, mapping = loader() - clf = linear_model.LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=200, random_state=0) + clf = linear_model.LogisticRegression( + solver='lbfgs', multi_class='auto', max_iter=200, random_state=0 + ) # try each encoding method available, which works on multiclass problems - encoders = (set(category_encoders.__all__) - {'WOEEncoder'}) # WoE is currently only for binary targets + encoders = set(category_encoders.__all__) - { + 'WOEEncoder' + } # WoE is currently only for binary targets for encoder_name in encoders: encoder = getattr(category_encoders, encoder_name) @@ -74,7 +76,17 @@ def main(loader, name): raw_scores_ds[encoder_name] = raw_scores gc.collect() - results = pd.DataFrame(scores, columns=['Encoding', 'Dataset', 'Dimensionality', 'Avg. Score', 'Score StDev', 'Elapsed Time']) + results = pd.DataFrame( + scores, + columns=[ + 'Encoding', + 'Dataset', + 'Dimensionality', + 'Avg. Score', + 'Score StDev', + 'Elapsed Time', + ], + ) raw = pd.DataFrame.from_dict(raw_scores_ds) ax = raw.plot(kind='box', return_type='axes') diff --git a/examples/grid_search_example.py b/examples/grid_search_example.py index 9eb6ff33..99f5c93d 100644 --- a/examples/grid_search_example.py +++ b/examples/grid_search_example.py @@ -3,17 +3,17 @@ Tested to work with scikit-learn 0.20.2 """ +import warnings -from sklearn.model_selection import GridSearchCV -from sklearn.model_selection import train_test_split +from category_encoders.basen import BaseNEncoder +from sklearn.exceptions import DataConversionWarning +from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report +from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler -from category_encoders.basen import BaseNEncoder + from examples.source_data.loaders import get_mushroom_data -from sklearn.linear_model import LogisticRegression -import warnings -from sklearn.exceptions import DataConversionWarning warnings.filterwarnings(action='ignore', category=DataConversionWarning) @@ -28,34 +28,34 @@ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) # create a pipeline -ppl = Pipeline([ - ('enc', BaseNEncoder(base=2, return_df=False, verbose=True)), - ('norm', StandardScaler()), - ('clf', LogisticRegression(solver='lbfgs', random_state=0)) -]) +ppl = Pipeline( + [ + ('enc', BaseNEncoder(base=2, return_df=False, verbose=True)), + ('norm', StandardScaler()), + ('clf', LogisticRegression(solver='lbfgs', random_state=0)), + ] +) # set the parameters by cross-validation -tuned_parameters = { - 'enc__base': [1, 2, 3, 4, 5, 6] -} +tuned_parameters = {'enc__base': [1, 2, 3, 4, 5, 6]} scores = ['precision', 'recall'] for score in scores: - print("# Tuning hyper-parameters for %s\n" % score) + print('# Tuning hyper-parameters for %s\n' % score) clf = GridSearchCV(ppl, tuned_parameters, cv=5, scoring='%s_macro' % score) clf.fit(X_train, y_train) - print("Best parameters set found on development set:\n") + print('Best parameters set found on development set:\n') print(clf.best_params_) - print("\nGrid scores on development set:\n") + print('\nGrid scores on development set:\n') means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] - for mean, std, params in zip(means, stds, clf.cv_results_['params']): - print("%s (+/-%s) for %s" % (mean, std * 2, params)) + for mean, std, params in zip(means, stds, clf.cv_results_['params'], strict=False): + print('%s (+/-%s) for %s' % (mean, std * 2, params)) - print("\nDetailed classification report:\n") - print("The model is trained on the full development set.") - print("The scores are computed on the full evaluation set.\n") + print('\nDetailed classification report:\n') + print('The model is trained on the full development set.') + print('The scores are computed on the full evaluation set.\n') y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) diff --git a/examples/source_data/loaders.py b/examples/source_data/loaders.py index 7277db10..65677989 100644 --- a/examples/source_data/loaders.py +++ b/examples/source_data/loaders.py @@ -10,7 +10,6 @@ def get_cars_data(): :return: """ - df = pd.read_csv('source_data/cars/car.data.txt') X = df.reindex(columns=[x for x in df.columns if x != 'class']) y = df.reindex(columns=['class']) @@ -34,7 +33,6 @@ def get_mushroom_data(): :return: """ - df = pd.read_csv('source_data/mushrooms/agaricus-lepiota.csv') X = df.reindex(columns=[x for x in df.columns if x != 'class']) y = df.reindex(columns=['class']) @@ -52,12 +50,11 @@ def get_splice_data(): :return: """ - df = pd.read_csv('source_data/splice/splice.csv') X = df.reindex(columns=[x for x in df.columns if x != 'class']) X['dna'] = X['dna'].map(lambda x: list(str(x).strip())) for idx in range(60): - X['dna_%d' % (idx, )] = X['dna'].map(lambda x: x[idx]) + X['dna_%d' % (idx,)] = X['dna'].map(lambda x: x[idx]) del X['dna'] y = df.reindex(columns=['class']) @@ -66,4 +63,4 @@ def get_splice_data(): # this data is truly categorical, with no known concept of ordering mapping = None - return X, y, mapping \ No newline at end of file + return X, y, mapping diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 00000000..ed84cfc2 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,1092 @@ +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. + +[[package]] +name = "alabaster" +version = "0.7.16" +description = "A light, configurable Sphinx theme" +optional = false +python-versions = ">=3.9" +files = [ + {file = "alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92"}, + {file = "alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65"}, +] + +[[package]] +name = "attrs" +version = "24.3.0" +description = "Classes Without Boilerplate" +optional = false +python-versions = ">=3.8" +files = [ + {file = "attrs-24.3.0-py3-none-any.whl", hash = "sha256:ac96cd038792094f438ad1f6ff80837353805ac950cd2aa0e0625ef19850c308"}, + {file = "attrs-24.3.0.tar.gz", hash = "sha256:8f5c07333d543103541ba7be0e2ce16eeee8130cb0b3f9238ab904ce1e85baff"}, +] + +[package.extras] +benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"] +tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] + +[[package]] +name = "babel" +version = "2.16.0" +description = "Internationalization utilities" +optional = false +python-versions = ">=3.8" +files = [ + {file = "babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b"}, + {file = "babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316"}, +] + +[package.extras] +dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"] + +[[package]] +name = "certifi" +version = "2024.8.30" +description = "Python package for providing Mozilla's CA Bundle." +optional = false +python-versions = ">=3.6" +files = [ + {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"}, + {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"}, +] + +[[package]] +name = "charset-normalizer" +version = "3.3.2" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"}, + {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, +] + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "docutils" +version = "0.20.1" +description = "Docutils -- Python Documentation Utilities" +optional = false +python-versions = ">=3.7" +files = [ + {file = "docutils-0.20.1-py3-none-any.whl", hash = "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6"}, + {file = "docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.2.2" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, + {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, +] + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "idna" +version = "3.10" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.6" +files = [ + {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, + {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, +] + +[package.extras] +all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] + +[[package]] +name = "imagesize" +version = "1.4.1" +description = "Getting image size from png/jpeg/jpeg2000/gif file" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b"}, + {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"}, +] + +[[package]] +name = "importlib-metadata" +version = "8.5.0" +description = "Read metadata from Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"}, + {file = "importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7"}, +] + +[package.dependencies] +zipp = ">=3.20" + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +enabler = ["pytest-enabler (>=2.2)"] +perf = ["ipython"] +test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] +type = ["pytest-mypy"] + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[[package]] +name = "jinja2" +version = "3.1.4" +description = "A very fast and expressive template engine." +optional = false +python-versions = ">=3.7" +files = [ + {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, + {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, +] + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + +[[package]] +name = "joblib" +version = "1.4.2" +description = "Lightweight pipelining with Python functions" +optional = false +python-versions = ">=3.8" +files = [ + {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"}, + {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"}, +] + +[[package]] +name = "markupsafe" +version = "2.1.5" +description = "Safely add untrusted strings to HTML/XML markup." +optional = false +python-versions = ">=3.7" +files = [ + {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"}, + {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"}, +] + +[[package]] +name = "numpy" +version = "2.0.2" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece"}, + {file = "numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04"}, + {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66"}, + {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b"}, + {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd"}, + {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318"}, + {file = "numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8"}, + {file = "numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326"}, + {file = "numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97"}, + {file = "numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131"}, + {file = "numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448"}, + {file = "numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195"}, + {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57"}, + {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a"}, + {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669"}, + {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951"}, + {file = "numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9"}, + {file = "numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15"}, + {file = "numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4"}, + {file = "numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc"}, + {file = "numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b"}, + {file = "numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e"}, + {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c"}, + {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c"}, + {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692"}, + {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a"}, + {file = "numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c"}, + {file = "numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded"}, + {file = "numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5"}, + {file = "numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a"}, + {file = "numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c"}, + {file = "numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd"}, + {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b"}, + {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729"}, + {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1"}, + {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd"}, + {file = "numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d"}, + {file = "numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d"}, + {file = "numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa"}, + {file = "numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73"}, + {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8"}, + {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4"}, + {file = "numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c"}, + {file = "numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385"}, + {file = "numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78"}, +] + +[[package]] +name = "numpydoc" +version = "1.8.0" +description = "Sphinx extension to support docstrings in Numpy format" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numpydoc-1.8.0-py3-none-any.whl", hash = "sha256:72024c7fd5e17375dec3608a27c03303e8ad00c81292667955c6fea7a3ccf541"}, + {file = "numpydoc-1.8.0.tar.gz", hash = "sha256:022390ab7464a44f8737f79f8b31ce1d3cfa4b4af79ccaa1aac5e8368db587fb"}, +] + +[package.dependencies] +sphinx = ">=6" +tabulate = ">=0.8.10" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} + +[package.extras] +developer = ["pre-commit (>=3.3)", "tomli"] +doc = ["intersphinx-registry", "matplotlib (>=3.5)", "numpy (>=1.22)", "pydata-sphinx-theme (>=0.13.3)", "sphinx (>=7)"] +test = ["matplotlib", "pytest", "pytest-cov"] + +[[package]] +name = "packaging" +version = "24.1" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, + {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, +] + +[[package]] +name = "pandas" +version = "2.2.3" +description = "Powerful data structures for data analysis, time series, and statistics" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"}, + {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"}, + {file = "pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed"}, + {file = "pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57"}, + {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42"}, + {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f"}, + {file = "pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645"}, + {file = "pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039"}, + {file = "pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd"}, + {file = "pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698"}, + {file = "pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc"}, + {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3"}, + {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32"}, + {file = "pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5"}, + {file = "pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9"}, + {file = "pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4"}, + {file = "pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3"}, + {file = "pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319"}, + {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8"}, + {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a"}, + {file = "pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13"}, + {file = "pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015"}, + {file = "pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28"}, + {file = "pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0"}, + {file = "pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24"}, + {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659"}, + {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb"}, + {file = "pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d"}, + {file = "pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468"}, + {file = "pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18"}, + {file = "pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2"}, + {file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"}, + {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"}, + {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"}, + {file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"}, + {file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"}, + {file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"}, + {file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"}, + {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"}, + {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"}, + {file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"}, + {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.7" + +[package.extras] +all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] +aws = ["s3fs (>=2022.11.0)"] +clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] +compression = ["zstandard (>=0.19.0)"] +computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] +feather = ["pyarrow (>=10.0.1)"] +fss = ["fsspec (>=2022.11.0)"] +gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] +hdf5 = ["tables (>=3.8.0)"] +html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] +mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] +parquet = ["pyarrow (>=10.0.1)"] +performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] +plot = ["matplotlib (>=3.6.3)"] +postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] +pyarrow = ["pyarrow (>=10.0.1)"] +spss = ["pyreadstat (>=1.2.0)"] +sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.9.2)"] + +[[package]] +name = "patsy" +version = "0.5.6" +description = "A Python package for describing statistical models and for building design matrices." +optional = false +python-versions = "*" +files = [ + {file = "patsy-0.5.6-py2.py3-none-any.whl", hash = "sha256:19056886fd8fa71863fa32f0eb090267f21fb74be00f19f5c70b2e9d76c883c6"}, + {file = "patsy-0.5.6.tar.gz", hash = "sha256:95c6d47a7222535f84bff7f63d7303f2e297747a598db89cf5c67f0c0c7d2cdb"}, +] + +[package.dependencies] +numpy = ">=1.4" +six = "*" + +[package.extras] +test = ["pytest", "pytest-cov", "scipy"] + +[[package]] +name = "pluggy" +version = "1.5.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pygments" +version = "2.18.0" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a"}, + {file = "pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199"}, +] + +[package.extras] +windows-terminal = ["colorama (>=0.4.6)"] + +[[package]] +name = "pytest" +version = "8.3.3" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2"}, + {file = "pytest-8.3.3.tar.gz", hash = "sha256:70b98107bd648308a7952b06e6ca9a50bc660be218d53c257cc1fc94fda10181"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=1.5,<2" +tomli = {version = ">=1", markers = "python_version < \"3.11\""} + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "pytest-subtests" +version = "0.14.1" +description = "unittest subTest() support and subtests fixture" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pytest_subtests-0.14.1-py3-none-any.whl", hash = "sha256:e92a780d98b43118c28a16044ad9b841727bd7cb6a417073b38fd2d7ccdf052d"}, + {file = "pytest_subtests-0.14.1.tar.gz", hash = "sha256:350c00adc36c3aff676a66135c81aed9e2182e15f6c3ec8721366918bbbf7580"}, +] + +[package.dependencies] +attrs = ">=19.2.0" +pytest = ">=7.4" + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pytz" +version = "2024.2" +description = "World timezone definitions, modern and historical" +optional = false +python-versions = "*" +files = [ + {file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"}, + {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"}, +] + +[[package]] +name = "requests" +version = "2.32.3" +description = "Python HTTP for Humans." +optional = false +python-versions = ">=3.8" +files = [ + {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, + {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, +] + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "ruff" +version = "0.6.8" +description = "An extremely fast Python linter and code formatter, written in Rust." +optional = false +python-versions = ">=3.7" +files = [ + {file = "ruff-0.6.8-py3-none-linux_armv6l.whl", hash = "sha256:77944bca110ff0a43b768f05a529fecd0706aac7bcce36d7f1eeb4cbfca5f0f2"}, + {file = "ruff-0.6.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:27b87e1801e786cd6ede4ada3faa5e254ce774de835e6723fd94551464c56b8c"}, + {file = "ruff-0.6.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:cd48f945da2a6334f1793d7f701725a76ba93bf3d73c36f6b21fb04d5338dcf5"}, + {file = "ruff-0.6.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:677e03c00f37c66cea033274295a983c7c546edea5043d0c798833adf4cf4c6f"}, + {file = "ruff-0.6.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9f1476236b3eacfacfc0f66aa9e6cd39f2a624cb73ea99189556015f27c0bdeb"}, + {file = "ruff-0.6.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f5a2f17c7d32991169195d52a04c95b256378bbf0de8cb98478351eb70d526f"}, + {file = "ruff-0.6.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:5fd0d4b7b1457c49e435ee1e437900ced9b35cb8dc5178921dfb7d98d65a08d0"}, + {file = "ruff-0.6.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8034b19b993e9601f2ddf2c517451e17a6ab5cdb1c13fdff50c1442a7171d87"}, + {file = "ruff-0.6.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6cfb227b932ba8ef6e56c9f875d987973cd5e35bc5d05f5abf045af78ad8e098"}, + {file = "ruff-0.6.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ef0411eccfc3909269fed47c61ffebdcb84a04504bafa6b6df9b85c27e813b0"}, + {file = "ruff-0.6.8-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:007dee844738c3d2e6c24ab5bc7d43c99ba3e1943bd2d95d598582e9c1b27750"}, + {file = "ruff-0.6.8-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:ce60058d3cdd8490e5e5471ef086b3f1e90ab872b548814e35930e21d848c9ce"}, + {file = "ruff-0.6.8-py3-none-musllinux_1_2_i686.whl", hash = "sha256:1085c455d1b3fdb8021ad534379c60353b81ba079712bce7a900e834859182fa"}, + {file = "ruff-0.6.8-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:70edf6a93b19481affd287d696d9e311388d808671bc209fb8907b46a8c3af44"}, + {file = "ruff-0.6.8-py3-none-win32.whl", hash = "sha256:792213f7be25316f9b46b854df80a77e0da87ec66691e8f012f887b4a671ab5a"}, + {file = "ruff-0.6.8-py3-none-win_amd64.whl", hash = "sha256:ec0517dc0f37cad14a5319ba7bba6e7e339d03fbf967a6d69b0907d61be7a263"}, + {file = "ruff-0.6.8-py3-none-win_arm64.whl", hash = "sha256:8d3bb2e3fbb9875172119021a13eed38849e762499e3cfde9588e4b4d70968dc"}, + {file = "ruff-0.6.8.tar.gz", hash = "sha256:a5bf44b1aa0adaf6d9d20f86162b34f7c593bfedabc51239953e446aefc8ce18"}, +] + +[[package]] +name = "scikit-learn" +version = "1.5.2" +description = "A set of python modules for machine learning and data mining" +optional = false +python-versions = ">=3.9" +files = [ + {file = "scikit_learn-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:299406827fb9a4f862626d0fe6c122f5f87f8910b86fe5daa4c32dcd742139b6"}, + {file = "scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2d4cad1119c77930b235579ad0dc25e65c917e756fe80cab96aa3b9428bd3fb0"}, + {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c412ccc2ad9bf3755915e3908e677b367ebc8d010acbb3f182814524f2e5540"}, + {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a686885a4b3818d9e62904d91b57fa757fc2bed3e465c8b177be652f4dd37c8"}, + {file = "scikit_learn-1.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:c15b1ca23d7c5f33cc2cb0a0d6aaacf893792271cddff0edbd6a40e8319bc113"}, + {file = "scikit_learn-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:03b6158efa3faaf1feea3faa884c840ebd61b6484167c711548fce208ea09445"}, + {file = "scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1ff45e26928d3b4eb767a8f14a9a6efbf1cbff7c05d1fb0f95f211a89fd4f5de"}, + {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f763897fe92d0e903aa4847b0aec0e68cadfff77e8a0687cabd946c89d17e675"}, + {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8b0ccd4a902836493e026c03256e8b206656f91fbcc4fde28c57a5b752561f1"}, + {file = "scikit_learn-1.5.2-cp311-cp311-win_amd64.whl", hash = "sha256:6c16d84a0d45e4894832b3c4d0bf73050939e21b99b01b6fd59cbb0cf39163b6"}, + {file = "scikit_learn-1.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f932a02c3f4956dfb981391ab24bda1dbd90fe3d628e4b42caef3e041c67707a"}, + {file = "scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3b923d119d65b7bd555c73be5423bf06c0105678ce7e1f558cb4b40b0a5502b1"}, + {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, + {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, + {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, + {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, + {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, + {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, + {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, + {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca64b3089a6d9b9363cd3546f8978229dcbb737aceb2c12144ee3f70f95684b7"}, + {file = "scikit_learn-1.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:3bed4909ba187aca80580fe2ef370d9180dcf18e621a27c4cf2ef10d279a7efe"}, + {file = "scikit_learn-1.5.2.tar.gz", hash = "sha256:b4237ed7b3fdd0a4882792e68ef2545d5baa50aca3bb45aa7df468138ad8f94d"}, +] + +[package.dependencies] +joblib = ">=1.2.0" +numpy = ">=1.19.5" +scipy = ">=1.6.0" +threadpoolctl = ">=3.1.0" + +[package.extras] +benchmark = ["matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "pandas (>=1.1.5)"] +build = ["cython (>=3.0.10)", "meson-python (>=0.16.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)"] +docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pydata-sphinx-theme (>=0.15.3)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)", "sphinx (>=7.3.7)", "sphinx-copybutton (>=0.5.2)", "sphinx-design (>=0.5.0)", "sphinx-design (>=0.6.0)", "sphinx-gallery (>=0.16.0)", "sphinx-prompt (>=1.4.0)", "sphinx-remove-toctrees (>=1.0.0.post1)", "sphinxcontrib-sass (>=0.3.4)", "sphinxext-opengraph (>=0.9.1)"] +examples = ["matplotlib (>=3.3.4)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)"] +install = ["joblib (>=1.2.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)", "threadpoolctl (>=3.1.0)"] +maintenance = ["conda-lock (==2.5.6)"] +tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pyarrow (>=12.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.2.1)", "scikit-image (>=0.17.2)"] + +[[package]] +name = "scipy" +version = "1.13.1" +description = "Fundamental algorithms for scientific computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca"}, + {file = "scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f"}, + {file = "scipy-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfa31f1def5c819b19ecc3a8b52d28ffdcc7ed52bb20c9a7589669dd3c250989"}, + {file = "scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26264b282b9da0952a024ae34710c2aff7d27480ee91a2e82b7b7073c24722f"}, + {file = "scipy-1.13.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:eccfa1906eacc02de42d70ef4aecea45415f5be17e72b61bafcfd329bdc52e94"}, + {file = "scipy-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:2831f0dc9c5ea9edd6e51e6e769b655f08ec6db6e2e10f86ef39bd32eb11da54"}, + {file = "scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:27e52b09c0d3a1d5b63e1105f24177e544a222b43611aaf5bc44d4a0979e32f9"}, + {file = "scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:54f430b00f0133e2224c3ba42b805bfd0086fe488835effa33fa291561932326"}, + {file = "scipy-1.13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89369d27f9e7b0884ae559a3a956e77c02114cc60a6058b4e5011572eea9299"}, + {file = "scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a78b4b3345f1b6f68a763c6e25c0c9a23a9fd0f39f5f3d200efe8feda560a5fa"}, + {file = "scipy-1.13.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:45484bee6d65633752c490404513b9ef02475b4284c4cfab0ef946def50b3f59"}, + {file = "scipy-1.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:5713f62f781eebd8d597eb3f88b8bf9274e79eeabf63afb4a737abc6c84ad37b"}, + {file = "scipy-1.13.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5d72782f39716b2b3509cd7c33cdc08c96f2f4d2b06d51e52fb45a19ca0c86a1"}, + {file = "scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:017367484ce5498445aade74b1d5ab377acdc65e27095155e448c88497755a5d"}, + {file = "scipy-1.13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:949ae67db5fa78a86e8fa644b9a6b07252f449dcf74247108c50e1d20d2b4627"}, + {file = "scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de3ade0e53bc1f21358aa74ff4830235d716211d7d077e340c7349bc3542e884"}, + {file = "scipy-1.13.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2ac65fb503dad64218c228e2dc2d0a0193f7904747db43014645ae139c8fad16"}, + {file = "scipy-1.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:cdd7dacfb95fea358916410ec61bbc20440f7860333aee6d882bb8046264e949"}, + {file = "scipy-1.13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:436bbb42a94a8aeef855d755ce5a465479c721e9d684de76bf61a62e7c2b81d5"}, + {file = "scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:8335549ebbca860c52bf3d02f80784e91a004b71b059e3eea9678ba994796a24"}, + {file = "scipy-1.13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d533654b7d221a6a97304ab63c41c96473ff04459e404b83275b60aa8f4b7004"}, + {file = "scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637e98dcf185ba7f8e663e122ebf908c4702420477ae52a04f9908707456ba4d"}, + {file = "scipy-1.13.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a014c2b3697bde71724244f63de2476925596c24285c7a637364761f8710891c"}, + {file = "scipy-1.13.1-cp39-cp39-win_amd64.whl", hash = "sha256:392e4ec766654852c25ebad4f64e4e584cf19820b980bc04960bca0b0cd6eaa2"}, + {file = "scipy-1.13.1.tar.gz", hash = "sha256:095a87a0312b08dfd6a6155cbbd310a8c51800fc931b8c0b84003014b874ed3c"}, +] + +[package.dependencies] +numpy = ">=1.22.4,<2.3" + +[package.extras] +dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pydevtool", "rich-click", "ruff", "types-psutil", "typing_extensions"] +doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.12.0)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0)", "sphinx-design (>=0.4.0)"] +test = ["array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[[package]] +name = "snowballstemmer" +version = "2.2.0" +description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms." +optional = false +python-versions = "*" +files = [ + {file = "snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"}, + {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"}, +] + +[[package]] +name = "sphinx" +version = "7.4.7" +description = "Python documentation generator" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinx-7.4.7-py3-none-any.whl", hash = "sha256:c2419e2135d11f1951cd994d6eb18a1835bd8fdd8429f9ca375dc1f3281bd239"}, + {file = "sphinx-7.4.7.tar.gz", hash = "sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe"}, +] + +[package.dependencies] +alabaster = ">=0.7.14,<0.8.0" +babel = ">=2.13" +colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\""} +docutils = ">=0.20,<0.22" +imagesize = ">=1.3" +importlib-metadata = {version = ">=6.0", markers = "python_version < \"3.10\""} +Jinja2 = ">=3.1" +packaging = ">=23.0" +Pygments = ">=2.17" +requests = ">=2.30.0" +snowballstemmer = ">=2.2" +sphinxcontrib-applehelp = "*" +sphinxcontrib-devhelp = "*" +sphinxcontrib-htmlhelp = ">=2.0.0" +sphinxcontrib-jsmath = "*" +sphinxcontrib-qthelp = "*" +sphinxcontrib-serializinghtml = ">=1.1.9" +tomli = {version = ">=2", markers = "python_version < \"3.11\""} + +[package.extras] +docs = ["sphinxcontrib-websupport"] +lint = ["flake8 (>=6.0)", "importlib-metadata (>=6.0)", "mypy (==1.10.1)", "pytest (>=6.0)", "ruff (==0.5.2)", "sphinx-lint (>=0.9)", "tomli (>=2)", "types-docutils (==0.21.0.20240711)", "types-requests (>=2.30.0)"] +test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=8.0)", "setuptools (>=70.0)", "typing_extensions (>=4.9)"] + +[[package]] +name = "sphinx-rtd-theme" +version = "2.0.0" +description = "Read the Docs theme for Sphinx" +optional = false +python-versions = ">=3.6" +files = [ + {file = "sphinx_rtd_theme-2.0.0-py2.py3-none-any.whl", hash = "sha256:ec93d0856dc280cf3aee9a4c9807c60e027c7f7b461b77aeffed682e68f0e586"}, + {file = "sphinx_rtd_theme-2.0.0.tar.gz", hash = "sha256:bd5d7b80622406762073a04ef8fadc5f9151261563d47027de09910ce03afe6b"}, +] + +[package.dependencies] +docutils = "<0.21" +sphinx = ">=5,<8" +sphinxcontrib-jquery = ">=4,<5" + +[package.extras] +dev = ["bump2version", "sphinxcontrib-httpdomain", "transifex-client", "wheel"] + +[[package]] +name = "sphinxcontrib-applehelp" +version = "2.0.0" +description = "sphinxcontrib-applehelp is a Sphinx extension which outputs Apple help books" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5"}, + {file = "sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-devhelp" +version = "2.0.0" +description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp documents" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2"}, + {file = "sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-htmlhelp" +version = "2.1.0" +description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8"}, + {file = "sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["html5lib", "pytest"] + +[[package]] +name = "sphinxcontrib-jquery" +version = "4.1" +description = "Extension to include jQuery on newer Sphinx releases" +optional = false +python-versions = ">=2.7" +files = [ + {file = "sphinxcontrib-jquery-4.1.tar.gz", hash = "sha256:1620739f04e36a2c779f1a131a2dfd49b2fd07351bf1968ced074365933abc7a"}, + {file = "sphinxcontrib_jquery-4.1-py2.py3-none-any.whl", hash = "sha256:f936030d7d0147dd026a4f2b5a57343d233f1fc7b363f68b3d4f1cb0993878ae"}, +] + +[package.dependencies] +Sphinx = ">=1.8" + +[[package]] +name = "sphinxcontrib-jsmath" +version = "1.0.1" +description = "A sphinx extension which renders display math in HTML via JavaScript" +optional = false +python-versions = ">=3.5" +files = [ + {file = "sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"}, + {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"}, +] + +[package.extras] +test = ["flake8", "mypy", "pytest"] + +[[package]] +name = "sphinxcontrib-qthelp" +version = "2.0.0" +description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp documents" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb"}, + {file = "sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["defusedxml (>=0.7.1)", "pytest"] + +[[package]] +name = "sphinxcontrib-serializinghtml" +version = "2.0.0" +description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331"}, + {file = "sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d"}, +] + +[package.extras] +lint = ["mypy", "ruff (==0.5.5)", "types-docutils"] +standalone = ["Sphinx (>=5)"] +test = ["pytest"] + +[[package]] +name = "statsmodels" +version = "0.14.3" +description = "Statistical computations and models for Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "statsmodels-0.14.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7372c92f18b8afb06355e067285abb94e8b214afd9f2fda6d3c26f3ea004cbdf"}, + {file = "statsmodels-0.14.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:42459cdaafe217f455e6b95c05d9e089caf02dd53295aebe63bc1e0206f83176"}, + {file = "statsmodels-0.14.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a72d3d9fe61f70baf18667bc9cf2e68b6bdd8f5cce4f7b21f9e662e19d2ffdf"}, + {file = "statsmodels-0.14.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9050e5817f23a5adcb87822406b5260758795c42c41fa2fa60816023f0a0d8ef"}, + {file = "statsmodels-0.14.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f12d74743936323165dae648f75193ee4a47381a85610be661d34de56c7634e0"}, + {file = "statsmodels-0.14.3-cp310-cp310-win_amd64.whl", hash = "sha256:53212f597747534bed475bbd89f4bc39a3757c20692bb7664021e30fbd967c53"}, + {file = "statsmodels-0.14.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e49a63757e12269ef02841f05906e91bdb70f5bc358cbaca97f171f4a4de09c4"}, + {file = "statsmodels-0.14.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:de4b989f0fea684f89bdf5ff641f9acb7acddfd712459f28365904a974afaeff"}, + {file = "statsmodels-0.14.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45a5ae177e92348532bf2522f27feecd0589b88b243709b28e2b068631c9c181"}, + {file = "statsmodels-0.14.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a736ac24da1388e444bb2b0d381a7307b29074b237acef040a793cfdd508e160"}, + {file = "statsmodels-0.14.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ea8491b6a36fca738403037709e9469412a9d3e8a8e54db482c20e8dd70efa1f"}, + {file = "statsmodels-0.14.3-cp311-cp311-win_amd64.whl", hash = "sha256:efb946ced8243923eb78909834699be55442172cea3dc37158e3e1c5370e4189"}, + {file = "statsmodels-0.14.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9bf3690f71ebacff0c976c1584994174bc1bb72785b5a35645b385a00a5107e0"}, + {file = "statsmodels-0.14.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:197bcb1aeaaa5c7e9ba4ad87c2369f9600c6cd69d6e2db829eb46d3d9fe534c9"}, + {file = "statsmodels-0.14.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:492b8fd867687f9539b1f7f111dafb2464e04f65fa834585c08725b8aa1a3d98"}, + {file = "statsmodels-0.14.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a849e78dcb3ed6416bb9043b9549415f1f8cd00426deb467ff4dfe0acbaaad8e"}, + {file = "statsmodels-0.14.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e8a82aa8a99a428f39a9ead1b03fbd2339e40908412371abe089239d21467fd5"}, + {file = "statsmodels-0.14.3-cp312-cp312-win_amd64.whl", hash = "sha256:5724e51a370227655679f1a487f429919f03de325d7b5702e919526353d0cb1d"}, + {file = "statsmodels-0.14.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:78f579f8416b91b971fb0f27e18c3dec6946b4471ac2456a98dbfd24c72d180c"}, + {file = "statsmodels-0.14.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:eb84759e3c1a7b77cae4e7dfdc2ea09b1f1790446fd8476951680eb79e4a568d"}, + {file = "statsmodels-0.14.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e7681296373de33d775b01201c51e340d01afb70c6a5ac9b7c66a9e120564967"}, + {file = "statsmodels-0.14.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:988346db6319f0c12e6137df674e10ebf551adb42445e05eea2e1d900898f670"}, + {file = "statsmodels-0.14.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c69b82b4f2a794199d1100ab4406f761516f71826856fa6bfc474a3189b77785"}, + {file = "statsmodels-0.14.3-cp313-cp313-win_amd64.whl", hash = "sha256:5114e5c0f10ce30616ef4a91dc24e66e1033c242589208e604d80a7931537f12"}, + {file = "statsmodels-0.14.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:280e69721925a936493153dba692b53a2fe4e3f46e5fafd32a453f5d9fa2a344"}, + {file = "statsmodels-0.14.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:97f28958e456aea788d4ffd83d7ade82d2a4a3bd5c7e8eabf791f224cddef2bf"}, + {file = "statsmodels-0.14.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ef24d6350a15f5d25f7c6cb774fce89dff77e3687181ce4410cafd6a4004f04"}, + {file = "statsmodels-0.14.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ddbd07b7d05e16d1a2ea6df3d7e2255dfb3e0363b91d859623d9fc3aff32b4a"}, + {file = "statsmodels-0.14.3-cp39-cp39-win_amd64.whl", hash = "sha256:42dfb9084a5520342248441904357bd5d7fcf01ec05c9bdc7dd764a88e15a9c4"}, + {file = "statsmodels-0.14.3.tar.gz", hash = "sha256:ecf3502643fa93aabe5f0bdf238efb59609517c4d60a811632d31fcdce86c2d2"}, +] + +[package.dependencies] +numpy = ">=1.22.3,<3" +packaging = ">=21.3" +pandas = ">=1.4,<2.1.0 || >2.1.0" +patsy = ">=0.5.6" +scipy = ">=1.8,<1.9.2 || >1.9.2" + +[package.extras] +build = ["cython (>=3.0.10)"] +develop = ["colorama", "cython (>=3.0.10)", "cython (>=3.0.10,<4)", "flake8", "isort", "joblib", "matplotlib (>=3)", "pytest (>=7.3.0,<8)", "pytest-cov", "pytest-randomly", "pytest-xdist", "pywinpty", "setuptools-scm[toml] (>=8.0,<9.0)"] +docs = ["ipykernel", "jupyter-client", "matplotlib", "nbconvert", "nbformat", "numpydoc", "pandas-datareader", "sphinx"] + +[[package]] +name = "tabulate" +version = "0.9.0" +description = "Pretty-print tabular data" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, + {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, +] + +[package.extras] +widechars = ["wcwidth"] + +[[package]] +name = "threadpoolctl" +version = "3.5.0" +description = "threadpoolctl" +optional = false +python-versions = ">=3.8" +files = [ + {file = "threadpoolctl-3.5.0-py3-none-any.whl", hash = "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467"}, + {file = "threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107"}, +] + +[[package]] +name = "tomli" +version = "2.0.2" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.8" +files = [ + {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"}, + {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"}, +] + +[[package]] +name = "tzdata" +version = "2024.2" +description = "Provider of IANA time zone data" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"}, + {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"}, +] + +[[package]] +name = "urllib3" +version = "2.2.3" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=3.8" +files = [ + {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"}, + {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"}, +] + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +h2 = ["h2 (>=4,<5)"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + +[[package]] +name = "zipp" +version = "3.20.2" +description = "Backport of pathlib-compatible object wrapper for zip files" +optional = false +python-versions = ">=3.8" +files = [ + {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"}, + {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"}, +] + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] +type = ["pytest-mypy"] + +[metadata] +lock-version = "2.0" +python-versions = ">=3.9" +content-hash = "2237110d20a33b24e0f73322962a1ff2cc1411386309302759151579a3466605" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..de9c70b7 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,61 @@ +[tool.poetry] +name = "category_encoders" +version = "2.6.4" +description = "A package for encoding categorical variables for machine learning" +authors = ["PaulWestenthanner "] +license = "BSD-3" +readme = "README.md" + +[tool.poetry.dependencies] +python = ">=3.9" +numpy = ">=1.14.0" +scikit-learn = ">=1.0.0" +scipy = ">=1.0.0" +statsmodels = ">=0.9.0" +pandas = ">=1.0.5" +patsy = ">=0.5.1" + + +[tool.poetry.dev-dependencies] +numpydoc = "^1.8.0" +ruff = "^0.6.8" +sphinx = "^7.4.7" +sphinx_rtd_theme = "^2.0.0" +pytest = "^8.3.3" +pytest-subtests = "^0.14.1" + +[tool.ruff] +line-length = 100 +target-version = "py311" +src = ["category_encoders", "tests"] + +[tool.ruff.format] +quote-style = "single" +indent-style = "space" +docstring-code-format = true + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "N", # PEP8 naming convetions + "D" # pydocstyle +] +ignore = [ + "D401", # imperative mood + "N803", # allow X as a name for data + "N806", # allow X as a name for data + "N816", # allow mixed case names such as np_X_t as a name for data + "B023", # todo re-add this rule +] + +[tool.ruff.lint.pydocstyle] +convention = "numpy" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index 7cda0836..00000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,5 +0,0 @@ -sphinx -sphinx_rtd_theme -pytest -numpydoc -packaging \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e558dff0..00000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -numpy>=1.14.0 -scikit-learn>=1.0.0 -scipy>=1.0.0 -statsmodels>=0.9.0 -pandas>=1.0.5 -patsy>=0.5.1 -unittest2 -importlib_resources ; python_version<"3.9" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index cb4a338e..00000000 --- a/setup.cfg +++ /dev/null @@ -1,5 +0,0 @@ -[bdist_wheel] -universal=1 - -[metadata] -description-file=README.md \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 406f79f7..00000000 --- a/setup.py +++ /dev/null @@ -1,45 +0,0 @@ -from setuptools import setup, find_packages -from codecs import open -from os import path -import re - -here = path.abspath(path.dirname(__file__)) - -# Get the long description from the README file -with open(path.join(here, 'README.md'), encoding='utf-8') as f: - long_description = f.read() - -# Get the version from the __init__.py file -with open(path.join(here, 'category_encoders/__init__.py'), encoding='utf-8') as f: - __version__ = re.findall('''__version__ = ['"](.*)['"]''', f.read())[0] - -setup( - name='category_encoders', - version=__version__, - description='A collection of sklearn transformers to encode categorical variables as numeric', - long_description=long_description, - long_description_content_type='text/markdown', - url='https://github.com/scikit-learn-contrib/category_encoders', - download_url='https://github.com/scikit-learn-contrib/category_encoders/tarball/' + __version__, - license='BSD', - classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'Programming Language :: Python :: 3', - ], - keywords='python data science machine learning pandas sklearn', - packages=find_packages(include=['category_encoders', 'category_encoders.datasets']), - include_package_data=True, - author='Will McGinnis', - install_requires=[ - 'numpy>=1.14.0', - 'scikit-learn>=0.20.0', - 'scipy>=1.0.0', - 'statsmodels>=0.9.0', - 'pandas>=1.0.5', - 'patsy>=0.5.1', - 'importlib_resources ; python_version<"3.9"', - ], - author_email='will@pedalwrencher.com', - package_data={'': ['datasets/data/*.csv']}, -) diff --git a/tests/__init__.py b/tests/__init__.py index 465906b7..327bcea2 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +1,2 @@ +"""Unit tests.""" __author__ = 'willmcginnis' diff --git a/tests/helpers.py b/tests/helpers.py index 555cf1ce..d67d3fb0 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -1,57 +1,87 @@ -"""Helper functions that are used exclusively in the tests""" +"""Helper functions that are used exclusively in the tests.""" -import numpy as np +import math import random + +import numpy as np import pandas as pd -import math def verify_numeric(X_test): - """ - Test that all attributes in the DataFrame are numeric. - """ + """Test that all attributes in the DataFrame are numeric.""" _NUMERIC_KINDS = set('buifc') - + for dt in X_test.dtypes: - assert(dt.kind in _NUMERIC_KINDS) + assert dt.kind in _NUMERIC_KINDS def create_array(n_rows=1000, extras=False, has_none=True): - """ - Creates a numpy dataset with some categorical variables. - """ - ds = [[ - random.random(), - random.random(), - random.choice(['A', 'B', 'C']), - random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']), - random.choice(['A', 'B', 'C', None, np.nan]) if has_none else random.choice(['A', 'B', 'C']), - random.choice(['A']) - ] for _ in range(n_rows)] + """Creates a numpy dataset with some categorical variables.""" + ds = [ + [ + random.random(), + random.random(), + random.choice(['A', 'B', 'C']), + random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']), + ( + random.choice(['A', 'B', 'C', None, np.nan]) + if has_none + else random.choice(['A', 'B', 'C']) + ), + random.choice(['A']), + ] + for _ in range(n_rows) + ] return np.array(ds) def create_dataset(n_rows=1000, extras=False, has_missing=True, random_seed=2001): - """ - Creates a dataset with some categorical variables. - """ + """Creates a dataset with some categorical variables.""" random.seed(random_seed) - ds = [[ - random.random(), # Floats - random.choice([float('nan'), float('inf'), float('-inf'), -0, 0, 1, -1, math.pi]), # Floats with edge scenarios - row, # Unique integers - str(row), # Unique strings - random.choice(['A', 'B']) if extras else 'A', # Invariant in the training data - random.choice(['A', 'B_b', 'C_c_c']), # Strings with underscores to test reverse_dummies() - random.choice(['A', 'B', 'C', np.nan]) if has_missing else random.choice(['A', 'B', 'C']), # None - random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']), # With a new string value - random.choice(['A', 'B', 'C']), # What is going to become the categorical column - random.choice(['A', 'B', 'C', np.nan]), # Categorical with missing values - random.choice([1, 2, 3]) # Ordinal integers - ] for row in range(n_rows)] - - df = pd.DataFrame(ds, columns=['float', 'float_edge', 'unique_int', 'unique_str', 'invariant', 'underscore', 'none', 'extra', 'categorical', 'na_categorical', 'categorical_int']) + ds = [ + [ + random.random(), # Floats + random.choice( + [float('nan'), float('inf'), float('-inf'), -0, 0, 1, -1, math.pi] + ), # Floats with edge scenarios + row, # Unique integers + str(row), # Unique strings + random.choice(['A', 'B']) if extras else 'A', # Invariant in the training data + random.choice( + ['A', 'B_b', 'C_c_c'] + ), # Strings with underscores to test reverse_dummies() + ( + random.choice(['A', 'B', 'C', np.nan]) + if has_missing + else random.choice(['A', 'B', 'C']) + ), # None + ( + random.choice(['A', 'B', 'C', 'D']) if extras else random.choice(['A', 'B', 'C']) + ), # With a new string value + random.choice(['A', 'B', 'C']), # What is going to become the categorical column + random.choice(['A', 'B', 'C', np.nan]), # Categorical with missing values + random.choice([1, 2, 3]), # Ordinal integers + ] + for row in range(n_rows) + ] + + df = pd.DataFrame( + ds, + columns=[ + 'float', + 'float_edge', + 'unique_int', + 'unique_str', + 'invariant', + 'underscore', + 'none', + 'extra', + 'categorical', + 'na_categorical', + 'categorical_int', + ], + ) df['categorical'] = pd.Categorical(df['categorical'], categories=['A', 'B', 'C']) df['na_categorical'] = pd.Categorical(df['na_categorical'], categories=['A', 'B', 'C']) df['categorical_int'] = pd.Categorical(df['categorical_int'], categories=[1, 2, 3]) @@ -59,14 +89,13 @@ def create_dataset(n_rows=1000, extras=False, has_missing=True, random_seed=2001 def verify_inverse_transform(x, x_inv): - """ - Verify x is equal to x_inv. The test returns true for nan.equals(nan) as it should. - """ + """Verify x is equal to x_inv. The test returns true for nan.equals(nan) as it should.""" assert x.equals(x_inv) def deep_round(A, ndigits=5): - """ - Rounds numbers in a list of lists. Useful for approximate equality testing. + """Rounds numbers in a list of lists. + + Useful for approximate equality testing. """ return [[round(val, ndigits) for val in sublst] for sublst in A] diff --git a/tests/test_backward_difference.py b/tests/test_backward_difference.py index d2ff4cea..89f432b8 100644 --- a/tests/test_backward_difference.py +++ b/tests/test_backward_difference.py @@ -1,145 +1,17 @@ -import pandas as pd +"""Tests for the BackwardDifferenceEncoder.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ -import numpy as np + import category_encoders as encoders +import numpy as np class TestBackwardsEncoder(TestCase): - - def test_backwards_difference_encoder_preserve_dimension_1(self): - train = ['A', 'B', 'C'] - test = ['A', 'D', 'E'] - - encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [[1, -2 / 3.0, -1 / 3.0], - [1, 0, 0], - [1, 0, 0]] - self.assertEqual(test_t.to_numpy().tolist(), expected) - - def test_backwards_difference_encoder_preserve_dimension_2(self): - train = ['A', 'B', 'C'] - test = ['B', 'D', 'E'] - - encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [[1, 1 / 3.0, -1 / 3.0], - [1, 0, 0], - [1, 0, 0]] - self.assertEqual(test_t.to_numpy().tolist(), expected) - - def test_backwards_difference_encoder_preserve_dimension_3(self): - train = ['A', 'B', 'C'] - test = ['A', 'B', 'C', None] - - encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [[1, -2 / 3.0, -1 / 3.0], - [1, 1 / 3.0, -1 / 3.0], - [1, 1 / 3.0, 2 / 3.0], - [1, 0, 0]] - self.assertEqual(test_t.to_numpy().tolist(), expected) - - def test_backwards_difference_encoder_preserve_dimension_4(self): - train = ['A', 'B', 'C'] - test = ['D', 'B', 'C', None] - - encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [[1, 0, 0], - [1, 1 / 3.0, -1 / 3.0], - [1, 1 / 3.0, 2 / 3.0], - [1, 0, 0]] - self.assertEqual(test_t.to_numpy().tolist(), expected) - - def test_backwards_difference_encoder_2cols(self): - train = [['A', 'A'], ['B', 'B'], ['C', 'C']] - - encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - obtained = encoder.transform(train) - - expected = [[1, -2 / 3.0, -1 / 3.0, -2 / 3.0, -1 / 3.0], - [1, 1 / 3.0, -1 / 3.0, 1 / 3.0, -1 / 3.0], - [1, 1 / 3.0, 2 / 3.0, 1 / 3.0, 2 / 3.0]] - self.assertEqual(obtained.to_numpy().tolist(), expected) - - def test_backwards_difference_encoder_2StringCols_ExpectCorrectOrder(self): - train = pd.DataFrame({'col1': [1, 2, 3, 4], - 'col2': ['A', 'B', 'C', 'D'], - 'col3': [1, 2, 3, 4], - 'col4': ['A', 'B', 'C', 'A'] - }, - columns=['col1', 'col2', 'col3', 'col4']) - expected_columns = ['intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1'] - encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') - - encoder.fit(train) - columns = encoder.transform(train).columns - - self.assertTrue(np.array_equal(expected_columns, columns)) - - def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): - train = ['A', 'B', np.nan] - - encoder = encoders.BackwardDifferenceEncoder(handle_missing='indicator') - result = encoder.fit_transform(train) - - expected = [[1, -2 / 3.0, -1 / 3.0], - [1, 1 / 3.0, -1 / 3.0], - [1, 1 / 3.0, 2 / 3.0]] - self.assertTrue(np.array_equal(result.to_numpy().tolist(), expected)) - - def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): - train = ['A', 'B'] - - encoder = encoders.BackwardDifferenceEncoder(handle_missing='indicator') - result = encoder.fit_transform(train) - - expected = [[1, -2 / 3.0, -1 / 3.0], - [1, 1 / 3.0, -1 / 3.0]] - self.assertEqual(result.to_numpy().tolist(), expected) - - def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): - train = ['A', 'B'] - test = ['A', 'B', np.nan] - - encoder = encoders.BackwardDifferenceEncoder(handle_missing='indicator') - encoder.fit(train) - result = encoder.transform(test) - - expected = [[1, -2 / 3.0, -1 / 3.0], - [1, 1 / 3.0, -1 / 3.0], - [1, 1 / 3.0, 2 / 3.0]] - self.assertEqual(result.to_numpy().tolist(), expected) - - def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): - train = ['A', 'B'] - test = ['A', 'B', 'C'] - - encoder = encoders.BackwardDifferenceEncoder(handle_unknown='indicator') - encoder.fit(train) - result = encoder.transform(test) - - expected = [[1, -2 / 3.0, -1 / 3.0], - [1, 1 / 3.0, -1 / 3.0], - [1, 1 / 3.0, 2 / 3.0]] - self.assertEqual(result.to_numpy().tolist(), expected) - - def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): - train = ['A', 'B'] - - encoder = encoders.BackwardDifferenceEncoder(handle_unknown='indicator') - result = encoder.fit_transform(train) - - expected = [[1, -2 / 3.0, -1 / 3.0], - [1, 1 / 3.0, -1 / 3.0]] - self.assertEqual(result.to_numpy().tolist(), expected) + """Unit tests for the BackwardDifferenceEncoder.""" + + def test_get_contrast_matrix(self): + """Test the BackwardDifferenceEncoder get_contrast_matrix method.""" + train = np.array([('A', ), ('B', ), ('C', )]) + encoder = encoders.BackwardDifferenceEncoder() + matrix = encoder.get_contrast_matrix(train) + expected_matrix = np.array([[-2/3, -1/3], [1/3, -1/3], [1/3, 2/3]]) + np.testing.assert_array_equal(matrix.matrix, expected_matrix) diff --git a/tests/test_basen.py b/tests/test_basen.py index dc2b50a7..f608e8bb 100644 --- a/tests/test_basen.py +++ b/tests/test_basen.py @@ -1,12 +1,16 @@ -import pandas as pd +"""Tests for the BaseNEncoder class.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ -import numpy as np + import category_encoders as encoders +import numpy as np +import pandas as pd class TestBaseNEncoder(TestCase): + """Tests for the BaseNEncoder class.""" - def test_fit_transform_have_base_2_expect_Correct_Encoding(self): + def test_fit_transform_have_base_2_expect_correct_encoding(self): + """Test the BaseNEncoder with base 2.""" train = pd.Series(['a', 'b', 'c', 'd']) result = encoders.BaseNEncoder(base=2).fit_transform(train) @@ -17,7 +21,8 @@ def test_fit_transform_have_base_2_expect_Correct_Encoding(self): self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) - def test_inverse_transform_HaveData_ExpectResultReturned(self): + def test_inverse_transform(self): + """Test the BaseNEncoder inverse_transform method.""" train = pd.Series(list('abcd')).to_frame('letter') enc = encoders.BaseNEncoder(base=2) @@ -26,7 +31,8 @@ def test_inverse_transform_HaveData_ExpectResultReturned(self): pd.testing.assert_frame_equal(train, inversed_result) - def test_HaveIndicatorAndNanValue_ExpectNewColumn(self): + def test_handle_missing_indicator_with_nan(self): + """Test the BaseNEncoder with handle_missing='indicator'.""" train = pd.Series(['a', 'b', 'c', np.nan]) result = encoders.BaseNEncoder(handle_missing='indicator', base=2).fit_transform(train) @@ -37,31 +43,34 @@ def test_HaveIndicatorAndNanValue_ExpectNewColumn(self): self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) - def test_HandleMissingIndicator_HaveNoNan_ExpectThirdColumn(self): + def test_handle_missing_indicator_without_nan(self): + """Test the BaseNEncoder with handle_missing='indicator'. + + This should add a column for predict if there was no missing value in the training set. + """ train = pd.Series(['a', 'b', 'c']) - result = encoders.BaseNEncoder(handle_missing='indicator', base=2).fit_transform(train) + encoder = encoders.BaseNEncoder(handle_missing='indicator', base=2) + result = encoder.fit_transform(train) self.assertEqual(3, result.shape[0]) self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) - def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): - train = pd.Series(['a', 'b', 'c']) - test = pd.Series(['a', 'b', 'c', np.nan]) + with self.subTest("should work with a missing value in test set"): + test = pd.Series(['a', 'b', 'c', np.nan]) - encoder = encoders.BaseNEncoder(handle_missing='indicator') - encoder.fit(train) - result = encoder.transform(test) + result = encoder.transform(test) - self.assertEqual(4, result.shape[0]) - self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) - self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) - self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) - self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) + self.assertEqual(4, result.shape[0]) + self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) + self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) + self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) + self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) - def test_HandleUnknown_HaveUnknown_ExpectIndicatorInTest(self): + def test_handle_unknown_indicator(self): + """Test the BaseNEncoder with handle_unknown='indicator'.""" train = ['A', 'B', 'C'] test = ['A', 'B', 'C', 'D'] @@ -75,7 +84,8 @@ def test_HandleUnknown_HaveUnknown_ExpectIndicatorInTest(self): self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) - def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): + def test_handle_unknown_indicator_no_unknowns(self): + """Should create an indicator column even if no unknown values appear in the test set.""" train = ['A', 'B'] encoder = encoders.BaseNEncoder(handle_unknown='indicator') @@ -85,62 +95,65 @@ def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): self.assertListEqual([0, 1], result.iloc[0, :].tolist()) self.assertListEqual([1, 0], result.iloc[1, :].tolist()) - def test_inverse_transform_HaveNanInTrainAndHandleMissingValue_ExpectReturnedWithNan(self): + def test_inverse_transform_have_nan_in_train(self): + """Test the BaseNEncoder inverse_transform method with NaN in the training set.""" train = pd.DataFrame({'city': ['chicago', np.nan]}) - enc = encoders.BaseNEncoder(handle_missing='value', handle_unknown='value') - result = enc.fit_transform(train) - original = enc.inverse_transform(result) + handle_missing = ["value", "return_nan"] + for handle_missing_strategy in handle_missing: + with self.subTest(f"Should work for handle_missing='{handle_missing_strategy}"): + enc = encoders.BaseNEncoder(handle_missing=handle_missing_strategy, + handle_unknown='value') + result = enc.fit_transform(train) + original = enc.inverse_transform(result) + pd.testing.assert_frame_equal(train, original) - pd.testing.assert_frame_equal(train, original) + def test_inverse_transform_not_supported_with_unknown_values(self): + """Test that inverse_transform is not supported if a nan could be either missing or unknown. - def test_inverse_transform_HaveNanInTrainAndHandleMissingReturnNan_ExpectReturnedWithNan(self): - train = pd.DataFrame({'city': ['chicago', np.nan]}) - - enc = encoders.BaseNEncoder(handle_missing='return_nan', handle_unknown='value') - result = enc.fit_transform(train) - original = enc.inverse_transform(result) - - pd.testing.assert_frame_equal(train, original) - - def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self): + This happens if both handle_missing and handle_unkown are set to 'return_nan'. + """ train = pd.DataFrame({'city': ['chicago', np.nan]}) test = pd.DataFrame({'city': ['chicago', 'los angeles']}) enc = encoders.BaseNEncoder(handle_missing='return_nan', handle_unknown='return_nan') enc.fit(train) result = enc.transform(test) - - message = 'inverse_transform is not supported because transform impute '\ - 'the unknown category nan when encode city' - with self.assertWarns(UserWarning, msg=message) as w: - enc.inverse_transform(result) + message = ( + 'inverse_transform is not supported because transform impute ' + 'the unknown category nan when encode city' + ) - def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self): - train = pd.DataFrame({'city': ['chicago', np.nan]}) - test = pd.DataFrame({'city': ['chicago', 'los angeles']}) - - enc = encoders.BaseNEncoder(handle_missing='value', handle_unknown='return_nan') - enc.fit(train) - result = enc.transform(test) - original = enc.inverse_transform(result) + with self.assertWarns(UserWarning, msg=message): + enc.inverse_transform(result) - pd.testing.assert_frame_equal(train, original) + def test_inverse_transform_with_missing_and_unknown(self): + """Test the BaseNEncoder inverse_transform method with missing and unknown values. - def test_inverse_transform_HaveHandleMissingValueAndHandleUnknownReturnNan_ExpectBestInverse(self): + In the case of handle_missing='value' and handle_unknown='return_nan', + the inverse_transform can distinguish between missing and unknown values and + hence should work. Unknown values are encoded as nan in the inverse. + """ train = pd.DataFrame({'city': ['chicago', np.nan]}) - test = pd.DataFrame({'city': ['chicago', np.nan, 'los angeles']}) - expected = pd.DataFrame({'city': ['chicago', np.nan, np.nan]}) enc = encoders.BaseNEncoder(handle_missing='value', handle_unknown='return_nan') enc.fit(train) - result = enc.transform(test) - original = enc.inverse_transform(result) - - pd.testing.assert_frame_equal(expected, original) - - def test_inverse_transform_HaveRegexMetacharactersInColumnName_ExpectInversed(self): + with self.subTest("should work with only unknown values"): + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + result = enc.transform(test) + original = enc.inverse_transform(result) + pd.testing.assert_frame_equal(train, original) + + with self.subTest("should inverse transform unknowns and missing values to NaN"): + test = pd.DataFrame({'city': ['chicago', np.nan, 'los angeles']}) + expected = pd.DataFrame({'city': ['chicago', np.nan, np.nan]}) + result = enc.transform(test) + original = enc.inverse_transform(result) + pd.testing.assert_frame_equal(expected, original) + + def test_inverse_transform_have_regex_metacharacters_in_column_name(self): + """Test the inverse_transform method with regex metacharacters in column name.""" train = pd.DataFrame({'state (2-letter code)': ['il', 'ny', 'ca']}) enc = encoders.BaseNEncoder() @@ -151,8 +164,7 @@ def test_inverse_transform_HaveRegexMetacharactersInColumnName_ExpectInversed(se pd.testing.assert_frame_equal(train, original) def test_num_cols(self): - """ - Test that BaseNEncoder produces the correct number of output columns. + """Test that BaseNEncoder produces the correct number of output columns. Since the value 0 is reserved for encoding unseen values, there need to be enough digits to represent up to nvals + 1 distinct encodings, where nvals is the number of distinct input @@ -161,8 +173,9 @@ def test_num_cols(self): This test specifically checks the case where BaseNEncoder is initialized with handle_unknown='value' and handle_missing='value' (i.e. the defaults). """ + def num_cols(nvals, base): - """Returns the number of columns output for a given number of distinct input values""" + """Returns the number of columns output for a given number of distinct input values.""" vals = [str(i) for i in range(nvals)] df = pd.DataFrame({'vals': vals}) encoder = encoders.BaseNEncoder(base=base) diff --git a/tests/test_binary.py b/tests/test_binary.py index f315274c..1d9051bd 100644 --- a/tests/test_binary.py +++ b/tests/test_binary.py @@ -1,18 +1,22 @@ -import pandas as pd +"""Tests for the BinaryEncoder.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ -import numpy as np import category_encoders as encoders +import numpy as np +import pandas as pd class TestBinaryEncoder(TestCase): + """Unit tests for the BinaryEncoder.""" def test_binary_bin(self): + """Test the BinaryEncoder with only two values.""" data = np.array(['a', 'ba', 'ba']) out = encoders.BinaryEncoder().fit_transform(data) self.assertTrue(pd.DataFrame([[0, 1], [1, 0], [1, 0]], columns=['0_0', '0_1']).equals(out)) def test_binary_dist(self): + """Test the BinaryEncoder with a all distinct values.""" data = np.array(['apple', 'orange', 'peach', 'lemon']) encoder = encoders.BinaryEncoder() encoder.fit(data) diff --git a/tests/test_cat_boost.py b/tests/test_cat_boost.py index b149b906..cc53bc35 100644 --- a/tests/test_cat_boost.py +++ b/tests/test_cat_boost.py @@ -1,59 +1,99 @@ -import pandas as pd -import numpy as np +"""Tests for cat boost encoder.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ import category_encoders as encoders +import numpy as np +import pandas as pd class TestCatBoostEncoder(TestCase): + """Tests for the CatBoostEncoder.""" - def test_catBoost(self): + def test_cat_boost(self): + """Test the CatBoostEncoder.""" X = pd.DataFrame({'col1': ['A', 'B', 'B', 'C', 'A']}) y = pd.Series([1, 0, 1, 0, 1]) enc = encoders.CatBoostEncoder() obtained = enc.fit_transform(X, y) - self.assertEqual(list(obtained['col1']), [0.6, 0.6, 0.6/2, 0.6, 1.6/2], 'The nominator is incremented by the prior. The denominator by 1.') + self.assertEqual( + list(obtained['col1']), + [0.6, 0.6, 0.6 / 2, 0.6, 1.6 / 2], + 'The nominator is incremented by the prior. The denominator by 1.', + ) # For testing set, use statistics calculated on all the training data. # See: CatBoost: unbiased boosting with categorical features, page 4. X_t = pd.DataFrame({'col1': ['B', 'B', 'A']}) obtained = enc.transform(X_t) - self.assertEqual(list(obtained['col1']), [1.6/3, 1.6/3, 2.6/3]) + self.assertEqual(list(obtained['col1']), [1.6 / 3, 1.6 / 3, 2.6 / 3]) + + def test_cat_boost_missing(self): + """Test the CatBoostEncoder with missing values. - def test_catBoost_missing(self): + Should impute according to cat boost for missing values. + """ X = pd.DataFrame({'col1': ['A', 'B', 'B', 'C', 'A', np.nan, np.nan, np.nan]}) y = pd.Series([1, 0, 1, 0, 1, 0, 1, 0]) enc = encoders.CatBoostEncoder(handle_missing='value') obtained = enc.fit_transform(X, y) - self.assertEqual(list(obtained['col1']), [0.5, 0.5, 0.5/2, 0.5, 1.5/2, 0.5, 0.5/2, 1.5/3], 'We treat None as another category.') + self.assertEqual( + list(obtained['col1']), + [0.5, 0.5, 0.5 / 2, 0.5, 1.5 / 2, 0.5, 0.5 / 2, 1.5 / 3], + 'We treat None as another category.', + ) X_t = pd.DataFrame({'col1': ['B', 'B', 'A', np.nan]}) obtained = enc.transform(X_t) - self.assertEqual(list(obtained['col1']), [1.5/3, 1.5/3, 2.5/3, 1.5/4]) - - def test_catBoost_reference(self): - # The reference is from: - # https://catboost.ai/docs/concepts/algorithm-main-stages_cat-to-numberic.html - # paragraph: - # Transforming categorical features to numerical features in classification - # as obtained on 17 Aug 2019. + self.assertEqual(list(obtained['col1']), [1.5 / 3, 1.5 / 3, 2.5 / 3, 1.5 / 4]) + + def test_cat_boost_reference(self): + """Test a specific case mentioned on catboost website. + + The reference is from: + https://catboost.ai/docs/concepts/algorithm-main-stages_cat-to-numberic.html + paragraph: + Transforming categorical features to numerical features in classification + as obtained on 17 Aug 2019. + """ X = pd.DataFrame({'col1': ['rock', 'indie', 'rock', 'rock', 'pop', 'indie', 'rock']}) y = pd.Series([0, 0, 1, 1, 1, 0, 0]) enc = encoders.CatBoostEncoder() obtained = enc.fit_transform(X, y) - prior = 3./7 # Since we do not support prior passing, we replace the prior in the reference = 0.05 with the sample prior = 3/7. - self.assertEqual(list(obtained['col1']), [prior, prior, prior/2, (1+prior)/3, prior, prior/2, (2+prior)/4]) - - def test_catBoost_reference2(self): - # The reference is from: - # https://www.youtube.com/watch?v=hqYQ8Yj9vB0 - # time: - # 35:03 - # as obtained on 21 Aug 2019. - # Note: they have an error at line [smooth 6 4.3 4.1]. It should be [smooth 6 4 4.1 3.9] - X = pd.DataFrame({'col1': ['fuzzy', 'soft', 'smooth', 'fuzzy', 'smooth', 'soft', 'smooth', 'smooth']}) + # Since we do not support prior passing, we replace the prior in the reference = 0.05 + # with the sample prior = 3/7. + prior = 3.0 / 7 + self.assertEqual( + list(obtained['col1']), + [prior, prior, prior / 2, (1 + prior) / 3, prior, prior / 2, (2 + prior) / 4], + ) + + def test_cat_boost_reference_2(self): + """Test another reference case stated in a video. + + The reference is from: + https://www.youtube.com/watch?v=hqYQ8Yj9vB0 + time: + 35:03 + as obtained on 21 Aug 2019. + Note: they have an error at line [smooth 6 4.3 4.1]. It should be [smooth 6 4 4.1 3.9] + """ + X = pd.DataFrame( + {'col1': ['fuzzy', 'soft', 'smooth', 'fuzzy', 'smooth', 'soft', 'smooth', 'smooth']} + ) y = pd.Series([4, 1, 4, 3, 6, 0, 7, 5]) enc = encoders.CatBoostEncoder() obtained = enc.fit_transform(X, y) - prior = 30./8 - self.assertEqual(list(obtained['col1']), [prior, prior, prior, (4+prior)/2, (4+prior)/2, (1+prior)/2, (10+prior)/3, (17+prior)/4]) + prior = 30.0 / 8 + self.assertEqual( + list(obtained['col1']), + [ + prior, + prior, + prior, + (4 + prior) / 2, + (4 + prior) / 2, + (1 + prior) / 2, + (10 + prior) / 3, + (17 + prior) / 4, + ], + ) diff --git a/tests/test_count.py b/tests/test_count.py index f15739c8..746ca1ac 100644 --- a/tests/test_count.py +++ b/tests/test_count.py @@ -1,36 +1,91 @@ -import pandas as pd +"""Tests for the CountEncoder.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ -import numpy as np + import category_encoders as encoders +import numpy as np +import pandas as pd +X = pd.DataFrame( + { + 'none': [ + 'A', + 'A', + 'B', + None, + None, + 'C', + None, + 'C', + None, + 'B', + 'A', + 'A', + 'C', + 'B', + 'B', + 'A', + 'A', + None, + 'B', + None, + ], + 'na_categorical': [ + 'A', + 'A', + 'C', + 'A', + 'B', + 'C', + 'C', + 'A', + np.nan, + 'B', + 'A', + 'C', + 'C', + 'A', + 'B', + 'C', + np.nan, + 'A', + np.nan, + np.nan, + ], + } +) + +X_t = pd.DataFrame( + { + 'none': ['A', 'C', None, 'B', 'C', 'C', None, None, 'A', 'A', 'C', 'A', 'B', 'A', 'A'], + 'na_categorical': [ + 'C', + 'C', + 'A', + 'B', + 'C', + 'A', + np.nan, + 'B', + 'A', + 'A', + 'B', + np.nan, + 'A', + np.nan, + 'A', + ], + } +) -X = pd.DataFrame({ - 'none': [ - 'A', 'A', 'B', None, None, 'C', None, 'C', None, 'B', - 'A', 'A', 'C', 'B', 'B', 'A', 'A', None, 'B', None - ], - 'na_categorical': [ - 'A', 'A', 'C', 'A', 'B', 'C', 'C', 'A', np.nan, 'B', 'A', - 'C', 'C', 'A', 'B', 'C', np.nan, 'A', np.nan, np.nan - ] -}) - -X_t = pd.DataFrame({ - 'none': [ - 'A', 'C', None, 'B', 'C', 'C', None, None, 'A', - 'A', 'C', 'A', 'B', 'A', 'A' - ], - 'na_categorical': [ - 'C', 'C', 'A', 'B', 'C', 'A', np.nan, 'B', 'A', 'A', - 'B', np.nan, 'A', np.nan, 'A' - ] -}) class TestCountEncoder(TestCase): + """Unit tests for the CountEncoder.""" def test_count_defaults(self): - """Test the defaults are working as expected on 'none' and 'categorical' - which are the most extreme edge cases for the count encoder.""" + """Test the defaults are working as expected on 'none' and 'categorical'. + + These are the most extreme edge cases for the count encoder. + """ enc = encoders.CountEncoder(verbose=1) enc.fit(X) out = enc.transform(X_t) @@ -44,9 +99,7 @@ def test_count_defaults(self): def test_count_handle_missing_string(self): """Test the handle_missing string on 'none' and 'na_categorical'.""" - enc = encoders.CountEncoder( - handle_missing='return_nan' - ) + enc = encoders.CountEncoder(handle_missing='return_nan') enc.fit(X) out = enc.transform(X_t) @@ -61,11 +114,11 @@ def test_count_handle_missing_string(self): self.assertTrue(out['na_categorical'].isna().sum() == 3) def test_count_handle_missing_dict(self): - """Test the handle_missing dict on 'none' and 'na_categorical'. - We want to see differing behaviour between 'none' and 'na_cat' cols.""" - enc = encoders.CountEncoder( - handle_missing={'na_categorical': 'return_nan'} - ) + """Test the handle_missing dict on 'none' and 'na_categorical'. + + We want to see differing behaviour between 'none' and 'na_cat' cols. + """ + enc = encoders.CountEncoder(handle_missing={'na_categorical': 'return_nan'}) enc.fit(X) out = enc.transform(X_t) @@ -81,8 +134,10 @@ def test_count_handle_missing_dict(self): def test_count_handle_unknown_string(self): """Test the handle_unknown string on 'none' and 'na_categorical'. + The 'handle_missing' must be set to 'return_nan' in order to test - 'handle_unkown' correctly.""" + 'handle_unkown' correctly. + """ enc = encoders.CountEncoder( handle_missing='return_nan', handle_unknown='return_nan', @@ -103,10 +158,7 @@ def test_count_handle_unknown_dict(self): """Test the 'handle_unkown' dict with all non-default options.""" enc = encoders.CountEncoder( handle_missing='return_nan', - handle_unknown={ - 'none': -1, - 'na_categorical': 'return_nan' - }, + handle_unknown={'none': -1, 'na_categorical': 'return_nan'}, ) enc.fit(X) @@ -137,9 +189,7 @@ def test_count_min_group_size_int(self): def test_count_min_group_size_dict(self): """Test the min_group_size dict on 'none' and 'na_categorical'.""" - enc = encoders.CountEncoder( - min_group_size={'none': 6, 'na_categorical': 7} - ) + enc = encoders.CountEncoder(min_group_size={'none': 6, 'na_categorical': 7}) enc.fit(X) out = enc.transform(X_t) @@ -155,10 +205,7 @@ def test_count_min_group_size_dict(self): def test_count_combine_min_nan_groups_bool(self): """Test the min_nan_groups_bool on 'none' and 'na_categorical'.""" - enc = encoders.CountEncoder( - min_group_size=7, - combine_min_nan_groups=False - ) + enc = encoders.CountEncoder(min_group_size=7, combine_min_nan_groups=False) enc.fit(X) out = enc.transform(X_t) @@ -174,14 +221,8 @@ def test_count_combine_min_nan_groups_bool(self): def test_count_combine_min_nan_groups_dict(self): """Test the combine_min_nan_groups dict on 'none' and 'na_categorical'.""" enc = encoders.CountEncoder( - min_group_size={ - 'none': 6, - 'na_categorical': 7 - }, - combine_min_nan_groups={ - 'none': 'force', - 'na_categorical': False - } + min_group_size={'none': 6, 'na_categorical': 7}, + combine_min_nan_groups={'none': 'force', 'na_categorical': False}, ) enc.fit(X) @@ -198,10 +239,7 @@ def test_count_combine_min_nan_groups_dict(self): def test_count_min_group_name_string(self): """Test the min_group_name string on 'none' and 'na_categorical'.""" - enc = encoders.CountEncoder( - min_group_size=6, - min_group_name='dave' - ) + enc = encoders.CountEncoder(min_group_size=6, min_group_name='dave') enc.fit(X) @@ -213,12 +251,8 @@ def test_count_min_group_name_string(self): def test_count_min_group_name_dict(self): """Test the min_group_name dict on 'none' and 'na_categorical'.""" enc = encoders.CountEncoder( - min_group_size={ - 'none': 6, 'na_categorical': 6 - }, - min_group_name={ - 'none': 'dave', 'na_categorical': None - } + min_group_size={'none': 6, 'na_categorical': 6}, + min_group_name={'none': 'dave', 'na_categorical': None}, ) enc.fit(X) @@ -231,10 +265,7 @@ def test_count_min_group_name_dict(self): def test_count_normalize_bool(self): """Test the normalize bool on 'none' and 'na_categorical'.""" - enc = encoders.CountEncoder( - min_group_size=6, - normalize=True - ) + enc = encoders.CountEncoder(min_group_size=6, normalize=True) enc.fit(X) out = enc.transform(X_t) @@ -250,17 +281,14 @@ def test_count_normalize_bool(self): def test_count_normalize_dict(self): """Test the normalize dict on 'none' and 'na_categorical'.""" enc = encoders.CountEncoder( - min_group_size=7, - normalize={ - 'none': True, 'na_categorical': False - } + min_group_size=7, normalize={'none': True, 'na_categorical': False} ) enc.fit(X) out = enc.transform(X_t) self.assertIn('none', enc._normalize) - self.assertTrue(out['none'].round(5).isin([0.3 , 0.15, 0.25]).all()) + self.assertTrue(out['none'].round(5).isin([0.3, 0.15, 0.25]).all()) self.assertEqual(out['none'].unique().shape[0], 3) self.assertEqual(out['none'].isna().sum(), 0) self.assertTrue(pd.Series([13, 7]).isin(out['na_categorical']).all()) diff --git a/tests/test_encoders.py b/tests/test_encoders.py index f0dab7e1..28487131 100644 --- a/tests/test_encoders.py +++ b/tests/test_encoders.py @@ -1,17 +1,21 @@ +"""Tests for the encoders.""" import warnings +from copy import deepcopy from datetime import timedelta +from unittest import TestCase +import category_encoders as encoders import numpy as np -from numpy.testing import assert_array_equal import pandas as pd -import sklearn -import tests.helpers as th -from sklearn.utils.estimator_checks import check_transformer_general, check_transformers_unfitted, check_n_features_in +from numpy.testing import assert_array_equal from sklearn.compose import ColumnTransformer -from unittest import TestCase -from copy import deepcopy +from sklearn.utils.estimator_checks import ( + check_n_features_in, + check_transformer_general, + check_transformers_unfitted, +) -import category_encoders as encoders +import tests.helpers as th __author__ = 'willmcginnis' @@ -34,22 +38,38 @@ warnings.filterwarnings('error') -# this class utilises parametrised tests where we loop over different encoders class TestEncoders(TestCase): + """Tests for the encoders. + + This is more of functional and property-based testing than unit testing. + """ def test_np(self): + """Test all encoders with numpy arrays as input.""" for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): - # Encode a numpy array enc = getattr(encoders, encoder_name)() enc.fit(np_X, np_y) th.verify_numeric(enc.transform(np_X_t)) def test_classification(self): + """Perform some basic testing of all encoders. + + This includes running the pipeline on various data types and with different parameters. + """ for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): - cols = ['unique_str', 'underscore', 'extra', 'none', 'invariant', 'categorical', 'na_categorical', 'categorical_int'] + cols = [ + 'unique_str', + 'underscore', + 'extra', + 'none', + 'invariant', + 'categorical', + 'na_categorical', + 'categorical_int', + ] enc = getattr(encoders, encoder_name)(cols=cols) enc.fit(X, np_y) @@ -66,11 +86,15 @@ def test_classification(self): enc = getattr(encoders, encoder_name)(return_df=False) enc.fit(X, np_y) self.assertTrue(isinstance(enc.transform(X_t), np.ndarray)) - self.assertEqual(enc.transform(X_t).shape[0], X_t.shape[0], 'Row count must not change') + self.assertEqual( + enc.transform(X_t).shape[0], X_t.shape[0], 'Row count must not change' + ) # encoders should be re-fittable (c.f. issue 122) X_a = pd.DataFrame(data=['1', '2', '2', '2', '2', '2'], columns=['col_a']) - X_b = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col_b']) # different values and name + X_b = pd.DataFrame( + data=['1', '1', '1', '2', '2', '2'], columns=['col_b'] + ) # different values and name y_dummy = [True, False, True, False, True, False] enc = getattr(encoders, encoder_name)() enc.fit(X_a, y_dummy) @@ -78,20 +102,22 @@ def test_classification(self): th.verify_numeric(enc.transform(X_b)) def test_deepcopy(self): - # Generate instance of evert encoder and test deepcopyable - # See: https://github.com/scikit-learn-contrib/categorical-encoding/pull/194 + """Generate instance of every encoder and test if it is deepcopy-able. + + See: https://github.com/scikit-learn-contrib/categorical-encoding/pull/194 + """ for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)() - enc2 = deepcopy(enc) + _ = deepcopy(enc) def test_impact_encoders(self): + """Test that supervised encoders use a target variable.""" for encoder_name in encoders.__all__: enc = getattr(encoders, encoder_name)() - if not enc._get_tags().get("supervised_encoder"): + if not enc._get_tags().get('supervised_encoder'): continue with self.subTest(encoder_name=encoder_name): - # encode a numpy array and transform with the help of the target enc.fit(np_X, np_y) th.verify_numeric(enc.transform(np_X_t, np_y_t)) @@ -101,15 +127,16 @@ def test_impact_encoders(self): enc.fit(X, y) th.verify_numeric(enc.transform(X_t, y_t)) - # when we run transform(X, y) and there is a new value in X, something is wrong and we raise an error + # when we run transform(X, y) and there is a new value in X, + # something is wrong and we raise an error enc = getattr(encoders, encoder_name)(handle_unknown='error', cols=['extra']) enc.fit(X, y) self.assertRaises(ValueError, enc.transform, (X_t, y_t)) def test_error_handling(self): + """Test that the encoder raises an error if the input is wrong.""" for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): - # we exclude some columns X = th.create_dataset(n_rows=100) X = X.drop(['unique_str', 'none'], axis=1) @@ -133,13 +160,14 @@ def test_error_handling(self): self.assertTrue(enc.transform(X_t).equals(X_t)) def test_handle_unknown_error(self): + """The encoder should raise an error if there is a new value and handle_unknown='error'.""" # BaseN has problems with None -> ignore None X = th.create_dataset(n_rows=100, has_missing=False) X_t = th.create_dataset(n_rows=50, extras=True, has_missing=False) - for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + # HashingEncoder supports new values by design -> excluded + for encoder_name in set(encoders.__all__) - { 'HashingEncoder' }: with self.subTest(encoder_name=encoder_name): - # new value during scoring enc = getattr(encoders, encoder_name)(handle_unknown='error') enc.fit(X, y) @@ -147,14 +175,19 @@ def test_handle_unknown_error(self): _ = enc.transform(X_t) def test_handle_missing_error(self): - non_null = pd.DataFrame({'city': ['chicago', 'los angeles'], 'color': ['red', np.nan]}) # only 'city' column is going to be transformed + """The encoder should raise an error if there is a NaN value and handle_missing='error'.""" + non_null = pd.DataFrame( + {'city': ['chicago', 'los angeles'], 'color': ['red', np.nan]} + ) # only 'city' column is going to be transformed has_null = pd.DataFrame({'city': ['chicago', np.nan], 'color': ['red', np.nan]}) - has_null_pd = pd.DataFrame({'city': ['chicago', pd.NA], 'color': ['red', pd.NA]}, dtype="string") + has_null_pd = pd.DataFrame( + {'city': ['chicago', pd.NA], 'color': ['red', pd.NA]}, dtype='string' + ) y = pd.Series([1, 0]) - for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + # HashingEncoder supports new values by design -> excluded + for encoder_name in set(encoders.__all__) - { 'HashingEncoder' }: with self.subTest(encoder_name=encoder_name): - enc = getattr(encoders, encoder_name)(handle_missing='error', cols='city') with self.assertRaises(ValueError): enc.fit(has_null, y) @@ -162,37 +195,53 @@ def test_handle_missing_error(self): with self.assertRaises(ValueError): enc.fit(has_null_pd, y) - enc.fit(non_null, y) # we raise an error only if a missing value is in one of the transformed columns + enc.fit( + non_null, y + ) # we raise an error only if a missing value is in one of the transformed columns with self.assertRaises(ValueError): enc.transform(has_null) def test_handle_missing_error_2cols(self): - # See issue #213 - non_null = pd.DataFrame({'country': ['us', 'uk'], 'city': ['chicago', 'los angeles'], 'color': ['red', np.nan]}) # only 'city' column is going to be transformed - has_null = pd.DataFrame({'country': ['us', 'uk'], 'city': ['chicago', np.nan], 'color': ['red', np.nan]}) + """The encoder should raise an error if there is a NaN value and handle_missing='error'. + + See issue #213. + This test covers the case of multiple columns. + """ + non_null = pd.DataFrame( + {'country': ['us', 'uk'], 'city': ['chicago', 'los angeles'], 'color': ['red', np.nan]} + ) # only 'city' column is going to be transformed + has_null = pd.DataFrame( + {'country': ['us', 'uk'], 'city': ['chicago', np.nan], 'color': ['red', np.nan]} + ) y = pd.Series([1, 0]) - for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + # HashingEncoder supports new values by design -> excluded + for encoder_name in set(encoders.__all__) - { 'HashingEncoder' }: with self.subTest(encoder_name=encoder_name): - - enc = getattr(encoders, encoder_name)(handle_missing='error', cols=['country', 'city']) + enc = getattr(encoders, encoder_name)( + handle_missing='error', cols=['country', 'city'] + ) with self.assertRaises(ValueError): enc.fit(has_null, y) - enc.fit(non_null, y) # we raise an error only if a missing value is in one of the transformed columns + enc.fit( + non_null, y + ) # we raise an error only if a missing value is in one of the transformed columns with self.assertRaises(ValueError): enc.transform(has_null) def test_handle_unknown_return_nan(self): + """Test that the encoder implements a handle_unknown='return_nan' strategy.""" train = pd.DataFrame({'city': ['chicago', 'los angeles']}) test = pd.DataFrame({'city': ['chicago', 'denver']}) y = pd.Series([1, 0]) - for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + for encoder_name in set(encoders.__all__) - { + 'HashingEncoder' + }: # HashingEncoder supports new values by design -> excluded with self.subTest(encoder_name=encoder_name): - enc = getattr(encoders, encoder_name)(handle_unknown='return_nan') enc.fit(train, y) result = enc.transform(test).iloc[1, :] @@ -203,11 +252,14 @@ def test_handle_unknown_return_nan(self): self.assertTrue(result[1:].isna().all()) def test_handle_missing_return_nan_train(self): + """Test that the encoder implements a handle_missing='return_nan' strategy.""" X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.nan]}) - X_pd = pd.DataFrame({'city': ['chicago', 'los angeles', pd.NA]}, dtype="string") + X_pd = pd.DataFrame({'city': ['chicago', 'los angeles', pd.NA]}, dtype='string') y = pd.Series([1, 0, 1]) - for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + for encoder_name in set(encoders.__all__) - { + 'HashingEncoder' + }: # HashingEncoder supports new values by design -> excluded for X in (X_np, X_pd): with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_missing='return_nan') @@ -219,12 +271,14 @@ def test_handle_missing_return_nan_train(self): self.assertTrue(result[1:].isna().all()) def test_handle_missing_return_nan_test(self): + """Test that the encoder implements a handle_missing='return_nan' strategy.""" X = pd.DataFrame({'city': ['chicago', 'los angeles', 'chicago']}) X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.nan]}) - X_pd = pd.DataFrame({'city': ['chicago', 'los angeles', pd.NA]}, dtype="string") + X_pd = pd.DataFrame({'city': ['chicago', 'los angeles', pd.NA]}, dtype='string') y = pd.Series([1, 0, 1]) - for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + # HashingEncoder supports new values by design -> excluded + for encoder_name in set(encoders.__all__) - { 'HashingEncoder' }: for X_na in (X_np, X_pd): with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_missing='return_nan') @@ -236,11 +290,13 @@ def test_handle_missing_return_nan_test(self): self.assertTrue(result[1:].isna().all()) def test_handle_unknown_value(self): + """Test that each encoder implements a handle_unknown='value' strategy.""" train = pd.DataFrame({'city': ['chicago', 'los angeles']}) test = pd.DataFrame({'city': ['chicago', 'denver']}) y = pd.Series([1, 0]) - for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + # HashingEncoder supports new values by design -> excluded + for encoder_name in set(encoders.__all__) - { 'HashingEncoder' }: with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_unknown='value') enc.fit(train, y) @@ -248,6 +304,7 @@ def test_handle_unknown_value(self): self.assertFalse(result.iloc[1, :].isna().all()) def test_sklearn_compliance(self): + """Test that the encoders are sklearn compliant.""" for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): encoder = getattr(encoders, encoder_name)() @@ -257,28 +314,30 @@ def test_sklearn_compliance(self): train = pd.DataFrame({'city': ['chicago', 'los angeles']}) y = pd.Series([1, 0]) encoder.fit(train, y) - self.assertTrue(hasattr(encoder, "feature_names_out_")) - self.assertListEqual(encoder.feature_names_in_, ["city"]) + self.assertTrue(hasattr(encoder, 'feature_names_out_')) + self.assertListEqual(encoder.feature_names_in_, ['city']) self.assertEqual(encoder.n_features_in_, 1) self.assertIsInstance(encoder.get_feature_names_out(), np.ndarray) - self.assertIsInstance(encoder.get_feature_names_in(), list) + self.assertIsInstance(encoder.get_feature_names_in(), np.ndarray) def test_inverse_transform(self): - # we do not allow None in these data (but "none" column without any missing value is ok) + """Test that the inverse transform works. + + We do not allow None in these data (but "none" column without any missing value is ok). + """ X = th.create_dataset(n_rows=100, has_missing=False) X_t = th.create_dataset(n_rows=50, has_missing=False) cols = ['underscore', 'none', 'categorical', 'categorical_int'] for encoder_name in ['BaseNEncoder', 'BinaryEncoder', 'OneHotEncoder', 'OrdinalEncoder']: with self.subTest(encoder_name=encoder_name): - # simple run enc = getattr(encoders, encoder_name)(verbose=1, cols=cols) enc.fit(X) th.verify_inverse_transform(X_t, enc.inverse_transform(enc.transform(X_t))) def test_inverse_uninitialized(self): - # raise an error when we call inverse_transform() before the encoder is fitted + """Raise an error when we call inverse_transform() before the encoder is fitted.""" # @ToDo parametrize for encoder_name in {'BaseNEncoder', 'BinaryEncoder', 'OrdinalEncoder', 'OneHotEncoder'}: with self.subTest(encoder_name=encoder_name): @@ -286,6 +345,7 @@ def test_inverse_uninitialized(self): self.assertRaises(ValueError, enc.inverse_transform, X) def test_inverse_wrong_feature_count(self): + """Test that the inverse transform raises an error if the feature count is wrong.""" x1 = [['A', 'B', 'C'], ['D', 'E', 'F'], ['G', 'H', 'I']] x2 = [['A', 'B'], ['C', 'D']] # @ToDo parametrize @@ -296,6 +356,7 @@ def test_inverse_wrong_feature_count(self): self.assertRaises(ValueError, enc.inverse_transform, x2) def test_inverse_wrong_feature_count_drop_invariant(self): + """Test that the inverse transform works with drop_invariant=True.""" x1 = [['A', 'B', 'C'], ['D', 'E', 'F'], ['G', 'H', 'I']] x2 = [['A', 'B'], ['C', 'D']] # @ToDo parametrize @@ -306,6 +367,7 @@ def test_inverse_wrong_feature_count_drop_invariant(self): self.assertRaises(ValueError, enc.inverse_transform, x2) def test_inverse_numeric(self): + """Test that the inverse transform works with numeric data.""" x = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] y = [0, 0, 1] @@ -318,8 +380,11 @@ def test_inverse_numeric(self): self.assertTrue((x == result.to_numpy()).all()) def test_inverse_numpy(self): - # See issue #196 - # @ToDo parametrize + """Test that the inverse transform works with numpy arrays. + + See issue #196 + @ToDo parametrize + """ for encoder_name in {'BaseNEncoder', 'BinaryEncoder', 'OrdinalEncoder', 'OneHotEncoder'}: with self.subTest(encoder_name=encoder_name): arr = np.array([['A'], ['B'], ['B'], ['C']]) @@ -332,23 +397,39 @@ def test_inverse_numpy(self): assert np.array_equal(arr, arr_decoded) def test_types(self): - X = pd.DataFrame({ - 'Int': [1, 2, 1, 2], - 'Float': [1.1, 2.2, 3.3, 4.4], - 'Complex': [3.45J, 3.45J, 3.45J, 3.45J], - 'None': [None, None, None, None], - 'Str': ['a', 'c', 'c', 'd'], - 'PdTimestamp': [pd.Timestamp('2012-05-01'), pd.Timestamp('2012-05-02'), pd.Timestamp('2012-05-03'), pd.Timestamp('2012-05-06')], - 'PdTimedelta': [pd.Timedelta('1 days'), pd.Timedelta('2 days'), pd.Timedelta('1 days'), pd.Timedelta('1 days')], - 'TimeDelta': [timedelta(-9999), timedelta(-9), timedelta(-1), timedelta(999)], - 'Bool': [False, True, True, False], - 'Tuple': [('a', 'tuple'), ('a', 'tuple'), ('a', 'tuple'), ('b', 'tuple')], - 'Categorical': pd.Categorical(list('bbea'), categories=['e', 'a', 'b'], ordered=True), - # 'List': [[1,2], [2,3], [3,4], [4,5]], - # 'Dictionary': [{1: "a", 2: "b"}, {1: "a", 2: "b"}, {1: "a", 2: "b"}, {1: "a", 2: "b"}], - # 'Set': [{'John', 'Jane'}, {'John', 'Jane'}, {'John', 'Jane'}, {'John', 'Jane'}], - # 'Array': [array('i'), array('i'), array('i'), array('i')] - }) + """Test that the encoder can handle different data types.""" + X = pd.DataFrame( + { + 'Int': [1, 2, 1, 2], + 'Float': [1.1, 2.2, 3.3, 4.4], + 'Complex': [3.45j, 3.45j, 3.45j, 3.45j], + 'None': [None, None, None, None], + 'Str': ['a', 'c', 'c', 'd'], + 'PdTimestamp': [ + pd.Timestamp('2012-05-01'), + pd.Timestamp('2012-05-02'), + pd.Timestamp('2012-05-03'), + pd.Timestamp('2012-05-06'), + ], + 'PdTimedelta': [ + pd.Timedelta('1 days'), + pd.Timedelta('2 days'), + pd.Timedelta('1 days'), + pd.Timedelta('1 days'), + ], + 'TimeDelta': [timedelta(-9999), timedelta(-9), timedelta(-1), timedelta(999)], + 'Bool': [False, True, True, False], + 'Tuple': [('a', 'tuple'), ('a', 'tuple'), ('a', 'tuple'), ('b', 'tuple')], + 'Categorical': pd.Categorical( + list('bbea'), categories=['e', 'a', 'b'], ordered=True + ), + # 'List': [[1,2], [2,3], [3,4], [4,5]], + # 'Dictionary': [{1: "a", 2: "b"}, {1: "a", 2: "b"}, + # {1: "a", 2: "b"}, {1: "a", 2: "b"}], + # 'Set': [{'John', 'Jane'}, {'John', 'Jane'}, {'John', 'Jane'}, {'John', 'Jane'}], + # 'Array': [array('i'), array('i'), array('i'), array('i')] + } + ) y = [1, 0, 0, 1] for encoder_name in encoders.__all__: @@ -356,25 +437,37 @@ def test_types(self): encoder.fit_transform(X, y) def test_preserve_column_order(self): + """Test that the encoder preserves the column order.""" binary_cat_example = pd.DataFrame( - {'Trend': ['UP', 'UP', 'DOWN', 'FLAT', 'DOWN', 'UP', 'DOWN', 'FLAT', 'FLAT', 'FLAT'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}, columns=['Trend', 'target']) + { + 'Trend': ['UP', 'UP', 'DOWN', 'FLAT', 'DOWN', 'UP', 'DOWN', 'FLAT', 'FLAT', 'FLAT'], + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1], + }, + columns=['Trend', 'target'], + ) for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): - encoder = getattr(encoders, encoder_name)() result = encoder.fit_transform(binary_cat_example, binary_cat_example['target']) columns = result.columns - self.assertTrue('target' in columns[-1], - "Target must be the last column as in the input. This is a tricky test because 'y' is named 'target' as well.") + self.assertTrue( + 'target' in columns[-1], + "Target must be the last column as in the input. " + "This is a tricky test because 'y' is named 'target' as well.", + ) def test_tmp_column_name(self): + """Test that the encoder can handle a temporary column name.""" binary_cat_example = pd.DataFrame( - {'Trend': ['UP', 'UP', 'DOWN', 'FLAT'], - 'Trend_tmp': ['UP', 'UP', 'DOWN', 'FLAT'], - 'target': [1, 1, 0, 0]}, columns=['Trend', 'Trend_tmp', 'target']) + { + 'Trend': ['UP', 'UP', 'DOWN', 'FLAT'], + 'Trend_tmp': ['UP', 'UP', 'DOWN', 'FLAT'], + 'target': [1, 1, 0, 0], + }, + columns=['Trend', 'Trend_tmp', 'target'], + ) for encoder_name in encoders.__all__: enc = getattr(encoders, encoder_name)() if not enc._get_tags().get('supervised_encoder'): @@ -383,10 +476,15 @@ def test_tmp_column_name(self): _ = enc.fit_transform(binary_cat_example, binary_cat_example['target']) def test_preserve_names(self): + """Test that the encoder preserves the column names.""" binary_cat_example = pd.DataFrame( - {'ignore': ['UP', 'UP', 'DOWN', 'FLAT'], - 'feature': ['UP', 'UP', 'DOWN', 'FLAT'], - 'target': [1, 1, 0, 0]}, columns=['ignore', 'feature', 'target']) + { + 'ignore': ['UP', 'UP', 'DOWN', 'FLAT'], + 'feature': ['UP', 'UP', 'DOWN', 'FLAT'], + 'target': [1, 1, 0, 0], + }, + columns=['ignore', 'feature', 'target'], + ) for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): @@ -394,35 +492,44 @@ def test_preserve_names(self): result = encoder.fit_transform(binary_cat_example, binary_cat_example['target']) columns = result.columns - self.assertTrue('ignore' in columns, "Column 'ignore' is missing in: " + str(columns)) + self.assertTrue( + 'ignore' in columns, "Column 'ignore' is missing in: " + str(columns) + ) def test_unique_column_is_not_predictive(self): - # @ToDo not sure how useful this test is. TargetEncoders set the value to the default if there is only + """Test that the unique column is not predictive of the label.""" + # @ToDo not sure how useful this test is. + # TargetEncoders set the value to the default if there is only # one category but they probably should not. See discussion in issue 327 - test_encoders = ['LeaveOneOutEncoder', 'WOEEncoder', 'MEstimateEncoder', - 'JamesSteinEncoder', 'CatBoostEncoder', 'GLMMEncoder'] + test_encoders = [ + 'LeaveOneOutEncoder', + 'WOEEncoder', + 'MEstimateEncoder', + 'JamesSteinEncoder', + 'CatBoostEncoder', + 'GLMMEncoder', + ] for encoder_name in test_encoders: enc = getattr(encoders, encoder_name)() with self.subTest(encoder_name=encoder_name): result = enc.fit_transform(X[['unique_str']], y) - self.assertTrue(all(result.var() < 0.001), 'The unique string column must not be predictive of the label') - - # # beware: for some reason doctest does not raise exceptions - you have to read the text output - # def test_doc(self): - # suite = TestSuite() - # - # for filename in os.listdir('../'): - # if filename.endswith(".py"): - # suite.addTest(doctest.DocFileSuite('../' + filename)) - # - # runner = TextTestRunner(verbosity=2) - # runner.run(suite) + self.assertTrue( + all(result.var() < 0.001), + 'The unique string column must not be predictive of the label', + ) def test_cols(self): - # Test cols argument with different data types, which are array-like or scalars + """Test cols argument with different data types, which are array-like or scalars.""" cols_list = ['extra', 'invariant'] - cols_types = [cols_list, pd.Series(cols_list), np.array(cols_list), 'extra', set(cols_list), - ('extra', 'invariant'), pd.Categorical(cols_list, categories=cols_list)] + cols_types = [ + cols_list, + pd.Series(cols_list), + np.array(cols_list), + 'extra', + set(cols_list), + ('extra', 'invariant'), + pd.Categorical(cols_list, categories=cols_list), + ] for encoder_name in encoders.__all__: for cols in cols_types: @@ -431,23 +538,29 @@ def test_cols(self): enc.fit(X, y) enc.transform(X_t) - def test_noncontiguous_index(self): + def test_non_contiguous_index(self): + """Test if the encoder can handle non-contiguous index values.""" for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): - enc = getattr(encoders, encoder_name)(cols=['x']) - data = pd.DataFrame({'x': ['a', 'b', np.nan, 'd', 'e'], 'y': [1, 0, 1, 0, 1]}).dropna() + data = pd.DataFrame( + {'x': ['a', 'b', np.nan, 'd', 'e'], 'y': [1, 0, 1, 0, 1]} + ).dropna() _ = enc.fit_transform(data[['x']], data['y']) def test_duplicate_index_value(self): + """Test if the encoder can handle duplicate index values.""" for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(cols=['x']) - data = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e'], 'y': [1, 0, 1, 0, 1]}, index=[1, 2, 2, 3, 4]) + data = pd.DataFrame( + {'x': ['a', 'b', 'c', 'd', 'e'], 'y': [1, 0, 1, 0, 1]}, index=[1, 2, 2, 3, 4] + ) result = enc.fit_transform(data[['x']], data['y']) self.assertEqual(5, len(result)) def test_string_index(self): + """Test if the encoder can handle string indices.""" train = pd.DataFrame({'city': ['chicago', 'denver']}) target = [0, 1] train.index = train.index.astype(str) @@ -456,9 +569,12 @@ def test_string_index(self): with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)() result = enc.fit_transform(train, target) - self.assertFalse(result.isna().any(axis=None), 'There should not be any missing value!') + self.assertFalse( + result.isna().any(axis=None), 'There should not be any missing value!' + ) def test_get_feature_names_out(self): + """Should return correct column names.""" for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)() @@ -472,6 +588,7 @@ def test_get_feature_names_out(self): assert_array_equal(obtained, expected) def test_get_feature_names_out_drop_invariant(self): + """Should return correct column names when dropping invariant columns.""" # TODO: What could a DF look like that results in constant # columns for all encoders? for encoder_name in encoders.__all__: @@ -487,12 +604,14 @@ def test_get_feature_names_out_drop_invariant(self): assert_array_equal(obtained, expected) def test_get_feature_names_out_not_set(self): + """Test if get_feature_names_out() raises an error if the encoder is not fitted.""" for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)() self.assertRaises(ValueError, enc.get_feature_names_out) def test_get_feature_names_out_after_transform(self): + """Test if get_feature_names_out() returns the correct column names after transform.""" for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)() @@ -501,11 +620,14 @@ def test_get_feature_names_out_after_transform(self): self.assertEqual(set(enc.get_feature_names_out()), set(out.columns)) def test_truncated_index(self): - # see: https://github.com/scikit-learn-contrib/categorical-encoding/issues/152 + """Test if an encoder can be trained on the slice of a dataframe. + + see: https://github.com/scikit-learn-contrib/categorical-encoding/issues/152 + """ data = pd.DataFrame(data={'x': ['A', 'B', 'C', 'A', 'B'], 'y': [1, 0, 1, 0, 1]}) data = data.iloc[2:5] data2 = pd.DataFrame(data={'x': ['C', 'A', 'B'], 'y': [1, 0, 1]}) - for encoder_name in set(encoders.__all__) - {"HashingEncoder"}: + for encoder_name in set(encoders.__all__) - {'HashingEncoder'}: with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)() result = enc.fit_transform(data.x, data.y) @@ -514,27 +636,42 @@ def test_truncated_index(self): self.assertTrue((result.to_numpy() == result2.to_numpy()).all()) def test_column_transformer(self): - # see issue #169 - for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder does not accept handle_missing parameter - with self.subTest(encoder_name=encoder_name): + """Test if the sklearn ColumnTransformer works with the encoders. + see issue #169. + """ + # HashingEncoder does not accept handle_missing parameter + for encoder_name in set(encoders.__all__) - { 'HashingEncoder' }: + with self.subTest(encoder_name=encoder_name): # we can only test one data type at once. Here, we test string columns. tested_columns = ['unique_str', 'invariant', 'underscore', 'none', 'extra'] - # ColumnTransformer instantiates the encoder twice -> we have to make sure the encoder settings are correctly passed - ct = ColumnTransformer([ - ("dummy_encoder_name", getattr(encoders, encoder_name)(handle_missing="return_nan"), tested_columns) - ]) + # ColumnTransformer instantiates the encoder twice -> + # we have to make sure the encoder settings are correctly passed + ct = ColumnTransformer( + [ + ( + 'dummy_encoder_name', + getattr(encoders, encoder_name)(handle_missing='return_nan'), + tested_columns, + ) + ] + ) obtained = ct.fit_transform(X, y) # the old-school approach - enc = getattr(encoders, encoder_name)(handle_missing="return_nan", return_df=False) + enc = getattr(encoders, encoder_name)(handle_missing='return_nan', return_df=False) expected = enc.fit_transform(X[tested_columns], y) np.testing.assert_array_equal(obtained, expected) def test_error_messages(self): - # when the count of features change between training and scoring, we raise an exception + """Test if the error messages are meaningful. + + Case 1: The count of features changes must be the same in training and scoring. + Case 2: supervised encoders must obtain 'y' of the same length as 'x' during training. + """ + # Case 1 data = pd.DataFrame(data={'x': ['A', 'B', 'C', 'A', 'B'], 'y': [1, 0, 1, 0, 1]}) data2 = pd.DataFrame(data={'x': ['C', 'A', 'B'], 'x2': ['C', 'A', 'B']}) for encoder_name in encoders.__all__: @@ -543,7 +680,7 @@ def test_error_messages(self): enc.fit(data.x, data.y) self.assertRaises(ValueError, enc.transform, data2) - # supervised encoders must obtain 'y' of the same length as 'x' during training... + # Case 2 x = ['A', 'B', 'C'] y_good = pd.Series([1, 0, 1]) y_bad = pd.Series([1, 0, 1, 0]) @@ -559,10 +696,15 @@ def test_error_messages(self): self.assertRaises(ValueError, enc.transform, x, y_bad) def test_drop_invariant(self): - x = pd.DataFrame([['A', 'B', 'C'], ['A', 'B', 'C'], ['A', 'B', 'C'], ['D', 'E', 'C'], ['A', 'B', 'C']]) + """Should drop invariant columns when drop_invariant=True.""" + x = pd.DataFrame( + [['A', 'B', 'C'], ['A', 'B', 'C'], ['A', 'B', 'C'], ['D', 'E', 'C'], ['A', 'B', 'C']] + ) y = [0, 0, 1, 1, 1] - for encoder_name in set(encoders.__all__) - {'CatBoostEncoder'}: # CatBoost does not generally deliver a constant column when the feature is constant + for encoder_name in set(encoders.__all__) - { + 'CatBoostEncoder' + }: # CatBoost does not generally deliver a constant column when the feature is constant with self.subTest(encoder_name=encoder_name): enc1 = getattr(encoders, encoder_name)(drop_invariant=False) enc2 = getattr(encoders, encoder_name)(drop_invariant=True) @@ -573,26 +715,31 @@ def test_drop_invariant(self): self.assertTrue(len(result1.columns) > len(result2.columns)) def test_target_encoders(self): - # See issue #206 + """Should raise an error when the target is not provided for supervised encoders. + + See issue #206 + """ for encoder_name in encoders.__all__: enc = getattr(encoders, encoder_name)() if not enc._get_tags().get('supervised_encoder'): continue with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(return_df=False) - # an attempt to fit_transform() a supervised encoder without the target should result into a meaningful error message + # an attempt to fit_transform() a supervised encoder without the target should + # result into a meaningful error message self.assertRaises(TypeError, enc.fit_transform, X) def test_missing_values(self): - # by default, treat missing values as another valid value + """Should by default treat missing values as another valid value.""" x_placeholder = pd.Series(['a', 'b', 'b', 'c', 'c']) x_nan = pd.Series(['a', 'b', 'b', np.nan, np.nan]) x_float = pd.DataFrame({'col1': [1.0, 2.0, 2.0, np.nan, np.nan]}) y = [0, 1, 1, 1, 1] - for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder currently violates it + for encoder_name in set(encoders.__all__) - { + 'HashingEncoder' + }: # HashingEncoder currently violates it with self.subTest(encoder_name=encoder_name): - enc = getattr(encoders, encoder_name)() result_placeholder = enc.fit_transform(x_placeholder, y) @@ -606,18 +753,42 @@ def test_missing_values(self): np.testing.assert_equal(result_placeholder.values, result_float.values) def test_metamorphic(self): - # When we only slightly alter the input data or an irrelevant argument, the output should remain unchanged. - x1 = ['A', 'B', 'B'] # Baseline - x2 = ['Apple', 'Banana', 'Banana'] # Different strings, but with the same alphabetic ordering - x3 = pd.DataFrame(data={'x': ['A', 'B', 'B']}) # DataFrame - x4 = pd.Series(['A', 'B', 'B'], dtype='category') # Series with category data type - x5 = np.array(['A', 'B', 'B']) # Numpy - x6 = ['Z', 'Y', 'Y'] # Different strings, reversed alphabetic ordering (it works because we look at the order of appearance, not at alphabetic order) + """Test the metamorphic property of the encoders. + + This means that the output should remain unchanged when we slightly alter the input data + (e.g. other labels) or an irrelevant argument. + + We include the following cases: + - Baseline + - Different strings, but with the same alphabetic ordering + - Input as DataFrame + - Input as Series with category data type + - Input as Numpy + - Different strings, reversed alphabetic ordering (it works because we look at + the order of appearance, not at alphabetic order) + + Note that the hashing encoder is not expected to be metamorphic. + """ + x1 = ['A', 'B', 'B'] + x2 = [ + 'Apple', + 'Banana', + 'Banana', + ] + x3 = pd.DataFrame(data={'x': ['A', 'B', 'B']}) + x4 = pd.Series(['A', 'B', 'B'], dtype='category') + x5 = np.array(['A', 'B', 'B']) + x6 = [ + 'Z', + 'Y', + 'Y', + ] y = [1, 1, 0] - for encoder_name in ( - set(encoders.__all__) - {'HashingEncoder'}): # Hashing encoder is, by definition, not invariant to data changes + for encoder_name in set(encoders.__all__) - { + 'HashingEncoder' + }: # Hashing encoder is, by definition, not invariant to data changes with self.subTest(encoder_name=encoder_name): enc1 = getattr(encoders, encoder_name)() result1 = enc1.fit_transform(x1, y) @@ -628,7 +799,7 @@ def test_metamorphic(self): enc3 = getattr(encoders, encoder_name)() result3 = enc3.fit_transform(x3, y) - self.assertTrue( (result1.to_numpy() == result3.to_numpy()).all() ) + self.assertTrue((result1.to_numpy() == result3.to_numpy()).all()) enc4 = getattr(encoders, encoder_name)() result4 = enc4.fit_transform(x4, y) @@ -640,7 +811,7 @@ def test_metamorphic(self): # gray encoder actually does re-order inputs # rankhot encoder respects order, in this example the order is switched - if encoder_name not in ["GrayEncoder", "RankHotEncoder"]: + if encoder_name not in ['GrayEncoder', 'RankHotEncoder']: enc6 = getattr(encoders, encoder_name)() result6 = enc6.fit_transform(x6, y) self.assertTrue(result1.equals(result6)) @@ -654,35 +825,54 @@ def test_metamorphic(self): result10 = enc10.fit_transform(x1, y) self.assertTrue(result1.equals(result10)) - # Note: If the encoder does not support these arguments/argument values, it is OK/expected to fail. - # Note: The indicator approach is not tested because it adds columns -> the encoders that support it are expected to fail. + # Note: If the encoder does not support these arguments/argument values, + # it is OK/expected to fail. + # Note: The indicator approach is not tested because it adds columns -> the + # encoders that support it are expected to fail. - # enc11 = getattr(encoders, encoder_name)(handle_unknown='return_nan', handle_missing='return_nan') # Quite a few algorithms fail here because of handle_missing + # enc11 = getattr(encoders, encoder_name)(handle_unknown='return_nan', + # handle_missing='return_nan') + # Quite a few algorithms fail here because of handle_missing # result11 = enc11.fit_transform(x1, y) - # self.assertTrue((result1.values == result11.values).all(), 'The data do not contain any missing or new value -> the result should be unchanged.') + # self.assertTrue((result1.values == result11.values).all(), + # 'The data do not contain any missing or new value -> the result should + # be unchanged.') - enc12 = getattr(encoders, encoder_name)(handle_unknown='value', handle_missing='value') + enc12 = getattr(encoders, encoder_name)( + handle_unknown='value', handle_missing='value' + ) result12 = enc12.fit_transform(x1, y) - self.assertTrue(result1.equals(result12), 'The data do not contain any missing or new value -> the result should be unchanged.') - - # enc13 = getattr(encoders, encoder_name)(handle_unknown='error', handle_missing='error', cols=['x']) # Quite a few algorithms fail here because of handle_missing + self.assertTrue( + result1.equals(result12), + 'The data do not contain any missing or new value -> ' + 'the result should be unchanged.', + ) + + # enc13 = getattr(encoders, encoder_name)(handle_unknown='error', + # handle_missing='error', cols=['x']) + # Quite a few algorithms fail here because of handle_missing # result13 = enc13.fit_transform(x3, y) - # self.assertTrue((result1.values == result13.values).all(), 'The data do not contain any missing or new value -> the result should be unchanged.') + # self.assertTrue((result1.values == result13.values).all(), + # 'The data do not contain any missing or new value -> + # the result should be unchanged.') def test_pandas_index(self): - # see https://github.com/scikit-learn-contrib/categorical-encoding/pull/224 - df = pd.DataFrame({ - 'hello': ['a', 'b', 'c'], - 'world': [0, 1, 0] - }, columns=pd.Index(['hello', 'world'])) + """Should work with pandas index. + + See https://github.com/scikit-learn-contrib/categorical-encoding/pull/224 + """ + df = pd.DataFrame( + {'hello': ['a', 'b', 'c'], 'world': [0, 1, 0]}, columns=pd.Index(['hello', 'world']) + ) cols = df.select_dtypes(include='object').columns - for encoder_name in (set(encoders.__all__) - {"HashingEncoder"}): + for encoder_name in set(encoders.__all__) - {'HashingEncoder'}: with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(cols=cols) enc.fit_transform(df, df['world']) def test_mismatched_indexes(self): + """Should work with mismatched indexes.""" df = pd.DataFrame({'x': ['a', 'b', 'b']}, index=[7, 5, 9]) y_list = [1, 0, 1] for encoder_name in encoders.__all__: @@ -692,12 +882,19 @@ def test_mismatched_indexes(self): self.assertFalse(out.isna().any().any()) def test_numbers_as_strings_with_numpy_output(self): - # see issue #229 + """Should work with numbers as strings. + + See issue #229. + """ X = np.array(['11', '12', '13', '14', '15']) oe = encoders.OrdinalEncoder(return_df=False) oe.fit(X) def test_columns(self): + """Should convert only selected columns. + + If no selection is made all columns of type object should be converted. + """ # Convert only selected columns. Leave the remaining string columns untouched. oe = encoders.OrdinalEncoder(cols=['underscore']) result = oe.fit_transform(X) @@ -721,12 +918,14 @@ def test_columns(self): self.assertTrue(result['unique_int'].min() < 1, 'should still be a number and untouched') def test_ignored_columns_are_untouched(self): - # Make sure None values in ignored columns are preserved. - # See: https://github.com/scikit-learn-contrib/category_encoders/pull/261 + """Should not change None values of ignored columns. + + See: https://github.com/scikit-learn-contrib/category_encoders/pull/261 + """ X = pd.DataFrame({'col1': ['A', 'B', None], 'col2': ['C', 'D', None]}) y = [1, 0, 1] - for encoder_name in (set(encoders.__all__)): + for encoder_name in set(encoders.__all__): with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(cols=['col1']) out = enc.fit_transform(X, y) diff --git a/tests/test_feature_names.py b/tests/test_feature_names.py index 2afca053..314cdae7 100644 --- a/tests/test_feature_names.py +++ b/tests/test_feature_names.py @@ -1,14 +1,16 @@ +"""Tests for the feature names of the encoders.""" +from unittest import TestCase + +import category_encoders as encoders import numpy as np import pandas as pd -import tests.helpers as th -from numpy.testing import assert_array_equal import sklearn +from numpy.testing import assert_array_equal +from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer -from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline -from sklearn.compose import ColumnTransformer -import category_encoders as encoders -from unittest import TestCase + +import tests.helpers as th __author__ = 'JaimeArboleda' @@ -20,81 +22,75 @@ np_y = np.random.randn(X.shape[0]) > 0.5 y = pd.DataFrame(np_y) + class TestEncodersFeaturesOut(TestCase): + """Tests for the feature names of the encoders.""" def test_feature_names_out(self): + """Test the feature names out of the encoders.""" for encoder_name in encoders.__all__: - if sklearn.__version__ < "1.2.0": + if sklearn.__version__ < '1.2.0': continue else: - sklearn.set_config(transform_output="pandas") + sklearn.set_config(transform_output='pandas') with self.subTest(encoder_name=encoder_name): encoder = getattr(encoders, encoder_name)() X_t = encoder.fit_transform(X, y) categorical_preprocessor_start = Pipeline( - steps=[ - ("encoder", getattr(encoders, encoder_name)()) - ] + steps=[('encoder', getattr(encoders, encoder_name)())] ) categorical_preprocessor_middle = Pipeline( steps=[ - ("imputation_constant", SimpleImputer(fill_value="missing", strategy="constant")), - ("encoder", getattr(encoders, encoder_name)()) + ( + 'imputation_constant', + SimpleImputer(fill_value='missing', strategy='constant'), + ), + ('encoder', getattr(encoders, encoder_name)()), ] ) numerical_preprocessor = Pipeline( steps=[ - ("imputation_constant", SimpleImputer(fill_value=0, strategy="constant")) + ('imputation_constant', SimpleImputer(fill_value=0, strategy='constant')) ] ) preprocessor = ColumnTransformer( [ - ("categorical_prep_start", categorical_preprocessor_start, ["categorical", "na_categorical"]), - ("categorical_prep_middle", categorical_preprocessor_middle, ["categorical", "na_categorical"]), - ("numerical_prep", numerical_preprocessor, ["float"]) + ( + 'categorical_prep_start', + categorical_preprocessor_start, + ['categorical', 'na_categorical'], + ), + ( + 'categorical_prep_middle', + categorical_preprocessor_middle, + ['categorical', 'na_categorical'], + ), + ('numerical_prep', numerical_preprocessor, ['float']), ] ) X_tt = preprocessor.fit_transform(X, y) + assert_array_equal(np.array(X_t.columns), encoder.get_feature_names_out()) + assert_array_equal(np.array(X_tt.columns), preprocessor.get_feature_names_out()) assert_array_equal( - np.array(X_t.columns), - encoder.get_feature_names_out() - ) - assert_array_equal( - np.array(X_tt.columns), - preprocessor.get_feature_names_out() - ) - assert_array_equal( - np.array( - [ - c - for c in X_t.columns - if c not in num_columns - ] - ), + np.array([c for c in X_t.columns if c not in num_columns]), np.array( [ - c[len("categorical_prep_start__"):] + c[len('categorical_prep_start__') :] for c in X_tt.columns - if "categorical_prep_start" in c + if 'categorical_prep_start' in c ] - ) + ), ) assert_array_equal( + np.array([c for c in X_t.columns if c not in num_columns]), np.array( [ - c - for c in X_t.columns - if c not in num_columns - ] - ), - np.array( - [ - c[len("categorical_prep_middle__"):] + c[len('categorical_prep_middle__') :] for c in X_tt.columns - if "categorical_prep_middle" in c + if 'categorical_prep_middle' in c ] - ) + ), ) - sklearn.set_config(transform_output="default") + sklearn.set_config(transform_output='default') diff --git a/tests/test_glmm.py b/tests/test_glmm.py index fef9c216..53840e12 100644 --- a/tests/test_glmm.py +++ b/tests/test_glmm.py @@ -1,20 +1,18 @@ +"""Tests for the GLMMEncoder.""" +import unittest from unittest import TestCase -import numpy as np -import category_encoders as encoders -import tests.helpers as th class TestGLMMEncoder(TestCase): + """Unit tests for the GLMMEncoder. + + They are still missing. + """ + + @unittest.skip def test_continuous(self): - cols = ['unique_str', 'underscore', 'extra', 'none', 'invariant', 321, 'categorical', 'na_categorical', 'categorical_int'] - enc = encoders.GLMMEncoder(cols=cols, binomial_target=False) - # TODO: fix this test IRL - # enc.fit(X, np_y) - #th.verify_numeric(enc.transform(X)) + """Test the continuous target.""" + @unittest.skip def test_binary(self): - cols = ['unique_str', 'underscore', 'extra', 'none', 'invariant', 321, 'categorical', 'na_categorical', 'categorical_int'] - enc = encoders.GLMMEncoder(cols=cols, binomial_target=True) - # TODO: fix this test IRL - #enc.fit(X, np_y) - #th.verify_numeric(enc.transform(X)) + """Test the binary target.""" diff --git a/tests/test_gray.py b/tests/test_gray.py index 1e500a61..bc9f5d61 100644 --- a/tests/test_gray.py +++ b/tests/test_gray.py @@ -1,26 +1,30 @@ -import pandas as pd +"""Unit tests for the GrayEncoder.""" from unittest import TestCase -import numpy as np import category_encoders as encoders +import numpy as np +import pandas as pd class TestGrayEncoder(TestCase): + """Unit tests for the GrayEncoder.""" def test_gray_sorting(self): + """Test the GrayEncoder sorting.""" data = np.array(['ba', 'ba', 'aa']) out = encoders.GrayEncoder().fit_transform(data) expected = pd.DataFrame([[1, 1], [1, 1], [0, 1]], columns=['0_0', '0_1']) pd.testing.assert_frame_equal(out, expected) def test_gray_mapping(self): + """Test the GrayEncoder mapping.""" train_data = pd.DataFrame() - train_data["cat_col"] = np.array([4, 9, 6, 7, 7, 9]) - train_data["other_col"] = range(train_data.shape[0]) - encoder = encoders.GrayEncoder(cols=["cat_col"]) + train_data['cat_col'] = np.array([4, 9, 6, 7, 7, 9]) + train_data['other_col'] = range(train_data.shape[0]) + encoder = encoders.GrayEncoder(cols=['cat_col']) encoder.fit(train_data) - expected_ordinal_mapping = {4.0: 1, 9.0: 2, 6.0: 3, 7.0: 4, "nan": -2} + expected_ordinal_mapping = {4.0: 1, 9.0: 2, 6.0: 3, 7.0: 4, 'nan': -2} expected_mapping = pd.DataFrame( [ [0, 0, 1], @@ -29,18 +33,20 @@ def test_gray_mapping(self): [1, 1, 0], [0, 0, 0], [0, 0, 0], - ], columns=[f"cat_col_{i}" for i in range(3)], index=[1, 3, 4, 2, -1, -2] + ], + columns=[f'cat_col_{i}' for i in range(3)], + index=[1, 3, 4, 2, -1, -2], ) self.assertEqual(len(encoder.mapping), 1) self.assertEqual(len(encoder.mapping[0].keys()), 2) - actual_ordinal_encoding = encoder.ordinal_encoder.mapping[0]["mapping"] - actual_ordinal_encoding.index = actual_ordinal_encoding.index.fillna("nan") + actual_ordinal_encoding = encoder.ordinal_encoder.mapping[0]['mapping'] + actual_ordinal_encoding.index = actual_ordinal_encoding.index.fillna('nan') self.assertDictEqual(actual_ordinal_encoding.to_dict(), expected_ordinal_mapping) - pd.testing.assert_frame_equal(encoder.mapping[0]["mapping"], expected_mapping) + pd.testing.assert_frame_equal(encoder.mapping[0]['mapping'], expected_mapping) train_transformed = encoder.transform(train_data) - train_data["cat_col"] = np.array([4, 9, 6, 7, 7, 9]) + train_data['cat_col'] = np.array([4, 9, 6, 7, 7, 9]) expected_train_transformed = [ [0, 0, 1, 0], [1, 1, 0, 1], @@ -49,26 +55,31 @@ def test_gray_mapping(self): [0, 1, 0, 4], [1, 1, 0, 5], ] - expected_train_transformed = pd.DataFrame(expected_train_transformed, - columns=[f"cat_col_{i}" for i in range(3)] + ["other_col"], - index=train_data.index) + expected_train_transformed = pd.DataFrame( + expected_train_transformed, + columns=[f'cat_col_{i}' for i in range(3)] + ['other_col'], + index=train_data.index, + ) pd.testing.assert_frame_equal(train_transformed, expected_train_transformed) test_data = pd.DataFrame() - test_data["cat_col"] = np.array([4, 3, None, np.nan]) - test_data["other_col"] = range(test_data.shape[0]) + test_data['cat_col'] = np.array([4, 3, None, np.nan]) + test_data['other_col'] = range(test_data.shape[0]) expected_test_transformed = [ [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 0, 2], [0, 0, 0, 3], ] - expected_test_transformed = pd.DataFrame(expected_test_transformed, - columns=[f"cat_col_{i}" for i in range(3)] + ["other_col"], - index=test_data.index) + expected_test_transformed = pd.DataFrame( + expected_test_transformed, + columns=[f'cat_col_{i}' for i in range(3)] + ['other_col'], + index=test_data.index, + ) test_transformed = encoder.transform(test_data) pd.testing.assert_frame_equal(test_transformed, expected_test_transformed) def test_gray_code(self): + """Test the Gray code generation.""" input_expected_output = { (0, 0): [0], (0, 1): [0], diff --git a/tests/test_hashing.py b/tests/test_hashing.py index c7458874..fbcd59c7 100644 --- a/tests/test_hashing.py +++ b/tests/test_hashing.py @@ -1,13 +1,16 @@ +"""Tests for the HashingEncoder.""" from unittest import TestCase +import category_encoders as encoders import pandas as pd from pandas.testing import assert_frame_equal, assert_index_equal -import category_encoders as encoders - class TestHashingEncoder(TestCase): + """Tests for the HashingEncoder.""" + def test_must_not_reset_index(self): + """Test that the HashingEncoder does not reset the index.""" columns = ['column1', 'column2', 'column3', 'column4'] df = pd.DataFrame([[i, i, i, i] for i in range(10)], columns=columns) df = df.iloc[2:8, :] @@ -17,17 +20,19 @@ def test_must_not_reset_index(self): single_process_encoder.fit(df, None) df_encoded_single_process = single_process_encoder.transform(df) assert_index_equal(df.index, df_encoded_single_process.index) - assert df.shape[0] == pd.concat([df, df_encoded_single_process], axis=1).shape[0] + self.assertEqual(df.shape[0], + pd.concat([df, df_encoded_single_process], axis=1).shape[0]) multi_process_encoder = encoders.HashingEncoder(cols=target_columns) multi_process_encoder.fit(df, None) df_encoded_multi_process = multi_process_encoder.transform(df) assert_index_equal(df.index, df_encoded_multi_process.index) - assert df.shape[0] == pd.concat([df, df_encoded_multi_process], axis=1).shape[0] + self.assertEqual(df.shape[0] , pd.concat([df, df_encoded_multi_process], axis=1).shape[0]) assert_frame_equal(df_encoded_single_process, df_encoded_multi_process) def test_transform_works_with_single_row_df(self): + """Test that the HashingEncoder works with a single row DataFrame.""" columns = ['column1', 'column2', 'column3', 'column4'] df = pd.DataFrame([[i, i, i, i] for i in range(10)], columns=columns) df = df.iloc[2:8, :] @@ -36,23 +41,23 @@ def test_transform_works_with_single_row_df(self): multi_process_encoder = encoders.HashingEncoder(cols=target_columns) multi_process_encoder.fit(df, None) df_encoded_multi_process = multi_process_encoder.transform(df.sample(1)) - - assert (multi_process_encoder.n_components + - len(list(set(columns) - - set(target_columns)) - ) == df_encoded_multi_process.shape[1] - ) + + self.assertEqual( + multi_process_encoder.n_components + len(list(set(columns) - set(target_columns))), + df_encoded_multi_process.shape[1] + ) def test_simple_example(self): - df = pd.DataFrame({ - 'strings': ["aaaa", "bbbb", "cccc"], - "more_strings": ["aaaa", "dddd", "eeee"], - }) + """Test the HashingEncoder with a simple example.""" + df = pd.DataFrame( + { + 'strings': ['aaaa', 'bbbb', 'cccc'], + 'more_strings': ['aaaa', 'dddd', 'eeee'], + } + ) encoder = encoders.HashingEncoder(n_components=4, max_process=2) encoder.fit(df) - assert encoder.transform(df).equals(pd.DataFrame({ - "col_0": [0,1,1], - "col_1": [2,0,1], - "col_2": [0,1,0], - "col_3": [0,0,0] - })) + expected_df = pd.DataFrame( + {'col_0': [0, 1, 1], 'col_1': [2, 0, 1], 'col_2': [0, 1, 0], 'col_3': [0, 0, 0]} + ) + pd.testing.assert_frame_equal(encoder.transform(df), expected_df) diff --git a/tests/test_helmert.py b/tests/test_helmert.py index 8c88a8fd..e2abcc92 100644 --- a/tests/test_helmert.py +++ b/tests/test_helmert.py @@ -1,145 +1,17 @@ -import pandas as pd +"""Tests for the HelmertEncoder.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ -import numpy as np + import category_encoders as encoders +import numpy as np class TestHelmertEncoder(TestCase): - - def test_helmert_preserve_dimension_1(self): - train = ['A', 'B', 'C'] - test = ['A', 'D', 'E'] - - encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [[1, -1, -1], - [1, 0, 0], - [1, 0, 0]] - self.assertEqual(test_t.to_numpy().tolist(), expected) - - def test_helmert_preserve_dimension_2(self): - train = ['A', 'B', 'C'] - test = ['B', 'D', 'E'] - - encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [[1, 1, -1], - [1, 0, 0], - [1, 0, 0]] - self.assertEqual(test_t.to_numpy().tolist(), expected) - - def test_helmert_preserve_dimension_3(self): - train = ['A', 'B', 'C'] - test = ['A', 'B', 'C', None] - - encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [[1, -1, -1], - [1, 1, -1], - [1, 0, 2], - [1, 0, 0]] - self.assertEqual(test_t.to_numpy().tolist(), expected) - - def test_helmert_preserve_dimension_4(self): - train = ['A', 'B', 'C'] - test = ['D', 'B', 'C', None] - - encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [[1, 0, 0], - [1, 1, -1], - [1, 0, 2], - [1, 0, 0]] - self.assertEqual(test_t.to_numpy().tolist(), expected) - - def test_helmert_2cols(self): - train = [['A', 'A'], ['B', 'B'], ['C', 'C']] - - encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - obtained = encoder.transform(train) - - expected = [[1, -1, -1, -1, -1], - [1, 1, -1, 1, -1], - [1, 0, 2, 0, 2]] - self.assertEqual(obtained.to_numpy().tolist(), expected) - - def test_helmert_2StringCols_ExpectCorrectOrder(self): - train = pd.DataFrame({'col1': [1, 2, 3, 4], - 'col2': ['A', 'B', 'C', 'D'], - 'col3': [1, 2, 3, 4], - 'col4': ['A', 'B', 'C', 'A'] - }, - columns=['col1', 'col2', 'col3', 'col4']) - expected_columns = ['intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1'] - encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') - - encoder.fit(train) - columns = encoder.transform(train).columns.to_numpy() - - self.assertTrue(np.array_equal(expected_columns, columns)) - - def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): - train = ['A', 'B', np.nan] - - encoder = encoders.HelmertEncoder(handle_missing='indicator', handle_unknown='value') - result = encoder.fit_transform(train) - - expected = [[1, -1, -1], - [1, 1, -1], - [1, 0, 2]] - self.assertTrue(np.array_equal(result.to_numpy().tolist(), expected)) - - def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): - train = ['A', 'B'] - - encoder = encoders.HelmertEncoder(handle_missing='indicator', handle_unknown='value') - result = encoder.fit_transform(train) - - expected = [[1, -1, -1], - [1, 1, -1]] - self.assertEqual(result.to_numpy().tolist(), expected) - - def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): - train = ['A', 'B'] - test = ['A', 'B', np.nan] - - encoder = encoders.HelmertEncoder(handle_missing='indicator', handle_unknown='value') - encoder.fit(train) - result = encoder.transform(test) - - expected = [[1, -1, -1], - [1, 1, -1], - [1, 0, 2]] - self.assertEqual(result.to_numpy().tolist(), expected) - - def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): - train = ['A', 'B'] - test = ['A', 'B', 'C'] - - encoder = encoders.HelmertEncoder(handle_unknown='indicator', handle_missing='value') - encoder.fit(train) - result = encoder.transform(test) - - expected = [[1, -1, -1], - [1, 1, -1], - [1, 0, 2]] - self.assertEqual(result.to_numpy().tolist(), expected) - - def test_HandleUnknown_HaveOnlyKnown_ExpectExtraColumn(self): - train = ['A', 'B'] - - encoder = encoders.HelmertEncoder(handle_unknown='indicator', handle_missing='value') - result = encoder.fit_transform(train) - - expected = [[1, -1, -1], - [1, 1, -1]] - self.assertEqual(result.to_numpy().tolist(), expected) + """Unit tests for the HelmertEncoder.""" + + def test_get_contrast_matrix(self): + """Should return the correct contrast matrix for helmert.""" + train = np.array([('A', ), ('B', ), ('C', )]) + encoder = encoders.HelmertEncoder() + matrix = encoder.get_contrast_matrix(train) + expected_matrix = np.array([[-1, -1], [1, -1], [0, 2]]) + np.testing.assert_array_equal(matrix.matrix, expected_matrix) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 137649fd..01533cc3 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,14 +1,17 @@ +"""Tests for the helpers module.""" +from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ + import numpy as np import pandas as pd -from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ from tests.helpers import verify_numeric class TestHelpers(TestCase): + """Tests for the helpers module.""" def test_is_numeric_pandas(self): - # Whole numbers, regardless of the byte length, should not raise AssertionError + """Whole numbers, regardless of the byte length, should not raise AssertionError.""" X = pd.DataFrame(np.ones([5, 5]), dtype='int32') verify_numeric(pd.DataFrame(X)) @@ -17,11 +20,11 @@ def test_is_numeric_pandas(self): # Strings should raise AssertionError X = pd.DataFrame([['a', 'b', 'c'], ['d', 'e', 'f']]) - with self.assertRaises(Exception): + with self.assertRaises(AssertionError): verify_numeric(pd.DataFrame(X)) def test_is_numeric_numpy(self): - # Whole numbers, regardless of the byte length, should not raise AssertionError + """Whole numbers, regardless of the byte length, should not raise AssertionError.""" X = np.ones([5, 5], dtype='int32') verify_numeric(pd.DataFrame(X)) @@ -35,8 +38,8 @@ def test_is_numeric_numpy(self): X = np.ones([5, 5], dtype='float64') verify_numeric(pd.DataFrame(X)) - def test_verify_raises_AssertionError_on_categories(self): - # Categories should raise AssertionError + def test_verify_raises_assertion_error_on_categories(self): + """Categories should raise AssertionError.""" X = pd.DataFrame([['a', 'b', 'c'], ['d', 'e', 'f']], dtype='category') - with self.assertRaises(Exception): + with self.assertRaises(AssertionError): verify_numeric(pd.DataFrame(X)) diff --git a/tests/test_james_stein.py b/tests/test_james_stein.py index c26a4497..b1dc24e9 100644 --- a/tests/test_james_stein.py +++ b/tests/test_james_stein.py @@ -1,137 +1,194 @@ +"""Unit tests for the James-Stein encoder.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ -import numpy as np import category_encoders as encoders +import numpy as np class TestJamesSteinEncoder(TestCase): + """Unit tests for the James-Stein encoder.""" def test_small_samples_independent(self): + """Test the James-Stein encoder with small samples.""" X = np.array(['a', 'b', 'b']) y = np.array([1, 0, 1]) out = encoders.JamesSteinEncoder(return_df=False, model='independent').fit_transform(X, y) - self.assertEqual([1, 0.5, 0.5], - list(out), - 'When the count of unique values in the column is <4 (here it is 2), James-Stein estimator returns (unbiased) sample means') + self.assertEqual( + [1, 0.5, 0.5], + list(out), + 'When the count of unique values in the column is <4 (here it is 2), ' + 'James-Stein estimator returns (unbiased) sample means', + ) def test_large_samples(self): + """Test the James-Stein encoder with large samples.""" X = np.array(['a', 'b', 'b', 'c', 'd']) y = np.array([1, 0, 1, 0, 0]) out = encoders.JamesSteinEncoder(return_df=False, model='independent').fit_transform(X, y) - self.assertNotEqual([1, 0.5, 0.5, 0, 0], - list(out), - 'Shrinkage should kick in with 4 or more unique values') + self.assertNotEqual( + [1, 0.5, 0.5, 0, 0], list(out), + 'Shrinkage should kick in with 4 or more unique values' + ) self.assertTrue(np.max(out) <= 1, 'This should still be a probability') self.assertTrue(np.min(out) >= 0, 'This should still be a probability') def test_zero_variance(self): + """Test the James-Stein encoder with zero variance.""" X = np.array(['a', 'b', 'c', 'd', 'd']) y = np.array([0, 1, 1, 1, 1]) out = encoders.JamesSteinEncoder(return_df=False, model='independent').fit_transform(X, y) - self.assertEqual([0, 1, 1, 1, 1], - list(out), - 'Should not result into division by zero') + self.assertEqual([0, 1, 1, 1, 1], list(out), 'Should not result into division by zero') def test_continuous_target(self): + """Test the James-Stein encoder with continuous target.""" X = np.array(['a', 'b', 'b', 'c']) y = np.array([-10, 0, 0, 10]) out = encoders.JamesSteinEncoder(return_df=False, model='independent').fit_transform(X, y) - self.assertEqual([-10, 0, 0, 10], - list(out), - 'The model assumes normal distribution -> we support real numbers') + self.assertEqual( + [-10, 0, 0, 10], + list(out), + 'The model assumes normal distribution -> we support real numbers', + ) # Pooled def test_continuous_target_pooled(self): + """Test the James-Stein encoder with continuous target and pooled model.""" X = np.array(['a', 'b', 'b', 'c']) y = np.array([-10, 0, 0, 10]) out = encoders.JamesSteinEncoder(return_df=False, model='pooled').fit_transform(X, y) - self.assertEqual([-10, 0, 0, 10], - list(out), - 'The model assumes normal distribution -> we support real numbers') + self.assertEqual( + [-10, 0, 0, 10], + list(out), + 'The model assumes normal distribution -> we support real numbers', + ) def test_large_samples_pooled(self): + """Test the James-Stein encoder with large samples and pooled model.""" X = np.array(['a', 'b', 'b', 'c', 'd']) y = np.array([1, 0, 1, 0, 0]) out = encoders.JamesSteinEncoder(return_df=False, model='pooled').fit_transform(X, y) - self.assertNotEqual([1, 0.5, 0.5, 0, 0], - list(out), - 'Shrinkage should kick in with 4 or more unique values') + self.assertNotEqual( + [1, 0.5, 0.5, 0, 0], list(out), 'Shrinkage should kick in with 4 or more unique values' + ) self.assertTrue(np.max(out) <= 1, 'This should still be a probability') self.assertTrue(np.min(out) >= 0, 'This should still be a probability') def test_ids_small_pooled(self): + """Test the James-Stein encoder with small samples and pooled model.""" X = np.array(['a', 'b', 'c']) y = np.array([1, 0, 1]) out = encoders.JamesSteinEncoder(model='pooled').fit_transform(X, y) - self.assertTrue(all(np.var(out, axis=0) == 0), - 'This is not a standard behaviour of James-Stein estimator. But it helps a lot if we treat id-like attributes as non-predictive.') + self.assertTrue( + all(np.var(out, axis=0) == 0), + 'This is not a standard behaviour of James-Stein estimator. ' + 'But it helps a lot if we treat id-like attributes as non-predictive.', + ) def test_ids_large_pooled(self): + """Test the James-Stein encoder with large samples and pooled model.""" X = np.array(['a', 'b', 'c', 'd', 'e']) y = np.array([1, 0, 1, 0, 1]) out = encoders.JamesSteinEncoder(model='pooled').fit_transform(X, y) - self.assertTrue(all(np.var(out, axis=0) == 0), - 'This is not a standard behaviour of James-Stein estimator. But it helps a lot if we treat id-like attributes as non-predictive.') + self.assertTrue( + all(np.var(out, axis=0) == 0), + 'This is not a standard behaviour of James-Stein estimator. ' + 'But it helps a lot if we treat id-like attributes as non-predictive.', + ) # Beta def test_continuous_target_beta(self): + """Test the James-Stein encoder with continuous target and beta model.""" X = np.array(['a', 'b', 'b', 'c']) y = np.array([-10, 0, 0, 10]) out = encoders.JamesSteinEncoder(return_df=False, model='beta').fit_transform(X, y) - self.assertEqual([-2, 0, 0, 2], - list(out), - 'The model assumes normal distribution -> we support real numbers') + self.assertEqual( + [-2, 0, 0, 2], + list(out), + 'The model assumes normal distribution -> we support real numbers', + ) def test_large_samples_beta(self): + """Test the James-Stein encoder with large samples and beta model.""" X = np.array(['a', 'b', 'b', 'c', 'd']) y = np.array([1, 0, 1, 0, 0]) out = encoders.JamesSteinEncoder(return_df=False, model='beta').fit_transform(X, y) - self.assertNotEqual([1, 0.5, 0.5, 0, 0], - list(out), - 'Shrinkage should kick in with 4 or more unique values') + self.assertNotEqual( + [1, 0.5, 0.5, 0, 0], list(out), 'Shrinkage should kick in with 4 or more unique values' + ) self.assertTrue(np.max(out) <= 1, 'This should still be a probability') self.assertTrue(np.min(out) >= 0, 'This should still be a probability') def test_ids_small_beta(self): + """Test the James-Stein encoder with small samples and beta model.""" X = np.array(['a', 'b', 'c']) y = np.array([1, 0, 1]) out = encoders.JamesSteinEncoder(model='beta').fit_transform(X, y) - self.assertTrue(all(np.var(out, axis=0) == 0), - 'This is not a standard behaviour of James-Stein estimator. But it helps a lot if we treat id-like attributes as non-predictive.') + self.assertTrue( + all(np.var(out, axis=0) == 0), + 'This is not a standard behaviour of James-Stein estimator. ' + 'But it helps a lot if we treat id-like attributes as non-predictive.', + ) def test_ids_large_beta(self): + """Test the James-Stein encoder with large samples and beta model.""" X = np.array(['a', 'b', 'c', 'd', 'e']) y = np.array([1, 0, 1, 0, 1]) out = encoders.JamesSteinEncoder(model='beta').fit_transform(X, y) - self.assertTrue(all(np.var(out, axis=0) == 0), - 'This is not a standard behaviour of James-Stein estimator. But it helps a lot if we treat id-like attributes as non-predictive.') + self.assertTrue( + all(np.var(out, axis=0) == 0), + 'This is not a standard behaviour of James-Stein estimator. ' + 'But it helps a lot if we treat id-like attributes as non-predictive.', + ) # Binary def test_small_samples_binary(self): + """Test the James-Stein encoder with small samples and binary model.""" X = np.array(['a', 'b', 'b']) y = np.array([1, 0, 1]) out = encoders.JamesSteinEncoder(return_df=False, model='binary').fit_transform(X, y) - self.assertTrue(np.sum(np.abs([np.log((1.5*1.5)/(0.5*1.5)), np.log((0.5*1.5)/(1.5*1.5)), np.log((0.5*1.5)/(1.5*1.5))] - np.transpose(out))) < 0.001, - 'When the count of unique values in the column is <4 (here it is 2), James-Stein estimator returns (unbiased) sample means') + self.assertTrue( + np.sum( + np.abs( + [ + np.log((1.5 * 1.5) / (0.5 * 1.5)), + np.log((0.5 * 1.5) / (1.5 * 1.5)), + np.log((0.5 * 1.5) / (1.5 * 1.5)), + ] + - np.transpose(out) + ) + ) + < 0.001, + 'When the count of unique values in the column is <4 (here it is 2), ' + 'James-Stein estimator returns (unbiased) sample means', + ) def test_large_samples_binary(self): + """Test the James-Stein encoder with large samples and binary model.""" X = np.array(['a', 'b', 'b', 'c', 'd']) y = np.array([1, 0, 1, 0, 0]) out = encoders.JamesSteinEncoder(return_df=False, model='binary').fit_transform(X, y) - self.assertNotEqual([1, 0.5, 0.5, 0, 0], - list(out), - 'Shrinkage should kick in with 4 or more unique values') + self.assertNotEqual( + [1, 0.5, 0.5, 0, 0], list(out), 'Shrinkage should kick in with 4 or more unique values' + ) def test_identifiers_small_binary(self): + """Test the James-Stein encoder with small samples and binary model on an id column.""" X = np.array(['a', 'b', 'c']) y = np.array([1, 0, 1]) out = encoders.JamesSteinEncoder(model='binary').fit_transform(X, y) - self.assertTrue(all(np.var(out, axis=0) == 0), - 'This is not a standard behaviour of James-Stein estimator. But it helps a lot if we treat id-like attributes as non-predictive.') + self.assertTrue( + all(np.var(out, axis=0) == 0), + 'This is not a standard behaviour of James-Stein estimator. ' + 'But it helps a lot if we treat id-like attributes as non-predictive.', + ) def test_identifiers_large_binary(self): + """Test the James-Stein encoder with large samples and binary model on an id column.""" X = np.array(['a', 'b', 'c', 'd', 'e']) y = np.array([1, 0, 1, 0, 1]) out = encoders.JamesSteinEncoder(model='binary').fit_transform(X, y) - self.assertTrue(all(np.var(out, axis=0) == 0), - 'This is not a standard behaviour of James-Stein estimator. But it helps a lot if we treat id-like attributes as non-predictive.') + self.assertTrue( + all(np.var(out, axis=0) == 0), + 'This is not a standard behaviour of James-Stein estimator. ' + 'But it helps a lot if we treat id-like attributes as non-predictive.', + ) diff --git a/tests/test_leave_one_out.py b/tests/test_leave_one_out.py index b63d1b6f..90449c2f 100644 --- a/tests/test_leave_one_out.py +++ b/tests/test_leave_one_out.py @@ -1,14 +1,18 @@ -import pandas as pd +"""Unit tests for the LeaveOneOutEncoder.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ -import tests.helpers as th -import numpy as np import category_encoders as encoders +import numpy as np +import pandas as pd + +import tests.helpers as th class TestLeaveOneOutEncoder(TestCase): + """Unit tests for the LeaveOneOutEncoder.""" def test_leave_one_out(self): + """Test basic functionality on a diverse dataset.""" np_X = th.create_array(n_rows=100) np_X_t = th.create_array(n_rows=50, extras=True) np_y = np.random.randn(np_X.shape[0]) > 0.5 @@ -23,9 +27,8 @@ def test_leave_one_out(self): th.verify_numeric(enc.transform(X_t, y_t)) def test_leave_one_out_values(self): - df = pd.DataFrame({ - 'color': ["a", "a", "a", "b", "b", "b"], - 'outcome': [1, 0, 0, 1, 0, 1]}) + """Test that the fitted values are correct.""" + df = pd.DataFrame({'color': ['a', 'a', 'a', 'b', 'b', 'b'], 'outcome': [1, 0, 0, 1, 0, 1]}) X = df.drop('outcome', axis=1) y = df.drop('color', axis=1) @@ -35,20 +38,26 @@ def test_leave_one_out_values(self): self.assertEqual([0.0, 0.5, 0.5, 0.5, 1.0, 0.5], list(obtained['color'])) - def test_leave_one_out_fit_callTwiceOnDifferentData_ExpectRefit(self): + def test_refit(self): + """Test that the encoder can be refit if fit is called twice with different data.""" x_a = pd.DataFrame(data=['1', '2', '2', '2', '2', '2'], columns=['col_a']) - x_b = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col_b']) # different values and name + x_b = pd.DataFrame( + data=['1', '1', '1', '2', '2', '2'], columns=['col_b'] + ) # different values and name y_dummy = [True, False, True, False, True, False] encoder = encoders.LeaveOneOutEncoder() encoder.fit(x_a, y_dummy) encoder.fit(x_b, y_dummy) mapping = encoder.mapping self.assertEqual(1, len(mapping)) - self.assertIn('col_b', mapping) # the model should have the updated mapping - expected = pd.DataFrame({'sum': [2.0, 1.0], 'count': [3, 3]}, index=['1', '2'], columns=['sum', 'count']) + self.assertIn('col_b', mapping) # the model should have the updated mapping + expected = pd.DataFrame( + {'sum': [2.0, 1.0], 'count': [3, 3]}, index=['1', '2'], columns=['sum', 'count'] + ) np.testing.assert_equal(expected.values, mapping['col_b'].values) def test_leave_one_out_unique(self): + """Test that unique levels are encoded as the global mean.""" X = pd.DataFrame(data=['1', '2', '2', '2', '3'], columns=['col']) y = np.array([1, 0, 1, 0, 1]) @@ -59,10 +68,11 @@ def test_leave_one_out_unique(self): expected = pd.DataFrame(data=[y.mean(), 0.5, 0, 0.5, y.mean()], columns=['col']) pd.testing.assert_frame_equal(expected, result) - def test_HandleMissingIsValueAndNanInTrain_ExpectAtValueSet(self): - df = pd.DataFrame({ - 'color': [np.nan, np.nan, np.nan, "b", "b", "b"], - 'outcome': [2, 2, 0, 1, 0, 1]}) + def test_handle_missing_value_nan_in_training(self): + """Should encode missing values with the mean in training.""" + df = pd.DataFrame( + {'color': [np.nan, np.nan, np.nan, 'b', 'b', 'b'], 'outcome': [2, 2, 0, 1, 0, 1]} + ) X = df.drop('outcome', axis=1) y = df.drop('color', axis=1) @@ -72,10 +82,11 @@ def test_HandleMissingIsValueAndNanInTrain_ExpectAtValueSet(self): self.assertEqual([1, 1, 2, 0.5, 1.0, 0.5], list(obtained['color'])) - def test_HandleMissingIsValueAndNanInTest_ExpectMean(self): - df = pd.DataFrame({ - 'color': ["a", "a", "a", "b", "b", "b"], - 'outcome': [1.6, 0, 0, 1, 0, 1]}) + def test_handle_missing_value_nan_not_in_training(self): + """Should encode missing values with the global mean if not present in training.""" + df = pd.DataFrame( + {'color': ['a', 'a', 'a', 'b', 'b', 'b'], 'outcome': [1.6, 0, 0, 1, 0, 1]} + ) train = df.drop('outcome', axis=1) target = df.drop('color', axis=1) @@ -86,12 +97,11 @@ def test_HandleMissingIsValueAndNanInTest_ExpectMean(self): ce_leave.fit(train, target['outcome']) obtained = ce_leave.transform(test, test_target) - self.assertEqual([.6, 1.0], list(obtained['color'])) + self.assertEqual([0.6, 1.0], list(obtained['color'])) - def test_HandleMissingIsValueAndNanInTestAndNoTestTarget_ExpectMean(self): - df = pd.DataFrame({ - 'color': ["a", "a", "a", "b", "b", "b"], - 'outcome': [1, 0, 0, 1, 0, 1]}) + def test_handle_missing_value(self): + """Should encode missing values with the global mean.""" + df = pd.DataFrame({'color': ['a', 'a', 'a', 'b', 'b', 'b'], 'outcome': [1, 0, 0, 1, 0, 1]}) train = df.drop('outcome', axis=1) target = df.drop('color', axis=1) @@ -101,10 +111,11 @@ def test_HandleMissingIsValueAndNanInTestAndNoTestTarget_ExpectMean(self): ce_leave.fit(train, target['outcome']) obtained = ce_leave.transform(test) - self.assertEqual([.5, 2/3.0], list(obtained['color'])) + self.assertEqual([0.5, 2 / 3.0], list(obtained['color'])) - def test_HandleUnknownValue_HaveUnknownInTest_ExpectMean(self): - train = pd.Series(["a", "a", "a", "b", "b", "b"], name='color') + def test_handle_unknown(self): + """Should encode unknown values with the global mean.""" + train = pd.Series(['a', 'a', 'a', 'b', 'b', 'b'], name='color') target = pd.Series([1.6, 0, 0, 1, 0, 1], name='target') test = pd.Series(['b', 'c'], name='color') test_target = pd.Series([0, 0]) @@ -113,21 +124,21 @@ def test_HandleUnknownValue_HaveUnknownInTest_ExpectMean(self): ce_leave.fit(train, target) obtained = ce_leave.transform(test, test_target) - self.assertEqual([1.0, .6], list(obtained['color'])) + self.assertEqual([1.0, 0.6], list(obtained['color'])) def test_leave_one_out_categorical(self): - """ - test that if the input is a pd.Categorical the output is the same as for string columns - :return: - """ - df = pd.DataFrame({ - 'color_str': ["a", "a", "a", "b", "b", "b"], - 'color_num_cat': pd.Categorical([1.0, 1.0, 1.0, 2.0, 2.0, 2.0]), - 'color_str_cat': pd.Categorical(["a", "a", "a", "b", "b", "b"]), - 'outcome': [1, 0, 0, 1, 0, 1]}) + """Test that pd.Categorical work the same way as string columns.""" + df = pd.DataFrame( + { + 'color_str': ['a', 'a', 'a', 'b', 'b', 'b'], + 'color_num_cat': pd.Categorical([1.0, 1.0, 1.0, 2.0, 2.0, 2.0]), + 'color_str_cat': pd.Categorical(['a', 'a', 'a', 'b', 'b', 'b']), + 'outcome': [1, 0, 0, 1, 0, 1], + } + ) X = df.drop('outcome', axis=1) - y = df["outcome"] + y = df['outcome'] ce_leave = encoders.LeaveOneOutEncoder() obtained = ce_leave.fit_transform(X, y) diff --git a/tests/test_m_estimate.py b/tests/test_m_estimate.py index a1b26f4c..9bb7e0db 100644 --- a/tests/test_m_estimate.py +++ b/tests/test_m_estimate.py @@ -1,33 +1,34 @@ +"""Tests for the MEstimateEncoder.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ + import category_encoders as encoders +x = ['A', 'A', 'B', 'B'] +y = [1, 1, 0, 1] +x_t = ['A', 'B', 'C'] + class TestMEstimateEncoder(TestCase): + """Tests for the MEstimateEncoder.""" def test_reference_m0(self): - x = ['A', 'A', 'B', 'B'] - y = [1, 1, 0, 1] - x_t = ['A', 'B', 'C'] - + """Test the MEstimateEncoder with m=0, i.e. no shrinking.""" encoder = encoders.MEstimateEncoder(m=0, handle_unknown='value', handle_missing='value') encoder.fit(x, y) scored = encoder.transform(x_t) - expected = [[1], - [0.5], - [3./4.]] # The prior probability + expected = [[1], [0.5], [3.0 / 4.0]] # The prior probability self.assertEqual(scored.to_numpy().tolist(), expected) def test_reference_m1(self): - x = ['A', 'A', 'B', 'B'] - y = [1, 1, 0, 1] - x_t = ['A', 'B', 'C'] - + """Test the MEstimateEncoder with m=1.""" encoder = encoders.MEstimateEncoder(m=1, handle_unknown='value', handle_missing='value') encoder.fit(x, y) scored = encoder.transform(x_t) - expected = [[(2+3./4.)/(2+1)], - [(1+3./4.)/(2+1)], - [3./4.]] # The prior probability + expected = [ + [(2 + 3.0 / 4.0) / (2 + 1)], + [(1 + 3.0 / 4.0) / (2 + 1)], + [3.0 / 4.0], + ] self.assertEqual(scored.to_numpy().tolist(), expected) diff --git a/tests/test_one_hot.py b/tests/test_one_hot.py index ce280d07..82ffbcfb 100644 --- a/tests/test_one_hot.py +++ b/tests/test_one_hot.py @@ -1,21 +1,27 @@ -import pandas as pd +"""Tests for the OneHotEncoder.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ -import numpy as np -import tests.helpers as th import category_encoders as encoders +import numpy as np +import pandas as pd +import tests.helpers as th -class TestOneHotEncoderTestCase(TestCase): + +class TestOneHotEncoder(TestCase): + """Tests for the OneHotEncoder.""" def test_one_hot(self): + """Test basic functionality.""" X = th.create_dataset(n_rows=100) X_t = th.create_dataset(n_rows=50, extras=True) enc = encoders.OneHotEncoder(verbose=1, return_df=False) enc.fit(X) - self.assertEqual(enc.transform(X_t).shape[1], - enc.transform(X).shape[1], - 'We have to get the same count of columns despite the presence of a new value') + self.assertEqual( + enc.transform(X_t).shape[1], + enc.transform(X).shape[1], + 'We have to get the same count of columns despite the presence of a new value', + ) enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='indicator') enc.fit(X) @@ -34,12 +40,16 @@ def test_one_hot(self): with self.assertRaises(ValueError): enc.transform(X_t) - enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='return_nan', use_cat_names=True) + enc = encoders.OneHotEncoder( + verbose=1, return_df=True, handle_unknown='return_nan', use_cat_names=True + ) enc.fit(X) out = enc.transform(X_t) self.assertIn('extra_A', out.columns) - enc = encoders.OneHotEncoder(verbose=1, return_df=True, use_cat_names=True, handle_unknown='indicator') + enc = encoders.OneHotEncoder( + verbose=1, return_df=True, use_cat_names=True, handle_unknown='indicator' + ) enc.fit(X) out = enc.transform(X_t) self.assertIn('extra_-1', out.columns) @@ -54,104 +64,114 @@ def test_one_hot(self): obtained = enc.inverse_transform(enc.transform(X_i_t)) th.verify_inverse_transform(X_i_t, obtained) - def test_fit_transform_HaveMissingValuesAndUseCatNames_ExpectCorrectValue(self): - encoder = encoders.OneHotEncoder(cols=[0], use_cat_names=True, handle_unknown='indicator', return_df=False) + def test_fit_transform_use_cat_names(self): + """Test that use_cat_names works as expected. + + @ToDo: This test is not very useful as it seems to be covered by other tests already. + """ + encoder = encoders.OneHotEncoder( + cols=[0], use_cat_names=True, handle_unknown='indicator', return_df=False + ) result = encoder.fit_transform([[-1]]) self.assertListEqual([[1, 0]], result.tolist()) - def test_inverse_transform_HaveDedupedColumns_ExpectCorrectInverseTransform(self): - encoder = encoders.OneHotEncoder(cols=['match', 'match_box'], use_cat_names=True) - value = pd.DataFrame({'match': pd.Series('box_-1'), 'match_box': pd.Series(-1)}) - - transformed = encoder.fit_transform(value) - inverse_transformed = encoder.inverse_transform(transformed) - - assert value.equals(inverse_transformed) - - def test_inverse_transform_HaveNoCatNames_ExpectCorrectInverseTransform(self): - encoder = encoders.OneHotEncoder(cols=['match', 'match_box'], use_cat_names=False) - value = pd.DataFrame({'match': pd.Series('box_-1'), 'match_box': pd.Series(-1)}) - - transformed = encoder.fit_transform(value) - inverse_transformed = encoder.inverse_transform(transformed) - - assert value.equals(inverse_transformed) - - def test_fit_transform_HaveColumnAppearTwice_ExpectColumnsDeduped(self): - encoder = encoders.OneHotEncoder(cols=['match', 'match_box'], use_cat_names=True, handle_unknown='indicator') + def test_inverse_transform_duplicated_cat_names(self): + """Test that inverse_transform works with duplicated cat names. + + This can happen if use_cat_names is true and the two new column names coincide because + col_1 + label_A is the lame as col_2 + label_B. + """ + cases = {"should work if use_cat_names is True": True, + "should work if use_cat_names is False": False} + for case, use_cat_names in cases.items(): + with self.subTest(case=case): + encoder = encoders.OneHotEncoder(cols=['match', 'match_box'], + use_cat_names=use_cat_names) + value = pd.DataFrame({'match': pd.Series('box_-1'), 'match_box': pd.Series(-1)}) + + transformed = encoder.fit_transform(value) + inverse_transformed = encoder.inverse_transform(transformed) + + pd.testing.assert_frame_equal(value, inverse_transformed) + + def test_fit_transform_duplicated_column_rename(self): + """Check that # is added to duplicated column names. + + Column names can be duplicated either by use_cat_names=True or by having the label -1 + and adding an indicator column. + """ + encoder = encoders.OneHotEncoder( + cols=['match', 'match_box'], use_cat_names=True, handle_unknown='indicator' + ) value = pd.DataFrame({'match': pd.Series('box_-1'), 'match_box': pd.Series('-1')}) result = encoder.fit_transform(value) columns = result.columns.tolist() - self.assertSetEqual({'match_box_-1', 'match_-1', 'match_box_-1#', 'match_box_-1##'}, set(columns)) - - def test_fit_transform_HaveHandleUnknownValueAndUnseenValues_ExpectAllZeroes(self): - train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) - test = pd.DataFrame({'city': ['Chicago', 'Detroit']}) - expected_result = pd.DataFrame({'city_1': [1, 0], - 'city_2': [0, 0]}, - columns=['city_1', 'city_2']) - - enc = encoders.OneHotEncoder(handle_unknown='value') - result = enc.fit(train).transform(test) - - pd.testing.assert_frame_equal(expected_result, result) + self.assertSetEqual( + {'match_box_-1', 'match_-1', 'match_box_-1#', 'match_box_-1##'}, set(columns) + ) - def test_fit_transform_HaveHandleUnknownValueAndSeenValues_ExpectMappingUsed(self): + def test_fit_transform_handle_unknown_value(self): + """Test that unseen values are encoded as all zeroes.""" train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) - expected_result = pd.DataFrame({'city_1': [1, 0], - 'city_2': [0, 1]}, - columns=['city_1', 'city_2']) - enc = encoders.OneHotEncoder(handle_unknown='value') - result = enc.fit(train).transform(train) - - pd.testing.assert_frame_equal(expected_result, result) - - def test_fit_transform_HaveHandleUnknownIndicatorAndNoMissingValue_ExpectExtraColumn(self): - train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) - expected_result = pd.DataFrame({'city_1': [1, 0], - 'city_2': [0, 1], - 'city_-1': [0, 0]}, - columns=['city_1', 'city_2', 'city_-1']) - - enc = encoders.OneHotEncoder(handle_unknown='indicator') - result = enc.fit(train).transform(train) - - pd.testing.assert_frame_equal(expected_result, result) - - def test_fit_transform_HaveHandleUnknownIndicatorAndMissingValue_ExpectValueSet(self): + enc.fit(train) + with self.subTest("should encode unseen values as all zeroes"): + test = pd.DataFrame({'city': ['Chicago', 'Detroit']}) + expected_result = pd.DataFrame( + {'city_1': [1, 0], 'city_2': [0, 0]}, columns=['city_1', 'city_2'] + ) + result = enc.transform(test) + pd.testing.assert_frame_equal(expected_result, result) + + with self.subTest("should work if no unseen data"): + expected_result = pd.DataFrame( + {'city_1': [1, 0], 'city_2': [0, 1]}, columns=['city_1', 'city_2'] + ) + result = enc.transform(train) + pd.testing.assert_frame_equal(expected_result, result) + + def test_fit_transform_handle_unknown_indicator(self): + """Test that unseen values are encoded with an indicator column.""" train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) - test = pd.DataFrame({'city': ['Chicago', 'Detroit']}) - expected_result = pd.DataFrame({'city_1': [1, 0], - 'city_2': [0, 0], - 'city_-1': [0, 1]}, - columns=['city_1', 'city_2', 'city_-1']) - enc = encoders.OneHotEncoder(handle_unknown='indicator') - result = enc.fit(train).transform(test) - - pd.testing.assert_frame_equal(expected_result, result) - - def test_HandleMissingError(self): + enc.fit(train) + with self.subTest("Should create a column even if no unseen value in transform stage"): + expected_result = pd.DataFrame( + {'city_1': [1, 0], 'city_2': [0, 1], 'city_-1': [0, 0]}, + columns=['city_1', 'city_2', 'city_-1'], + ) + result = enc.transform(train) + pd.testing.assert_frame_equal(expected_result, result) + with self.subTest("Should create a column if unseen value in transform stage"): + + test = pd.DataFrame({'city': ['Chicago', 'Detroit']}) + expected_result = pd.DataFrame( + {'city_1': [1, 0], 'city_2': [0, 0], 'city_-1': [0, 1]}, + columns=['city_1', 'city_2', 'city_-1'], + ) + result = enc.transform(test) + pd.testing.assert_frame_equal(expected_result, result) + + def test_handle_missing_error(self): + """Test that missing values raise an error.""" data_no_missing = ['A', 'B', 'B'] data_w_missing = [np.nan, 'B', 'B'] - encoder = encoders.OneHotEncoder(handle_missing="error") + encoder = encoders.OneHotEncoder(handle_missing='error') result = encoder.fit_transform(data_no_missing) - expected = [[1, 0], - [0, 1], - [0, 1]] + expected = [[1, 0], [0, 1], [0, 1]] self.assertEqual(result.to_numpy().tolist(), expected) self.assertRaisesRegex(ValueError, '.*null.*', encoder.transform, data_w_missing) self.assertRaisesRegex(ValueError, '.*null.*', encoder.fit, data_w_missing) - def test_HandleMissingReturnNan(self): + def test_handle_missing_return_nan(self): + """Test that missing values are encoded as NaN in each dummy column.""" train = pd.DataFrame({'x': ['A', np.nan, 'B']}) encoder = encoders.OneHotEncoder(handle_missing='return_nan', use_cat_names=True) result = encoder.fit_transform(train) @@ -160,100 +180,87 @@ def test_HandleMissingReturnNan(self): pd.DataFrame({'x_A': [1, np.nan, 0], 'x_B': [0, np.nan, 1]}), ) - def test_HandleMissingIgnore(self): - train = pd.DataFrame({'x': ['A', 'B', np.nan], - 'y': ['A', None, 'A'], - 'z': [np.nan, 'B', 'B']}) + def test_handle_missing_ignore(self): + """Test that missing values are encoded as 0 in each dummy column.""" + train = pd.DataFrame( + {'x': ['A', 'B', np.nan], 'y': ['A', None, 'A'], 'z': [np.nan, 'B', 'B']} + ) train['z'] = train['z'].astype('category') - expected_result = pd.DataFrame({'x_A': [1, 0, 0], - 'x_B': [0, 1, 0], - 'y_A': [1, 0, 1], - 'z_B': [0, 1, 1]}) + expected_result = pd.DataFrame( + {'x_A': [1, 0, 0], 'x_B': [0, 1, 0], 'y_A': [1, 0, 1], 'z_B': [0, 1, 1]} + ) encoder = encoders.OneHotEncoder(handle_missing='ignore', use_cat_names=True) result = encoder.fit_transform(train) pd.testing.assert_frame_equal(result, expected_result) - def test_HandleMissingIgnore_ExpectMappingUsed(self): + def test_handle_missing_ignore_test_mapping(self): + """Test that the mapping is correct if handle_missing='ignore'.""" train = pd.DataFrame({'city': ['Chicago', np.nan, 'Geneva']}) - expected_result = pd.DataFrame({'city_1': [1, 0, 0], - 'city_2': [0, 0, 1]}) + expected_result = pd.DataFrame({'city_1': [1, 0, 0], 'city_2': [0, 0, 1]}) encoder = encoders.OneHotEncoder(handle_missing='ignore') result = encoder.fit(train).transform(train) - expected_mapping = pd.DataFrame([ - [1, 0], - [0, 1], - [0, 0], - [0, 0], - ], columns=["city_1", "city_2"], index=[1, 2, -2, -1]) + expected_mapping = pd.DataFrame( + [ + [1, 0], + [0, 1], + [0, 0], + [0, 0], + ], + columns=['city_1', 'city_2'], + index=[1, 2, -2, -1], + ) pd.testing.assert_frame_equal(expected_result, result) - pd.testing.assert_frame_equal(expected_mapping, encoder.category_mapping[0]["mapping"]) - - def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): - train = ['A', 'B', np.nan] - - encoder = encoders.OneHotEncoder(handle_missing='indicator', handle_unknown='value') - result = encoder.fit_transform(train) - - expected = [[1, 0, 0], - [0, 1, 0], - [0, 0, 1]] - self.assertEqual(result.to_numpy().tolist(), expected) - - def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): - train = ['A', 'B'] - - encoder = encoders.OneHotEncoder(handle_missing='indicator', handle_unknown='value') - result = encoder.fit_transform(train) - - expected = [[1, 0, 0], - [0, 1, 0]] - self.assertEqual(result.to_numpy().tolist(), expected) - - def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): + pd.testing.assert_frame_equal(expected_mapping, encoder.category_mapping[0]['mapping']) + + def test_handle_missing_indicator(self): + """Test that missing values are encoded with an indicator column.""" + with self.subTest("Should create a column if NaN in training set"): + train = ['A', 'B', np.nan] + encoder = encoders.OneHotEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + expected = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] + self.assertEqual(result.to_numpy().tolist(), expected) + + with self.subTest("should create a column if NaN not in training set"): + train = ['A', 'B'] + + encoder = encoders.OneHotEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [[1, 0, 0], [0, 1, 0]] + self.assertEqual(result.to_numpy().tolist(), expected) + + # if NaN occurs in prediction it should be encoded as a new column + test = ['A', 'B', np.nan] + encoded_test = encoder.transform(test) + expected_test = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] + self.assertEqual(encoded_test.to_numpy().tolist(), expected_test) + + def test_handle_unknown_indicator(self): + """Test that unseen values are encoded with an indicator column.""" train = ['A', 'B'] - test = ['A', 'B', np.nan] - - encoder = encoders.OneHotEncoder(handle_missing='indicator', handle_unknown='value') - encoded_train = encoder.fit_transform(train) - encoded_test = encoder.transform(test) - - expected_1 = [[1, 0, 0], - [0, 1, 0]] - self.assertEqual(encoded_train.to_numpy().tolist(), expected_1) - - expected_2 = [[1, 0, 0], - [0, 1, 0], - [0, 0, 1]] - self.assertEqual(encoded_test.to_numpy().tolist(), expected_2) - - def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): - train = ['A', 'B'] - test = ['A', 'B', 'C'] - encoder = encoders.OneHotEncoder(handle_unknown='indicator', handle_missing='value') encoder.fit(train) - result = encoder.transform(test) - - expected = [[1, 0, 0], - [0, 1, 0], - [0, 0, 1]] - self.assertEqual(result.to_numpy().tolist(), expected) - - def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): - train = ['A', 'B'] - - encoder = encoders.OneHotEncoder(handle_unknown='indicator', handle_missing='value') - result = encoder.fit_transform(train) - - expected = [[1, 0, 0], - [0, 1, 0]] - self.assertEqual(result.to_numpy().tolist(), expected) - - def test_inverse_transform_HaveNanInTrainAndHandleMissingValue_ExpectReturnedWithNan(self): + with self.subTest("should create a column if unseen value in transform stage"): + test = ['A', 'B', 'C'] + result = encoder.transform(test) + expected = [[1, 0, 0], [0, 1, 0], [0, 0, 1]] + self.assertEqual(result.to_numpy().tolist(), expected) + + with self.subTest("should also create a column if no unseen value in transform"): + result = encoder.transform(train) + expected = [[1, 0, 0], [0, 1, 0]] + self.assertEqual(result.to_numpy().tolist(), expected) + + def test_inverse_transform_missing_value(self): + """Test the inverse transform with handle_missing='value'. + + This should output the original data if the input data is inverse transformed. + """ train = pd.DataFrame({'city': ['chicago', np.nan]}) enc = encoders.OneHotEncoder(handle_missing='value', handle_unknown='value') @@ -262,7 +269,11 @@ def test_inverse_transform_HaveNanInTrainAndHandleMissingValue_ExpectReturnedWit pd.testing.assert_frame_equal(train, original) - def test_inverse_transform_HaveNanInTrainAndHandleMissingReturnNan_ExpectReturnedWithNan(self): + def test_inverse_transform_missing_return_nan(self): + """Test the inverse transform with handle_missing='return_nan'. + + This should output the original data if the input data is inverse transformed. + """ train = pd.DataFrame({'city': ['chicago', np.nan]}) enc = encoders.OneHotEncoder(handle_missing='return_nan', handle_unknown='value') @@ -271,7 +282,11 @@ def test_inverse_transform_HaveNanInTrainAndHandleMissingReturnNan_ExpectReturne pd.testing.assert_frame_equal(train, original) - def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self): + def test_inverse_transform_missing_and_unknown_return_nan(self): + """Test the inverse transform with handle_missing and handle_unknown='return_nan'. + + This should raise a warning as the unknown category cannot be inverted. + """ train = pd.DataFrame({'city': ['chicago', np.nan]}) test = pd.DataFrame({'city': ['chicago', 'los angeles']}) @@ -279,31 +294,30 @@ def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self): enc.fit(train) result = enc.transform(test) - message = 'inverse_transform is not supported because transform impute '\ - 'the unknown category nan when encode city' + message = ( + 'inverse_transform is not supported because transform impute ' + 'the unknown category nan when encode city' + ) - with self.assertWarns(UserWarning, msg=message) as w: + with self.assertWarns(UserWarning, msg=message): enc.inverse_transform(result) - def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self): - train = pd.DataFrame({'city': ['chicago', np.nan]}) - test = pd.DataFrame({'city': ['chicago', 'los angeles']}) - - enc = encoders.OneHotEncoder(handle_missing='value', handle_unknown='return_nan') - enc.fit(train) - result = enc.transform(test) - original = enc.inverse_transform(result) - - pd.testing.assert_frame_equal(train, original) - - def test_inverse_transform_HaveHandleMissingValueAndHandleUnknownReturnNan_ExpectBestInverse(self): + def test_inverse_transform_handle_missing_value(self): + """Test inverse transform if missing values are encoded with strategy 'value'.""" train = pd.DataFrame({'city': ['chicago', np.nan]}) - test = pd.DataFrame({'city': ['chicago', np.nan, 'los angeles']}) - expected = pd.DataFrame({'city': ['chicago', np.nan, np.nan]}) enc = encoders.OneHotEncoder(handle_missing='value', handle_unknown='return_nan') enc.fit(train) - result = enc.transform(test) - original = enc.inverse_transform(result) - pd.testing.assert_frame_equal(expected, original) + test_data_case_1 = pd.DataFrame({'city': ['chicago', 'los angeles']}) + test_data_case_2 = pd.DataFrame({'city': ['chicago', np.nan, 'los angeles']}) + expected_case_2 = pd.DataFrame({'city': ['chicago', np.nan, np.nan]}) + cases = {"should encode unknown into nan": (test_data_case_1, train), + "should encode unknown into nan and missing into nan": (test_data_case_2, + expected_case_2), + } + for case, (test_data, expected) in cases.items(): + with self.subTest(case=case): + result = enc.transform(test_data) + original = enc.inverse_transform(result) + pd.testing.assert_frame_equal(expected, original) diff --git a/tests/test_ordinal.py b/tests/test_ordinal.py index c34be0ad..a37a939e 100644 --- a/tests/test_ordinal.py +++ b/tests/test_ordinal.py @@ -1,9 +1,11 @@ -import pandas as pd +"""Tests for the Ordinal encoder.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ -import tests.helpers as th -import numpy as np + import category_encoders as encoders +import numpy as np +import pandas as pd +import tests.helpers as th np_X = th.create_array(n_rows=100) np_X_t = th.create_array(n_rows=50, extras=True) @@ -16,48 +18,54 @@ class TestOrdinalEncoder(TestCase): - def test_ordinal(self): + """Unit tests for the Ordinal encoder.""" + def test_ordinal(self): + """Test some basic functionality.""" enc = encoders.OrdinalEncoder(verbose=1, return_df=True) enc.fit(X) out = enc.transform(X_t) - self.assertEqual(len(set(out["extra"].values)), 4) - self.assertIn(-1, set(out["extra"].values)) + self.assertEqual(len(set(out['extra'].values)), 4) + self.assertIn(-1, set(out['extra'].values)) self.assertFalse(enc.mapping is None) self.assertTrue(len(enc.mapping) > 0) enc = encoders.OrdinalEncoder(verbose=1, mapping=enc.mapping, return_df=True) enc.fit(X) out = enc.transform(X_t) - self.assertEqual(len(set(out["extra"].values)), 4) - self.assertIn(-1, set(out["extra"].values)) + self.assertEqual(len(set(out['extra'].values)), 4) + self.assertIn(-1, set(out['extra'].values)) self.assertTrue(len(enc.mapping) > 0) - enc = encoders.OrdinalEncoder(verbose=1, return_df=True, handle_unknown="return_nan") + enc = encoders.OrdinalEncoder(verbose=1, return_df=True, handle_unknown='return_nan') enc.fit(X) out = enc.transform(X_t) - out_cats = [x for x in set(out["extra"].values) if np.isfinite(x)] + out_cats = [x for x in set(out['extra'].values) if np.isfinite(x)] self.assertEqual(len(out_cats), 3) self.assertFalse(enc.mapping is None) def test_ordinal_dist(self): - data = np.array([["apple", "lemon"], ["peach", None]]) + """Test that the encoder works with multiple columns and all encodings are distinct.""" + data = np.array([['apple', 'lemon'], ['peach', None]]) encoder = encoders.OrdinalEncoder() result = encoder.fit_transform(data) - self.assertEqual(2, len(result[0].unique()), "We expect two unique values in the column") - self.assertEqual(2, len(result[1].unique()), "We expect two unique values in the column") + self.assertEqual(2, len(result[0].unique())) + self.assertEqual(2, len(result[1].unique())) self.assertFalse(np.isnan(result.iloc[1, 1])) - encoder = encoders.OrdinalEncoder(handle_missing="return_nan") + encoder = encoders.OrdinalEncoder(handle_missing='return_nan') result = encoder.fit_transform(data) - self.assertEqual(2, len(result[0].unique()), "We expect two unique values in the column") - self.assertEqual(2, len(result[1].unique()), "We expect two unique values in the column") + self.assertEqual(2, len(result[0].unique())) + self.assertEqual(2, len(result[1].unique())) def test_pandas_categorical(self): + """Test that the encoder works with pandas Categorical data.""" X = pd.DataFrame( { - "Str": ["a", "c", "c", "d"], - "Categorical": pd.Categorical(list("bbea"), categories=["e", "a", "b"], ordered=True), + 'Str': ['a', 'c', 'c', 'd'], + 'Categorical': pd.Categorical( + list('bbea'), categories=['e', 'a', 'b'], ordered=True + ), } ) @@ -65,292 +73,324 @@ def test_pandas_categorical(self): out = enc.fit_transform(X) th.verify_numeric(out) - self.assertEqual(3, out["Categorical"][0]) - self.assertEqual(3, out["Categorical"][1]) - self.assertEqual(1, out["Categorical"][2]) - self.assertEqual(2, out["Categorical"][3]) + self.assertEqual(3, out['Categorical'][0]) + self.assertEqual(3, out['Categorical'][1]) + self.assertEqual(1, out['Categorical'][2]) + self.assertEqual(2, out['Categorical'][3]) def test_handle_missing_have_nan_fit_time_expect_as_category(self): + """Test that missing values are encoded with 1 if handle_missing='value'.""" train = pd.DataFrame( { - "city": ["chicago", np.nan], - "city_cat": pd.Categorical(["chicago", np.nan]), + 'city': ['chicago', np.nan], + 'city_cat': pd.Categorical(['chicago', np.nan]), } ) - enc = encoders.OrdinalEncoder(handle_missing="value") + enc = encoders.OrdinalEncoder(handle_missing='value') out = enc.fit_transform(train) - self.assertListEqual([1, 2], out["city"].tolist()) - self.assertListEqual([1, 2], out["city_cat"].tolist()) + self.assertListEqual([1, 2], out['city'].tolist()) + self.assertListEqual([1, 2], out['city_cat'].tolist()) def test_handle_missing_have_nan_transform_time_expect_negative_2(self): + """Test that missing values in the test set are encoded with -2 if no missing in training. + + This is for handle_missing='value'. + """ train = pd.DataFrame( { - "city": ["chicago", "st louis"], - "city_cat": pd.Categorical(["chicago", "st louis"]), + 'city': ['chicago', 'st louis'], + 'city_cat': pd.Categorical(['chicago', 'st louis']), } ) test = pd.DataFrame( { - "city": ["chicago", np.nan], - "city_cat": pd.Categorical(["chicago", np.nan]), + 'city': ['chicago', np.nan], + 'city_cat': pd.Categorical(['chicago', np.nan]), } ) - enc = encoders.OrdinalEncoder(handle_missing="value") + enc = encoders.OrdinalEncoder(handle_missing='value') enc.fit(train) out = enc.transform(test) - self.assertListEqual([1, -2], out["city"].tolist()) - self.assertListEqual([1, -2], out["city_cat"].tolist()) + self.assertListEqual([1, -2], out['city'].tolist()) + self.assertListEqual([1, -2], out['city_cat'].tolist()) def test_handle_unknown_have_new_value_expect_negative_1(self): - # See #238 - train = pd.DataFrame({"city": ["chicago", "st louis"]}) - test = pd.DataFrame({"city": ["chicago", "los angeles"]}) + """Test that unknown values are encoded with -1 if missing values are left missing.""" + # See issue #238 + train = pd.DataFrame({'city': ['chicago', 'st louis']}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) expected = [1.0, -1.0] - enc = encoders.OrdinalEncoder(handle_missing="return_nan") + enc = encoders.OrdinalEncoder(handle_missing='return_nan') enc.fit(train) - result = enc.transform(test)["city"].tolist() + result = enc.transform(test)['city'].tolist() self.assertEqual(expected, result) def test_handle_unknown_have_new_value_expect_negative_1_categorical(self): - cities = ["st louis", "chicago", "los angeles"] - train = pd.DataFrame({"city": pd.Categorical(cities[:-1], categories=cities)}) - test = pd.DataFrame({"city": pd.Categorical(cities[1:], categories=cities)}) + """Test that unknown values are encoded with -1.""" + cities = ['st louis', 'chicago', 'los angeles'] + train = pd.DataFrame({'city': pd.Categorical(cities[:-1], categories=cities)}) + test = pd.DataFrame({'city': pd.Categorical(cities[1:], categories=cities)}) expected = [2.0, -1.0] - enc = encoders.OrdinalEncoder(handle_missing="return_nan") + enc = encoders.OrdinalEncoder(handle_missing='return_nan') enc.fit(train) - result = enc.transform(test)["city"].tolist() + result = enc.transform(test)['city'].tolist() self.assertEqual(expected, result) def test_custom_mapping(self): + """Test that custom mapping is correctly applied.""" # See issue 193 custom_mapping = [ { - "col": "col1", - "mapping": {np.nan: 0, "a": 1, "b": 2}, + 'col': 'col1', + 'mapping': {np.nan: 0, 'a': 1, 'b': 2}, }, # The mapping from the documentation - {"col": "col2", "mapping": {np.nan: -3, "x": 11, "y": 2}}, + {'col': 'col2', 'mapping': {np.nan: -3, 'x': 11, 'y': 2}}, ] custom_mapping_series = [ { - "col": "col1", - "mapping": pd.Series({np.nan: 0, "a": 1, "b": 2}), + 'col': 'col1', + 'mapping': pd.Series({np.nan: 0, 'a': 1, 'b': 2}), }, # The mapping from the documentation - {"col": "col2", "mapping": pd.Series({np.nan: -3, "x": 11, "y": 2})}, + {'col': 'col2', 'mapping': pd.Series({np.nan: -3, 'x': 11, 'y': 2})}, ] - train = pd.DataFrame({"col1": ["a", "a", "b", np.nan], "col2": ["x", "y", np.nan, np.nan]}) + train = pd.DataFrame({'col1': ['a', 'a', 'b', np.nan], 'col2': ['x', 'y', np.nan, np.nan]}) for mapping in [custom_mapping, custom_mapping_series]: with self.subTest(): - enc = encoders.OrdinalEncoder(handle_missing="value", mapping=mapping) + enc = encoders.OrdinalEncoder(handle_missing='value', mapping=mapping) + # We have to first 'fit' before 'transform' out = enc.fit_transform( train - ) # We have to first 'fit' before 'transform' even if we do nothing during the fit... + ) - self.assertListEqual([1, 1, 2, 0], out["col1"].tolist()) - self.assertListEqual([11, 2, -3, -3], out["col2"].tolist()) + self.assertListEqual([1, 1, 2, 0], out['col1'].tolist()) + self.assertListEqual([11, 2, -3, -3], out['col2'].tolist()) - def test_HaveNegativeOneInTrain_ExpectCodedAsOne(self): - train = pd.DataFrame({"city": [-1]}) + def test_integers_are_encoded(self): + """Should encode integers, also negative ones as categories.""" + train = pd.DataFrame({'city': [-1]}) expected = [1] - enc = encoders.OrdinalEncoder(cols=["city"]) - result = enc.fit_transform(train)["city"].tolist() + enc = encoders.OrdinalEncoder(cols=['city']) + result = enc.fit_transform(train)['city'].tolist() self.assertEqual(expected, result) - def test_HavenanInTrain_ExpectCodedAsOne(self): - train = pd.DataFrame({"city": [np.nan]}) + def test_nan_in_training(self): + """Test that NaN values are encoded the same way as non-missing the default setting.""" + train = pd.DataFrame({'city': [np.nan]}) expected = [1] - enc = encoders.OrdinalEncoder(cols=["city"]) - result = enc.fit_transform(train)["city"].tolist() + enc = encoders.OrdinalEncoder(cols=['city']) + result = enc.fit_transform(train)['city'].tolist() self.assertEqual(expected, result) - def test_Timestamp(self): + def test_timestamp(self): + """Test that the ordinal encoder works with pandas timestamps.""" df = pd.DataFrame( { - "timestamps": { - 0: pd.Timestamp("1997-09-03 00:00:00"), - 1: pd.Timestamp("1997-09-03 00:00:00"), - 2: pd.Timestamp("2000-09-03 00:00:00"), - 3: pd.Timestamp("1997-09-03 00:00:00"), - 4: pd.Timestamp("1999-09-04 00:00:00"), - 5: pd.Timestamp("2001-09-03 00:00:00"), + 'timestamps': { + 0: pd.Timestamp('1997-09-03 00:00:00'), + 1: pd.Timestamp('1997-09-03 00:00:00'), + 2: pd.Timestamp('2000-09-03 00:00:00'), + 3: pd.Timestamp('1997-09-03 00:00:00'), + 4: pd.Timestamp('1999-09-04 00:00:00'), + 5: pd.Timestamp('2001-09-03 00:00:00'), }, } ) - enc = encoders.OrdinalEncoder(cols=["timestamps"]) + enc = encoders.OrdinalEncoder(cols=['timestamps']) encoded_df = enc.fit_transform(df) - expected_index = [pd.Timestamp("1997-09-03 00:00:00"), - pd.Timestamp("2000-09-03 00:00:00"), - pd.Timestamp("1999-09-04 00:00:00"), - pd.Timestamp("2001-09-03 00:00:00"), - pd.NaT - ] + expected_index = [ + pd.Timestamp('1997-09-03 00:00:00'), + pd.Timestamp('2000-09-03 00:00:00'), + pd.Timestamp('1999-09-04 00:00:00'), + pd.Timestamp('2001-09-03 00:00:00'), + pd.NaT, + ] expected_mapping = pd.Series([1, 2, 3, 4, -2], index=expected_index) expected_values = [1, 1, 2, 1, 3, 4] - pd.testing.assert_series_equal(expected_mapping, enc.mapping[0]["mapping"]) - self.assertListEqual(expected_values, encoded_df["timestamps"].tolist()) + pd.testing.assert_series_equal(expected_mapping, enc.mapping[0]['mapping']) + self.assertListEqual(expected_values, encoded_df['timestamps'].tolist()) - def test_NoGaps(self): - train = pd.DataFrame({"city": ["New York", np.nan, "Rio", None, "Rosenheim"]}) - expected_mapping_value = pd.Series([1, 2, 3, 4], index=["New York", "Rio", "Rosenheim", np.nan]) - expected_mapping_return_nan = pd.Series([1, 2, 3, -2], index=["New York", "Rio", "Rosenheim", np.nan]) + def test_no_gaps(self): + """Test that the ordinal mapping does not have gaps.""" + train = pd.DataFrame({'city': ['New York', np.nan, 'Rio', None, 'Rosenheim']}) + expected_mapping_value = pd.Series( + [1, 2, 3, 4], index=['New York', 'Rio', 'Rosenheim', np.nan] + ) + expected_mapping_return_nan = pd.Series( + [1, 2, 3, -2], index=['New York', 'Rio', 'Rosenheim', np.nan] + ) - enc_value = encoders.OrdinalEncoder(cols=["city"], handle_missing="value") + enc_value = encoders.OrdinalEncoder(cols=['city'], handle_missing='value') enc_value.fit(train) - pd.testing.assert_series_equal(expected_mapping_value, enc_value.mapping[0]["mapping"]) - enc_return_nan = encoders.OrdinalEncoder(cols=["city"], handle_missing="return_nan") + pd.testing.assert_series_equal(expected_mapping_value, enc_value.mapping[0]['mapping']) + enc_return_nan = encoders.OrdinalEncoder(cols=['city'], handle_missing='return_nan') enc_return_nan.fit(train) - pd.testing.assert_series_equal(expected_mapping_return_nan, enc_return_nan.mapping[0]["mapping"]) + pd.testing.assert_series_equal( + expected_mapping_return_nan, enc_return_nan.mapping[0]['mapping'] + ) - def test_HaveNoneAndNan_ExpectCodesAsOne(self): - train = pd.DataFrame({"city": [np.nan, None]}) + def test_nan_and_none_is_encoded_the_same(self): + """Test that NaN and None are encoded the same.""" + train = pd.DataFrame({'city': [np.nan, None]}) expected = [1, 1] - enc = encoders.OrdinalEncoder(cols=["city"]) - result = enc.fit_transform(train)["city"].tolist() + enc = encoders.OrdinalEncoder(cols=['city']) + result = enc.fit_transform(train)['city'].tolist() self.assertEqual(expected, result) new_nan = pd.DataFrame( { - "city": [ + 'city': [ np.nan, ] } ) - result_new_nan = enc.transform(new_nan)["city"].tolist() + result_new_nan = enc.transform(new_nan)['city'].tolist() expected_new_nan = [1] self.assertEqual(expected_new_nan, result_new_nan) new_none = pd.DataFrame( { - "city": [ + 'city': [ None, ] } ) - result_new_none = enc.transform(new_none)["city"].tolist() + result_new_none = enc.transform(new_none)['city'].tolist() expected_new_none = [1] self.assertEqual(expected_new_none, result_new_none) - def test_inverse_transform_HaveUnknown_ExpectWarning(self): - train = pd.DataFrame({"city": ["chicago", "st louis"]}) - test = pd.DataFrame({"city": ["chicago", "los angeles"]}) + def test_inverse_transform_unknown_value(self): + """Test the inverse transform with handle_unknown='value'. + + This should raise a warning as the unknown category cannot be inverted. + """ + train = pd.DataFrame({'city': ['chicago', 'st louis']}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) - enc = encoders.OrdinalEncoder(handle_missing="value", handle_unknown="value") + enc = encoders.OrdinalEncoder(handle_missing='value', handle_unknown='value') enc.fit(train) result = enc.transform(test) message = ( - "inverse_transform is not supported because transform impute " "the unknown category -1 when encode city" + 'inverse_transform is not supported because transform impute ' + 'the unknown category -1 when encode city' ) - with self.assertWarns(UserWarning, msg=message) as w: + with self.assertWarns(UserWarning, msg=message): enc.inverse_transform(result) - def test_inverse_transform_HaveNanInTrainAndHandleMissingValue_ExpectReturnedWithNan( - self, - ): - train = pd.DataFrame({"city": ["chicago", np.nan]}) + def test_inverse_transform_missing_value( self ): + """Test the inverse transform with handle_missing='value'. - enc = encoders.OrdinalEncoder(handle_missing="value", handle_unknown="value") + This should output the original data if the input data is inverse transformed. + """ + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OrdinalEncoder(handle_missing='value', handle_unknown='value') result = enc.fit_transform(train) original = enc.inverse_transform(result) pd.testing.assert_frame_equal(train, original) - def test_inverse_transform_HaveNanInTrainAndHandleMissingReturnNan_ExpectReturnedWithNan( - self, - ): - train = pd.DataFrame({"city": ["chicago", np.nan]}) + def test_inverse_transform_missing_return_nan(self): + """Test the inverse transform with handle_missing='return_nan'. + + This should output the original data if the input data is inverse transformed. + """ + train = pd.DataFrame({'city': ['chicago', np.nan]}) - enc = encoders.OrdinalEncoder(handle_missing="return_nan", handle_unknown="value") + enc = encoders.OrdinalEncoder(handle_missing='return_nan', handle_unknown='value') result = enc.fit_transform(train) original = enc.inverse_transform(result) pd.testing.assert_frame_equal(train, original) - def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self): - train = pd.DataFrame({"city": ["chicago", np.nan]}) - test = pd.DataFrame({"city": ["chicago", "los angeles"]}) + def test_inverse_transform_missing_and_unknown_return_nan(self): + """Test the inverse transform with handle_missing and handle_unknown='return_nan'. - enc = encoders.OrdinalEncoder(handle_missing="return_nan", handle_unknown="return_nan") + This should raise a warning as the unknown category cannot be inverted. + """ + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan') enc.fit(train) result = enc.transform(test) message = ( - "inverse_transform is not supported because transform impute " "the unknown category nan when encode city" + 'inverse_transform is not supported because transform impute ' + 'the unknown category nan when encode city' ) - with self.assertWarns(UserWarning, msg=message) as w: + with self.assertWarns(UserWarning, msg=message): enc.inverse_transform(result) - def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self): - train = pd.DataFrame({"city": ["chicago", np.nan]}) - test = pd.DataFrame({"city": ["chicago", "los angeles"]}) - - enc = encoders.OrdinalEncoder(handle_missing="value", handle_unknown="return_nan") - enc.fit(train) - result = enc.transform(test) - original = enc.inverse_transform(result) - - pd.testing.assert_frame_equal(train, original) - - def test_inverse_transform_HaveHandleMissingValueAndHandleUnknownReturnNan_ExpectBestInverse( - self, - ): - train = pd.DataFrame({"city": ["chicago", np.nan]}) - test = pd.DataFrame({"city": ["chicago", np.nan, "los angeles"]}) - expected = pd.DataFrame({"city": ["chicago", np.nan, np.nan]}) - - enc = encoders.OrdinalEncoder(handle_missing="value", handle_unknown="return_nan") + def test_inverse_transform_handle_missing_value(self): + """Test that the inverse transform works with handle_missing='value'.""" + train = pd.DataFrame({'city': ['chicago', np.nan]}) + enc = encoders.OrdinalEncoder(handle_missing='value', handle_unknown='return_nan') enc.fit(train) - result = enc.transform(test) - original = enc.inverse_transform(result) - - pd.testing.assert_frame_equal(expected, original) + with self.subTest("Should treat unknown values as NaN values in the inverse."): + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + result = enc.transform(test) + original = enc.inverse_transform(result) + pd.testing.assert_frame_equal(train, original) + + with self.subTest("Should treat unknown and NaN values as NaN in the inverse."): + test = pd.DataFrame({'city': ['chicago', np.nan, 'los angeles']}) + expected = pd.DataFrame({'city': ['chicago', np.nan, np.nan]}) + result = enc.transform(test) + original = enc.inverse_transform(result) + pd.testing.assert_frame_equal(expected, original) def test_inverse_with_mapping(self): + """Test that the inverse transform works with a custom mapping.""" df = X.copy(deep=True) categoricals = [ - "unique_int", - "unique_str", - "invariant", - "underscore", - "none", - "extra", + 'unique_int', + 'unique_str', + 'invariant', + 'underscore', + 'none', + 'extra', ] mappings = { - "as Series": [ + 'as Series': [ { - "col": c, - "mapping": pd.Series(data=range(len(df[c].unique())), index=df[c].unique()), - "data_type": X[c].dtype, + 'col': c, + 'mapping': pd.Series(data=range(len(df[c].unique())), index=df[c].unique()), + 'data_type': X[c].dtype, } for c in categoricals ], - "as Dict": [{"col": c, "mapping": {k: idx for idx, k in enumerate(df[c].unique())}} for c in categoricals], + 'as Dict': [ + {'col': c, 'mapping': {k: idx for idx, k in enumerate(df[c].unique())}} + for c in categoricals + ], } for msg, mapping in mappings.items(): with self.subTest(msg): df = X.copy(deep=True) enc = encoders.OrdinalEncoder( cols=categoricals, - handle_unknown="ignore", + handle_unknown='ignore', mapping=mapping, return_df=True, ) @@ -359,23 +399,26 @@ def test_inverse_with_mapping(self): pd.testing.assert_frame_equal(X[categoricals], recovered) def test_validate_mapping(self): + """Test that the mapping is validated correctly.""" custom_mapping = [ { - "col": "col1", - "mapping": {np.nan: 0, "a": 1, "b": 2}, + 'col': 'col1', + 'mapping': {np.nan: 0, 'a': 1, 'b': 2}, }, # The mapping from the documentation - {"col": "col2", "mapping": {np.nan: -3, "x": 11, "y": 2}}, + {'col': 'col2', 'mapping': {np.nan: -3, 'x': 11, 'y': 2}}, ] expected_valid_mapping = [ { - "col": "col1", - "mapping": pd.Series({np.nan: 0, "a": 1, "b": 2}), + 'col': 'col1', + 'mapping': pd.Series({np.nan: 0, 'a': 1, 'b': 2}), }, # The mapping from the documentation - {"col": "col2", "mapping": pd.Series({np.nan: -3, "x": 11, "y": 2})}, + {'col': 'col2', 'mapping': pd.Series({np.nan: -3, 'x': 11, 'y': 2})}, ] enc = encoders.OrdinalEncoder() actual_valid_mapping = enc._validate_supplied_mapping(custom_mapping) self.assertEqual(len(actual_valid_mapping), len(expected_valid_mapping)) for idx in range(len(actual_valid_mapping)): - self.assertEqual(actual_valid_mapping[idx]["col"], expected_valid_mapping[idx]["col"]) - pd.testing.assert_series_equal(actual_valid_mapping[idx]["mapping"], expected_valid_mapping[idx]["mapping"]) + self.assertEqual(actual_valid_mapping[idx]['col'], expected_valid_mapping[idx]['col']) + pd.testing.assert_series_equal( + actual_valid_mapping[idx]['mapping'], expected_valid_mapping[idx]['mapping'] + ) diff --git a/tests/test_polynomial.py b/tests/test_polynomial.py index a5edd3cb..e2edd32f 100644 --- a/tests/test_polynomial.py +++ b/tests/test_polynomial.py @@ -1,7 +1,10 @@ -import pandas as pd +"""Tests for the PolynomialEncoder.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ -import numpy as np + import category_encoders as encoders +import numpy as np +import pandas as pd + from tests.helpers import deep_round a_encoding = [1, -0.7071067811865476, 0.40824829046386313] @@ -10,81 +13,64 @@ class TestPolynomialEncoder(TestCase): + """Tests for the PolynomialEncoder.""" - def test_polynomial_encoder_preserve_dimension_1(self): - train = ['A', 'B', 'C'] - test = ['A', 'D', 'E'] - - encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [a_encoding, - [1, 0, 0], - [1, 0, 0]] - self.assertEqual(deep_round(test_t.to_numpy().tolist()), deep_round(expected)) - - def test_polynomial_encoder_preserve_dimension_2(self): - train = ['A', 'B', 'C'] - test = ['B', 'D', 'E'] - - encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [b_encoding, - [1, 0, 0], - [1, 0, 0]] - self.assertEqual(deep_round(test_t.to_numpy().tolist()), deep_round(expected)) - - def test_polynomial_encoder_preserve_dimension_3(self): - train = ['A', 'B', 'C'] - test = ['A', 'B', 'C', None] - - encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [a_encoding, - b_encoding, - c_encoding, - [1, 0, 0]] - self.assertEqual(deep_round(test_t.to_numpy().tolist()), deep_round(expected)) - - def test_polynomial_encoder_preserve_dimension_4(self): + def test_handle_missing_and_unknown(self): + """Test that missing and unknown values are treated as values.""" train = ['A', 'B', 'C'] - test = ['D', 'B', 'C', None] - - encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [[1, 0, 0], - b_encoding, - c_encoding, - [1, 0, 0]] - self.assertEqual(deep_round(test_t.to_numpy().tolist()), deep_round(expected)) + expected_encoding_unknown = [1, 0, 0] + expected_1 = [a_encoding, expected_encoding_unknown, expected_encoding_unknown] + expected_2 = [b_encoding, expected_encoding_unknown, expected_encoding_unknown] + expected_3 = [a_encoding, b_encoding, c_encoding, expected_encoding_unknown] + expected_4 = [expected_encoding_unknown, b_encoding, c_encoding, expected_encoding_unknown] + cases = {"should preserve dimension 1": (['A', 'D', 'E'], expected_1), + "should preserve dimension 2": (['B', 'D', 'E'], expected_2), + "should preserve dimension 3": (['A', 'B', 'C', None], expected_3), + "should preserve dimension 4": (['D', 'B', 'C', None], expected_4), + } + for case, (test_data, expected) in cases.items(): + with self.subTest(case=case): + encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') + encoder.fit(train) + test_t = encoder.transform(test_data) + self.assertEqual(deep_round(test_t.to_numpy().tolist()), deep_round(expected)) def test_polynomial_encoder_2cols(self): + """Test the PolynomialEncoder with two columns.""" train = [['A', 'A'], ['B', 'B'], ['C', 'C']] encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) obtained = encoder.transform(train) - expected = [[1, a_encoding[1], a_encoding[2], a_encoding[1], a_encoding[2]], - [1, b_encoding[1], b_encoding[2], b_encoding[1], b_encoding[2]], - [1, c_encoding[1], c_encoding[2], c_encoding[1], c_encoding[2]]] + expected = [ + [1, a_encoding[1], a_encoding[2], a_encoding[1], a_encoding[2]], + [1, b_encoding[1], b_encoding[2], b_encoding[1], b_encoding[2]], + [1, c_encoding[1], c_encoding[2], c_encoding[1], c_encoding[2]], + ] self.assertEqual(deep_round(obtained.to_numpy().tolist()), deep_round(expected)) - def test_polynomial_encoder_2StringCols_ExpectCorrectOrder(self): - train = pd.DataFrame({'col1': [1, 2, 3, 4], - 'col2': ['A', 'B', 'C', 'D'], - 'col3': [1, 2, 3, 4], - 'col4': ['A', 'B', 'C', 'A'] - }, - columns=['col1', 'col2', 'col3', 'col4']) - expected_columns = ['intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1'] + def test_correct_order(self): + """Test that the order is correct when auto-detecting multiple columns.""" + train = pd.DataFrame( + { + 'col1': [1, 2, 3, 4], + 'col2': ['A', 'B', 'C', 'D'], + 'col3': [1, 2, 3, 4], + 'col4': ['A', 'B', 'C', 'A'], + }, + columns=['col1', 'col2', 'col3', 'col4'], + ) + expected_columns = [ + 'intercept', + 'col1', + 'col2_0', + 'col2_1', + 'col2_2', + 'col3', + 'col4_0', + 'col4_1', + ] encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) @@ -92,59 +78,40 @@ def test_polynomial_encoder_2StringCols_ExpectCorrectOrder(self): self.assertTrue(np.array_equal(expected_columns, columns)) - def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): - train = ['A', 'B', np.nan] + def test_handle_missing_is_indicator(self): + """Test that missing values are encoded with an indicator column.""" + with self.subTest("missing values in the training set are encoded with an " + "indicator column"): + train = ['A', 'B', np.nan] - encoder = encoders.PolynomialEncoder(handle_missing='indicator', handle_unknown='value') - result = encoder.fit_transform(train) + encoder = encoders.PolynomialEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) - expected = [a_encoding, - b_encoding, - c_encoding] - self.assertTrue(np.array_equal(deep_round(result.to_numpy().tolist()), deep_round(expected))) + expected = [a_encoding, b_encoding, c_encoding] + self.assertListEqual(deep_round(result.to_numpy().tolist()), deep_round(expected)) - def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): - train = ['A', 'B'] + with self.subTest("should fit an indicator column for missing values " + "even if not present in the training set"): + train = ['A', 'B'] - encoder = encoders.PolynomialEncoder(handle_missing='indicator', handle_unknown='value') - result = encoder.fit_transform(train) + encoder = encoders.PolynomialEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) - expected = [a_encoding, - b_encoding] - self.assertEqual(deep_round(result.to_numpy().tolist()), deep_round(expected)) + expected = [a_encoding, b_encoding] + self.assertEqual(deep_round(result.to_numpy().tolist()), deep_round(expected)) - def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): - train = ['A', 'B'] - test = ['A', 'B', np.nan] + test = ['A', 'B', np.nan] + result = encoder.transform(test) + expected = [a_encoding, b_encoding, c_encoding] + self.assertEqual(deep_round(result.to_numpy().tolist()), deep_round(expected)) - encoder = encoders.PolynomialEncoder(handle_missing='indicator', handle_unknown='value') - encoder.fit(train) - result = encoder.transform(test) - - expected = [a_encoding, - b_encoding, - c_encoding] - self.assertEqual(deep_round(result.to_numpy().tolist()), deep_round(expected)) + # unknown value is encoded as zeros (only one at indicator) + test = ['A', 'B', 'C'] + result = encoder.transform(test) + expected = [a_encoding, b_encoding, [1, 0, 0]] + self.assertEqual(deep_round(result.to_numpy().tolist()), deep_round(expected)) - def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): - train = ['A', 'B'] - test = ['A', 'B', 'C'] - - encoder = encoders.PolynomialEncoder(handle_unknown='indicator') - encoder.fit(train) - result = encoder.transform(test) - expected = [a_encoding, - b_encoding, - c_encoding] - self.assertEqual(deep_round(result.to_numpy().tolist()), deep_round(expected)) - def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): - train = ['A', 'B'] - encoder = encoders.PolynomialEncoder(handle_unknown='indicator') - result = encoder.fit_transform(train) - expected = [a_encoding, - b_encoding] - self.assertEqual(deep_round(result.to_numpy().tolist()), deep_round(expected)) diff --git a/tests/test_quantile_encoder.py b/tests/test_quantile_encoder.py index c8281a25..78a14cdd 100644 --- a/tests/test_quantile_encoder.py +++ b/tests/test_quantile_encoder.py @@ -1,65 +1,61 @@ +"""Tests for quantile encoder.""" import unittest -import pandas as pd import category_encoders as encoders import numpy as np +import pandas as pd class TestQuantileEncoder(unittest.TestCase): """Tests for percentile encoder.""" def setUp(self): - """Create dataframe with categories and a target variable""" - - self.df = pd.DataFrame({"categories": ["a", "b", "c", "a", "b", "c", "a", "b"]}) + """Create dataframe with categories and a target variable.""" + self.df = pd.DataFrame({'categories': ['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b']}) self.target = np.array([1, 2, 0, 4, 5, 0, 6, 7]) def test_median_works(self): - """ + """Test that median encoder works. + Expected output of percentile 50 in df: - a median is 4 (a values are 1, 4, 6) - b median is 5 (b values are 2, 5, 7) - c median is 0 (c values are 0) """ - - expected_output_median = pd.DataFrame( - {"categories": [4.0, 5, 0, 4, 5, 0, 4, 5]} - ) + expected_output_median = pd.DataFrame({'categories': [4.0, 5, 0, 4, 5, 0, 4, 5]}) pd.testing.assert_frame_equal( - encoders.QuantileEncoder(quantile=0.5, m=0.0).fit_transform( - self.df, self.target - ), + encoders.QuantileEncoder(quantile=0.5, m=0.0).fit_transform(self.df, self.target), expected_output_median, ) def test_max_works(self): - """ + """Test that maximum (=percentile 100) encoder works. + Expected output of percentile 100 in df: - a max is 6 - b max is 7 - c max is 0 """ - expected_output_max = pd.DataFrame({"categories": [6.0, 7, 0, 6, 7, 0, 6, 7]}) + expected_output_max = pd.DataFrame({'categories': [6.0, 7, 0, 6, 7, 0, 6, 7]}) pd.testing.assert_frame_equal( - encoders.QuantileEncoder(quantile=1, m=0.0).fit_transform( - self.df, self.target - ), + encoders.QuantileEncoder(quantile=1, m=0.0).fit_transform(self.df, self.target), expected_output_max, ) def test_new_category(self): - """ + """Test that unknown values are encoded with global mean. + The global median of the target is 3. If new categories are passed to the transformer, then the output should be 3 """ transformer_median = encoders.QuantileEncoder(quantile=0.5, m=0.0) transformer_median.fit(self.df, self.target) - new_df = pd.DataFrame({"categories": ["d", "e"]}) + new_df = pd.DataFrame({'categories': ['d', 'e']}) - new_medians = pd.DataFrame({"categories": [3.0, 3.0]}) + new_medians = pd.DataFrame({'categories': [3.0, 3.0]}) pd.testing.assert_frame_equal(transformer_median.transform(new_df), new_medians) @@ -68,18 +64,13 @@ class TestSummaryEncoder(unittest.TestCase): """Tests for summary encoder.""" def setUp(self): - """Create dataframe with categories and a target variable""" - - self.df = pd.DataFrame({"categories": ["a", "b", "c", "a", "b", "c", "a", "b"]}) + """Create dataframe with categories and a target variable.""" + self.df = pd.DataFrame({'categories': ['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b']}) self.target = np.array([1, 2, 0, 4, 5, 0, 6, 7]) - self.col = "categories" + self.col = 'categories' def assert_same_quantile(self, quantile): - """ - Given a quantile, compares if the summary encoder and the quantile encoder provide the same results - - """ - + """Check the summary encoder with a single quantile coincides with the quantile encoder.""" quantile_results = encoders.QuantileEncoder( cols=[self.col], quantile=quantile ).fit_transform(self.df, self.target) @@ -89,26 +80,19 @@ def assert_same_quantile(self, quantile): ).fit_transform(self.df, self.target) percentile = round(quantile * 100) - col_name = str(self.col)+'_'+str(percentile) + col_name = str(self.col) + '_' + str(percentile) np.testing.assert_allclose( quantile_results[self.col].values, summary_results[col_name].values, ) def test_several_quantiles(self): - """ - Check that all quantiles of the QE are in the summary encoder - """ - + """Check that all quantiles of the QE are in the summary encoder.""" for quantile in [0.1, 0.5, 0.9]: self.assert_same_quantile(quantile) def test_several_quantiles_reverse(self): - """ - Checks that all quantiles of summary encoder are in the quantile encoder - - """ - + """Check that all quantiles of summary encoder are in the quantile encoder.""" quantile_list = [0.2, 0.1, 0.8] summary_results = encoders.SummaryEncoder( @@ -116,13 +100,12 @@ def test_several_quantiles_reverse(self): ).fit_transform(self.df, self.target) for quantile in quantile_list: - quantile_results = encoders.QuantileEncoder( cols=[self.col], quantile=quantile ).fit_transform(self.df, self.target) percentile = round(quantile * 100) - col_name = str(self.col)+'_'+str(percentile) + col_name = str(self.col) + '_' + str(percentile) np.testing.assert_allclose( quantile_results[self.col].values, diff --git a/tests/test_rankhot.py b/tests/test_rankhot.py index 5cb114c5..c73343ae 100644 --- a/tests/test_rankhot.py +++ b/tests/test_rankhot.py @@ -1,8 +1,11 @@ -import pandas as pd +"""Tests for the RankHotEncoder.""" from unittest import TestCase -import tests.helpers as th -import numpy as np + import category_encoders as encoders +import numpy as np +import pandas as pd + +import tests.helpers as th np_X = th.create_array(n_rows=100) np_X_t = th.create_array(n_rows=50, extras=True) @@ -15,8 +18,11 @@ class TestRankHotEncoder(TestCase): + """Tests for the RankHotEncoder.""" - def test_handlenanvalue(self): + def test_handle_nan_value(self): + """Test that the NaN values are handled correctly.""" + # @ToDo this test rather checks the behaviour of handle_unknown than handle_missing. enc = encoders.RankHotEncoder(handle_unknown='value', cols=['none']) enc.fit(X) t_f = enc.transform(X) @@ -24,7 +30,8 @@ def test_handlenanvalue(self): self.assertEqual(t_f.shape[1] - (X.shape[1] - 1), len(X.none.unique())) self.assertTupleEqual(inv_tf.shape, X.shape) - def test_handleCategoricalValue(self): + def test_handle_pandas_categorical(self): + """Test that the RankHotEncoder works with pandas Categorical data.""" enc = encoders.RankHotEncoder(cols=['categorical']) enc.fit(X) t_f = enc.transform(X) @@ -32,23 +39,48 @@ def test_handleCategoricalValue(self): self.assertEqual(t_f.shape[1] - (X.shape[1] - 1), len(X.categorical.unique())) self.assertTupleEqual(inv_tf.shape, X.shape) - def test_naCatagoricalValue(self): + def test_na_catagorical(self): + """Test also NAs on pandas categorical are handled correctly.""" enc = encoders.RankHotEncoder(handle_unknown='value', cols=['na_categorical']) enc.fit(X) t_f = enc.transform(X) inv_tf = enc.inverse_transform(t_f) self.assertTupleEqual(inv_tf.shape, X.shape) - def test_extraValue(self): - train = pd.DataFrame({'city': ['chicago', 'st louis', 'chicago', "st louis"]}) + def test_unknown_value(self): + """Test that the extra columns are handled correctly.""" + train = pd.DataFrame({'city': ['chicago', 'st louis', 'chicago', 'st louis']}) test = pd.DataFrame({'city': ['chicago', 'los angeles']}) enc = encoders.RankHotEncoder(handle_unknown='value') train_out = enc.fit_transform(train) - expected_mapping = pd.DataFrame([[1, 0], [1, 1], ], columns=["city_1", "city_2"], index=[1, 2]) - expected_out_train = pd.DataFrame([[1, 0], [1, 1], [1, 0], [1, 1], ], columns=["city_1", "city_2"]) - expected_out_test = pd.DataFrame([[1, 0], [0, 0], ], columns=["city_1", "city_2"]) + expected_mapping = pd.DataFrame( + [ + [1, 0], + [1, 1], + ], + columns=['city_1', 'city_2'], + index=[1, 2], + ) + expected_out_train = pd.DataFrame( + [ + [1, 0], + [1, 1], + [1, 0], + [1, 1], + ], + columns=['city_1', 'city_2'], + ) + expected_out_test = pd.DataFrame( + [ + [1, 0], + [0, 0], + ], + columns=['city_1', 'city_2'], + ) pd.testing.assert_frame_equal(train_out, expected_out_train) - pd.testing.assert_frame_equal(enc.mapping[0]["mapping"], expected_mapping, check_dtype=False) + pd.testing.assert_frame_equal( + enc.mapping[0]['mapping'], expected_mapping, check_dtype=False + ) t_f = enc.transform(test) pd.testing.assert_frame_equal(t_f, expected_out_test) inv_tf = enc.inverse_transform(t_f) @@ -56,19 +88,22 @@ def test_extraValue(self): th.verify_inverse_transform(expected_inverse_test, inv_tf) def test_invariant(self): + """Test that the invariant columns are dropped.""" enc = encoders.RankHotEncoder(cols=['invariant'], drop_invariant=True) enc.fit(X) - self.assertFalse(any([c.startswith("invariant") for c in enc.feature_names_out_])) - self.assertTrue(any([c.startswith("invariant") for c in enc.invariant_cols])) + self.assertFalse(any(c.startswith('invariant') for c in enc.feature_names_out_)) + self.assertTrue(any(c.startswith('invariant') for c in enc.invariant_cols)) - def test_categoricalNaming(self): + def test_categorical_naming(self): + """Test that the categorical names are used in the output.""" train = pd.DataFrame({'city': ['chicago', 'st louis']}) enc = encoders.RankHotEncoder(use_cat_names=True) enc.fit(train) tf = enc.transform(train) self.assertListEqual(['city_chicago', 'city_st louis'], list(tf.columns)) - def test_rankhot(self): + def test_inverse_transform(self): + """Test that the inverse transform is the inverse of the transform.""" enc = encoders.RankHotEncoder(verbose=1) enc.fit(X) t_f = enc.transform(X) @@ -76,18 +111,22 @@ def test_rankhot(self): th.verify_inverse_transform(X, inv_tf) def test_order(self): + """Test that the mapping is independent of the order of the input data. + + Since RankHotEncoding respects the order in ordinal variables, + the mapping should be independent of input order """ - Since RankHotEncoding respects the order in ordinal variables, the mapping should be independent of input order - """ - train_order_1 = pd.DataFrame({'grade': ['B', 'A', 'C', 'F', 'D', 'C', 'F', 'D'], - "ord_var": [1, 3, 2, 2, 2, 1, 3, 1]}) - train_order_2 = pd.DataFrame({'grade': ['A', 'D', 'C', 'B', 'C', 'F', 'F', 'D'], - "ord_var": [3, 1, 2, 2, 2, 1, 3, 1]}) - enc = encoders.RankHotEncoder(cols=["grade", "ord_var"]) + train_order_1 = pd.DataFrame( + {'grade': ['B', 'A', 'C', 'F', 'D', 'C', 'F', 'D'], 'ord_var': [1, 3, 2, 2, 2, 1, 3, 1]} + ) + train_order_2 = pd.DataFrame( + {'grade': ['A', 'D', 'C', 'B', 'C', 'F', 'F', 'D'], 'ord_var': [3, 1, 2, 2, 2, 1, 3, 1]} + ) + enc = encoders.RankHotEncoder(cols=['grade', 'ord_var']) enc.fit(train_order_1) mapping_order_1 = enc.ordinal_encoder.mapping enc.fit(train_order_2) mapping_order_2 = enc.ordinal_encoder.mapping - for m1, m2 in zip(mapping_order_1, mapping_order_2): - self.assertEqual(m1["col"], m2["col"]) - pd.testing.assert_series_equal(m1["mapping"], m2["mapping"]) + for m1, m2 in zip(mapping_order_1, mapping_order_2, strict=False): + self.assertEqual(m1['col'], m2['col']) + pd.testing.assert_series_equal(m1['mapping'], m2['mapping']) diff --git a/tests/test_sum_coding.py b/tests/test_sum_coding.py index 991cc171..ee5dcfd5 100644 --- a/tests/test_sum_coding.py +++ b/tests/test_sum_coding.py @@ -1,7 +1,9 @@ -import pandas as pd -from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ -import numpy as np +"""Unit tests for the SumEncoder.""" +from unittest import TestCase + import category_encoders as encoders +import numpy as np +import pandas as pd a_encoding = [1, 1, 0] b_encoding = [1, 0, 1] @@ -9,81 +11,69 @@ class TestSumEncoder(TestCase): + """Unit tests for the SumEncoder.""" - def test_sum_encoder_preserve_dimension_1(self): - train = ['A', 'B', 'C'] - test = ['A', 'D', 'E'] - - encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [a_encoding, - [1, 0, 0], - [1, 0, 0]] - self.assertEqual(test_t.to_numpy().tolist(), expected) - - def test_sum_encoder_preserve_dimension_2(self): - train = ['A', 'B', 'C'] - test = ['B', 'D', 'E'] - - encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [b_encoding, - [1, 0, 0], - [1, 0, 0]] - self.assertEqual(test_t.to_numpy().tolist(), expected) - - def test_sum_encoder_preserve_dimension_3(self): + def test_unknown_and_missing(self): + """Test the SumEncoder with the handle unknown = 'value' strategy.""" train = ['A', 'B', 'C'] - test = ['A', 'B', 'C', None] encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) - test_t = encoder.transform(test) - - expected = [a_encoding, - b_encoding, - c_encoding, - [1, 0, 0]] - self.assertEqual(test_t.to_numpy().tolist(), expected) - - def test_sum_encoder_preserve_dimension_4(self): - train = ['A', 'B', 'C'] - test = ['D', 'B', 'C', None] - - encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') - encoder.fit(train) - test_t = encoder.transform(test) - - expected = [[1, 0, 0], - b_encoding, - c_encoding, - [1, 0, 0]] - self.assertEqual(test_t.to_numpy().tolist(), expected) + dim_1_test = ['A', 'D', 'E'] + dim_1_expected = [a_encoding, [1, 0, 0], [1, 0, 0]] + dim_2_test = ['B', 'D', 'E'] + dim_2_expected = [b_encoding, [1, 0, 0], [1, 0, 0]] + dim_3_test = ['A', 'B', 'C', None] + dim_3_expected = [a_encoding, b_encoding, c_encoding, [1, 0, 0]] + + dim_4_test = ['D', 'B', 'C', None] + dim_4_expected = [[1, 0, 0], b_encoding, c_encoding, [1, 0, 0]] + cases = {"should preserve dimension 1": (dim_1_test, dim_1_expected), + "should preserve dimension 2": (dim_2_test, dim_2_expected), + "should preserve dimension 3": (dim_3_test, dim_3_expected), + "should preserve dimension 4": (dim_4_test, dim_4_expected), + } + for case, (test_data, expected) in cases.items(): + with self.subTest(case=case): + test_t = encoder.transform(test_data) + self.assertEqual(test_t.to_numpy().tolist(), expected) def test_sum_encoder_2cols(self): + """Test the SumEncoder with two columns.""" train = [['A', 'A'], ['B', 'B'], ['C', 'C']] encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) obtained = encoder.transform(train) - expected = [[1, a_encoding[1], a_encoding[2], a_encoding[1], a_encoding[2]], - [1, b_encoding[1], b_encoding[2], b_encoding[1], b_encoding[2]], - [1, c_encoding[1], c_encoding[2], c_encoding[1], c_encoding[2]]] + expected = [ + [1, a_encoding[1], a_encoding[2], a_encoding[1], a_encoding[2]], + [1, b_encoding[1], b_encoding[2], b_encoding[1], b_encoding[2]], + [1, c_encoding[1], c_encoding[2], c_encoding[1], c_encoding[2]], + ] self.assertEqual(obtained.to_numpy().tolist(), expected) - def test_sum_encoder_2StringCols_ExpectCorrectOrder(self): - train = pd.DataFrame({'col1': [1, 2, 3, 4], - 'col2': ['A', 'B', 'C', 'D'], - 'col3': [1, 2, 3, 4], - 'col4': ['A', 'B', 'C', 'A'] - }, - columns=['col1', 'col2', 'col3', 'col4']) - expected_columns = ['intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1'] + def test_multiple_columns_correct_order(self): + """Test that the order is correct when auto-detecting multiple columns.""" + train = pd.DataFrame( + { + 'col1': [1, 2, 3, 4], + 'col2': ['A', 'B', 'C', 'D'], + 'col3': [1, 2, 3, 4], + 'col4': ['A', 'B', 'C', 'A'], + }, + columns=['col1', 'col2', 'col3', 'col4'], + ) + expected_columns = [ + 'intercept', + 'col1', + 'col2_0', + 'col2_1', + 'col2_2', + 'col3', + 'col4_0', + 'col4_1', + ] encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) @@ -91,59 +81,36 @@ def test_sum_encoder_2StringCols_ExpectCorrectOrder(self): self.assertTrue(np.array_equal(expected_columns, columns)) - def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): - train = ['A', 'B', np.nan] - - encoder = encoders.SumEncoder(handle_missing='indicator', handle_unknown='value') - result = encoder.fit_transform(train) + def test_handle_missing_is_indicator(self): + """Test that missing values are encoded with an indicator column.""" + with self.subTest("missing values in the training set are encoded with an " + "indicator column"): + train = ['A', 'B', np.nan] - expected = [a_encoding, - b_encoding, - c_encoding] - self.assertTrue(np.array_equal(result.to_numpy().tolist(), expected)) + encoder = encoders.SumEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) - def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): - train = ['A', 'B'] + expected = [a_encoding, b_encoding, c_encoding] + self.assertListEqual(result.to_numpy().tolist(), expected) - encoder = encoders.SumEncoder(handle_missing='indicator', handle_unknown='value') - result = encoder.fit_transform(train) - - expected = [a_encoding, - b_encoding] - self.assertEqual(result.to_numpy().tolist(), expected) - - def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): - train = ['A', 'B'] - test = ['A', 'B', np.nan] - - encoder = encoders.SumEncoder(handle_missing='indicator', handle_unknown='value') - encoder.fit(train) - result = encoder.transform(test) - - expected = [a_encoding, - b_encoding, - c_encoding] - self.assertEqual(result.to_numpy().tolist(), expected) - - def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): - train = ['A', 'B'] - test = ['A', 'B', 'C'] - - encoder = encoders.SumEncoder(handle_unknown='indicator', handle_missing='value') - encoder.fit(train) - result = encoder.transform(test) + with self.subTest("should fit an indicator column for missing values " + "even if not present in the training set"): + train = ['A', 'B'] - expected = [a_encoding, - b_encoding, - c_encoding] - self.assertEqual(result.to_numpy().tolist(), expected) + encoder = encoders.SumEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) - def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): - train = ['A', 'B'] + expected = [a_encoding, b_encoding] + self.assertEqual(result.to_numpy().tolist(), expected) - encoder = encoders.SumEncoder(handle_unknown='indicator', handle_missing='value') - result = encoder.fit_transform(train) + test = ['A', 'B', np.nan] + result = encoder.transform(test) + expected = [a_encoding, b_encoding, c_encoding] + self.assertEqual(result.to_numpy().tolist(), expected) - expected = [a_encoding, - b_encoding] - self.assertEqual(result.to_numpy().tolist(), expected) + # unknown value should be encoded with value strategy, + # i.e. indicator 1 and all other columns zeros + test = ['A', 'B', 'C'] + result = encoder.transform(test) + expected = [a_encoding, b_encoding, [1, 0, 0]] + self.assertEqual(result.to_numpy().tolist(), expected) diff --git a/tests/test_target_encoder.py b/tests/test_target_encoder.py index 154476dd..4196a52c 100644 --- a/tests/test_target_encoder.py +++ b/tests/test_target_encoder.py @@ -1,43 +1,108 @@ -import pandas as pd +"""Tests for the TargetEncoder class.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ -import tests.helpers as th -import numpy as np -from category_encoders.datasets import load_postcodes, load_compass import category_encoders as encoders +import numpy as np +import pandas as pd +from category_encoders.datasets import load_compass, load_postcodes + +import tests.helpers as th class TestTargetEncoder(TestCase): + """Unit tests for the Target Encoder.""" def setUp(self): + """Set up the test case.""" self.hierarchical_cat_example = pd.DataFrame( { - 'Compass': ['N', 'N', 'NE', 'NE', 'NE', 'SE', 'SE', 'S', 'S', 'S', 'S', 'W', 'W', 'W', 'W', 'W'], - 'Speed': ['slow', 'slow', 'slow', 'slow', 'medium', 'medium', 'medium', 'fast', 'fast', 'fast', 'fast', - 'fast', 'fast', 'fast', 'fast', 'fast'], - 'Animal': ['Cat', 'Cat', 'Cat', 'Cat', 'Cat', 'Dog', 'Dog', 'Dog', 'Dog', - 'Dog', 'Dog', 'Tiger', 'Tiger', 'Wolf', 'Wolf', 'Cougar'], - 'Plant': ['Rose', 'Rose', 'Rose', 'Rose', 'Daisy', 'Daisy', 'Daisy', 'Daisy', 'Daffodil', - 'Daffodil', 'Daffodil', 'Daffodil', 'Bluebell', 'Bluebell', 'Bluebell', 'Bluebell'], - 'target': [1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1] - }, columns=['Compass', 'Speed', 'Animal', 'Plant', 'target']) - self.hierarchical_map = { - 'Compass': { - 'N': ('N', 'NE'), - 'S': ('S', 'SE'), - 'W': 'W' - }, - 'Animal': { - 'Feline': ('Cat', 'Tiger', 'Cougar'), - 'Canine': ('Dog', 'Wolf') + 'Compass': [ + 'N', + 'N', + 'NE', + 'NE', + 'NE', + 'SE', + 'SE', + 'S', + 'S', + 'S', + 'S', + 'W', + 'W', + 'W', + 'W', + 'W', + ], + 'Speed': [ + 'slow', + 'slow', + 'slow', + 'slow', + 'medium', + 'medium', + 'medium', + 'fast', + 'fast', + 'fast', + 'fast', + 'fast', + 'fast', + 'fast', + 'fast', + 'fast', + ], + 'Animal': [ + 'Cat', + 'Cat', + 'Cat', + 'Cat', + 'Cat', + 'Dog', + 'Dog', + 'Dog', + 'Dog', + 'Dog', + 'Dog', + 'Tiger', + 'Tiger', + 'Wolf', + 'Wolf', + 'Cougar', + ], + 'Plant': [ + 'Rose', + 'Rose', + 'Rose', + 'Rose', + 'Daisy', + 'Daisy', + 'Daisy', + 'Daisy', + 'Daffodil', + 'Daffodil', + 'Daffodil', + 'Daffodil', + 'Bluebell', + 'Bluebell', + 'Bluebell', + 'Bluebell', + ], + 'target': [1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1], }, + columns=['Compass', 'Speed', 'Animal', 'Plant', 'target'], + ) + self.hierarchical_map = { + 'Compass': {'N': ('N', 'NE'), 'S': ('S', 'SE'), 'W': 'W'}, + 'Animal': {'Feline': ('Cat', 'Tiger', 'Cougar'), 'Canine': ('Dog', 'Wolf')}, 'Plant': { 'Flower': ('Rose', 'Daisy', 'Daffodil', 'Bluebell'), - 'Tree': ('Ash', 'Birch') + 'Tree': ('Ash', 'Birch'), }, } def test_target_encoder(self): + """Test that no error occurs when calling the target encoder.""" np_X = th.create_array(n_rows=100) np_X_t = th.create_array(n_rows=50, extras=True) np_y = np.random.randn(np_X.shape[0]) > 0.5 @@ -51,12 +116,16 @@ def test_target_encoder(self): th.verify_numeric(enc.transform(X_t)) th.verify_numeric(enc.transform(X_t, y_t)) - def test_target_encoder_fit_HaveConstructorSetSmoothingAndMinSamplesLeaf_ExpectUsedInFit(self): + def test_fit(self): + """Test the fit method and correct values are fitted.""" k = 2 f = 10 binary_cat_example = pd.DataFrame( - {'Trend': ['UP', 'UP', 'DOWN', 'FLAT', 'DOWN', 'UP', 'DOWN', 'FLAT', 'FLAT', 'FLAT'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + { + 'Trend': ['UP', 'UP', 'DOWN', 'FLAT', 'DOWN', 'UP', 'DOWN', 'FLAT', 'FLAT', 'FLAT'], + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1], + } + ) encoder = encoders.TargetEncoder(cols=['Trend'], min_samples_leaf=k, smoothing=f) encoder.fit(binary_cat_example, binary_cat_example['target']) trend_mapping = encoder.mapping['Trend'] @@ -66,41 +135,45 @@ def test_target_encoder_fit_HaveConstructorSetSmoothingAndMinSamplesLeaf_ExpectU self.assertEqual(0.5, trend_mapping[ordinal_mapping.loc['FLAT']]) self.assertAlmostEqual(0.5874, trend_mapping[ordinal_mapping.loc['UP']], delta=1e-4) - def test_target_encoder_fit_transform_HaveConstructorSetSmoothingAndMinSamplesLeaf_ExpectCorrectValueInResult(self): - k = 2 - f = 10 - binary_cat_example = pd.DataFrame( - {'Trend': ['UP', 'UP', 'DOWN', 'FLAT', 'DOWN', 'UP', 'DOWN', 'FLAT', 'FLAT', 'FLAT'], - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) - encoder = encoders.TargetEncoder(cols=['Trend'], min_samples_leaf=k, smoothing=f) - result = encoder.fit_transform(binary_cat_example, binary_cat_example['target']) - values = result['Trend'].array - self.assertAlmostEqual(0.5874, values[0], delta=1e-4) - self.assertAlmostEqual(0.5874, values[1], delta=1e-4) - self.assertAlmostEqual(0.4125, values[2], delta=1e-4) - self.assertEqual(0.5, values[3]) - def test_target_encoder_fit_transform_HaveCategoricalColumn_ExpectCorrectValueInResult(self): + def test_fit_transform(self): + """Test the good case without unknowns or NaN values.""" k = 2 f = 10 - binary_cat_example = pd.DataFrame( - {'Trend': pd.Categorical(['UP', 'UP', 'DOWN', 'FLAT', 'DOWN', 'UP', 'DOWN', 'FLAT', 'FLAT', 'FLAT'], - categories=['UP', 'FLAT', 'DOWN']), - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) - encoder = encoders.TargetEncoder(cols=['Trend'], min_samples_leaf=k, smoothing=f) - result = encoder.fit_transform(binary_cat_example, binary_cat_example['target']) - values = result['Trend'].array - self.assertAlmostEqual(0.5874, values[0], delta=1e-4) - self.assertAlmostEqual(0.5874, values[1], delta=1e-4) - self.assertAlmostEqual(0.4125, values[2], delta=1e-4) - self.assertEqual(0.5, values[3]) - - def test_target_encoder_fit_transform_HaveNanValue_ExpectCorrectValueInResult(self): + training_data = ['UP', 'UP', 'DOWN', 'FLAT', 'DOWN', 'UP', 'DOWN', 'FLAT', 'FLAT', 'FLAT'] + cases = {"with list input": training_data, + "with pd categorical input": pd.Categorical(training_data, + categories=['UP', 'FLAT', 'DOWN'])} + target = [1, 1, 0, 0, 1, 0, 0, 0, 1, 1] + + for case, input_data in cases.items(): + with self.subTest(case): + binary_cat_example = pd.DataFrame( + { + 'Trend': input_data, + 'target': target + } + ) + encoder = encoders.TargetEncoder(cols=['Trend'], min_samples_leaf=k, smoothing=f) + result = encoder.fit_transform(binary_cat_example, binary_cat_example['target']) + values = result['Trend'].array + self.assertAlmostEqual(0.5874, values[0], delta=1e-4) + self.assertAlmostEqual(0.5874, values[1], delta=1e-4) + self.assertAlmostEqual(0.4125, values[2], delta=1e-4) + self.assertEqual(0.5, values[3]) + + def test_fit_transform_with_nan(self): + """Test that the encoder works with NaN values.""" k = 2 f = 10 binary_cat_example = pd.DataFrame( - {'Trend': pd.Series([np.nan, np.nan, 'DOWN', 'FLAT', 'DOWN', np.nan, 'DOWN', 'FLAT', 'FLAT', 'FLAT']), - 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + { + 'Trend': pd.Series( + [np.nan, np.nan, 'DOWN', 'FLAT', 'DOWN', np.nan, 'DOWN', 'FLAT', 'FLAT', 'FLAT'] + ), + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1], + } + ) encoder = encoders.TargetEncoder(cols=['Trend'], min_samples_leaf=k, smoothing=f) result = encoder.fit_transform(binary_cat_example, binary_cat_example['target']) values = result['Trend'].array @@ -109,10 +182,11 @@ def test_target_encoder_fit_transform_HaveNanValue_ExpectCorrectValueInResult(se self.assertAlmostEqual(0.4125, values[2], delta=1e-4) self.assertEqual(0.5, values[3]) - def test_HandleMissingIsValueAndNanInTest_ExpectMean(self): - df = pd.DataFrame({ - 'color': ["a", "a", "a", "b", "b", "b"], - 'outcome': [1.6, 0, 0, 1, 0, 1]}) + def test_handle_missing_value(self): + """Should set the global mean value for missing values if handle_missing=value.""" + df = pd.DataFrame( + {'color': ['a', 'a', 'a', 'b', 'b', 'b'], 'outcome': [1.6, 0, 0, 1, 0, 1]} + ) train = df.drop('outcome', axis=1) target = df.drop('color', axis=1) @@ -123,10 +197,11 @@ def test_HandleMissingIsValueAndNanInTest_ExpectMean(self): enc.fit(train, target['outcome']) obtained = enc.transform(test, test_target) - self.assertEqual(.6, list(obtained['color'])[0]) + self.assertEqual(0.6, list(obtained['color'])[0]) - def test_HandleUnknownValue_HaveUnknownInTest_ExpectMean(self): - train = pd.Series(["a", "a", "a", "b", "b", "b"], name='color') + def test_handle_unknown_value(self): + """Test that encoder sets the global mean value for unknown values.""" + train = pd.Series(['a', 'a', 'a', 'b', 'b', 'b'], name='color') target = pd.Series([1.6, 0, 0, 1, 0, 1], name='target') test = pd.Series(['c', 'b'], name='color') test_target = pd.Series([0, 0]) @@ -135,12 +210,20 @@ def test_HandleUnknownValue_HaveUnknownInTest_ExpectMean(self): enc.fit(train, target) obtained = enc.transform(test, test_target) - self.assertEqual(.6, list(obtained['color'])[0]) + self.assertEqual(0.6, list(obtained['color'])[0]) def test_hierarchical_smoothing(self): - - enc = encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=self.hierarchical_map, cols=['Compass']) - result = enc.fit_transform(self.hierarchical_cat_example, self.hierarchical_cat_example['target']) + """Test that encoder works with a hierarchical mapping.""" + enc = encoders.TargetEncoder( + verbose=1, + smoothing=2, + min_samples_leaf=2, + hierarchy=self.hierarchical_map, + cols=['Compass'], + ) + result = enc.fit_transform( + self.hierarchical_cat_example, self.hierarchical_cat_example['target'] + ) values = result['Compass'].array self.assertAlmostEqual(0.6226, values[0], delta=1e-4) self.assertAlmostEqual(0.9038, values[2], delta=1e-4) @@ -149,9 +232,17 @@ def test_hierarchical_smoothing(self): self.assertAlmostEqual(0.4033, values[11], delta=1e-4) def test_hierarchical_smoothing_multi(self): - - enc = encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=self.hierarchical_map, cols=['Compass', 'Speed', 'Animal']) - result = enc.fit_transform(self.hierarchical_cat_example, self.hierarchical_cat_example['target']) + """Test that the encoder works with multiple columns.""" + enc = encoders.TargetEncoder( + verbose=1, + smoothing=2, + min_samples_leaf=2, + hierarchy=self.hierarchical_map, + cols=['Compass', 'Speed', 'Animal'], + ) + result = enc.fit_transform( + self.hierarchical_cat_example, self.hierarchical_cat_example['target'] + ) values = result['Compass'].array self.assertAlmostEqual(0.6226, values[0], delta=1e-4) @@ -173,9 +264,17 @@ def test_hierarchical_smoothing_multi(self): self.assertAlmostEqual(0.8370, values[15], delta=1e-4) def test_hierarchical_part_named_cols(self): - - enc = encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=self.hierarchical_map, cols=['Compass']) - result = enc.fit_transform(self.hierarchical_cat_example, self.hierarchical_cat_example['target']) + """Test that the encoder works with a partial hierarchy.""" + enc = encoders.TargetEncoder( + verbose=1, + smoothing=2, + min_samples_leaf=2, + hierarchy=self.hierarchical_map, + cols=['Compass'], + ) + result = enc.fit_transform( + self.hierarchical_cat_example, self.hierarchical_cat_example['target'] + ) values = result['Compass'].array self.assertAlmostEqual(0.6226, values[0], delta=1e-4) @@ -188,20 +287,23 @@ def test_hierarchical_part_named_cols(self): self.assertEqual('slow', values[0]) def test_hierarchy_pandas_index(self): - df = pd.DataFrame({ - 'hello': ['a', 'b', 'c', 'a', 'a', 'b', 'c', 'd', 'd'], - 'world': [0, 1, 0, 0, 1, 0, 0, 1, 1] - }, columns=pd.Index(['hello', 'world'])) + """Test that the encoder works with a pandas index.""" + df = pd.DataFrame( + { + 'hello': ['a', 'b', 'c', 'a', 'a', 'b', 'c', 'd', 'd'], + 'world': [0, 1, 0, 0, 1, 0, 0, 1, 1], + }, + columns=pd.Index(['hello', 'world']), + ) cols = df.select_dtypes(include='object').columns self.hierarchical_map = { - 'hello': { - 'A': ('a', 'b'), - 'B': ('c', 'd') - }, + 'hello': {'A': ('a', 'b'), 'B': ('c', 'd')}, } - enc = encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=self.hierarchical_map, cols=cols) + enc = encoders.TargetEncoder( + verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=self.hierarchical_map, cols=cols + ) result = enc.fit_transform(df, df['world']) values = result['hello'].array @@ -211,10 +313,17 @@ def test_hierarchy_pandas_index(self): self.assertAlmostEqual(0.7425, values[7], delta=1e-4) def test_hierarchy_single_mapping(self): - - enc = encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=self.hierarchical_map, - cols=['Plant']) - result = enc.fit_transform(self.hierarchical_cat_example, self.hierarchical_cat_example['target']) + """Test that mapping a single column works.""" + enc = encoders.TargetEncoder( + verbose=1, + smoothing=2, + min_samples_leaf=2, + hierarchy=self.hierarchical_map, + cols=['Plant'], + ) + result = enc.fit_transform( + self.hierarchical_cat_example, self.hierarchical_cat_example['target'] + ) values = result['Plant'].array self.assertAlmostEqual(0.6828, values[0], delta=1e-4) @@ -223,18 +332,22 @@ def test_hierarchy_single_mapping(self): self.assertAlmostEqual(0.3172, values[12], delta=1e-4) def test_hierarchy_no_mapping(self): + """Test that a trivial hierarchy mapping label to itself works.""" hierarchical_map = { 'Plant': { 'Rose': 'Rose', 'Daisy': 'Daisy', 'Daffodil': 'Daffodil', - 'Bluebell': 'Bluebell' + 'Bluebell': 'Bluebell', } } - enc = encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=hierarchical_map, - cols=['Plant']) - result = enc.fit_transform(self.hierarchical_cat_example, self.hierarchical_cat_example['target']) + enc = encoders.TargetEncoder( + verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=hierarchical_map, cols=['Plant'] + ) + result = enc.fit_transform( + self.hierarchical_cat_example, self.hierarchical_cat_example['target'] + ) values = result['Plant'].array self.assertAlmostEqual(0.6828, values[0], delta=1e-4) @@ -243,52 +356,83 @@ def test_hierarchy_no_mapping(self): self.assertAlmostEqual(0.3172, values[12], delta=1e-4) def test_hierarchy_error(self): - hierarchical_map = { - 'Plant': { - 'Flower': {'Rose': ('Pink', 'Yellow', 'Red')}, - 'Tree': 'Ash' - } - } + """Test that an error is raised when the hierarchy dictionary is invalid.""" + hierarchical_map = {'Plant': {'Flower': {'Rose': ('Pink', 'Yellow', 'Red')}, 'Tree': 'Ash'}} with self.assertRaises(ValueError): - encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=hierarchical_map, - cols=['Plant']) + encoders.TargetEncoder( + verbose=1, + smoothing=2, + min_samples_leaf=2, + hierarchy=hierarchical_map, + cols=['Plant'], + ) def test_trivial_hierarchy(self): - trivial_hierarchical_map = { - 'Plant': { - 'Plant': ('Rose', 'Daisy', 'Daffodil', 'Bluebell') - } - } - - enc_hier = encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=trivial_hierarchical_map, - cols=['Plant']) - result_hier = enc_hier.fit_transform(self.hierarchical_cat_example, self.hierarchical_cat_example['target']) - enc_no_hier = encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, cols=['Plant']) - result_no_hier = enc_no_hier.fit_transform(self.hierarchical_cat_example, self.hierarchical_cat_example['target']) - pd.testing.assert_series_equal(result_hier["Plant"], result_no_hier["Plant"]) + """Test that a trivial hierarchy works.""" + trivial_hierarchical_map = {'Plant': {'Plant': ('Rose', 'Daisy', 'Daffodil', 'Bluebell')}} + + enc_hier = encoders.TargetEncoder( + verbose=1, + smoothing=2, + min_samples_leaf=2, + hierarchy=trivial_hierarchical_map, + cols=['Plant'], + ) + result_hier = enc_hier.fit_transform( + self.hierarchical_cat_example, self.hierarchical_cat_example['target'] + ) + enc_no_hier = encoders.TargetEncoder( + verbose=1, smoothing=2, min_samples_leaf=2, cols=['Plant'] + ) + result_no_hier = enc_no_hier.fit_transform( + self.hierarchical_cat_example, self.hierarchical_cat_example['target'] + ) + pd.testing.assert_series_equal(result_hier['Plant'], result_no_hier['Plant']) def test_hierarchy_multi_level(self): + """Test that hierarchy works with a multi-level hierarchy.""" hierarchy_multi_level_df = pd.DataFrame( { - 'Animal': ['Cat', 'Cat', 'Dog', 'Dog', 'Dog', 'Osprey', 'Kite', 'Kite', 'Carp', 'Carp', 'Carp', - 'Clownfish', 'Clownfish', 'Lizard', 'Snake', 'Snake'], - 'target': [1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1] - }, columns=['Animal', 'target']) + 'Animal': [ + 'Cat', + 'Cat', + 'Dog', + 'Dog', + 'Dog', + 'Osprey', + 'Kite', + 'Kite', + 'Carp', + 'Carp', + 'Carp', + 'Clownfish', + 'Clownfish', + 'Lizard', + 'Snake', + 'Snake', + ], + 'target': [1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1], + }, + columns=['Animal', 'target'], + ) hierarchy_multi_level = { 'Animal': { - 'Warm-Blooded': - {'Mammals': ('Cat', 'Dog'), - 'Birds': ('Osprey', 'Kite'), - 'Fish': ('Carp', 'Clownfish') - }, - 'Cold-Blooded': - {'Reptiles': ('Lizard'), - 'Amphibians': ('Snake', 'Frog') - } - }} + 'Warm-Blooded': { + 'Mammals': ('Cat', 'Dog'), + 'Birds': ('Osprey', 'Kite'), + 'Fish': ('Carp', 'Clownfish'), + }, + 'Cold-Blooded': {'Reptiles': ('Lizard'), 'Amphibians': ('Snake', 'Frog')}, + } + } - enc = encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=hierarchy_multi_level, - cols=['Animal']) + enc = encoders.TargetEncoder( + verbose=1, + smoothing=2, + min_samples_leaf=2, + hierarchy=hierarchy_multi_level, + cols=['Animal'], + ) result = enc.fit_transform(hierarchy_multi_level_df, hierarchy_multi_level_df['target']) values = result['Animal'].array @@ -301,11 +445,13 @@ def test_hierarchy_multi_level(self): self.assertAlmostEqual(0.4741, values[14], delta=1e-4) def test_hierarchy_columnwise_compass(self): + """Test that hierarchy works with a columnwise hierarchy.""" X, y = load_compass() cols = X.columns[~X.columns.str.startswith('HIER')] HIER_cols = X.columns[X.columns.str.startswith('HIER')] - enc = encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=X[HIER_cols], - cols=['compass']) + enc = encoders.TargetEncoder( + verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=X[HIER_cols], cols=['compass'] + ) result = enc.fit_transform(X[cols], y) values = result['compass'].array @@ -315,38 +461,47 @@ def test_hierarchy_columnwise_compass(self): self.assertAlmostEqual(0.4605, values[7], delta=1e-4) self.assertAlmostEqual(0.4033, values[11], delta=1e-4) - def test_hierarchy_columnwise_postcodes(self): + """Test that hierarchy works with a columnwise hierarchy.""" X, y = load_postcodes('binary') cols = X.columns[~X.columns.str.startswith('HIER')] HIER_cols = X.columns[X.columns.str.startswith('HIER')] - enc = encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=X[HIER_cols], - cols=['postcode']) + enc = encoders.TargetEncoder( + verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=X[HIER_cols], cols=['postcode'] + ) result = enc.fit_transform(X[cols], y) values = result['postcode'].array self.assertAlmostEqual(0.8448, values[0], delta=1e-4) - def test_hierarchy_columnwise_missing_level(self): + """Test that an error is raised when a hierarchy is given but a sub-column is missing.""" X, y = load_postcodes('binary') HIER_cols = ['HIER_postcode_1', 'HIER_postcode_2', 'HIER_postcode_4'] with self.assertRaises(ValueError): - encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=X[HIER_cols], - cols=['postcode']) - + encoders.TargetEncoder( + verbose=1, + smoothing=2, + min_samples_leaf=2, + hierarchy=X[HIER_cols], + cols=['postcode'], + ) def test_hierarchy_mapping_no_cols(self): + """Test that an error is raised when the hierarchy is given but no columns to encode.""" hierarchical_map = {'Compass': {'N': ('N', 'NE'), 'S': ('S', 'SE'), 'W': 'W'}} with self.assertRaises(ValueError): - encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=hierarchical_map) - + encoders.TargetEncoder( + verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=hierarchical_map + ) def test_hierarchy_mapping_cols_missing(self): + """Test that an error is raised when the dataframe is missing the hierarchy column.""" X = ['N', 'N', 'NE', 'NE', 'NE', 'SE', 'SE', 'S', 'S', 'S', 'S', 'W', 'W', 'W', 'W', 'W'] hierarchical_map = {'Compass': {'N': ('N', 'NE'), 'S': ('S', 'SE'), 'W': 'W'}} y = [1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1] - enc = encoders.TargetEncoder(verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=hierarchical_map, - cols=['Compass']) + enc = encoders.TargetEncoder( + verbose=1, smoothing=2, min_samples_leaf=2, hierarchy=hierarchical_map, cols=['Compass'] + ) with self.assertRaises(ValueError): enc.fit_transform(X, y) diff --git a/tests/test_utils.py b/tests/test_utils.py index 65799ca2..b6e52cbd 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,16 +1,25 @@ +"""Tests for the utils module.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ -import pytest -from category_encoders.utils import convert_input_vector, convert_inputs, get_categorical_cols, BaseEncoder -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn import __version__ as skl_version -from packaging.version import Version -import pandas as pd import numpy as np +import pandas as pd +import pytest +from category_encoders.utils import ( + BaseEncoder, + convert_input_vector, + convert_inputs, + get_categorical_cols, +) +from packaging.version import Version +from sklearn import __version__ as skl_version +from sklearn.base import BaseEstimator, TransformerMixin class TestUtils(TestCase): + """Tests for the utils module.""" + def test_convert_input_vector(self): + """Test the convert_input_vector function.""" index = [2, 3, 4] result = convert_input_vector([0, 1, 0], index) # list @@ -46,40 +55,50 @@ def test_convert_input_vector(self): result = convert_input_vector(pd.Series([0, 1, 0], index=[4, 5, 6]), index) # series self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) - np.testing.assert_array_equal(result.index, [4, 5, 6], 'We want to preserve the original index') + np.testing.assert_array_equal( + result.index, [4, 5, 6], 'We want to preserve the original index' + ) - result = convert_input_vector(pd.DataFrame({'y': [0, 1, 0]}, index=[4, 5, 6]), index) # dataFrame + result = convert_input_vector( + pd.DataFrame({'y': [0, 1, 0]}, index=[4, 5, 6]), index + ) # dataFrame self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) - np.testing.assert_array_equal(result.index, [4, 5, 6], 'We want to preserve the original index') + np.testing.assert_array_equal( + result.index, [4, 5, 6], 'We want to preserve the original index' + ) result = convert_input_vector((0, 1, 0), index) # tuple self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) - result = convert_input_vector(0, [2]) # scalar - self.assertTrue(isinstance(result, pd.Series)) - self.assertEqual(1, len(result)) - self.assertTrue(result.index == [2]) - - result = convert_input_vector('a', [2]) # scalar - self.assertTrue(isinstance(result, pd.Series)) - self.assertEqual(1, len(result)) - self.assertTrue(result.index == [2]) - - # multiple columns and rows should cause an error because it is unclear which column/row to use as the target - self.assertRaises(ValueError, convert_input_vector, (pd.DataFrame({'col1': [0, 1, 0], 'col2': [1, 0, 1]})), index) - self.assertRaises(ValueError, convert_input_vector, (np.array([[0, 1], [1, 0], [0, 1]])), index) + # should not work for scalars + self.assertRaises(ValueError, convert_input_vector, 0, [2]) + self.assertRaises(ValueError, convert_input_vector, "a", [2]) + + # multiple columns and rows should cause an error because it is unclear + # which column/row to use as the target + self.assertRaises( + ValueError, + convert_input_vector, + (pd.DataFrame({'col1': [0, 1, 0], 'col2': [1, 0, 1]})), + index, + ) + self.assertRaises( + ValueError, convert_input_vector, (np.array([[0, 1], [1, 0], [0, 1]])), index + ) self.assertRaises(ValueError, convert_input_vector, ([[0, 1], [1, 0], [0, 1]]), index) - # edge scenarios (it is ok to raise an exception but please, provide then a helpful exception text) + # edge scenarios (it is ok to raise an exception but please, + # provide then a helpful exception text) _ = convert_input_vector(pd.Series(dtype=float), []) _ = convert_input_vector([], []) _ = convert_input_vector([[]], []) _ = convert_input_vector(pd.DataFrame(), []) def test_convert_inputs(self): + """Test the convert_inputs function.""" aindex = [2, 4, 5] bindex = [1, 3, 4] alist = [5, 3, 6] @@ -121,14 +140,18 @@ def test_convert_inputs(self): self.assertRaises(ValueError, convert_inputs, barray, [1, 2, 3, 4]) def test_get_categorical_cols(self): - df = pd.DataFrame({"col": ["a", "b"]}) - self.assertEqual(get_categorical_cols(df.astype("object")), ["col"]) - self.assertEqual(get_categorical_cols(df.astype("category")), ["col"]) - self.assertEqual(get_categorical_cols(df.astype("string")), ["col"]) + """Test the get_categorical_cols function.""" + df = pd.DataFrame({'col': ['a', 'b']}) + self.assertEqual(get_categorical_cols(df.astype('object')), ['col']) + self.assertEqual(get_categorical_cols(df.astype('category')), ['col']) + self.assertEqual(get_categorical_cols(df.astype('string')), ['col']) class TestBaseEncoder(TestCase): + """Tests for the BaseEncoder class.""" + def setUp(self): + """Set up the tests.""" class DummyEncoder(BaseEncoder, BaseEstimator, TransformerMixin): def _fit(self, X, y=None): return self @@ -138,13 +161,12 @@ def transform(self, X, y=None, override_return_df=False): self.encoder = DummyEncoder() - @pytest.mark.skipif(Version(skl_version) < Version('1.2'), reason="requires sklean > 1.2") + @pytest.mark.skipif(Version(skl_version) < Version('1.2'), reason='requires sklean > 1.2') def test_sklearn_pandas_out_refit(self): + """Test that the encoder can be refit with sklearn and pandas.""" # Thanks to Issue#437 - df = pd.DataFrame({"C1": ["a", "a"], "C2": ["c", "d"]}) - self.encoder.set_output(transform="pandas") + df = pd.DataFrame({'C1': ['a', 'a'], 'C2': ['c', 'd']}) + self.encoder.set_output(transform='pandas') self.encoder.fit_transform(df.iloc[:1]) - out = self.encoder.fit_transform( - df.rename(columns={'C1': 'X1', 'C2': 'X2'}) - ) + out = self.encoder.fit_transform(df.rename(columns={'C1': 'X1', 'C2': 'X2'})) self.assertTrue(list(out.columns) == ['X1', 'X2']) diff --git a/tests/test_woe.py b/tests/test_woe.py index f6e6e45a..21e1a050 100644 --- a/tests/test_woe.py +++ b/tests/test_woe.py @@ -1,9 +1,11 @@ -import pandas as pd +"""Unit tests for the Weight of Evidence encoder.""" from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ -import tests.helpers as th -import numpy as np import category_encoders as encoders +import numpy as np +import pandas as pd + +import tests.helpers as th np_X = th.create_array(n_rows=100) np_X_t = th.create_array(n_rows=50, extras=True) @@ -16,8 +18,20 @@ class TestWeightOfEvidenceEncoder(TestCase): + """Unit tests for the Weight of Evidence encoder.""" + def test_woe(self): - cols = ['unique_str', 'underscore', 'extra', 'none', 'invariant', 'categorical', 'na_categorical', 'categorical_int'] + """Test the Weight of Evidence encoder.""" + cols = [ + 'unique_str', + 'underscore', + 'extra', + 'none', + 'invariant', + 'categorical', + 'na_categorical', + 'categorical_int', + ] # balanced label with balanced features X_balanced = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col1']) @@ -25,37 +39,53 @@ def test_woe(self): enc = encoders.WOEEncoder() enc.fit(X_balanced, y_balanced) X1 = enc.transform(X_balanced) - self.assertTrue(all(X1.sum() < 0.001), - "When the class label is balanced, WoE should sum to 0 in each transformed column") + self.assertTrue( + all(X1.sum() < 0.001), + 'When the class label is balanced, WoE should sum to 0 in each transformed column', + ) enc = encoders.WOEEncoder(cols=cols) enc.fit(X, np_y) X1 = enc.transform(X_t) th.verify_numeric(X1[cols]) - self.assertTrue(np.isfinite(X1[cols].to_numpy()).all(), - 'There must not be any nan, inf or -inf in the transformed columns') + self.assertTrue( + np.isfinite(X1[cols].to_numpy()).all(), + 'There must not be any nan, inf or -inf in the transformed columns', + ) self.assertEqual(len(list(X_t)), len(list(X1)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X1), 'The count of rows must not change') X2 = enc.transform(X_t, np_y_t) th.verify_numeric(X2) - self.assertTrue(np.isfinite(X2[cols].to_numpy()).all(), - 'There must not be any nan, inf or -inf in the transformed columns') + self.assertTrue( + np.isfinite(X2[cols].to_numpy()).all(), + 'There must not be any nan, inf or -inf in the transformed columns', + ) self.assertEqual(len(list(X_t)), len(list(X2)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X2), 'The count of rows must not change') X3 = enc.transform(X, np_y) th.verify_numeric(X3) - self.assertTrue(np.isfinite(X3[cols].to_numpy()).all(), - 'There must not be any nan, inf or -inf in the transformed columns') + self.assertTrue( + np.isfinite(X3[cols].to_numpy()).all(), + 'There must not be any nan, inf or -inf in the transformed columns', + ) self.assertEqual(len(list(X)), len(list(X3)), 'The count of attributes must not change') self.assertEqual(len(X), len(X3), 'The count of rows must not change') - self.assertTrue(X3['unique_str'].var() < 0.001, 'The unique string column must not be predictive of the label') + self.assertTrue( + X3['unique_str'].var() < 0.001, + 'The unique string column must not be predictive of the label', + ) X4 = enc.fit_transform(X, np_y) th.verify_numeric(X4) - self.assertTrue(np.isfinite(X4[cols].to_numpy()).all(), - 'There must not be any nan, inf or -inf in the transformed columns') + self.assertTrue( + np.isfinite(X4[cols].to_numpy()).all(), + 'There must not be any nan, inf or -inf in the transformed columns', + ) self.assertEqual(len(list(X)), len(list(X4)), 'The count of attributes must not change') self.assertEqual(len(X), len(X4), 'The count of rows must not change') - self.assertTrue(X4['unique_str'].var() < 0.001, 'The unique string column must not be predictive of the label') + self.assertTrue( + X4['unique_str'].var() < 0.001, + 'The unique string column must not be predictive of the label', + ) enc = encoders.WOEEncoder() enc.fit(X, np_y) @@ -73,7 +103,7 @@ def test_woe(self): enc.fit(X, np_y) X1 = enc.transform(X_t, np_y_t) X2 = enc.transform(X_t, np_y_t) - self.assertTrue(X1.equals(X2), "When the seed is given, the results must be identical") + self.assertTrue(X1.equals(X2), 'When the seed is given, the results must be identical') th.verify_numeric(X1) th.verify_numeric(X2) @@ -109,39 +139,48 @@ def test_woe(self): self.assertEqual(len(list(X_t)), len(list(X2)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X2), 'The count of rows must not change') - def test_HaveArrays_ExpectCalculatedProperly(self): + def test_expect_calculated_properly(self): + """Test that the expected value for the following tests is calculated properly.""" X = ['a', 'a', 'b', 'b'] y = [1, 0, 0, 0] enc = encoders.WOEEncoder() result = enc.fit_transform(X, y) - expected = pd.Series([0.5108256237659906, .5108256237659906, -0.587786664902119, -0.587786664902119], name=0) + expected = pd.Series( + [0.5108256237659906, 0.5108256237659906, -0.587786664902119, -0.587786664902119], name=0 + ) pd.testing.assert_series_equal(expected, result[0]) - def test_HandleMissingValue_HaveMissingInTrain_ExpectEncoded(self): - X = ['a', 'a', np.nan, np.nan] - y = [1, 0, 0, 0] - enc = encoders.WOEEncoder(handle_missing='value') + def test_handle_missing_value(self): + """Test that missing values in the training set are encoded with the mean of the target.""" + with self.subTest("with NaN in the training set."): + X = ['a', 'a', np.nan, np.nan] + y = [1, 0, 0, 0] + enc = encoders.WOEEncoder(handle_missing='value') - result = enc.fit_transform(X, y) + result = enc.fit_transform(X, y) - expected = pd.Series([0.5108256237659906, .5108256237659906, -0.587786664902119, -0.587786664902119], name=0) - pd.testing.assert_series_equal(expected, result[0]) + expected = pd.Series( + [0.5108256237659906, 0.5108256237659906, -0.587786664902119, -0.587786664902119], + name=0 + ) + pd.testing.assert_series_equal(expected, result[0]) - def test_HandleMissingValue_HaveMissingInTest_ExpectEncodedWithZero(self): - X = ['a', 'a', 'b', 'b'] - y = [1, 0, 0, 0] - test = ['a', np.nan] - enc = encoders.WOEEncoder(handle_missing='value') + with self.subTest("without NaN in the training set."): + X = ['a', 'a', 'b', 'b'] + y = [1, 0, 0, 0] + test = ['a', np.nan] + enc = encoders.WOEEncoder(handle_missing='value') - enc.fit(X, y) - result = enc.transform(test) + enc.fit(X, y) + result = enc.transform(test) - expected = pd.Series([0.5108256237659906, 0], name=0) - pd.testing.assert_series_equal(expected, result[0]) + expected = pd.Series([0.5108256237659906, 0], name=0) + pd.testing.assert_series_equal(expected, result[0]) - def test_HandleUnknownValue_HaveUnknown_ExpectEncodedWithZero(self): + def test_unknown_value_is_zero(self): + """Test that unknown values are encoded with zero.""" X = ['a', 'a', 'b', 'b'] y = [1, 0, 0, 0] test = ['a', 'c'] diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py index 4eb6961b..7b785a5d 100644 --- a/tests/test_wrapper.py +++ b/tests/test_wrapper.py @@ -1,135 +1,179 @@ +"""Tests for the wrapper module.""" +from unittest import TestCase + +import category_encoders as encoders import numpy as np import pandas as pd -from unittest import TestCase +from category_encoders.wrapper import NestedCVWrapper, PolynomialWrapper from sklearn.model_selection import GroupKFold -import category_encoders as encoders import tests.helpers as th -from category_encoders.wrapper import PolynomialWrapper, NestedCVWrapper class TestMultiClassWrapper(TestCase): + """Tests for the PolynomialWrapper class.""" + def test_invariance_to_data_types(self): - x = np.array([ - ['a', 'b', 'c'], - ['a', 'b', 'c'], - ['b', 'b', 'c'], - ['b', 'b', 'b'], - ['b', 'b', 'b'], - ['a', 'b', 'a'], - ]) + """Test that the wrapper is invariant to data types.""" + x = np.array( + [ + ['a', 'b', 'c'], + ['a', 'b', 'c'], + ['b', 'b', 'c'], + ['b', 'b', 'b'], + ['b', 'b', 'b'], + ['a', 'b', 'a'], + ] + ) y = [1, 2, 3, 3, 3, 3] wrapper = PolynomialWrapper(encoders.TargetEncoder()) result = wrapper.fit_transform(x, y) th.verify_numeric(result) - x = pd.DataFrame([ - ['a', 'b', 'c'], - ['a', 'b', 'c'], - ['b', 'b', 'c'], - ['b', 'b', 'b'], - ['b', 'b', 'b'], - ['a', 'b', 'a'], - ], columns=['f1', 'f2', 'f3']) + x = pd.DataFrame( + [ + ['a', 'b', 'c'], + ['a', 'b', 'c'], + ['b', 'b', 'c'], + ['b', 'b', 'b'], + ['b', 'b', 'b'], + ['a', 'b', 'a'], + ], + columns=['f1', 'f2', 'f3'], + ) y = ['bee', 'cat', 'dog', 'dog', 'dog', 'dog'] wrapper = PolynomialWrapper(encoders.TargetEncoder()) result2 = wrapper.fit_transform(x, y) - self.assertTrue((result.to_numpy() == result2.to_numpy()).all(), 'The content should be the same regardless whether we pass Numpy or Pandas data type.') + self.assertTrue( + (result.to_numpy() == result2.to_numpy()).all(), + 'The content should be the same regardless whether we pass Numpy or Pandas data type.', + ) def test_transform_only_selected(self): - x = pd.DataFrame([ - ['a', 'b', 'c'], - ['a', 'a', 'c'], - ['b', 'a', 'c'], - ['b', 'c', 'b'], - ['b', 'b', 'b'], - ['a', 'b', 'a'], - ], columns=['f1', 'f2', 'f3']) + """Test that the wrapper only transforms the selected columns.""" + x = pd.DataFrame( + [ + ['a', 'b', 'c'], + ['a', 'a', 'c'], + ['b', 'a', 'c'], + ['b', 'c', 'b'], + ['b', 'b', 'b'], + ['a', 'b', 'a'], + ], + columns=['f1', 'f2', 'f3'], + ) y = ['bee', 'cat', 'dog', 'dog', 'dog', 'dog'] wrapper = PolynomialWrapper(encoders.LeaveOneOutEncoder(cols=['f2'])) # combination fit() + transform() wrapper.fit(x, y) result = wrapper.transform(x, y) - self.assertEqual(len(result.columns), 4, 'We expect 2 untouched features + f2 target encoded into 2 features') + self.assertEqual( + len(result.columns), + 4, + 'We expect 2 untouched features + f2 target encoded into 2 features', + ) # directly fit_transform() wrapper = PolynomialWrapper(encoders.LeaveOneOutEncoder(cols=['f2'])) result2 = wrapper.fit_transform(x, y) - self.assertEqual(len(result2.columns), 4, 'We expect 2 untouched features + f2 target encoded into 2 features') + self.assertEqual( + len(result2.columns), + 4, + 'We expect 2 untouched features + f2 target encoded into 2 features', + ) pd.testing.assert_frame_equal(result, result2) def test_refit_stateless(self): - # test that when the encoder is fitted multiple times no old state is carried - x = pd.DataFrame([ - ['a', 'b', 'c'], - ['a', 'b', 'c'], - ['b', 'b', 'c'], - ['b', 'b', 'b'], - ['b', 'b', 'b'], - ['a', 'b', 'a'], - ], columns=['f1', 'f2', 'f3']) + """Test that when the encoder is fitted multiple times no old state is carried.""" + x = pd.DataFrame( + [ + ['a', 'b', 'c'], + ['a', 'b', 'c'], + ['b', 'b', 'c'], + ['b', 'b', 'b'], + ['b', 'b', 'b'], + ['a', 'b', 'a'], + ], + columns=['f1', 'f2', 'f3'], + ) y1 = ['bee', 'cat', 'dog', 'dog', 'dog', 'dog'] y2 = ['bee', 'cat', 'duck', 'duck', 'duck', 'duck'] wrapper = PolynomialWrapper(encoders.TargetEncoder()) - result_first_fit = wrapper.fit_transform(x, y1) - expected_categories_1 = {"cat", "dog"} # 'bee' is dropped since first label is always dropped - expected_categories_2 = {"cat", "duck"} - self.assertEqual(set(wrapper.label_encoder.ordinal_encoder.category_mapping[0]["mapping"].index), {"bee", "cat", "dog"}) + _ = wrapper.fit_transform(x, y1) + expected_categories_1 = { + 'cat', + 'dog', + } # 'bee' is dropped since first label is always dropped + expected_categories_2 = {'cat', 'duck'} + self.assertEqual( + set(wrapper.label_encoder.ordinal_encoder.category_mapping[0]['mapping'].index), + {'bee', 'cat', 'dog'}, + ) self.assertEqual(set(wrapper.feature_encoders.keys()), expected_categories_1) - result_second_fit = wrapper.fit_transform(x, y2) - self.assertEqual(set(wrapper.label_encoder.ordinal_encoder.category_mapping[0]["mapping"].index), {"bee", "cat", "duck"}) + _ = wrapper.fit_transform(x, y2) + self.assertEqual( + set(wrapper.label_encoder.ordinal_encoder.category_mapping[0]['mapping'].index), + {'bee', 'cat', 'duck'}, + ) self.assertEqual(set(wrapper.feature_encoders.keys()), expected_categories_2) class TestNestedCVWrapper(TestCase): + """Tests for the NestedCVWrapper class.""" + def test_train_not_equal_to_valid(self): - x = np.array([ - ['a', 'b', 'c'], - ['a', 'b', 'c'], - ['a', 'b', 'c'], - ['a', 'b', 'c'], - ['b', 'b', 'c'], - ['b', 'b', 'c'], - ['b', 'b', 'b'], - ['b', 'b', 'b'], - ['b', 'b', 'b'], - ['b', 'b', 'b'], - ['a', 'b', 'a'], - ['a', 'b', 'a'], - ]) + """Test that the train and valid results are not equal.""" + x = np.array( + [ + ['a', 'b', 'c'], + ['a', 'b', 'c'], + ['a', 'b', 'c'], + ['a', 'b', 'c'], + ['b', 'b', 'c'], + ['b', 'b', 'c'], + ['b', 'b', 'b'], + ['b', 'b', 'b'], + ['b', 'b', 'b'], + ['b', 'b', 'b'], + ['a', 'b', 'a'], + ['a', 'b', 'a'], + ] + ) y = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3] wrapper = NestedCVWrapper(encoders.TargetEncoder(), cv=3) result_train, result_valid = wrapper.fit_transform(x, y, X_test=x) - # We would expect result_train != result_valid since result_train has been generated using nested - # folds and result_valid is generated by fitting the encoder on all of the x & y daya + # We would expect result_train != result_valid since result_train has been generated using + # nested # folds and result_valid is generated by fitting the encoder on all the x & y data self.assertFalse(np.allclose(result_train, result_valid)) - def test_custom_cv(self): - x = np.array([ - ['a', 'b', 'c'], - ['a', 'b', 'c'], - ['a', 'b', 'c'], - ['a', 'b', 'c'], - ['b', 'b', 'c'], - ['b', 'b', 'c'], - ['b', 'b', 'b'], - ['b', 'b', 'b'], - ['b', 'b', 'b'], - ['b', 'b', 'b'], - ['a', 'b', 'a'], - ['a', 'b', 'a'], - ]) + """Test custom cross validation.""" + x = np.array( + [ + ['a', 'b', 'c'], + ['a', 'b', 'c'], + ['a', 'b', 'c'], + ['a', 'b', 'c'], + ['b', 'b', 'c'], + ['b', 'b', 'c'], + ['b', 'b', 'b'], + ['b', 'b', 'b'], + ['b', 'b', 'b'], + ['b', 'b', 'b'], + ['a', 'b', 'a'], + ['a', 'b', 'a'], + ] + ) groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3] y = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3] gkfold = GroupKFold(n_splits=3) wrapper = NestedCVWrapper(encoders.TargetEncoder(), cv=gkfold) result_train, result_valid = wrapper.fit_transform(x, y, X_test=x, groups=groups) - # We would expect result_train != result_valid since result_train has been generated using nested - # folds and result_valid is generated by fitting the encoder on all of the x & y daya + # We would expect result_train != result_valid since result_train has been generated using + # nested # folds and result_valid is generated by fitting the encoder on all the x & y data self.assertFalse(np.allclose(result_train, result_valid)) From 86d4ce0650fb604ef58858892db18c366ee6dd48 Mon Sep 17 00:00:00 2001 From: PaulWestenthanner Date: Tue, 7 Jan 2025 17:43:10 +0100 Subject: [PATCH 2/3] Remove intercept from Contrast Coding Schemes. Fixes #370 --- category_encoders/backward_difference.py | 1 - category_encoders/base_contrast_encoder.py | 13 - category_encoders/helmert.py | 1 - category_encoders/polynomial.py | 1 - category_encoders/sum_coding.py | 1 - poetry.lock | 512 +++++++++++---------- pyproject.toml | 2 +- tests/test_encoders.py | 10 +- tests/test_polynomial.py | 19 +- tests/test_sum_coding.py | 26 +- 10 files changed, 301 insertions(+), 285 deletions(-) diff --git a/category_encoders/backward_difference.py b/category_encoders/backward_difference.py index e9e77311..8bc4a923 100644 --- a/category_encoders/backward_difference.py +++ b/category_encoders/backward_difference.py @@ -56,7 +56,6 @@ class BackwardDifferenceEncoder(BaseContrastEncoder): Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 intercept 1460 non-null int64 1 Id 1460 non-null float64 2 MSSubClass 1460 non-null float64 3 MSZoning 1460 non-null object diff --git a/category_encoders/base_contrast_encoder.py b/category_encoders/base_contrast_encoder.py index a322dd6f..937c8150 100644 --- a/category_encoders/base_contrast_encoder.py +++ b/category_encoders/base_contrast_encoder.py @@ -1,6 +1,5 @@ """Base encoder for various contrast coding schemes.""" -import warnings from abc import abstractmethod import numpy as np @@ -182,14 +181,6 @@ def transform_contrast_coding( """ cols = X.columns.tolist() - # See issue 370 if it is necessary to add an intercept or not. - X['intercept'] = pd.Series([1] * X.shape[0], index=X.index) - warnings.warn( - 'Intercept column might not be added anymore in future releases (c.f. issue #370)', - category=FutureWarning, - stacklevel=4, - ) - for switch in mapping: col = switch.get('col') mod = switch.get('mapping') @@ -202,8 +193,4 @@ def transform_contrast_coding( old_column_index = cols.index(col) cols[old_column_index : old_column_index + 1] = mod.columns - # this could lead to problems if an intercept column is already present - # (e.g. if another column has been encoded with another contrast coding scheme) - cols = ['intercept'] + cols - return X.reindex(columns=cols) diff --git a/category_encoders/helmert.py b/category_encoders/helmert.py index e236f457..f3d4653b 100644 --- a/category_encoders/helmert.py +++ b/category_encoders/helmert.py @@ -58,7 +58,6 @@ class HelmertEncoder(BaseContrastEncoder): Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 intercept 1460 non-null int64 1 Id 1460 non-null float64 2 MSSubClass 1460 non-null float64 3 MSZoning 1460 non-null object diff --git a/category_encoders/polynomial.py b/category_encoders/polynomial.py index 1dd94552..7a05e506 100644 --- a/category_encoders/polynomial.py +++ b/category_encoders/polynomial.py @@ -57,7 +57,6 @@ class PolynomialEncoder(BaseContrastEncoder): Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 intercept 1460 non-null int64 1 Id 1460 non-null float64 2 MSSubClass 1460 non-null float64 3 MSZoning 1460 non-null object diff --git a/category_encoders/sum_coding.py b/category_encoders/sum_coding.py index f3e849df..b873a69c 100644 --- a/category_encoders/sum_coding.py +++ b/category_encoders/sum_coding.py @@ -58,7 +58,6 @@ class SumEncoder(BaseContrastEncoder): Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 intercept 1460 non-null int64 1 Id 1460 non-null float64 2 MSSubClass 1460 non-null float64 3 MSZoning 1460 non-null object diff --git a/poetry.lock b/poetry.lock index ed84cfc2..0e6a82fa 100644 --- a/poetry.lock +++ b/poetry.lock @@ -46,112 +46,114 @@ dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"] [[package]] name = "certifi" -version = "2024.8.30" +version = "2024.12.14" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"}, - {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"}, + {file = "certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56"}, + {file = "certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db"}, ] [[package]] name = "charset-normalizer" -version = "3.3.2" +version = "3.4.1" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false -python-versions = ">=3.7.0" +python-versions = ">=3.7" files = [ - {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"}, - {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e218488cd232553829be0664c2292d3af2eeeb94b32bea483cf79ac6a694e037"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80ed5e856eb7f30115aaf94e4a08114ccc8813e6ed1b5efa74f9f82e8509858f"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b010a7a4fd316c3c484d482922d13044979e78d1861f0e0650423144c616a46a"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4532bff1b8421fd0a320463030c7520f56a79c9024a4e88f01c537316019005a"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d973f03c0cb71c5ed99037b870f2be986c3c05e63622c017ea9816881d2dd247"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3a3bd0dcd373514dcec91c411ddb9632c0d7d92aed7093b8c3bbb6d69ca74408"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d9c3cdf5390dcd29aa8056d13e8e99526cda0305acc038b96b30352aff5ff2bb"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2bdfe3ac2e1bbe5b59a1a63721eb3b95fc9b6817ae4a46debbb4e11f6232428d"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:eab677309cdb30d047996b36d34caeda1dc91149e4fdca0b1a039b3f79d9a807"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c0429126cf75e16c4f0ad00ee0eae4242dc652290f940152ca8c75c3a4b6ee8f"}, + {file = "charset_normalizer-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:9f0b8b1c6d84c8034a44893aba5e767bf9c7a211e313a9605d9c617d7083829f"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8bfa33f4f2672964266e940dd22a195989ba31669bd84629f05fab3ef4e2d125"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28bf57629c75e810b6ae989f03c0828d64d6b26a5e205535585f96093e405ed1"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f08ff5e948271dc7e18a35641d2f11a4cd8dfd5634f55228b691e62b37125eb3"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:234ac59ea147c59ee4da87a0c0f098e9c8d169f4dc2a159ef720f1a61bbe27cd"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd4ec41f914fa74ad1b8304bbc634b3de73d2a0889bd32076342a573e0779e00"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eea6ee1db730b3483adf394ea72f808b6e18cf3cb6454b4d86e04fa8c4327a12"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c96836c97b1238e9c9e3fe90844c947d5afbf4f4c92762679acfe19927d81d77"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4d86f7aff21ee58f26dcf5ae81a9addbd914115cdebcbb2217e4f0ed8982e146"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:09b5e6733cbd160dcc09589227187e242a30a49ca5cefa5a7edd3f9d19ed53fd"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:5777ee0881f9499ed0f71cc82cf873d9a0ca8af166dfa0af8ec4e675b7df48e6"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:237bdbe6159cff53b4f24f397d43c6336c6b0b42affbe857970cefbb620911c8"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-win32.whl", hash = "sha256:8417cb1f36cc0bc7eaba8ccb0e04d55f0ee52df06df3ad55259b9a323555fc8b"}, + {file = "charset_normalizer-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:d7f50a1f8c450f3925cb367d011448c39239bb3eb4117c36a6d354794de4ce76"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-win32.whl", hash = "sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35"}, + {file = "charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:aabfa34badd18f1da5ec1bc2715cadc8dca465868a4e73a0173466b688f29dda"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22e14b5d70560b8dd51ec22863f370d1e595ac3d024cb8ad7d308b4cd95f8313"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8436c508b408b82d87dc5f62496973a1805cd46727c34440b0d29d8a2f50a6c9"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d074908e1aecee37a7635990b2c6d504cd4766c7bc9fc86d63f9c09af3fa11b"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:955f8851919303c92343d2f66165294848d57e9bba6cf6e3625485a70a038d11"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:44ecbf16649486d4aebafeaa7ec4c9fed8b88101f4dd612dcaf65d5e815f837f"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0924e81d3d5e70f8126529951dac65c1010cdf117bb75eb02dd12339b57749dd"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2967f74ad52c3b98de4c3b32e1a44e32975e008a9cd2a8cc8966d6a5218c5cb2"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c75cb2a3e389853835e84a2d8fb2b81a10645b503eca9bcb98df6b5a43eb8886"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:09b26ae6b1abf0d27570633b2b078a2a20419c99d66fb2823173d73f188ce601"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-win32.whl", hash = "sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407"}, + {file = "charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f30bf9fd9be89ecb2360c7d94a711f00c09b976258846efe40db3d05828e8089"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:97f68b8d6831127e4787ad15e6757232e14e12060bec17091b85eb1486b91d8d"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7974a0b5ecd505609e3b19742b60cee7aa2aa2fb3151bc917e6e2646d7667dcf"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc54db6c8593ef7d4b2a331b58653356cf04f67c960f584edb7c3d8c97e8f39e"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:311f30128d7d333eebd7896965bfcfbd0065f1716ec92bd5638d7748eb6f936a"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:7d053096f67cd1241601111b698f5cad775f97ab25d81567d3f59219b5f1adbd"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:807f52c1f798eef6cf26beb819eeb8819b1622ddfeef9d0977a8502d4db6d534"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:dccbe65bd2f7f7ec22c4ff99ed56faa1e9f785482b9bbd7c717e26fd723a1d1e"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:2fb9bd477fdea8684f78791a6de97a953c51831ee2981f8e4f583ff3b9d9687e"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:01732659ba9b5b873fc117534143e4feefecf3b2078b0a6a2e925271bb6f4cfa"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-win32.whl", hash = "sha256:7a4f97a081603d2050bfaffdefa5b02a9ec823f8348a572e39032caa8404a487"}, + {file = "charset_normalizer-3.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:7b1bef6280950ee6c177b326508f86cad7ad4dff12454483b51d8b7d673a2c5d"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-win32.whl", hash = "sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e"}, + {file = "charset_normalizer-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75832c08354f595c760a804588b9357d34ec00ba1c940c15e31e96d902093770"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0af291f4fe114be0280cdd29d533696a77b5b49cfde5467176ecab32353395c4"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0167ddc8ab6508fe81860a57dd472b2ef4060e8d378f0cc555707126830f2537"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2a75d49014d118e4198bcee5ee0a6f25856b29b12dbf7cd012791f8a6cc5c496"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:363e2f92b0f0174b2f8238240a1a30142e3db7b957a5dd5689b0e75fb717cc78"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ab36c8eb7e454e34e60eb55ca5d241a5d18b2c6244f6827a30e451c42410b5f7"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:4c0907b1928a36d5a998d72d64d8eaa7244989f7aaaf947500d3a800c83a3fd6"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:04432ad9479fa40ec0f387795ddad4437a2b50417c69fa275e212933519ff294"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-win32.whl", hash = "sha256:3bed14e9c89dcb10e8f3a29f9ccac4955aebe93c71ae803af79265c9ca5644c5"}, + {file = "charset_normalizer-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:49402233c892a461407c512a19435d1ce275543138294f7ef013f0b63d5d3765"}, + {file = "charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85"}, + {file = "charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3"}, ] [[package]] @@ -251,13 +253,13 @@ files = [ [[package]] name = "jinja2" -version = "3.1.4" +version = "3.1.5" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" files = [ - {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, - {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, + {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"}, + {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"}, ] [package.dependencies] @@ -279,71 +281,72 @@ files = [ [[package]] name = "markupsafe" -version = "2.1.5" +version = "3.0.2" description = "Safely add untrusted strings to HTML/XML markup." optional = false -python-versions = ">=3.7" +python-versions = ">=3.9" files = [ - {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"}, - {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-win32.whl", hash = "sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"}, + {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"}, ] [[package]] @@ -423,13 +426,13 @@ test = ["matplotlib", "pytest", "pytest-cov"] [[package]] name = "packaging" -version = "24.1" +version = "24.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, - {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, + {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, + {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, ] [[package]] @@ -520,18 +523,17 @@ xml = ["lxml (>=4.9.2)"] [[package]] name = "patsy" -version = "0.5.6" +version = "1.0.1" description = "A Python package for describing statistical models and for building design matrices." optional = false -python-versions = "*" +python-versions = ">=3.6" files = [ - {file = "patsy-0.5.6-py2.py3-none-any.whl", hash = "sha256:19056886fd8fa71863fa32f0eb090267f21fb74be00f19f5c70b2e9d76c883c6"}, - {file = "patsy-0.5.6.tar.gz", hash = "sha256:95c6d47a7222535f84bff7f63d7303f2e297747a598db89cf5c67f0c0c7d2cdb"}, + {file = "patsy-1.0.1-py2.py3-none-any.whl", hash = "sha256:751fb38f9e97e62312e921a1954b81e1bb2bcda4f5eeabaf94db251ee791509c"}, + {file = "patsy-1.0.1.tar.gz", hash = "sha256:e786a9391eec818c054e359b737bbce692f051aee4c661f4141cc88fb459c0c4"}, ] [package.dependencies] numpy = ">=1.4" -six = "*" [package.extras] test = ["pytest", "pytest-cov", "scipy"] @@ -553,13 +555,13 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "pygments" -version = "2.18.0" +version = "2.19.1" description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.8" files = [ - {file = "pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a"}, - {file = "pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199"}, + {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, + {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, ] [package.extras] @@ -567,13 +569,13 @@ windows-terminal = ["colorama (>=0.4.6)"] [[package]] name = "pytest" -version = "8.3.3" +version = "8.3.4" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" files = [ - {file = "pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2"}, - {file = "pytest-8.3.3.tar.gz", hash = "sha256:70b98107bd648308a7952b06e6ca9a50bc660be218d53c257cc1fc94fda10181"}, + {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"}, + {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"}, ] [package.dependencies] @@ -650,29 +652,29 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "ruff" -version = "0.6.8" +version = "0.6.9" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.6.8-py3-none-linux_armv6l.whl", hash = "sha256:77944bca110ff0a43b768f05a529fecd0706aac7bcce36d7f1eeb4cbfca5f0f2"}, - {file = "ruff-0.6.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:27b87e1801e786cd6ede4ada3faa5e254ce774de835e6723fd94551464c56b8c"}, - {file = "ruff-0.6.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:cd48f945da2a6334f1793d7f701725a76ba93bf3d73c36f6b21fb04d5338dcf5"}, - {file = "ruff-0.6.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:677e03c00f37c66cea033274295a983c7c546edea5043d0c798833adf4cf4c6f"}, - {file = "ruff-0.6.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9f1476236b3eacfacfc0f66aa9e6cd39f2a624cb73ea99189556015f27c0bdeb"}, - {file = "ruff-0.6.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f5a2f17c7d32991169195d52a04c95b256378bbf0de8cb98478351eb70d526f"}, - {file = "ruff-0.6.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:5fd0d4b7b1457c49e435ee1e437900ced9b35cb8dc5178921dfb7d98d65a08d0"}, - {file = "ruff-0.6.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8034b19b993e9601f2ddf2c517451e17a6ab5cdb1c13fdff50c1442a7171d87"}, - {file = "ruff-0.6.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6cfb227b932ba8ef6e56c9f875d987973cd5e35bc5d05f5abf045af78ad8e098"}, - {file = "ruff-0.6.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ef0411eccfc3909269fed47c61ffebdcb84a04504bafa6b6df9b85c27e813b0"}, - {file = "ruff-0.6.8-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:007dee844738c3d2e6c24ab5bc7d43c99ba3e1943bd2d95d598582e9c1b27750"}, - {file = "ruff-0.6.8-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:ce60058d3cdd8490e5e5471ef086b3f1e90ab872b548814e35930e21d848c9ce"}, - {file = "ruff-0.6.8-py3-none-musllinux_1_2_i686.whl", hash = "sha256:1085c455d1b3fdb8021ad534379c60353b81ba079712bce7a900e834859182fa"}, - {file = "ruff-0.6.8-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:70edf6a93b19481affd287d696d9e311388d808671bc209fb8907b46a8c3af44"}, - {file = "ruff-0.6.8-py3-none-win32.whl", hash = "sha256:792213f7be25316f9b46b854df80a77e0da87ec66691e8f012f887b4a671ab5a"}, - {file = "ruff-0.6.8-py3-none-win_amd64.whl", hash = "sha256:ec0517dc0f37cad14a5319ba7bba6e7e339d03fbf967a6d69b0907d61be7a263"}, - {file = "ruff-0.6.8-py3-none-win_arm64.whl", hash = "sha256:8d3bb2e3fbb9875172119021a13eed38849e762499e3cfde9588e4b4d70968dc"}, - {file = "ruff-0.6.8.tar.gz", hash = "sha256:a5bf44b1aa0adaf6d9d20f86162b34f7c593bfedabc51239953e446aefc8ce18"}, + {file = "ruff-0.6.9-py3-none-linux_armv6l.whl", hash = "sha256:064df58d84ccc0ac0fcd63bc3090b251d90e2a372558c0f057c3f75ed73e1ccd"}, + {file = "ruff-0.6.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:140d4b5c9f5fc7a7b074908a78ab8d384dd7f6510402267bc76c37195c02a7ec"}, + {file = "ruff-0.6.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:53fd8ca5e82bdee8da7f506d7b03a261f24cd43d090ea9db9a1dc59d9313914c"}, + {file = "ruff-0.6.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:645d7d8761f915e48a00d4ecc3686969761df69fb561dd914a773c1a8266e14e"}, + {file = "ruff-0.6.9-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eae02b700763e3847595b9d2891488989cac00214da7f845f4bcf2989007d577"}, + {file = "ruff-0.6.9-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d5ccc9e58112441de8ad4b29dcb7a86dc25c5f770e3c06a9d57e0e5eba48829"}, + {file = "ruff-0.6.9-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:417b81aa1c9b60b2f8edc463c58363075412866ae4e2b9ab0f690dc1e87ac1b5"}, + {file = "ruff-0.6.9-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c866b631f5fbce896a74a6e4383407ba7507b815ccc52bcedabb6810fdb3ef7"}, + {file = "ruff-0.6.9-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b118afbb3202f5911486ad52da86d1d52305b59e7ef2031cea3425142b97d6f"}, + {file = "ruff-0.6.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a67267654edc23c97335586774790cde402fb6bbdb3c2314f1fc087dee320bfa"}, + {file = "ruff-0.6.9-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:3ef0cc774b00fec123f635ce5c547dac263f6ee9fb9cc83437c5904183b55ceb"}, + {file = "ruff-0.6.9-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:12edd2af0c60fa61ff31cefb90aef4288ac4d372b4962c2864aeea3a1a2460c0"}, + {file = "ruff-0.6.9-py3-none-musllinux_1_2_i686.whl", hash = "sha256:55bb01caeaf3a60b2b2bba07308a02fca6ab56233302406ed5245180a05c5625"}, + {file = "ruff-0.6.9-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:925d26471fa24b0ce5a6cdfab1bb526fb4159952385f386bdcc643813d472039"}, + {file = "ruff-0.6.9-py3-none-win32.whl", hash = "sha256:eb61ec9bdb2506cffd492e05ac40e5bc6284873aceb605503d8494180d6fc84d"}, + {file = "ruff-0.6.9-py3-none-win_amd64.whl", hash = "sha256:785d31851c1ae91f45b3d8fe23b8ae4b5170089021fbb42402d811135f0b7117"}, + {file = "ruff-0.6.9-py3-none-win_arm64.whl", hash = "sha256:a9641e31476d601f83cd602608739a0840e348bda93fec9f1ee816f8b6798b93"}, + {file = "ruff-0.6.9.tar.gz", hash = "sha256:b076ef717a8e5bc819514ee1d602bbdca5b4420ae13a9cf61a0c0a4f53a2baa2"}, ] [[package]] @@ -769,13 +771,13 @@ test = ["array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "mpmath", "po [[package]] name = "six" -version = "1.16.0" +version = "1.17.0" description = "Python 2 and 3 compatibility utilities" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ - {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, - {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, + {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, + {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, ] [[package]] @@ -954,41 +956,41 @@ test = ["pytest"] [[package]] name = "statsmodels" -version = "0.14.3" +version = "0.14.4" description = "Statistical computations and models for Python" optional = false python-versions = ">=3.9" files = [ - {file = "statsmodels-0.14.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7372c92f18b8afb06355e067285abb94e8b214afd9f2fda6d3c26f3ea004cbdf"}, - {file = "statsmodels-0.14.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:42459cdaafe217f455e6b95c05d9e089caf02dd53295aebe63bc1e0206f83176"}, - {file = "statsmodels-0.14.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a72d3d9fe61f70baf18667bc9cf2e68b6bdd8f5cce4f7b21f9e662e19d2ffdf"}, - {file = "statsmodels-0.14.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9050e5817f23a5adcb87822406b5260758795c42c41fa2fa60816023f0a0d8ef"}, - {file = "statsmodels-0.14.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f12d74743936323165dae648f75193ee4a47381a85610be661d34de56c7634e0"}, - {file = "statsmodels-0.14.3-cp310-cp310-win_amd64.whl", hash = "sha256:53212f597747534bed475bbd89f4bc39a3757c20692bb7664021e30fbd967c53"}, - {file = "statsmodels-0.14.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e49a63757e12269ef02841f05906e91bdb70f5bc358cbaca97f171f4a4de09c4"}, - {file = "statsmodels-0.14.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:de4b989f0fea684f89bdf5ff641f9acb7acddfd712459f28365904a974afaeff"}, - {file = "statsmodels-0.14.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45a5ae177e92348532bf2522f27feecd0589b88b243709b28e2b068631c9c181"}, - {file = "statsmodels-0.14.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a736ac24da1388e444bb2b0d381a7307b29074b237acef040a793cfdd508e160"}, - {file = "statsmodels-0.14.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ea8491b6a36fca738403037709e9469412a9d3e8a8e54db482c20e8dd70efa1f"}, - {file = "statsmodels-0.14.3-cp311-cp311-win_amd64.whl", hash = "sha256:efb946ced8243923eb78909834699be55442172cea3dc37158e3e1c5370e4189"}, - {file = "statsmodels-0.14.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9bf3690f71ebacff0c976c1584994174bc1bb72785b5a35645b385a00a5107e0"}, - {file = "statsmodels-0.14.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:197bcb1aeaaa5c7e9ba4ad87c2369f9600c6cd69d6e2db829eb46d3d9fe534c9"}, - {file = "statsmodels-0.14.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:492b8fd867687f9539b1f7f111dafb2464e04f65fa834585c08725b8aa1a3d98"}, - {file = "statsmodels-0.14.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a849e78dcb3ed6416bb9043b9549415f1f8cd00426deb467ff4dfe0acbaaad8e"}, - {file = "statsmodels-0.14.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e8a82aa8a99a428f39a9ead1b03fbd2339e40908412371abe089239d21467fd5"}, - {file = "statsmodels-0.14.3-cp312-cp312-win_amd64.whl", hash = "sha256:5724e51a370227655679f1a487f429919f03de325d7b5702e919526353d0cb1d"}, - {file = "statsmodels-0.14.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:78f579f8416b91b971fb0f27e18c3dec6946b4471ac2456a98dbfd24c72d180c"}, - {file = "statsmodels-0.14.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:eb84759e3c1a7b77cae4e7dfdc2ea09b1f1790446fd8476951680eb79e4a568d"}, - {file = "statsmodels-0.14.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e7681296373de33d775b01201c51e340d01afb70c6a5ac9b7c66a9e120564967"}, - {file = "statsmodels-0.14.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:988346db6319f0c12e6137df674e10ebf551adb42445e05eea2e1d900898f670"}, - {file = "statsmodels-0.14.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c69b82b4f2a794199d1100ab4406f761516f71826856fa6bfc474a3189b77785"}, - {file = "statsmodels-0.14.3-cp313-cp313-win_amd64.whl", hash = "sha256:5114e5c0f10ce30616ef4a91dc24e66e1033c242589208e604d80a7931537f12"}, - {file = "statsmodels-0.14.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:280e69721925a936493153dba692b53a2fe4e3f46e5fafd32a453f5d9fa2a344"}, - {file = "statsmodels-0.14.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:97f28958e456aea788d4ffd83d7ade82d2a4a3bd5c7e8eabf791f224cddef2bf"}, - {file = "statsmodels-0.14.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ef24d6350a15f5d25f7c6cb774fce89dff77e3687181ce4410cafd6a4004f04"}, - {file = "statsmodels-0.14.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ddbd07b7d05e16d1a2ea6df3d7e2255dfb3e0363b91d859623d9fc3aff32b4a"}, - {file = "statsmodels-0.14.3-cp39-cp39-win_amd64.whl", hash = "sha256:42dfb9084a5520342248441904357bd5d7fcf01ec05c9bdc7dd764a88e15a9c4"}, - {file = "statsmodels-0.14.3.tar.gz", hash = "sha256:ecf3502643fa93aabe5f0bdf238efb59609517c4d60a811632d31fcdce86c2d2"}, + {file = "statsmodels-0.14.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7a62f1fc9086e4b7ee789a6f66b3c0fc82dd8de1edda1522d30901a0aa45e42b"}, + {file = "statsmodels-0.14.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:46ac7ddefac0c9b7b607eed1d47d11e26fe92a1bc1f4d9af48aeed4e21e87981"}, + {file = "statsmodels-0.14.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a337b731aa365d09bb0eab6da81446c04fde6c31976b1d8e3d3a911f0f1e07b"}, + {file = "statsmodels-0.14.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:631bb52159117c5da42ba94bd94859276b68cab25dc4cac86475bc24671143bc"}, + {file = "statsmodels-0.14.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3bb2e580d382545a65f298589809af29daeb15f9da2eb252af8f79693e618abc"}, + {file = "statsmodels-0.14.4-cp310-cp310-win_amd64.whl", hash = "sha256:9729642884147ee9db67b5a06a355890663d21f76ed608a56ac2ad98b94d201a"}, + {file = "statsmodels-0.14.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ed7e118e6e3e02d6723a079b8c97eaadeed943fa1f7f619f7148dfc7862670f"}, + {file = "statsmodels-0.14.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f5f537f7d000de4a1708c63400755152b862cd4926bb81a86568e347c19c364b"}, + {file = "statsmodels-0.14.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa74aaa26eaa5012b0a01deeaa8a777595d0835d3d6c7175f2ac65435a7324d2"}, + {file = "statsmodels-0.14.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e332c2d9b806083d1797231280602340c5c913f90d4caa0213a6a54679ce9331"}, + {file = "statsmodels-0.14.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d9c8fa28dfd75753d9cf62769ba1fecd7e73a0be187f35cc6f54076f98aa3f3f"}, + {file = "statsmodels-0.14.4-cp311-cp311-win_amd64.whl", hash = "sha256:a6087ecb0714f7c59eb24c22781491e6f1cfffb660b4740e167625ca4f052056"}, + {file = "statsmodels-0.14.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5221dba7424cf4f2561b22e9081de85f5bb871228581124a0d1b572708545199"}, + {file = "statsmodels-0.14.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:17672b30c6b98afe2b095591e32d1d66d4372f2651428e433f16a3667f19eabb"}, + {file = "statsmodels-0.14.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ab5e6312213b8cfb9dca93dd46a0f4dccb856541f91d3306227c3d92f7659245"}, + {file = "statsmodels-0.14.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4bbb150620b53133d6cd1c5d14c28a4f85701e6c781d9b689b53681effaa655f"}, + {file = "statsmodels-0.14.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb695c2025d122a101c2aca66d2b78813c321b60d3a7c86bb8ec4467bb53b0f9"}, + {file = "statsmodels-0.14.4-cp312-cp312-win_amd64.whl", hash = "sha256:7f7917a51766b4e074da283c507a25048ad29a18e527207883d73535e0dc6184"}, + {file = "statsmodels-0.14.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b5a24f5d2c22852d807d2b42daf3a61740820b28d8381daaf59dcb7055bf1a79"}, + {file = "statsmodels-0.14.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:df4f7864606fa843d7e7c0e6af288f034a2160dba14e6ccc09020a3cf67cb092"}, + {file = "statsmodels-0.14.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91341cbde9e8bea5fb419a76e09114e221567d03f34ca26e6d67ae2c27d8fe3c"}, + {file = "statsmodels-0.14.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1322286a7bfdde2790bf72d29698a1b76c20b8423a55bdcd0d457969d0041f72"}, + {file = "statsmodels-0.14.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e31b95ac603415887c9f0d344cb523889cf779bc52d68e27e2d23c358958fec7"}, + {file = "statsmodels-0.14.4-cp313-cp313-win_amd64.whl", hash = "sha256:81030108d27aecc7995cac05aa280cf8c6025f6a6119894eef648997936c2dd0"}, + {file = "statsmodels-0.14.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4793b01b7a5f5424f5a1dbcefc614c83c7608aa2b035f087538253007c339d5d"}, + {file = "statsmodels-0.14.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d330da34f59f1653c5193f9fe3a3a258977c880746db7f155fc33713ea858db5"}, + {file = "statsmodels-0.14.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e9ddefba1d4e1107c1f20f601b0581421ea3ad9fd75ce3c2ba6a76b6dc4682c"}, + {file = "statsmodels-0.14.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f43da7957e00190104c5dd0f661bfc6dfc68b87313e3f9c4dbd5e7d222e0aeb"}, + {file = "statsmodels-0.14.4-cp39-cp39-win_amd64.whl", hash = "sha256:8286f69a5e1d0e0b366ffed5691140c83d3efc75da6dbf34a3d06e88abfaaab6"}, + {file = "statsmodels-0.14.4.tar.gz", hash = "sha256:5d69e0f39060dc72c067f9bb6e8033b6dccdb0bae101d76a7ef0bcc94e898b67"}, ] [package.dependencies] @@ -1030,13 +1032,43 @@ files = [ [[package]] name = "tomli" -version = "2.0.2" +version = "2.2.1" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" files = [ - {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"}, - {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"}, + {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, + {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"}, + {file = "tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a"}, + {file = "tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee"}, + {file = "tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e"}, + {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4"}, + {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106"}, + {file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8"}, + {file = "tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff"}, + {file = "tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b"}, + {file = "tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea"}, + {file = "tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8"}, + {file = "tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192"}, + {file = "tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222"}, + {file = "tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77"}, + {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6"}, + {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd"}, + {file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e"}, + {file = "tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98"}, + {file = "tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4"}, + {file = "tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7"}, + {file = "tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c"}, + {file = "tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13"}, + {file = "tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281"}, + {file = "tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272"}, + {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140"}, + {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2"}, + {file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744"}, + {file = "tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec"}, + {file = "tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69"}, + {file = "tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc"}, + {file = "tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff"}, ] [[package]] @@ -1052,13 +1084,13 @@ files = [ [[package]] name = "urllib3" -version = "2.2.3" +version = "2.3.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"}, - {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"}, + {file = "urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df"}, + {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"}, ] [package.extras] @@ -1069,13 +1101,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "zipp" -version = "3.20.2" +version = "3.21.0" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"}, - {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"}, + {file = "zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931"}, + {file = "zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4"}, ] [package.extras] @@ -1089,4 +1121,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.9" -content-hash = "2237110d20a33b24e0f73322962a1ff2cc1411386309302759151579a3466605" +content-hash = "1f78a5242fda08ef8214967206a01644c613f94ef5111fc39ad8616fa925f8eb" diff --git a/pyproject.toml b/pyproject.toml index de9c70b7..854b4c41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ readme = "README.md" [tool.poetry.dependencies] python = ">=3.9" numpy = ">=1.14.0" -scikit-learn = ">=1.0.0" +scikit-learn = ">=1.0.0, <1.6.0" scipy = ">=1.0.0" statsmodels = ">=0.9.0" pandas = ">=1.0.5" diff --git a/tests/test_encoders.py b/tests/test_encoders.py index 28487131..45d22d3d 100644 --- a/tests/test_encoders.py +++ b/tests/test_encoders.py @@ -702,9 +702,13 @@ def test_drop_invariant(self): ) y = [0, 0, 1, 1, 1] - for encoder_name in set(encoders.__all__) - { - 'CatBoostEncoder' - }: # CatBoost does not generally deliver a constant column when the feature is constant + # CatBoost does not generally deliver a constant column when the feature is constant + # ContrastCoding schemes will always ignore invariant columns, even if set to false + encoders_to_ignore = { + 'CatBoostEncoder', 'PolynomialEncoder', 'SumEncoder', + 'BackwardDifferenceEncoder', 'HelmertEncoder' + } + for encoder_name in set(encoders.__all__) - encoders_to_ignore: with self.subTest(encoder_name=encoder_name): enc1 = getattr(encoders, encoder_name)(drop_invariant=False) enc2 = getattr(encoders, encoder_name)(drop_invariant=True) diff --git a/tests/test_polynomial.py b/tests/test_polynomial.py index e2edd32f..e8370c7c 100644 --- a/tests/test_polynomial.py +++ b/tests/test_polynomial.py @@ -7,9 +7,9 @@ from tests.helpers import deep_round -a_encoding = [1, -0.7071067811865476, 0.40824829046386313] -b_encoding = [1, -5.551115123125783e-17, -0.8164965809277261] -c_encoding = [1, 0.7071067811865475, 0.4082482904638631] +a_encoding = [-0.7071067811865476, 0.40824829046386313] +b_encoding = [-5.551115123125783e-17, -0.8164965809277261] +c_encoding = [0.7071067811865475, 0.4082482904638631] class TestPolynomialEncoder(TestCase): @@ -18,7 +18,7 @@ class TestPolynomialEncoder(TestCase): def test_handle_missing_and_unknown(self): """Test that missing and unknown values are treated as values.""" train = ['A', 'B', 'C'] - expected_encoding_unknown = [1, 0, 0] + expected_encoding_unknown = [0, 0] expected_1 = [a_encoding, expected_encoding_unknown, expected_encoding_unknown] expected_2 = [b_encoding, expected_encoding_unknown, expected_encoding_unknown] expected_3 = [a_encoding, b_encoding, c_encoding, expected_encoding_unknown] @@ -44,9 +44,9 @@ def test_polynomial_encoder_2cols(self): obtained = encoder.transform(train) expected = [ - [1, a_encoding[1], a_encoding[2], a_encoding[1], a_encoding[2]], - [1, b_encoding[1], b_encoding[2], b_encoding[1], b_encoding[2]], - [1, c_encoding[1], c_encoding[2], c_encoding[1], c_encoding[2]], + a_encoding* 2, + b_encoding* 2, + c_encoding* 2, ] self.assertEqual(deep_round(obtained.to_numpy().tolist()), deep_round(expected)) @@ -62,7 +62,6 @@ def test_correct_order(self): columns=['col1', 'col2', 'col3', 'col4'], ) expected_columns = [ - 'intercept', 'col1', 'col2_0', 'col2_1', @@ -105,10 +104,10 @@ def test_handle_missing_is_indicator(self): expected = [a_encoding, b_encoding, c_encoding] self.assertEqual(deep_round(result.to_numpy().tolist()), deep_round(expected)) - # unknown value is encoded as zeros (only one at indicator) + # unknown value is encoded as zeros test = ['A', 'B', 'C'] result = encoder.transform(test) - expected = [a_encoding, b_encoding, [1, 0, 0]] + expected = [a_encoding, b_encoding, [0, 0]] self.assertEqual(deep_round(result.to_numpy().tolist()), deep_round(expected)) diff --git a/tests/test_sum_coding.py b/tests/test_sum_coding.py index ee5dcfd5..46347c05 100644 --- a/tests/test_sum_coding.py +++ b/tests/test_sum_coding.py @@ -5,9 +5,9 @@ import numpy as np import pandas as pd -a_encoding = [1, 1, 0] -b_encoding = [1, 0, 1] -c_encoding = [1, -1, -1] +a_encoding = [1, 0] +b_encoding = [0, 1] +c_encoding = [-1, -1] class TestSumEncoder(TestCase): @@ -20,14 +20,14 @@ def test_unknown_and_missing(self): encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) dim_1_test = ['A', 'D', 'E'] - dim_1_expected = [a_encoding, [1, 0, 0], [1, 0, 0]] + dim_1_expected = [a_encoding, [0, 0], [0, 0]] dim_2_test = ['B', 'D', 'E'] - dim_2_expected = [b_encoding, [1, 0, 0], [1, 0, 0]] + dim_2_expected = [b_encoding, [0, 0], [0, 0]] dim_3_test = ['A', 'B', 'C', None] - dim_3_expected = [a_encoding, b_encoding, c_encoding, [1, 0, 0]] + dim_3_expected = [a_encoding, b_encoding, c_encoding, [0, 0]] dim_4_test = ['D', 'B', 'C', None] - dim_4_expected = [[1, 0, 0], b_encoding, c_encoding, [1, 0, 0]] + dim_4_expected = [[0, 0], b_encoding, c_encoding, [0, 0]] cases = {"should preserve dimension 1": (dim_1_test, dim_1_expected), "should preserve dimension 2": (dim_2_test, dim_2_expected), "should preserve dimension 3": (dim_3_test, dim_3_expected), @@ -47,9 +47,9 @@ def test_sum_encoder_2cols(self): obtained = encoder.transform(train) expected = [ - [1, a_encoding[1], a_encoding[2], a_encoding[1], a_encoding[2]], - [1, b_encoding[1], b_encoding[2], b_encoding[1], b_encoding[2]], - [1, c_encoding[1], c_encoding[2], c_encoding[1], c_encoding[2]], + a_encoding*2, + b_encoding*2, + c_encoding*2, ] self.assertEqual(obtained.to_numpy().tolist(), expected) @@ -65,7 +65,6 @@ def test_multiple_columns_correct_order(self): columns=['col1', 'col2', 'col3', 'col4'], ) expected_columns = [ - 'intercept', 'col1', 'col2_0', 'col2_1', @@ -108,9 +107,8 @@ def test_handle_missing_is_indicator(self): expected = [a_encoding, b_encoding, c_encoding] self.assertEqual(result.to_numpy().tolist(), expected) - # unknown value should be encoded with value strategy, - # i.e. indicator 1 and all other columns zeros + # unknown value should be encoded with value strategy, i.e. zeros for all columns test = ['A', 'B', 'C'] result = encoder.transform(test) - expected = [a_encoding, b_encoding, [1, 0, 0]] + expected = [a_encoding, b_encoding, [0, 0]] self.assertEqual(result.to_numpy().tolist(), expected) From e4980b52aef926b7bc841f11144c9d44cafa6140 Mon Sep 17 00:00:00 2001 From: PaulWestenthanner Date: Tue, 7 Jan 2025 17:50:07 +0100 Subject: [PATCH 3/3] Deprecate Python 3.9, add Python 3.13 --- .github/workflows/docs.yml | 8 +- .github/workflows/test-docs-build.yml | 7 +- .github/workflows/test-suite.yml | 2 +- poetry.lock | 228 ++++++++++++-------------- pyproject.toml | 2 +- 5 files changed, 115 insertions(+), 132 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index dcc7ef1f..84126aaf 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -14,10 +14,10 @@ jobs: python -m pip install --upgrade pip wheel python -m pip install poetry poetry install - - name: Build Docs - uses: ammaraskar/sphinx-action@master - with: - docs-folder: "docs/" + - name: Directly build docs + run: | + pip install -r docs/requirements.txt + sphinx-build -D docs/source ./docs/build/html/ - name: Deploy Docs uses: peaceiris/actions-gh-pages@v3 with: diff --git a/.github/workflows/test-docs-build.yml b/.github/workflows/test-docs-build.yml index 4a05c378..5335a910 100644 --- a/.github/workflows/test-docs-build.yml +++ b/.github/workflows/test-docs-build.yml @@ -13,6 +13,7 @@ jobs: - uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - - uses: ammaraskar/sphinx-action@master - with: - docs-folder: "docs/" + - name: directly build sphinx (plugin only supports python 3.8) + run: | + pip install -r docs/requirements.txt + sphinx-build docs/source ./docs/build/html/ diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index 2d9a9bab..6ccdec7c 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.9', '3.10', '3.11', '3.12'] + python-version: ['3.10', '3.11', '3.12', '3.13'] steps: - uses: actions/checkout@v2 diff --git a/poetry.lock b/poetry.lock index 0e6a82fa..8b318f57 100644 --- a/poetry.lock +++ b/poetry.lock @@ -217,29 +217,6 @@ files = [ {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"}, ] -[[package]] -name = "importlib-metadata" -version = "8.5.0" -description = "Read metadata from Python packages" -optional = false -python-versions = ">=3.8" -files = [ - {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"}, - {file = "importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7"}, -] - -[package.dependencies] -zipp = ">=3.20" - -[package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] -cover = ["pytest-cov"] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -enabler = ["pytest-enabler (>=2.2)"] -perf = ["ipython"] -test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] -type = ["pytest-mypy"] - [[package]] name = "iniconfig" version = "2.0.0" @@ -351,56 +328,66 @@ files = [ [[package]] name = "numpy" -version = "2.0.2" +version = "2.2.1" description = "Fundamental package for array computing in Python" optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" files = [ - {file = "numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece"}, - {file = "numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04"}, - {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66"}, - {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b"}, - {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd"}, - {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318"}, - {file = "numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8"}, - {file = "numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326"}, - {file = "numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97"}, - {file = "numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131"}, - {file = "numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448"}, - {file = "numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195"}, - {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57"}, - {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a"}, - {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669"}, - {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951"}, - {file = "numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9"}, - {file = "numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15"}, - {file = "numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4"}, - {file = "numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc"}, - {file = "numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b"}, - {file = "numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e"}, - {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c"}, - {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c"}, - {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692"}, - {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a"}, - {file = "numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c"}, - {file = "numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded"}, - {file = "numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5"}, - {file = "numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a"}, - {file = "numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c"}, - {file = "numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd"}, - {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b"}, - {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729"}, - {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1"}, - {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd"}, - {file = "numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d"}, - {file = "numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d"}, - {file = "numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa"}, - {file = "numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73"}, - {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8"}, - {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4"}, - {file = "numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c"}, - {file = "numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385"}, - {file = "numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78"}, + {file = "numpy-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5edb4e4caf751c1518e6a26a83501fda79bff41cc59dac48d70e6d65d4ec4440"}, + {file = "numpy-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aa3017c40d513ccac9621a2364f939d39e550c542eb2a894b4c8da92b38896ab"}, + {file = "numpy-2.2.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:61048b4a49b1c93fe13426e04e04fdf5a03f456616f6e98c7576144677598675"}, + {file = "numpy-2.2.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:7671dc19c7019103ca44e8d94917eba8534c76133523ca8406822efdd19c9308"}, + {file = "numpy-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4250888bcb96617e00bfa28ac24850a83c9f3a16db471eca2ee1f1714df0f957"}, + {file = "numpy-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7746f235c47abc72b102d3bce9977714c2444bdfaea7888d241b4c4bb6a78bf"}, + {file = "numpy-2.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:059e6a747ae84fce488c3ee397cee7e5f905fd1bda5fb18c66bc41807ff119b2"}, + {file = "numpy-2.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f62aa6ee4eb43b024b0e5a01cf65a0bb078ef8c395e8713c6e8a12a697144528"}, + {file = "numpy-2.2.1-cp310-cp310-win32.whl", hash = "sha256:48fd472630715e1c1c89bf1feab55c29098cb403cc184b4859f9c86d4fcb6a95"}, + {file = "numpy-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:b541032178a718c165a49638d28272b771053f628382d5e9d1c93df23ff58dbf"}, + {file = "numpy-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:40f9e544c1c56ba8f1cf7686a8c9b5bb249e665d40d626a23899ba6d5d9e1484"}, + {file = "numpy-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f9b57eaa3b0cd8db52049ed0330747b0364e899e8a606a624813452b8203d5f7"}, + {file = "numpy-2.2.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:bc8a37ad5b22c08e2dbd27df2b3ef7e5c0864235805b1e718a235bcb200cf1cb"}, + {file = "numpy-2.2.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:9036d6365d13b6cbe8f27a0eaf73ddcc070cae584e5ff94bb45e3e9d729feab5"}, + {file = "numpy-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51faf345324db860b515d3f364eaa93d0e0551a88d6218a7d61286554d190d73"}, + {file = "numpy-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38efc1e56b73cc9b182fe55e56e63b044dd26a72128fd2fbd502f75555d92591"}, + {file = "numpy-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:31b89fa67a8042e96715c68e071a1200c4e172f93b0fbe01a14c0ff3ff820fc8"}, + {file = "numpy-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4c86e2a209199ead7ee0af65e1d9992d1dce7e1f63c4b9a616500f93820658d0"}, + {file = "numpy-2.2.1-cp311-cp311-win32.whl", hash = "sha256:b34d87e8a3090ea626003f87f9392b3929a7bbf4104a05b6667348b6bd4bf1cd"}, + {file = "numpy-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:360137f8fb1b753c5cde3ac388597ad680eccbbbb3865ab65efea062c4a1fd16"}, + {file = "numpy-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:694f9e921a0c8f252980e85bce61ebbd07ed2b7d4fa72d0e4246f2f8aa6642ab"}, + {file = "numpy-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3683a8d166f2692664262fd4900f207791d005fb088d7fdb973cc8d663626faa"}, + {file = "numpy-2.2.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:780077d95eafc2ccc3ced969db22377b3864e5b9a0ea5eb347cc93b3ea900315"}, + {file = "numpy-2.2.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:55ba24ebe208344aa7a00e4482f65742969a039c2acfcb910bc6fcd776eb4355"}, + {file = "numpy-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b1d07b53b78bf84a96898c1bc139ad7f10fda7423f5fd158fd0f47ec5e01ac7"}, + {file = "numpy-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5062dc1a4e32a10dc2b8b13cedd58988261416e811c1dc4dbdea4f57eea61b0d"}, + {file = "numpy-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:fce4f615f8ca31b2e61aa0eb5865a21e14f5629515c9151850aa936c02a1ee51"}, + {file = "numpy-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:67d4cda6fa6ffa073b08c8372aa5fa767ceb10c9a0587c707505a6d426f4e046"}, + {file = "numpy-2.2.1-cp312-cp312-win32.whl", hash = "sha256:32cb94448be47c500d2c7a95f93e2f21a01f1fd05dd2beea1ccd049bb6001cd2"}, + {file = "numpy-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:ba5511d8f31c033a5fcbda22dd5c813630af98c70b2661f2d2c654ae3cdfcfc8"}, + {file = "numpy-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f1d09e520217618e76396377c81fba6f290d5f926f50c35f3a5f72b01a0da780"}, + {file = "numpy-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3ecc47cd7f6ea0336042be87d9e7da378e5c7e9b3c8ad0f7c966f714fc10d821"}, + {file = "numpy-2.2.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f419290bc8968a46c4933158c91a0012b7a99bb2e465d5ef5293879742f8797e"}, + {file = "numpy-2.2.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:5b6c390bfaef8c45a260554888966618328d30e72173697e5cabe6b285fb2348"}, + {file = "numpy-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:526fc406ab991a340744aad7e25251dd47a6720a685fa3331e5c59fef5282a59"}, + {file = "numpy-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f74e6fdeb9a265624ec3a3918430205dff1df7e95a230779746a6af78bc615af"}, + {file = "numpy-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:53c09385ff0b72ba79d8715683c1168c12e0b6e84fb0372e97553d1ea91efe51"}, + {file = "numpy-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f3eac17d9ec51be534685ba877b6ab5edc3ab7ec95c8f163e5d7b39859524716"}, + {file = "numpy-2.2.1-cp313-cp313-win32.whl", hash = "sha256:9ad014faa93dbb52c80d8f4d3dcf855865c876c9660cb9bd7553843dd03a4b1e"}, + {file = "numpy-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:164a829b6aacf79ca47ba4814b130c4020b202522a93d7bff2202bfb33b61c60"}, + {file = "numpy-2.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4dfda918a13cc4f81e9118dea249e192ab167a0bb1966272d5503e39234d694e"}, + {file = "numpy-2.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:733585f9f4b62e9b3528dd1070ec4f52b8acf64215b60a845fa13ebd73cd0712"}, + {file = "numpy-2.2.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:89b16a18e7bba224ce5114db863e7029803c179979e1af6ad6a6b11f70545008"}, + {file = "numpy-2.2.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:676f4eebf6b2d430300f1f4f4c2461685f8269f94c89698d832cdf9277f30b84"}, + {file = "numpy-2.2.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f5cdf9f493b35f7e41e8368e7d7b4bbafaf9660cba53fb21d2cd174ec09631"}, + {file = "numpy-2.2.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1ad395cf254c4fbb5b2132fee391f361a6e8c1adbd28f2cd8e79308a615fe9d"}, + {file = "numpy-2.2.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:08ef779aed40dbc52729d6ffe7dd51df85796a702afbf68a4f4e41fafdc8bda5"}, + {file = "numpy-2.2.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:26c9c4382b19fcfbbed3238a14abf7ff223890ea1936b8890f058e7ba35e8d71"}, + {file = "numpy-2.2.1-cp313-cp313t-win32.whl", hash = "sha256:93cf4e045bae74c90ca833cba583c14b62cb4ba2cba0abd2b141ab52548247e2"}, + {file = "numpy-2.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:bff7d8ec20f5f42607599f9994770fa65d76edca264a87b5e4ea5629bce12268"}, + {file = "numpy-2.2.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7ba9cc93a91d86365a5d270dee221fdc04fb68d7478e6bf6af650de78a8339e3"}, + {file = "numpy-2.2.1-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:3d03883435a19794e41f147612a77a8f56d4e52822337844fff3d4040a142964"}, + {file = "numpy-2.2.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4511d9e6071452b944207c8ce46ad2f897307910b402ea5fa975da32e0102800"}, + {file = "numpy-2.2.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5c5cc0cbabe9452038ed984d05ac87910f89370b9242371bd9079cb4af61811e"}, + {file = "numpy-2.2.1.tar.gz", hash = "sha256:45681fd7128c8ad1c379f0ca0776a8b0c6583d2f69889ddac01559dfe4390918"}, ] [[package]] @@ -729,45 +716,60 @@ tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc ( [[package]] name = "scipy" -version = "1.13.1" +version = "1.15.0" description = "Fundamental algorithms for scientific computing in Python" optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" files = [ - {file = "scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca"}, - {file = "scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f"}, - {file = "scipy-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfa31f1def5c819b19ecc3a8b52d28ffdcc7ed52bb20c9a7589669dd3c250989"}, - {file = "scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26264b282b9da0952a024ae34710c2aff7d27480ee91a2e82b7b7073c24722f"}, - {file = "scipy-1.13.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:eccfa1906eacc02de42d70ef4aecea45415f5be17e72b61bafcfd329bdc52e94"}, - {file = "scipy-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:2831f0dc9c5ea9edd6e51e6e769b655f08ec6db6e2e10f86ef39bd32eb11da54"}, - {file = "scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:27e52b09c0d3a1d5b63e1105f24177e544a222b43611aaf5bc44d4a0979e32f9"}, - {file = "scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:54f430b00f0133e2224c3ba42b805bfd0086fe488835effa33fa291561932326"}, - {file = "scipy-1.13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89369d27f9e7b0884ae559a3a956e77c02114cc60a6058b4e5011572eea9299"}, - {file = "scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a78b4b3345f1b6f68a763c6e25c0c9a23a9fd0f39f5f3d200efe8feda560a5fa"}, - {file = "scipy-1.13.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:45484bee6d65633752c490404513b9ef02475b4284c4cfab0ef946def50b3f59"}, - {file = "scipy-1.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:5713f62f781eebd8d597eb3f88b8bf9274e79eeabf63afb4a737abc6c84ad37b"}, - {file = "scipy-1.13.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5d72782f39716b2b3509cd7c33cdc08c96f2f4d2b06d51e52fb45a19ca0c86a1"}, - {file = "scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:017367484ce5498445aade74b1d5ab377acdc65e27095155e448c88497755a5d"}, - {file = "scipy-1.13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:949ae67db5fa78a86e8fa644b9a6b07252f449dcf74247108c50e1d20d2b4627"}, - {file = "scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de3ade0e53bc1f21358aa74ff4830235d716211d7d077e340c7349bc3542e884"}, - {file = "scipy-1.13.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2ac65fb503dad64218c228e2dc2d0a0193f7904747db43014645ae139c8fad16"}, - {file = "scipy-1.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:cdd7dacfb95fea358916410ec61bbc20440f7860333aee6d882bb8046264e949"}, - {file = "scipy-1.13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:436bbb42a94a8aeef855d755ce5a465479c721e9d684de76bf61a62e7c2b81d5"}, - {file = "scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:8335549ebbca860c52bf3d02f80784e91a004b71b059e3eea9678ba994796a24"}, - {file = "scipy-1.13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d533654b7d221a6a97304ab63c41c96473ff04459e404b83275b60aa8f4b7004"}, - {file = "scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637e98dcf185ba7f8e663e122ebf908c4702420477ae52a04f9908707456ba4d"}, - {file = "scipy-1.13.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a014c2b3697bde71724244f63de2476925596c24285c7a637364761f8710891c"}, - {file = "scipy-1.13.1-cp39-cp39-win_amd64.whl", hash = "sha256:392e4ec766654852c25ebad4f64e4e584cf19820b980bc04960bca0b0cd6eaa2"}, - {file = "scipy-1.13.1.tar.gz", hash = "sha256:095a87a0312b08dfd6a6155cbbd310a8c51800fc931b8c0b84003014b874ed3c"}, + {file = "scipy-1.15.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:aeac60d3562a7bf2f35549bdfdb6b1751c50590f55ce7322b4b2fc821dc27fca"}, + {file = "scipy-1.15.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:5abbdc6ede5c5fed7910cf406a948e2c0869231c0db091593a6b2fa78be77e5d"}, + {file = "scipy-1.15.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:eb1533c59f0ec6c55871206f15a5c72d1fae7ad3c0a8ca33ca88f7c309bbbf8c"}, + {file = "scipy-1.15.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:de112c2dae53107cfeaf65101419662ac0a54e9a088c17958b51c95dac5de56d"}, + {file = "scipy-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2240e1fd0782e62e1aacdc7234212ee271d810f67e9cd3b8d521003a82603ef8"}, + {file = "scipy-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d35aef233b098e4de88b1eac29f0df378278e7e250a915766786b773309137c4"}, + {file = "scipy-1.15.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1b29e4fc02e155a5fd1165f1e6a73edfdd110470736b0f48bcbe48083f0eee37"}, + {file = "scipy-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:0e5b34f8894f9904cc578008d1a9467829c1817e9f9cb45e6d6eeb61d2ab7731"}, + {file = "scipy-1.15.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:46e91b5b16909ff79224b56e19cbad65ca500b3afda69225820aa3afbf9ec020"}, + {file = "scipy-1.15.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:82bff2eb01ccf7cea8b6ee5274c2dbeadfdac97919da308ee6d8e5bcbe846443"}, + {file = "scipy-1.15.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:9c8254fe21dd2c6c8f7757035ec0c31daecf3bb3cffd93bc1ca661b731d28136"}, + {file = "scipy-1.15.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:c9624eeae79b18cab1a31944b5ef87aa14b125d6ab69b71db22f0dbd962caf1e"}, + {file = "scipy-1.15.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d13bbc0658c11f3d19df4138336e4bce2c4fbd78c2755be4bf7b8e235481557f"}, + {file = "scipy-1.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdca4c7bb8dc41307e5f39e9e5d19c707d8e20a29845e7533b3bb20a9d4ccba0"}, + {file = "scipy-1.15.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f376d7c767731477bac25a85d0118efdc94a572c6b60decb1ee48bf2391a73b"}, + {file = "scipy-1.15.0-cp311-cp311-win_amd64.whl", hash = "sha256:61513b989ee8d5218fbeb178b2d51534ecaddba050db949ae99eeb3d12f6825d"}, + {file = "scipy-1.15.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5beb0a2200372b7416ec73fdae94fe81a6e85e44eb49c35a11ac356d2b8eccc6"}, + {file = "scipy-1.15.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:fde0f3104dfa1dfbc1f230f65506532d0558d43188789eaf68f97e106249a913"}, + {file = "scipy-1.15.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:35c68f7044b4e7ad73a3e68e513dda946989e523df9b062bd3cf401a1a882192"}, + {file = "scipy-1.15.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:52475011be29dfcbecc3dfe3060e471ac5155d72e9233e8d5616b84e2b542054"}, + {file = "scipy-1.15.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5972e3f96f7dda4fd3bb85906a17338e65eaddfe47f750e240f22b331c08858e"}, + {file = "scipy-1.15.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00169cf875bed0b3c40e4da45b57037dc21d7c7bf0c85ed75f210c281488f1"}, + {file = "scipy-1.15.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:161f80a98047c219c257bf5ce1777c574bde36b9d962a46b20d0d7e531f86863"}, + {file = "scipy-1.15.0-cp312-cp312-win_amd64.whl", hash = "sha256:327163ad73e54541a675240708244644294cb0a65cca420c9c79baeb9648e479"}, + {file = "scipy-1.15.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0fcb16eb04d84670722ce8d93b05257df471704c913cb0ff9dc5a1c31d1e9422"}, + {file = "scipy-1.15.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:767e8cf6562931f8312f4faa7ddea412cb783d8df49e62c44d00d89f41f9bbe8"}, + {file = "scipy-1.15.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:37ce9394cdcd7c5f437583fc6ef91bd290014993900643fdfc7af9b052d1613b"}, + {file = "scipy-1.15.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:6d26f17c64abd6c6c2dfb39920f61518cc9e213d034b45b2380e32ba78fde4c0"}, + {file = "scipy-1.15.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e2448acd79c6374583581a1ded32ac71a00c2b9c62dfa87a40e1dd2520be111"}, + {file = "scipy-1.15.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36be480e512d38db67f377add5b759fb117edd987f4791cdf58e59b26962bee4"}, + {file = "scipy-1.15.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ccb6248a9987193fe74363a2d73b93bc2c546e0728bd786050b7aef6e17db03c"}, + {file = "scipy-1.15.0-cp313-cp313-win_amd64.whl", hash = "sha256:952d2e9eaa787f0a9e95b6e85da3654791b57a156c3e6609e65cc5176ccfe6f2"}, + {file = "scipy-1.15.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:b1432102254b6dc7766d081fa92df87832ac25ff0b3d3a940f37276e63eb74ff"}, + {file = "scipy-1.15.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:4e08c6a36f46abaedf765dd2dfcd3698fa4bd7e311a9abb2d80e33d9b2d72c34"}, + {file = "scipy-1.15.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:ec915cd26d76f6fc7ae8522f74f5b2accf39546f341c771bb2297f3871934a52"}, + {file = "scipy-1.15.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:351899dd2a801edd3691622172bc8ea01064b1cada794f8641b89a7dc5418db6"}, + {file = "scipy-1.15.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9baff912ea4f78a543d183ed6f5b3bea9784509b948227daaf6f10727a0e2e5"}, + {file = "scipy-1.15.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cd9d9198a7fd9a77f0eb5105ea9734df26f41faeb2a88a0e62e5245506f7b6df"}, + {file = "scipy-1.15.0-cp313-cp313t-win_amd64.whl", hash = "sha256:129f899ed275c0515d553b8d31696924e2ca87d1972421e46c376b9eb87de3d2"}, + {file = "scipy-1.15.0.tar.gz", hash = "sha256:300742e2cc94e36a2880ebe464a1c8b4352a7b0f3e36ec3d2ac006cdbe0219ac"}, ] [package.dependencies] -numpy = ">=1.22.4,<2.3" +numpy = ">=1.23.5,<2.5" [package.extras] -dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pydevtool", "rich-click", "ruff", "types-psutil", "typing_extensions"] -doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.12.0)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0)", "sphinx-design (>=0.4.0)"] -test = ["array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] +dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodestyle", "pydevtool", "rich-click", "ruff (>=0.0.292)", "types-psutil", "typing_extensions"] +doc = ["intersphinx_registry", "jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.16.5)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<8.0.0)", "sphinx-copybutton", "sphinx-design (>=0.4.0)"] +test = ["Cython", "array-api-strict (>=2.0,<2.1.1)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] [[package]] name = "six" @@ -808,7 +810,6 @@ babel = ">=2.13" colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\""} docutils = ">=0.20,<0.22" imagesize = ">=1.3" -importlib-metadata = {version = ">=6.0", markers = "python_version < \"3.10\""} Jinja2 = ">=3.1" packaging = ">=23.0" Pygments = ">=2.17" @@ -1099,26 +1100,7 @@ h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] -[[package]] -name = "zipp" -version = "3.21.0" -description = "Backport of pathlib-compatible object wrapper for zip files" -optional = false -python-versions = ">=3.9" -files = [ - {file = "zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931"}, - {file = "zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4"}, -] - -[package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] -cover = ["pytest-cov"] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -enabler = ["pytest-enabler (>=2.2)"] -test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] -type = ["pytest-mypy"] - [metadata] lock-version = "2.0" -python-versions = ">=3.9" -content-hash = "1f78a5242fda08ef8214967206a01644c613f94ef5111fc39ad8616fa925f8eb" +python-versions = ">=3.10" +content-hash = "87a16457e15b5435f18630e5972acbc923ebf55cd3654e5ffa4786cba5e340b0" diff --git a/pyproject.toml b/pyproject.toml index 854b4c41..d2ff53eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ license = "BSD-3" readme = "README.md" [tool.poetry.dependencies] -python = ">=3.9" +python = ">=3.10" numpy = ">=1.14.0" scikit-learn = ">=1.0.0, <1.6.0" scipy = ">=1.0.0"