Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
- name: Directly build docs
run: |
pip install -r docs/requirements.txt
sphinx-build -D docs/source ./docs/build/html/
sphinx-build docs/source ./docs/build/html/
- name: Deploy Docs
uses: peaceiris/actions-gh-pages@v3
with:
Expand Down
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
v.2.8.0
=======

* Fix: Support new concept of sklearn tags, now requiring sklearn >= 1.6.0
* Fix: Docs deployment

v.2.7.0
==========
=======

* Refactor: Use poetry as packaging tool
* Refactor: Add more typing
Expand Down
10 changes: 8 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,25 @@ The preferred workflow to contribute to git-pandas is:
Guidelines
==========

This is still a very young project, but we do have a few guiding principles:

1. Maintain semantics of the scikit-learn API
2. Write detailed docstrings in numpy format
3. Support pandas dataframes and numpy arrays as inputs
4. Write tests

Styleguide:

We're using ruff for linting. Rules are implemented in the `pyproject.toml` file. To run the linter, use:

$ poetry run ruff check category_encoders --fix


Running Tests
=============

To run the tests, use:

$ pytest
$ poetry run pytest tests/

Easy Issues / Getting Started
=============================
Expand Down
2 changes: 1 addition & 1 deletion category_encoders/base_contrast_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
__author__ = 'paulwestenthanner'


class BaseContrastEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
class BaseContrastEncoder(util.UnsupervisedTransformerMixin, util.BaseEncoder):
"""Base class for various contrast encoders.

Parameters
Expand Down
2 changes: 1 addition & 1 deletion category_encoders/basen.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def _ceillogint(n, base):
return ret


class BaseNEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
class BaseNEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
"""Base-N encoder encodes the categories into arrays of their base-N representation.

A base of 1 is equivalent to one-hot encoding (not really base-1, but useful),
Expand Down
8 changes: 4 additions & 4 deletions category_encoders/cat_boost.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
__author__ = 'Jan Motl'


class CatBoostEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
class CatBoostEncoder(util.SupervisedTransformerMixin, util.BaseEncoder):
"""CatBoost Encoding for categorical features.

Supported targets: binomial and continuous.
Expand Down Expand Up @@ -202,10 +202,10 @@ def _transform(self, X, y=None):

return X

def _more_tags(self) -> dict[str, bool]:
def __sklearn_tags__(self) -> util.EncoderTags:
"""Set scikit transformer tags."""
tags = super()._more_tags()
tags['predict_depends_on_y'] = True
tags = super().__sklearn_tags__()
tags.predict_depends_on_y = True
return tags

def _fit_column_map(self, series, y):
Expand Down
2 changes: 1 addition & 1 deletion category_encoders/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
__author__ = 'joshua t. dunn'


class CountEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
class CountEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
"""Count encoding for categorical features.

For a given categorical feature, replace the names of the groups with the group counts.
Expand Down
8 changes: 4 additions & 4 deletions category_encoders/glmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
__author__ = 'Jan Motl'


class GLMMEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
class GLMMEncoder( util.SupervisedTransformerMixin ,util.BaseEncoder):
"""Generalized linear mixed model.

Supported targets: binomial and continuous.
Expand Down Expand Up @@ -164,10 +164,10 @@ def _transform(self, X, y=None):
X = self._score(X, y)
return X

def _more_tags(self) -> dict[str, bool]:
def __sklearn_tags__(self) -> util.EncoderTags:
"""Set scikit transformer tags."""
tags = super()._more_tags()
tags['predict_depends_on_y'] = True
tags = super().__sklearn_tags__()
tags.predict_depends_on_y = True
return tags

def _train(self, X, y):
Expand Down
2 changes: 1 addition & 1 deletion category_encoders/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
__author__ = 'willmcginnis', 'LiuShulun'


class HashingEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
class HashingEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
"""A multivariate hashing implementation with configurable dimensionality/precision.

The advantage of this encoder is that it does not maintain a dictionary of observed categories.
Expand Down
8 changes: 4 additions & 4 deletions category_encoders/james_stein.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
__author__ = 'Jan Motl'


class JamesSteinEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
class JamesSteinEncoder( util.SupervisedTransformerMixin,util.BaseEncoder):
"""James-Stein estimator.

Supported targets: binomial and continuous.
Expand Down Expand Up @@ -228,10 +228,10 @@ def _transform(self, X, y=None):
X = self._score(X, y)
return X

def _more_tags(self) -> dict[str, bool]:
def __sklearn_tags__(self) -> util.EncoderTags:
"""Set scikit transformer tags."""
tags = super()._more_tags()
tags['predict_depends_on_y'] = True
tags = super().__sklearn_tags__()
tags.predict_depends_on_y = True
return tags

def _train_pooled(self, X, y):
Expand Down
8 changes: 4 additions & 4 deletions category_encoders/leave_one_out.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
__author__ = 'hbghhy'


class LeaveOneOutEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
class LeaveOneOutEncoder( util.SupervisedTransformerMixin,util.BaseEncoder):
"""Leave one out coding for categorical features.

This is very similar to target encoding but excludes the current row's
Expand Down Expand Up @@ -124,10 +124,10 @@ def _transform(self, X, y=None):
X = self.transform_leave_one_out(X, y, mapping=self.mapping)
return X

def _more_tags(self) -> dict[str, bool]:
def __sklearn_tags__(self) -> util.EncoderTags:
"""Set scikit transformer tags."""
tags = super()._more_tags()
tags['predict_depends_on_y'] = True
tags = super().__sklearn_tags__()
tags.predict_depends_on_y = True
return tags

def fit_leave_one_out(
Expand Down
8 changes: 4 additions & 4 deletions category_encoders/m_estimate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
__author__ = 'Jan Motl'


class MEstimateEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
class MEstimateEncoder( util.SupervisedTransformerMixin,util.BaseEncoder):
"""M-probability estimate of likelihood.

Supported targets: binomial and continuous.
Expand Down Expand Up @@ -150,10 +150,10 @@ def _transform(self, X, y=None):
X = self._score(X, y)
return X

def _more_tags(self) -> dict[str, bool]:
def __sklearn_tags__(self) -> util.EncoderTags:
"""Set scikit transformer tags."""
tags = super()._more_tags()
tags['predict_depends_on_y'] = True
tags = super().__sklearn_tags__()
tags.predict_depends_on_y = True
return tags

def _train(self, X, y):
Expand Down
2 changes: 1 addition & 1 deletion category_encoders/one_hot.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
__author__ = 'willmcginnis'


class OneHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
class OneHotEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
"""Onehot (or dummy) coding for categorical features, produces a binary feature per category.

Parameters
Expand Down
2 changes: 1 addition & 1 deletion category_encoders/ordinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
__author__ = 'willmcginnis'


class OrdinalEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
class OrdinalEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
"""Encodes categorical features as ordinal, in one ordered feature.

Ordinal encoding uses a single column of integers to represent the classes.
Expand Down
20 changes: 18 additions & 2 deletions category_encoders/quantile_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from category_encoders.ordinal import OrdinalEncoder


class QuantileEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
class QuantileEncoder(util.SupervisedTransformerMixin, util.BaseEncoder):
"""Quantile Encoding for categorical features.

This a statistically modified version of target MEstimate encoder where selected features
Expand Down Expand Up @@ -204,7 +204,7 @@ def quantile_encode(self, X_in: pd.DataFrame) -> pd.DataFrame:


# todo does not fit in schema since it is an ensemble of other encoders
class SummaryEncoder(BaseEstimator, util.TransformerWithTargetMixin):
class SummaryEncoder(BaseEstimator):
"""Summary Encoding for categorical features.

It's an encoder designed for creating richer representations by applying quantile
Expand Down Expand Up @@ -418,6 +418,22 @@ def transform(
else:
return transformed_df.to_numpy()

def __sklearn_tags__(self) -> util.EncoderTags:
"""Set scikit transformer tags."""
sk_tags = super().__sklearn_tags__()
tags = util.EncoderTags.from_sk_tags(sk_tags)
tags.target_tags.required = True
return tags

def fit_transform(self, X: util.X_type, y: util.y_type | None = None):
"""Fit and transform using target.

This also uses the target for transforming, not only for training.
"""
if y is None:
raise TypeError('fit_transform() missing argument: ' 'y' '')
return self.fit(X, y).transform(X, y)

def get_feature_names(self) -> np.ndarray:
"""Deprecated method to get feature names. Use `get_feature_names_out` instead."""
msg = (
Expand Down
2 changes: 1 addition & 1 deletion category_encoders/rankhot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from category_encoders import OrdinalEncoder


class RankHotEncoder(util.BaseEncoder, util.UnsupervisedTransformerMixin):
class RankHotEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
"""Rank Hot Encoder.

The rank-hot encoder is similar to a one-hot encoder,
Expand Down
2 changes: 1 addition & 1 deletion category_encoders/target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
__author__ = 'chappers'


class TargetEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
class TargetEncoder( util.SupervisedTransformerMixin,util.BaseEncoder):
"""Target encoding for categorical features.

Supported targets: binomial and continuous.
Expand Down
45 changes: 24 additions & 21 deletions category_encoders/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import warnings
from abc import abstractmethod
from dataclasses import dataclass, fields
from enum import Enum, auto
from typing import Hashable, Sequence

Expand All @@ -16,6 +17,7 @@
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import Tags

__author__ = 'willmcginnis'

Expand Down Expand Up @@ -345,6 +347,21 @@ def get_docstring_output_shape(in_out_relation: EncodingRelation) -> str:
return 'M features (M can be anything)'


@dataclass
class EncoderTags(Tags):
"""Custom Tags for encoders."""

predict_depends_on_y: bool = False

@classmethod
def from_sk_tags(cls, tags: Tags) -> EncoderTags:
"""Initialize EncoderTags from given sklearn Tags."""
as_dict = {
field.name: getattr(tags, field.name)
for field in fields(tags)
}
return cls(**as_dict)

class BaseEncoder(BaseEstimator):
"""BaseEstimator class for all encoders.

Expand Down Expand Up @@ -437,7 +454,7 @@ def fit(self, X: X_type, y: y_type | None = None, **kwargs):
self.feature_names_in_ = X.columns.tolist()
self.n_features_in_ = len(self.feature_names_in_)

if self._get_tags().get('supervised_encoder'):
if self.__sklearn_tags__().target_tags.required:
if not is_numeric_dtype(y):
self.lab_encoder_ = LabelEncoder()
y = self.lab_encoder_.fit_transform(y)
Expand Down Expand Up @@ -475,7 +492,7 @@ def fit(self, X: X_type, y: y_type | None = None, **kwargs):
return self

def _check_fit_inputs(self, X: X_type, y: y_type) -> None:
if self._get_tags().get('supervised_encoder'):
if self.__sklearn_tags__().target_tags.required:
if y is None:
raise ValueError(
'Supervised encoders need a target for the fitting. The target cannot be None'
Expand Down Expand Up @@ -573,9 +590,12 @@ def _fit(self, X: pd.DataFrame, y: pd.Series | None, **kwargs): ...
class SupervisedTransformerMixin(sklearn.base.TransformerMixin):
"""Mixin for supervised transformers (with target)."""

def _more_tags(self) -> dict[str, bool]:
def __sklearn_tags__(self) -> EncoderTags:
"""Set scikit transformer tags."""
return {'supervised_encoder': True}
sk_tags = super().__sklearn_tags__()
tags = EncoderTags.from_sk_tags(sk_tags)
tags.target_tags.required = True
return tags

def transform(self, X: X_type, y: y_type | None = None, override_return_df: bool = False):
"""Perform the transformation to new categorical data.
Expand Down Expand Up @@ -653,20 +673,3 @@ def transform(self, X: X_type, override_return_df: bool = False):

@abstractmethod
def _transform(self, X: pd.DataFrame) -> pd.DataFrame: ...


class TransformerWithTargetMixin:
"""Mixin for transformers with target information."""

def _more_tags(self) -> dict[str, bool]:
"""Set scikit transformer tags."""
return {'supervised_encoder': True}

def fit_transform(self, X: X_type, y: y_type | None = None, **fit_params):
"""Fit and transform using target.

This also uses the target for transforming, not only for training.
"""
if y is None:
raise TypeError('fit_transform() missing argument: ' 'y' '')
return self.fit(X, y, **fit_params).transform(X, y)
2 changes: 1 addition & 1 deletion category_encoders/woe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
__author__ = 'Jan Motl'


class WOEEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
class WOEEncoder( util.SupervisedTransformerMixin,util.BaseEncoder):
"""Weight of Evidence coding for categorical features.

Supported targets: binomial. For polynomial target support, see PolynomialWrapper.
Expand Down
Loading
Loading