Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions category_encoders/base_contrast_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def _transform(self, X) -> pd.DataFrame:
X = self.ordinal_encoder.transform(X)
if self.handle_unknown == 'error':
if X[self.cols].isin([-1]).any().any():
raise ValueError('Columns to be encoded can not contain new values')
raise ValueError('Columns to be encoded cannot contain new values')

X = self.transform_contrast_coding(X, mapping=self.mapping)
return X
Expand All @@ -124,7 +124,7 @@ def fit_contrast_coding(
handle_missing: str
How to handle missing values.
handle_unknown: str
How to hande unkown values.
How to handle unknown values.

Returns
-------
Expand Down
2 changes: 1 addition & 1 deletion category_encoders/basen.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def _transform(self, X):

if self.handle_unknown == 'error':
if X_out[self.cols].isin([-1]).any().any():
raise ValueError('Columns to be encoded can not contain new values')
raise ValueError('Columns to be encoded cannot contain new values')

X_out = self.basen_encode(X_out, cols=self.cols)
return X_out
Expand Down
2 changes: 1 addition & 1 deletion category_encoders/cat_boost.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def _transform(self, X, y=None):
is_unknown_value = X[col].isin(unseen_values.dropna().astype(object))

if self.handle_unknown == 'error' and is_unknown_value.any():
raise ValueError('Columns to be encoded can not contain new values')
raise ValueError('Columns to be encoded cannot contain new values')

if (
y is None
Expand Down
2 changes: 1 addition & 1 deletion category_encoders/glmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class GLMMEncoder( util.SupervisedTransformerMixin ,util.BaseEncoder):
Mixed effects models are a mature branch of statistics.
2. No hyper-parameters to tune. The amount of shrinkage is automatically determined
through the estimation process. In short, the less observations a category has and/or
the more the outcome varies for a category. Then the higher the regularization
the more that outcome varies for a category, the higher the regularization
towards "the prior" or "grand mean".
3. The technique is applicable for both continuous and binomial targets.
If the target is continuous, the encoder returns regularized difference of the
Expand Down
8 changes: 4 additions & 4 deletions category_encoders/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,11 @@ def hash_chunk(hash_method: str, np_df: np.ndarray, N: int) -> np.ndarray:
for val in row:
if val is not None:
hasher = hasher_constructor()
# Computes an integer index from the hasher digest. The endian is
# "big" as the code use to read:
# Computes an integer index from the hasher digest. The endianness is
# "big" as the code used to read:
# column_index = int(hasher.hexdigest(), 16) % N
# which is implicitly considering the hexdigest to be big endian,
# even if the system is little endian.
# which is implicitly considering the hexdigest to be big-endian,
# even if the system is little-endian.
# Building the index that way is about 30% faster than using the
# hexdigest.
hasher.update(bytes(str(val), 'utf-8'))
Expand Down
4 changes: 2 additions & 2 deletions category_encoders/leave_one_out.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def _fit_column_map(series: pd.Series, y: pd.Series) -> pd.Series:
def transform_leave_one_out(self, X: pd.DataFrame, y: pd.Series | None, mapping=None):
"""Apply leave-one-out-encoding to a dataframe.

If a target is given the lable-mean is calculated without the target (left out).
If a target is given the label-mean is calculated without the target (left out).
Otherwise, the label mean from the fit step is taken.
"""
random_state_ = check_random_state(self.random_state)
Expand All @@ -184,7 +184,7 @@ def transform_leave_one_out(self, X: pd.DataFrame, y: pd.Series | None, mapping=
X[col] = X[col].astype(index_dtype)

if self.handle_unknown == 'error' and is_unknown_value.any():
raise ValueError('Columns to be encoded can not contain new values')
raise ValueError('Columns to be encoded cannot contain new values')

if (
y is None
Expand Down
4 changes: 2 additions & 2 deletions category_encoders/one_hot.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class OneHotEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
handle_missing: str
options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'.

'error' will raise a `ValueError` if missings are encountered.
'error' will raise a `ValueError` if a missing value is encountered.
'return_nan' will encode a missing value as `np.nan` in every dummy column.
'value' will encode a missing value as 0 in every dummy column.
'indicator' will treat missingness as its own category, adding an additional dummy column
Expand Down Expand Up @@ -227,7 +227,7 @@ def _transform(self, X):

if self.handle_unknown == 'error':
if X[self.cols].isin([-1]).any().any():
raise ValueError('Columns to be encoded can not contain new values')
raise ValueError('Columns to be encoded cannot contain new values')

X = self.get_dummies(X)
return X
Expand Down
8 changes: 4 additions & 4 deletions category_encoders/rankhot.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ class RankHotEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
handle_unknown: str
options are 'error', 'value', 'return_nan'.
The default is 'value'.
'value': If an unknown label occurrs, it is represented as 0 array.
'error': If an unknown label occurrs, error message is displayed.
'return_nan': If an unknown label occurrs, np.nan is returned in all columns.
'value': If an unknown label occurs, it is represented as 0 array.
'error': If an unknown label occurs, error message is displayed.
'return_nan': If an unknown label occurs, np.nan is returned in all columns.
handle_missing: str
options are 'error', 'value' and 'return_nan'. The default is 'value'.
Missing value also considered as unknown value in the final data set.
Expand Down Expand Up @@ -146,7 +146,7 @@ def _transform(self, X_in: pd.DataFrame, override_return_df: bool = False) -> pd

if self.handle_unknown == 'error':
if X[self.cols].isin([-1]).any().any():
raise ValueError('Columns to be encoded can not contain new values')
raise ValueError('Columns to be encoded cannot contain new values')

for switch, _ordinal_switch in zip(
self.mapping, self.ordinal_encoder.category_mapping, strict=False
Expand Down
2 changes: 1 addition & 1 deletion category_encoders/target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ def target_encode(self, X_in: pd.DataFrame) -> pd.DataFrame:
"""Apply target encoding via encoder mapping."""
X = X_in.copy(deep=True)

# Was not mapping extra columns as self.featuer_names_in did not include new column
# Was not mapping extra columns as self.feature_names_in did not include new column
for col in self.cols:
X[col] = X[col].map(self.mapping[col])

Expand Down
6 changes: 3 additions & 3 deletions category_encoders/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ def fit(self, X: X_type, y: y_type | None = None, **kwargs):

if self.handle_missing == 'error':
if X[self.cols].isna().any().any():
raise ValueError('Columns to be encoded can not contain null')
raise ValueError('Columns to be encoded cannot contain null')

self._fit(X, y, **kwargs)

Expand Down Expand Up @@ -504,7 +504,7 @@ def _check_fit_inputs(self, X: X_type, y: y_type) -> None:
def _check_transform_inputs(self, df: pd.DataFrame) -> None:
if self.handle_missing == 'error':
if df[self.cols].isna().any().any():
raise ValueError('Columns to be encoded can not contain null')
raise ValueError('Columns to be encoded cannot contain null')

if self._dim is None:
raise NotFittedError('Must train encoder before it can be used to transform data.')
Expand Down Expand Up @@ -600,7 +600,7 @@ def __sklearn_tags__(self) -> EncoderTags:
def transform(self, X: X_type, y: y_type | None = None, override_return_df: bool = False):
"""Perform the transformation to new categorical data.

Some encoders behave differently on whether y is given or not.
Some encoders behave differently on whether or not y is given.
This is mainly due to regularisation in order to avoid overfitting.
On training data transform should be called with y, on test data without.

Expand Down
2 changes: 1 addition & 1 deletion examples/source_data/mushrooms/agaricus-lepiota.names
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

3. Past Usage:
1. Schlimmer,J.S. (1987). Concept Acquisition Through Representational
Adjustment (Technical Report 87-19). Doctoral disseration, Department
Adjustment (Technical Report 87-19). Doctoral dissertation, Department
of Information and Computer Science, University of California, Irvine.
--- STAGGER: asymptoted to 95% classification accuracy after reviewing
1000 instances.
Expand Down
4 changes: 2 additions & 2 deletions joss/paper.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ title: 'Category Encoders: a scikit-learn-contrib package of transformers for en
tags:
- machine learning
- python
- sckit-learn
- scikit-learn
authors:
- name: William D McGinnis
orcid: 0000-0002-3009-9465
Expand Down Expand Up @@ -45,7 +45,7 @@ Categorical: Georgia, Alabama, South Carolina, … , New York

The machine learning algorithms we will later use tend to want numbers, and not strings, as their inputs so we need some method of coding to convert them.

Category_encoders includes a number of pre-existing encoders that are commonly used, notably Ordinal, Hashing and OneHot encoders [@idre][@carey][@hashing]. There are also some
Category_encoders includes a number of preexisting encoders that are commonly used, notably Ordinal, Hashing and OneHot encoders [@idre][@carey][@hashing]. There are also some
less frequently used encoders including Backward Difference, Helmert, Polynomial and Sum encoding [@idre][@carey]. Finally there are
experimental encoders: LeaveOneOut, Binary and BaseN [@zhang][@onehot][@basen].

Expand Down
2 changes: 1 addition & 1 deletion tests/test_basen.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def test_inverse_transform_have_nan_in_train(self):
def test_inverse_transform_not_supported_with_unknown_values(self):
"""Test that inverse_transform is not supported if a nan could be either missing or unknown.

This happens if both handle_missing and handle_unkown are set to 'return_nan'.
This happens if both handle_missing and handle_unknown are set to 'return_nan'.
"""
train = pd.DataFrame({'city': ['chicago', np.nan]})
test = pd.DataFrame({'city': ['chicago', 'los angeles']})
Expand Down
2 changes: 1 addition & 1 deletion tests/test_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_binary_bin(self):
self.assertTrue(pd.DataFrame([[0, 1], [1, 0], [1, 0]], columns=['0_0', '0_1']).equals(out))

def test_binary_dist(self):
"""Test the BinaryEncoder with a all distinct values."""
"""Test the BinaryEncoder with all distinct values."""
data = np.array(['apple', 'orange', 'peach', 'lemon'])
encoder = encoders.BinaryEncoder()
encoder.fit(data)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def test_count_handle_unknown_string(self):
"""Test the handle_unknown string on 'none' and 'na_categorical'.

The 'handle_missing' must be set to 'return_nan' in order to test
'handle_unkown' correctly.
'handle_unknown' correctly.
"""
enc = encoders.CountEncoder(
handle_missing='return_nan',
Expand All @@ -155,7 +155,7 @@ def test_count_handle_unknown_string(self):
self.assertTrue(out['na_categorical'].isna().sum() == 3)

def test_count_handle_unknown_dict(self):
"""Test the 'handle_unkown' dict with all non-default options."""
"""Test the 'handle_unknown' dict with all non-default options."""
enc = encoders.CountEncoder(
handle_missing='return_nan',
handle_unknown={'none': -1, 'na_categorical': 'return_nan'},
Expand Down
2 changes: 1 addition & 1 deletion tests/test_rankhot.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_handle_pandas_categorical(self):
self.assertEqual(t_f.shape[1] - (X.shape[1] - 1), len(X.categorical.unique()))
self.assertTupleEqual(inv_tf.shape, X.shape)

def test_na_catagorical(self):
def test_na_categorical(self):
"""Test also NAs on pandas categorical are handled correctly."""
enc = encoders.RankHotEncoder(handle_unknown='value', cols=['na_categorical'])
enc.fit(X)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def transform(self, X, y=None, override_return_df=False):

self.encoder = DummyEncoder()

@pytest.mark.skipif(Version(skl_version) < Version('1.2'), reason='requires sklean > 1.2')
@pytest.mark.skipif(Version(skl_version) < Version('1.2'), reason='requires sklearn > 1.2')
def test_sklearn_pandas_out_refit(self):
"""Test that the encoder can be refit with sklearn and pandas."""
# Thanks to Issue#437
Expand Down