diff --git a/category_encoders/base_contrast_encoder.py b/category_encoders/base_contrast_encoder.py index f3141b2e..7b1b72a8 100644 --- a/category_encoders/base_contrast_encoder.py +++ b/category_encoders/base_contrast_encoder.py @@ -100,7 +100,7 @@ def _transform(self, X) -> pd.DataFrame: X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): - raise ValueError('Columns to be encoded can not contain new values') + raise ValueError('Columns to be encoded cannot contain new values') X = self.transform_contrast_coding(X, mapping=self.mapping) return X @@ -124,7 +124,7 @@ def fit_contrast_coding( handle_missing: str How to handle missing values. handle_unknown: str - How to hande unkown values. + How to handle unknown values. Returns ------- diff --git a/category_encoders/basen.py b/category_encoders/basen.py index d14ce4e8..ac925791 100644 --- a/category_encoders/basen.py +++ b/category_encoders/basen.py @@ -187,7 +187,7 @@ def _transform(self, X): if self.handle_unknown == 'error': if X_out[self.cols].isin([-1]).any().any(): - raise ValueError('Columns to be encoded can not contain new values') + raise ValueError('Columns to be encoded cannot contain new values') X_out = self.basen_encode(X_out, cols=self.cols) return X_out diff --git a/category_encoders/cat_boost.py b/category_encoders/cat_boost.py index 03ebcdc6..6ea932e8 100644 --- a/category_encoders/cat_boost.py +++ b/category_encoders/cat_boost.py @@ -157,7 +157,7 @@ def _transform(self, X, y=None): is_unknown_value = X[col].isin(unseen_values.dropna().astype(object)) if self.handle_unknown == 'error' and is_unknown_value.any(): - raise ValueError('Columns to be encoded can not contain new values') + raise ValueError('Columns to be encoded cannot contain new values') if ( y is None diff --git a/category_encoders/glmm.py b/category_encoders/glmm.py index 3a5b2738..a7323197 100644 --- a/category_encoders/glmm.py +++ b/category_encoders/glmm.py @@ -28,7 +28,7 @@ class GLMMEncoder( util.SupervisedTransformerMixin ,util.BaseEncoder): Mixed effects models are a mature branch of statistics. 2. No hyper-parameters to tune. The amount of shrinkage is automatically determined through the estimation process. In short, the less observations a category has and/or - the more the outcome varies for a category. Then the higher the regularization + the more that outcome varies for a category, the higher the regularization towards "the prior" or "grand mean". 3. The technique is applicable for both continuous and binomial targets. If the target is continuous, the encoder returns regularized difference of the diff --git a/category_encoders/hashing.py b/category_encoders/hashing.py index d100a82a..e99d9553 100644 --- a/category_encoders/hashing.py +++ b/category_encoders/hashing.py @@ -228,11 +228,11 @@ def hash_chunk(hash_method: str, np_df: np.ndarray, N: int) -> np.ndarray: for val in row: if val is not None: hasher = hasher_constructor() - # Computes an integer index from the hasher digest. The endian is - # "big" as the code use to read: + # Computes an integer index from the hasher digest. The endianness is + # "big" as the code used to read: # column_index = int(hasher.hexdigest(), 16) % N - # which is implicitly considering the hexdigest to be big endian, - # even if the system is little endian. + # which is implicitly considering the hexdigest to be big-endian, + # even if the system is little-endian. # Building the index that way is about 30% faster than using the # hexdigest. hasher.update(bytes(str(val), 'utf-8')) diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py index 0f839d56..50d57ec4 100644 --- a/category_encoders/leave_one_out.py +++ b/category_encoders/leave_one_out.py @@ -161,7 +161,7 @@ def _fit_column_map(series: pd.Series, y: pd.Series) -> pd.Series: def transform_leave_one_out(self, X: pd.DataFrame, y: pd.Series | None, mapping=None): """Apply leave-one-out-encoding to a dataframe. - If a target is given the lable-mean is calculated without the target (left out). + If a target is given the label-mean is calculated without the target (left out). Otherwise, the label mean from the fit step is taken. """ random_state_ = check_random_state(self.random_state) @@ -184,7 +184,7 @@ def transform_leave_one_out(self, X: pd.DataFrame, y: pd.Series | None, mapping= X[col] = X[col].astype(index_dtype) if self.handle_unknown == 'error' and is_unknown_value.any(): - raise ValueError('Columns to be encoded can not contain new values') + raise ValueError('Columns to be encoded cannot contain new values') if ( y is None diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py index 9c98e123..f089806d 100644 --- a/category_encoders/one_hot.py +++ b/category_encoders/one_hot.py @@ -40,7 +40,7 @@ class OneHotEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder): handle_missing: str options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. - 'error' will raise a `ValueError` if missings are encountered. + 'error' will raise a `ValueError` if a missing value is encountered. 'return_nan' will encode a missing value as `np.nan` in every dummy column. 'value' will encode a missing value as 0 in every dummy column. 'indicator' will treat missingness as its own category, adding an additional dummy column @@ -227,7 +227,7 @@ def _transform(self, X): if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): - raise ValueError('Columns to be encoded can not contain new values') + raise ValueError('Columns to be encoded cannot contain new values') X = self.get_dummies(X) return X diff --git a/category_encoders/rankhot.py b/category_encoders/rankhot.py index 0cd0bee4..61e4822a 100644 --- a/category_encoders/rankhot.py +++ b/category_encoders/rankhot.py @@ -32,9 +32,9 @@ class RankHotEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder): handle_unknown: str options are 'error', 'value', 'return_nan'. The default is 'value'. - 'value': If an unknown label occurrs, it is represented as 0 array. - 'error': If an unknown label occurrs, error message is displayed. - 'return_nan': If an unknown label occurrs, np.nan is returned in all columns. + 'value': If an unknown label occurs, it is represented as 0 array. + 'error': If an unknown label occurs, error message is displayed. + 'return_nan': If an unknown label occurs, np.nan is returned in all columns. handle_missing: str options are 'error', 'value' and 'return_nan'. The default is 'value'. Missing value also considered as unknown value in the final data set. @@ -146,7 +146,7 @@ def _transform(self, X_in: pd.DataFrame, override_return_df: bool = False) -> pd if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): - raise ValueError('Columns to be encoded can not contain new values') + raise ValueError('Columns to be encoded cannot contain new values') for switch, _ordinal_switch in zip( self.mapping, self.ordinal_encoder.category_mapping, strict=False diff --git a/category_encoders/target_encoder.py b/category_encoders/target_encoder.py index b61464f3..80be84ba 100644 --- a/category_encoders/target_encoder.py +++ b/category_encoders/target_encoder.py @@ -301,7 +301,7 @@ def target_encode(self, X_in: pd.DataFrame) -> pd.DataFrame: """Apply target encoding via encoder mapping.""" X = X_in.copy(deep=True) - # Was not mapping extra columns as self.featuer_names_in did not include new column + # Was not mapping extra columns as self.feature_names_in did not include new column for col in self.cols: X[col] = X[col].map(self.mapping[col]) diff --git a/category_encoders/utils.py b/category_encoders/utils.py index a9adca74..9b884713 100644 --- a/category_encoders/utils.py +++ b/category_encoders/utils.py @@ -469,7 +469,7 @@ def fit(self, X: X_type, y: y_type | None = None, **kwargs): if self.handle_missing == 'error': if X[self.cols].isna().any().any(): - raise ValueError('Columns to be encoded can not contain null') + raise ValueError('Columns to be encoded cannot contain null') self._fit(X, y, **kwargs) @@ -504,7 +504,7 @@ def _check_fit_inputs(self, X: X_type, y: y_type) -> None: def _check_transform_inputs(self, df: pd.DataFrame) -> None: if self.handle_missing == 'error': if df[self.cols].isna().any().any(): - raise ValueError('Columns to be encoded can not contain null') + raise ValueError('Columns to be encoded cannot contain null') if self._dim is None: raise NotFittedError('Must train encoder before it can be used to transform data.') @@ -600,7 +600,7 @@ def __sklearn_tags__(self) -> EncoderTags: def transform(self, X: X_type, y: y_type | None = None, override_return_df: bool = False): """Perform the transformation to new categorical data. - Some encoders behave differently on whether y is given or not. + Some encoders behave differently on whether or not y is given. This is mainly due to regularisation in order to avoid overfitting. On training data transform should be called with y, on test data without. diff --git a/examples/source_data/mushrooms/agaricus-lepiota.names b/examples/source_data/mushrooms/agaricus-lepiota.names index 4f1f3b53..481457a9 100644 --- a/examples/source_data/mushrooms/agaricus-lepiota.names +++ b/examples/source_data/mushrooms/agaricus-lepiota.names @@ -9,7 +9,7 @@ 3. Past Usage: 1. Schlimmer,J.S. (1987). Concept Acquisition Through Representational - Adjustment (Technical Report 87-19). Doctoral disseration, Department + Adjustment (Technical Report 87-19). Doctoral dissertation, Department of Information and Computer Science, University of California, Irvine. --- STAGGER: asymptoted to 95% classification accuracy after reviewing 1000 instances. diff --git a/joss/paper.md b/joss/paper.md index 6e66deda..51fa5bec 100644 --- a/joss/paper.md +++ b/joss/paper.md @@ -3,7 +3,7 @@ title: 'Category Encoders: a scikit-learn-contrib package of transformers for en tags: - machine learning - python - - sckit-learn + - scikit-learn authors: - name: William D McGinnis orcid: 0000-0002-3009-9465 @@ -45,7 +45,7 @@ Categorical: Georgia, Alabama, South Carolina, … , New York The machine learning algorithms we will later use tend to want numbers, and not strings, as their inputs so we need some method of coding to convert them. -Category_encoders includes a number of pre-existing encoders that are commonly used, notably Ordinal, Hashing and OneHot encoders [@idre][@carey][@hashing]. There are also some +Category_encoders includes a number of preexisting encoders that are commonly used, notably Ordinal, Hashing and OneHot encoders [@idre][@carey][@hashing]. There are also some less frequently used encoders including Backward Difference, Helmert, Polynomial and Sum encoding [@idre][@carey]. Finally there are experimental encoders: LeaveOneOut, Binary and BaseN [@zhang][@onehot][@basen]. diff --git a/tests/test_basen.py b/tests/test_basen.py index f608e8bb..6941ea8d 100644 --- a/tests/test_basen.py +++ b/tests/test_basen.py @@ -111,7 +111,7 @@ def test_inverse_transform_have_nan_in_train(self): def test_inverse_transform_not_supported_with_unknown_values(self): """Test that inverse_transform is not supported if a nan could be either missing or unknown. - This happens if both handle_missing and handle_unkown are set to 'return_nan'. + This happens if both handle_missing and handle_unknown are set to 'return_nan'. """ train = pd.DataFrame({'city': ['chicago', np.nan]}) test = pd.DataFrame({'city': ['chicago', 'los angeles']}) diff --git a/tests/test_binary.py b/tests/test_binary.py index 1d9051bd..b4dbc505 100644 --- a/tests/test_binary.py +++ b/tests/test_binary.py @@ -16,7 +16,7 @@ def test_binary_bin(self): self.assertTrue(pd.DataFrame([[0, 1], [1, 0], [1, 0]], columns=['0_0', '0_1']).equals(out)) def test_binary_dist(self): - """Test the BinaryEncoder with a all distinct values.""" + """Test the BinaryEncoder with all distinct values.""" data = np.array(['apple', 'orange', 'peach', 'lemon']) encoder = encoders.BinaryEncoder() encoder.fit(data) diff --git a/tests/test_count.py b/tests/test_count.py index 746ca1ac..9aaf79ef 100644 --- a/tests/test_count.py +++ b/tests/test_count.py @@ -136,7 +136,7 @@ def test_count_handle_unknown_string(self): """Test the handle_unknown string on 'none' and 'na_categorical'. The 'handle_missing' must be set to 'return_nan' in order to test - 'handle_unkown' correctly. + 'handle_unknown' correctly. """ enc = encoders.CountEncoder( handle_missing='return_nan', @@ -155,7 +155,7 @@ def test_count_handle_unknown_string(self): self.assertTrue(out['na_categorical'].isna().sum() == 3) def test_count_handle_unknown_dict(self): - """Test the 'handle_unkown' dict with all non-default options.""" + """Test the 'handle_unknown' dict with all non-default options.""" enc = encoders.CountEncoder( handle_missing='return_nan', handle_unknown={'none': -1, 'na_categorical': 'return_nan'}, diff --git a/tests/test_rankhot.py b/tests/test_rankhot.py index c73343ae..dff1b32b 100644 --- a/tests/test_rankhot.py +++ b/tests/test_rankhot.py @@ -39,7 +39,7 @@ def test_handle_pandas_categorical(self): self.assertEqual(t_f.shape[1] - (X.shape[1] - 1), len(X.categorical.unique())) self.assertTupleEqual(inv_tf.shape, X.shape) - def test_na_catagorical(self): + def test_na_categorical(self): """Test also NAs on pandas categorical are handled correctly.""" enc = encoders.RankHotEncoder(handle_unknown='value', cols=['na_categorical']) enc.fit(X) diff --git a/tests/test_utils.py b/tests/test_utils.py index b6e52cbd..418d1059 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -161,7 +161,7 @@ def transform(self, X, y=None, override_return_df=False): self.encoder = DummyEncoder() - @pytest.mark.skipif(Version(skl_version) < Version('1.2'), reason='requires sklean > 1.2') + @pytest.mark.skipif(Version(skl_version) < Version('1.2'), reason='requires sklearn > 1.2') def test_sklearn_pandas_out_refit(self): """Test that the encoder can be refit with sklearn and pandas.""" # Thanks to Issue#437