Spelling (#464)

jsoref · web-flow · commit 242a409a3464 · 2025-11-14T09:37:51.000+01:00
* spelling: a missing value is

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: all

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: big-endian

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: cannot

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: categorical

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: dissertation

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: endianness

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: feature

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: handle

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: label

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: little-endian

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: occurs

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: preexisting

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: scikit

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: sklearn

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: that

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: unknown

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: used

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* spelling: whether or not

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

* fix sentence

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;

---------

Signed-off-by: Josh Soref &lt;2119212+jsoref@users.noreply.github.com&gt;
diff --git a/category_encoders/base_contrast_encoder.py b/category_encoders/base_contrast_encoder.py
@@ -100,7 +100,7 @@ def _transform(self, X) -> pd.DataFrame:
         X = self.ordinal_encoder.transform(X)
         if self.handle_unknown == 'error':
             if X[self.cols].isin([-1]).any().any():
-                raise ValueError('Columns to be encoded can not contain new values')
+                raise ValueError('Columns to be encoded cannot contain new values')
 
         X = self.transform_contrast_coding(X, mapping=self.mapping)
         return X
@@ -124,7 +124,7 @@ def fit_contrast_coding(
         handle_missing: str
             How to handle missing values.
         handle_unknown: str
-            How to hande unkown values.
+            How to handle unknown values.
 
         Returns
         -------
diff --git a/category_encoders/basen.py b/category_encoders/basen.py
@@ -187,7 +187,7 @@ def _transform(self, X):
 
         if self.handle_unknown == 'error':
             if X_out[self.cols].isin([-1]).any().any():
-                raise ValueError('Columns to be encoded can not contain new values')
+                raise ValueError('Columns to be encoded cannot contain new values')
 
         X_out = self.basen_encode(X_out, cols=self.cols)
         return X_out
diff --git a/category_encoders/cat_boost.py b/category_encoders/cat_boost.py
@@ -157,7 +157,7 @@ def _transform(self, X, y=None):
             is_unknown_value = X[col].isin(unseen_values.dropna().astype(object))
 
             if self.handle_unknown == 'error' and is_unknown_value.any():
-                raise ValueError('Columns to be encoded can not contain new values')
+                raise ValueError('Columns to be encoded cannot contain new values')
 
             if (
                 y is None
diff --git a/category_encoders/glmm.py b/category_encoders/glmm.py
@@ -28,7 +28,7 @@ class GLMMEncoder( util.SupervisedTransformerMixin ,util.BaseEncoder):
            Mixed effects models are a mature branch of statistics.
         2. No hyper-parameters to tune. The amount of shrinkage is automatically determined
            through the estimation process. In short, the less observations a category has and/or
-           the more the outcome varies for a category. Then the higher the regularization
+           the more that outcome varies for a category, the higher the regularization
            towards "the prior" or "grand mean".
         3. The technique is applicable for both continuous and binomial targets.
            If the target is continuous, the encoder returns regularized difference of the
diff --git a/category_encoders/hashing.py b/category_encoders/hashing.py
@@ -228,11 +228,11 @@ def hash_chunk(hash_method: str, np_df: np.ndarray, N: int) -> np.ndarray:
             for val in row:
                 if val is not None:
                     hasher = hasher_constructor()
-                    # Computes an integer index from the hasher digest. The endian is
-                    # "big" as the code use to read:
+                    # Computes an integer index from the hasher digest. The endianness is
+                    # "big" as the code used to read:
                     # column_index = int(hasher.hexdigest(), 16) % N
-                    # which is implicitly considering the hexdigest to be big endian,
-                    # even if the system is little endian.
+                    # which is implicitly considering the hexdigest to be big-endian,
+                    # even if the system is little-endian.
                     # Building the index that way is about 30% faster than using the
                     # hexdigest.
                     hasher.update(bytes(str(val), 'utf-8'))
diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py
@@ -161,7 +161,7 @@ def _fit_column_map(series: pd.Series, y: pd.Series) -> pd.Series:
     def transform_leave_one_out(self, X: pd.DataFrame, y: pd.Series | None, mapping=None):
         """Apply leave-one-out-encoding to a dataframe.
 
-        If a target is given the lable-mean is calculated without the target (left out).
+        If a target is given the label-mean is calculated without the target (left out).
         Otherwise, the label mean from the fit step is taken.
         """
         random_state_ = check_random_state(self.random_state)
@@ -184,7 +184,7 @@ def transform_leave_one_out(self, X: pd.DataFrame, y: pd.Series | None, mapping=
                 X[col] = X[col].astype(index_dtype)
 
             if self.handle_unknown == 'error' and is_unknown_value.any():
-                raise ValueError('Columns to be encoded can not contain new values')
+                raise ValueError('Columns to be encoded cannot contain new values')
 
             if (
                 y is None
diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py
@@ -40,7 +40,7 @@ class OneHotEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
     handle_missing: str
         options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'.
 
-        'error' will raise a `ValueError` if missings are encountered.
+        'error' will raise a `ValueError` if a missing value is encountered.
         'return_nan' will encode a missing value as `np.nan` in every dummy column.
         'value' will encode a missing value as 0 in every dummy column.
         'indicator' will treat missingness as its own category, adding an additional dummy column
@@ -227,7 +227,7 @@ def _transform(self, X):
 
         if self.handle_unknown == 'error':
             if X[self.cols].isin([-1]).any().any():
-                raise ValueError('Columns to be encoded can not contain new values')
+                raise ValueError('Columns to be encoded cannot contain new values')
 
         X = self.get_dummies(X)
         return X
diff --git a/category_encoders/rankhot.py b/category_encoders/rankhot.py
@@ -32,9 +32,9 @@ class RankHotEncoder( util.UnsupervisedTransformerMixin,util.BaseEncoder):
     handle_unknown: str
         options are 'error', 'value', 'return_nan'.
         The default is 'value'.
-        'value': If an unknown label occurrs, it is represented as 0 array.
-        'error': If an unknown label occurrs, error message is displayed.
-        'return_nan': If an unknown label occurrs, np.nan is returned in all columns.
+        'value': If an unknown label occurs, it is represented as 0 array.
+        'error': If an unknown label occurs, error message is displayed.
+        'return_nan': If an unknown label occurs, np.nan is returned in all columns.
     handle_missing: str
         options are 'error', 'value' and 'return_nan'. The default is 'value'.
         Missing value also considered as unknown value in the final data set.
@@ -146,7 +146,7 @@ def _transform(self, X_in: pd.DataFrame, override_return_df: bool = False) -> pd
 
         if self.handle_unknown == 'error':
             if X[self.cols].isin([-1]).any().any():
-                raise ValueError('Columns to be encoded can not contain new values')
+                raise ValueError('Columns to be encoded cannot contain new values')
 
         for switch, _ordinal_switch in zip(
             self.mapping, self.ordinal_encoder.category_mapping, strict=False
diff --git a/category_encoders/target_encoder.py b/category_encoders/target_encoder.py
@@ -301,7 +301,7 @@ def target_encode(self, X_in: pd.DataFrame) -> pd.DataFrame:
         """Apply target encoding via encoder mapping."""
         X = X_in.copy(deep=True)
 
-        # Was not mapping extra columns as self.featuer_names_in did not include new column
+        # Was not mapping extra columns as self.feature_names_in did not include new column
         for col in self.cols:
             X[col] = X[col].map(self.mapping[col])
 
diff --git a/category_encoders/utils.py b/category_encoders/utils.py
@@ -469,7 +469,7 @@ def fit(self, X: X_type, y: y_type | None = None, **kwargs):
 
         if self.handle_missing == 'error':
             if X[self.cols].isna().any().any():
-                raise ValueError('Columns to be encoded can not contain null')
+                raise ValueError('Columns to be encoded cannot contain null')
 
         self._fit(X, y, **kwargs)
 
@@ -504,7 +504,7 @@ def _check_fit_inputs(self, X: X_type, y: y_type) -> None:
     def _check_transform_inputs(self, df: pd.DataFrame) -> None:
         if self.handle_missing == 'error':
             if df[self.cols].isna().any().any():
-                raise ValueError('Columns to be encoded can not contain null')
+                raise ValueError('Columns to be encoded cannot contain null')
 
         if self._dim is None:
             raise NotFittedError('Must train encoder before it can be used to transform data.')
@@ -600,7 +600,7 @@ def __sklearn_tags__(self) -> EncoderTags:
     def transform(self, X: X_type, y: y_type | None = None, override_return_df: bool = False):
         """Perform the transformation to new categorical data.
 
-        Some encoders behave differently on whether y is given or not.
+        Some encoders behave differently on whether or not y is given.
         This is mainly due to regularisation in order to avoid overfitting.
         On training data transform should be called with y, on test data without.
 
diff --git a/examples/source_data/mushrooms/agaricus-lepiota.names b/examples/source_data/mushrooms/agaricus-lepiota.names
@@ -9,7 +9,7 @@
 
 3. Past Usage:
     1. Schlimmer,J.S. (1987). Concept Acquisition Through Representational
-       Adjustment (Technical Report 87-19).  Doctoral disseration, Department
+       Adjustment (Technical Report 87-19).  Doctoral dissertation, Department
        of Information and Computer Science, University of California, Irvine.
        --- STAGGER: asymptoted to 95% classification accuracy after reviewing
            1000 instances.
diff --git a/joss/paper.md b/joss/paper.md
@@ -3,7 +3,7 @@ title: 'Category Encoders: a scikit-learn-contrib package of transformers for en
 tags:
   - machine learning
   - python
-  - sckit-learn
+  - scikit-learn
 authors:
  - name: William D McGinnis
    orcid: 0000-0002-3009-9465
@@ -45,7 +45,7 @@ Categorical: Georgia, Alabama, South Carolina, … , New York
 
 The machine learning algorithms we will later use tend to want numbers, and not strings, as their inputs so we need some method of coding to convert them.
 
-Category_encoders includes a number of pre-existing encoders that are commonly used, notably Ordinal, Hashing and OneHot encoders [@idre][@carey][@hashing]. There are also some
+Category_encoders includes a number of preexisting encoders that are commonly used, notably Ordinal, Hashing and OneHot encoders [@idre][@carey][@hashing]. There are also some
 less frequently used encoders including Backward Difference, Helmert, Polynomial and Sum encoding [@idre][@carey]. Finally there are
 experimental encoders: LeaveOneOut, Binary and BaseN [@zhang][@onehot][@basen].
 
diff --git a/tests/test_basen.py b/tests/test_basen.py
@@ -111,7 +111,7 @@ def test_inverse_transform_have_nan_in_train(self):
     def test_inverse_transform_not_supported_with_unknown_values(self):
         """Test that inverse_transform is not supported if a nan could be either missing or unknown.
 
-        This happens if both handle_missing and handle_unkown are set to 'return_nan'.
+        This happens if both handle_missing and handle_unknown are set to 'return_nan'.
         """
         train = pd.DataFrame({'city': ['chicago', np.nan]})
         test = pd.DataFrame({'city': ['chicago', 'los angeles']})
diff --git a/tests/test_binary.py b/tests/test_binary.py
@@ -16,7 +16,7 @@ def test_binary_bin(self):
         self.assertTrue(pd.DataFrame([[0, 1], [1, 0], [1, 0]], columns=['0_0', '0_1']).equals(out))
 
     def test_binary_dist(self):
-        """Test the BinaryEncoder with a all distinct values."""
+        """Test the BinaryEncoder with all distinct values."""
         data = np.array(['apple', 'orange', 'peach', 'lemon'])
         encoder = encoders.BinaryEncoder()
         encoder.fit(data)
diff --git a/tests/test_count.py b/tests/test_count.py
@@ -136,7 +136,7 @@ def test_count_handle_unknown_string(self):
         """Test the handle_unknown string  on 'none' and 'na_categorical'.
 
         The 'handle_missing' must be set to 'return_nan' in order to test
-        'handle_unkown' correctly.
+        'handle_unknown' correctly.
         """
         enc = encoders.CountEncoder(
             handle_missing='return_nan',
@@ -155,7 +155,7 @@ def test_count_handle_unknown_string(self):
         self.assertTrue(out['na_categorical'].isna().sum() == 3)
 
     def test_count_handle_unknown_dict(self):
-        """Test the 'handle_unkown' dict with all non-default options."""
+        """Test the 'handle_unknown' dict with all non-default options."""
         enc = encoders.CountEncoder(
             handle_missing='return_nan',
             handle_unknown={'none': -1, 'na_categorical': 'return_nan'},
diff --git a/tests/test_rankhot.py b/tests/test_rankhot.py
@@ -39,7 +39,7 @@ def test_handle_pandas_categorical(self):
         self.assertEqual(t_f.shape[1] - (X.shape[1] - 1), len(X.categorical.unique()))
         self.assertTupleEqual(inv_tf.shape, X.shape)
 
-    def test_na_catagorical(self):
+    def test_na_categorical(self):
         """Test also NAs on pandas categorical are handled correctly."""
         enc = encoders.RankHotEncoder(handle_unknown='value', cols=['na_categorical'])
         enc.fit(X)
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -161,7 +161,7 @@ def transform(self, X, y=None, override_return_df=False):
 
         self.encoder = DummyEncoder()
 
-    @pytest.mark.skipif(Version(skl_version) < Version('1.2'), reason='requires sklean > 1.2')
+    @pytest.mark.skipif(Version(skl_version) < Version('1.2'), reason='requires sklearn > 1.2')
     def test_sklearn_pandas_out_refit(self):
         """Test that the encoder can be refit with sklearn and pandas."""
         # Thanks to Issue#437