From 543e23610d9143af4afea93045ffbc9e40ff0e11 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 21 May 2022 21:51:07 -0700 Subject: [PATCH 01/22] initial commit. create class skeleton. --- feature_engine/discretisation/chi_merge.py | 93 ++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 feature_engine/discretisation/chi_merge.py diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py new file mode 100644 index 000000000..866115407 --- /dev/null +++ b/feature_engine/discretisation/chi_merge.py @@ -0,0 +1,93 @@ + +from typing import List, Optional, Union + +import pandas as pd + +from feature_engine.discretisation.base_discretiser import BaseDiscretiser +from feature_engine._docstrings.methods import _fit_transform_docstring +from feature_engine._docstrings.fit_attributes import ( + _variables_attribute_docstring, + _feature_names_in_docstring, + _n_features_in_docstring, +) +from feature_engine._docstrings.class_inputs import _variables_numerical_docstring +from feature_engine._docstrings.substitute import Substitution +from feature_engine.variable_manipulation import _check_input_parameter_variables + + + +class ChiMergeDiscretiser(BaseDiscretiser): + """" + + + + + + + + """ + def __init__( + self, + variables: Union[None, int, str, List[Union[str, int]]] = None, + threshold: float = 0.9, + min_intervals: int = 2, + max_intervals: int = 10, + return_object: bool = False, + return_boundaries: bool = False, + ) -> None: + + if not isinstance(threshold, float) or threshold >= 1: + raise ValueError( + "threshold must be a float and less than one. " + f"Got {threshold} instead." + ) + + if not isinstance(min_intervals, int) or min_intervals < 2: + raise ValueError( + "min_intervals must be an integer that is greater than or " + f"equal to 2. Got {min_intervals} instead." + ) + + # TODO: Should we limit max_intervals? If so, how much? + if not isinstance(max_intervals, int) or max_intervals > 15: + raise ValueError( + "max_intervals must be an integer that is less than or " + f"equal to 15. Got {max_intervals} instead." + ) + super().__init(return_object, return_boundaries) + + self.variables = _check_input_parameter_variables(variables) + self.threshold = threshold + self.min_intervals = min_intervals + self.max_intervals = max_intervals + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series]): + """ + Learn the limits of the intervals using the chi-square test. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training dataset. Can be the entire dataframe, not just the variables + to be transformed. + y: None + y is not needed in this encoder. You can pass y or None. + + """ + pass + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Sort the variable values into the intervals. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The data to transform. + + Returns + ------- + X_new: pandas dataframe of shape = [n_samples, n_features] + The transformed data with the discrete variables. + """ + pass \ No newline at end of file From 54363c60cf07ee5d3c28c65f0724c138f2e225d9 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 21 May 2022 22:43:55 -0700 Subject: [PATCH 02/22] expand fit and transform --- feature_engine/discretisation/chi_merge.py | 31 ++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index 866115407..0f1be1f19 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -2,6 +2,7 @@ from typing import List, Optional, Union import pandas as pd +from sklearn.utils.validation import check_is_fitted from feature_engine.discretisation.base_discretiser import BaseDiscretiser from feature_engine._docstrings.methods import _fit_transform_docstring @@ -12,7 +13,16 @@ ) from feature_engine._docstrings.class_inputs import _variables_numerical_docstring from feature_engine._docstrings.substitute import Substitution -from feature_engine.variable_manipulation import _check_input_parameter_variables +from feature_engine.dataframe_checks import ( + _check_contains_inf, + _check_contains_na, + _check_X_matches_training_df, + check_X, +) +from feature_engine.variable_manipulation import ( + _check_input_parameter_variables, + _find_or_check_numerical_variables, +) @@ -74,7 +84,16 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series]): y is not needed in this encoder. You can pass y or None. """ - pass + # check input dataframe + X = check_X(X) + _check_contains_na(X) + _check_contains_inf(X) + + # find or check for numerical variables + self.variables = _find_or_check_numerical_variables(X, self.variables) + + + pass def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ @@ -90,4 +109,12 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X_new: pandas dataframe of shape = [n_samples, n_features] The transformed data with the discrete variables. """ + # check that fit method has been called + check_is_fitted(self) + + # check if X is a dataframe + X = check_X(X) + + + pass \ No newline at end of file From 54420714cd97324445783082f38488b98e068cdf Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sun, 22 May 2022 11:53:40 -0700 Subject: [PATCH 03/22] start _create_contingency_table. stuck. debating if neccessary --- feature_engine/discretisation/chi_merge.py | 60 ++++++++++++++++++++-- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index 0f1be1f19..fe40b6b7f 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -29,7 +29,9 @@ class ChiMergeDiscretiser(BaseDiscretiser): """" - + Chi-Squared test is a statistical hypothesis test that assumes (the null hypothesis) + that the observed frequencies for a categorical variable match the expected frequencies + for the categorical variable. @@ -71,7 +73,7 @@ def __init__( self.min_intervals = min_intervals self.max_intervals = max_intervals - def fit(self, X: pd.DataFrame, y: Optional[pd.Series]): + def fit(self, X: pd.DataFrame, y: pd.Series): """ Learn the limits of the intervals using the chi-square test. @@ -115,6 +117,56 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # check if X is a dataframe X = check_X(X) - - pass \ No newline at end of file + + pass + + def _create_contingency_table(self, feature: pd.Series, class_labels: pd.Series): + """ + Generates a frequency table in which the labels organized into bins. + + Parameters + ---------- + feature: pandas series = [n_samples, ] + The data to discretised. + + class_labels: pandas series = [n_samples, ] + The categorical data that will be arranged in the bins. + + Returns + ------- + TBD + + + """ + + unique_values = sorted(set(feature), reverse=False) + unique_labels = sorted(set(class_labels)) + count_dict = {label: 0 for label in unique_labels} + zeros = [0 for i in range(len(unique_labels))] + frequency_table = {val: zeros for val in unique_values} + + for feature_val, label_val in zip(feature, class_labels): + print(feature_val) + for idx, interval_key in enumerate(frequency_table.keys()): + min_interval = list(frequency_table.keys())[idx] + max_interval = list(frequency_table.keys())[idx + 1] + table_col_index = unique_labels.index(label_val) + + print(idx, min_interval, max_interval) + if interval_key == max(unique_values): + frequency_table[interval_key][label_val] += 1 + print(feature_val, label_val, min_interval, max_interval, table_col_index) + print(frequency_table) + break + + if min_interval <= feature_val and feature_val < max_interval: + print(feature_val, label_val, min_interval, max_interval, table_col_index) + frequency_table[min_interval][label_val] += 1 + print(frequency_table) + break + + + def _calc_chi_sqaure(self): + + From 61752bdf0653845bf61ba77e4eaf0de7d608fdc4 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 4 Jun 2022 09:59:55 -0700 Subject: [PATCH 04/22] create _calc_chi_square() --- feature_engine/discretisation/chi_merge.py | 57 +++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index fe40b6b7f..8ba219f4d 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -1,6 +1,7 @@ from typing import List, Optional, Union +import numpy as np import pandas as pd from sklearn.utils.validation import check_is_fitted @@ -167,6 +168,60 @@ def _create_contingency_table(self, feature: pd.Series, class_labels: pd.Series) break - def _calc_chi_sqaure(self): + def _calc_chi_sqaure(self, array: np.array) -> float: + """ + Calculates chi-squared. Using the following equation: + + # TODO: Add chi2 formula docstring + + Parameters + ---------- + X: np.array = [2, n_features] + Two sequential rows from the contingency table. + + Returns + ------- + chi2: float + Determines whether two sets of measurements are related. + """ + + if not isinstance(array, np.array): + raise ValueError( + f"array must be a numpy array. Got {type(array)} instead." + ) + + if array.shape[0] != 2: + raise ValueError( + f"array must be comprised of 2 rows. Got " + f"{array.shape[0]} instead" + ) + + shape = array.shape + num_obs = float(array.sum()) + rows_sums = {} + cols_sums = {} + chi2 = 0 + + # calculate row-wise summations + for row_idx in range(shape[0]): + rows_sums[row_idx] = array[row_idx, :].sum() + + # calculate column-wise summations + for col_idx in range(shape[1]): + cols_sums[col_idx] = array[:, col_idx].sum() + + # iterate through all expect and actual value pairs. + for row_idx in range(shape[0]): + for col_idx in range(shape[1]): + expected_val = rows_sums[row_idx] * cols_sums[col_idx] / num_obs + actual_val = array[row_idx, col_idx] + + if expected_val == 0: + # prevents NaN error + chi2 += 0 + else: + chi2 += (actual_val - expected_val) ** 2 / float(expected_val) + + return chi2 From 2fdb15cb2c19b364d4872698055912f2586fb69b Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 4 Jun 2022 10:00:17 -0700 Subject: [PATCH 05/22] create _calc_chi_square() --- feature_engine/discretisation/chi_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index 8ba219f4d..0bbe2362c 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -168,7 +168,7 @@ def _create_contingency_table(self, feature: pd.Series, class_labels: pd.Series) break - def _calc_chi_sqaure(self, array: np.array) -> float: + def _calc_chi_square(self, array: np.array) -> float: """ Calculates chi-squared. Using the following equation: From dd22c1949714f6db20aee6a3d945f9bf1c579f0f Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 4 Jun 2022 16:51:05 -0700 Subject: [PATCH 06/22] revise _create_contingency_table(). simplify code --- feature_engine/discretisation/chi_merge.py | 52 +++++++++------------- 1 file changed, 20 insertions(+), 32 deletions(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index 0bbe2362c..6c49894d4 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -83,8 +83,9 @@ def fit(self, X: pd.DataFrame, y: pd.Series): X: pandas dataframe of shape = [n_samples, n_features] The training dataset. Can be the entire dataframe, not just the variables to be transformed. - y: None - y is not needed in this encoder. You can pass y or None. + + y: pd.Series + y is the predicted variables. """ # check input dataframe @@ -118,11 +119,10 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # check if X is a dataframe X = check_X(X) - - pass - def _create_contingency_table(self, feature: pd.Series, class_labels: pd.Series): + + def _create_contingency_table(self, X: pd.DataFrame, y: pd.Series, variable: str) -> dict: """ Generates a frequency table in which the labels organized into bins. @@ -131,41 +131,28 @@ def _create_contingency_table(self, feature: pd.Series, class_labels: pd.Series) feature: pandas series = [n_samples, ] The data to discretised. - class_labels: pandas series = [n_samples, ] + y: pandas series = [n_samples, ] The categorical data that will be arranged in the bins. + variable: str + The variable used to count the frequency of the class labels. + Returns ------- - TBD + contingency_table: dict + A frequency table of the tables for each unvariable feature value. """ - unique_values = sorted(set(feature), reverse=False) - unique_labels = sorted(set(class_labels)) - count_dict = {label: 0 for label in unique_labels} - zeros = [0 for i in range(len(unique_labels))] - frequency_table = {val: zeros for val in unique_values} - - for feature_val, label_val in zip(feature, class_labels): - print(feature_val) - for idx, interval_key in enumerate(frequency_table.keys()): - min_interval = list(frequency_table.keys())[idx] - max_interval = list(frequency_table.keys())[idx + 1] - table_col_index = unique_labels.index(label_val) - - print(idx, min_interval, max_interval) - if interval_key == max(unique_values): - frequency_table[interval_key][label_val] += 1 - print(feature_val, label_val, min_interval, max_interval, table_col_index) - print(frequency_table) - break - - if min_interval <= feature_val and feature_val < max_interval: - print(feature_val, label_val, min_interval, max_interval, table_col_index) - frequency_table[min_interval][label_val] += 1 - print(frequency_table) - break + unique_values = sorted(set(X[variable])) + unique_labels = list(set(y)) + contingency_table = {l: [0] * len(unique_labels) for l in unique_values} + + for value, label in zip(X[variable], y): + contingency_table[value][label] += 1 + + return contingency_table def _calc_chi_square(self, array: np.array) -> float: @@ -225,3 +212,4 @@ def _calc_chi_square(self, array: np.array) -> float: return chi2 + From d6317b3398dd712c84b7cb0cadba405fe312ef18 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Mon, 13 Jun 2022 16:37:42 -0700 Subject: [PATCH 07/22] create test_chi_merge_discretiser.py --- feature_engine/discretisation/__init__.py | 2 ++ .../test_chi_merge_discretiser.py | 13 +++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 tests/test_discretisation/test_chi_merge_discretiser.py diff --git a/feature_engine/discretisation/__init__.py b/feature_engine/discretisation/__init__.py index a305c8a93..19fd90c7e 100644 --- a/feature_engine/discretisation/__init__.py +++ b/feature_engine/discretisation/__init__.py @@ -7,10 +7,12 @@ from .decision_tree import DecisionTreeDiscretiser from .equal_frequency import EqualFrequencyDiscretiser from .equal_width import EqualWidthDiscretiser +from .chi_merge import ChiMergeDiscretiser __all__ = [ "DecisionTreeDiscretiser", "EqualFrequencyDiscretiser", "EqualWidthDiscretiser", "ArbitraryDiscretiser", + "ChiMergeDiscretiser", ] diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py new file mode 100644 index 000000000..f397fa018 --- /dev/null +++ b/tests/test_discretisation/test_chi_merge_discretiser.py @@ -0,0 +1,13 @@ +import pandas as pd +import pytest +from sklearn import datasets + +from feature_engine.discretisation import ChiMergeDiscretiser + +# TODO: Should we create the df here on in conftest? + +# create dataset for unit tests +col_names = ["sepal_length", "sepal_width", "petal_length", "petal_width"] +iris_data = datasets.load_iris().data +iris = pd.DataFrame(iris_data, columns=col_names) +iris["flower"] = datasets.load_iris().target \ No newline at end of file From c41e09c1a872d4a75af86eefa9acc55be2588aa1 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Mon, 13 Jun 2022 17:04:31 -0700 Subject: [PATCH 08/22] create/pass test_create_contingency_table() and fix syntax errors --- feature_engine/discretisation/chi_merge.py | 8 +-- .../test_chi_merge_discretiser.py | 69 ++++++++++++++++++- 2 files changed, 70 insertions(+), 7 deletions(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index 6c49894d4..6b2c5a7f9 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -48,7 +48,7 @@ def __init__( return_object: bool = False, return_boundaries: bool = False, ) -> None: - + # TODO: Add threshold must be >= 0 if not isinstance(threshold, float) or threshold >= 1: raise ValueError( "threshold must be a float and less than one. " @@ -67,7 +67,7 @@ def __init__( "max_intervals must be an integer that is less than or " f"equal to 15. Got {max_intervals} instead." ) - super().__init(return_object, return_boundaries) + super().__init__(return_object, return_boundaries) self.variables = _check_input_parameter_variables(variables) self.threshold = threshold @@ -97,8 +97,6 @@ def fit(self, X: pd.DataFrame, y: pd.Series): self.variables = _find_or_check_numerical_variables(X, self.variables) - pass - def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Sort the variable values into the intervals. @@ -119,8 +117,6 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # check if X is a dataframe X = check_X(X) - pass - def _create_contingency_table(self, X: pd.DataFrame, y: pd.Series, variable: str) -> dict: """ diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py index f397fa018..f51adc5b2 100644 --- a/tests/test_discretisation/test_chi_merge_discretiser.py +++ b/tests/test_discretisation/test_chi_merge_discretiser.py @@ -10,4 +10,71 @@ col_names = ["sepal_length", "sepal_width", "petal_length", "petal_width"] iris_data = datasets.load_iris().data iris = pd.DataFrame(iris_data, columns=col_names) -iris["flower"] = datasets.load_iris().target \ No newline at end of file +iris["flower"] = datasets.load_iris().target + + +def test_create_contingency_table(): + transformer = ChiMergeDiscretiser( + variables=["sepal_length", "sepal_width"], + threshold=0.8, + min_intervals=2, + max_intervals=10, + return_object=False, + return_boundaries=False, + ) + + contingency_table = transformer._create_contingency_table( + X=iris[["sepal_length", "sepal_width", "petal_length"]], + y=iris["flower"], + variable="sepal_length" + ) + + # number of flowers included in contingency table + table_flower_count = 0 + for count_arr in contingency_table.values(): + table_flower_count += sum(count_arr) + + # expected results + expected_results = { + 4.3: [1, 0, 0], + 4.4: [3, 0, 0], + 4.5: [1, 0, 0], + 4.6: [4, 0, 0], + 4.7: [2, 0, 0], + 4.8: [5, 0, 0], + 4.9: [4, 1, 1], + 5.0: [8, 2, 0], + 5.1: [8, 1, 0], + 5.2: [3, 1, 0], + 5.3: [1, 0, 0], + 5.4: [5, 1, 0], + 5.5: [2, 5, 0], + 5.6: [0, 5, 1], + 5.7: [2, 5, 1], + 5.8: [1, 3, 3], + 5.9: [0, 2, 1], + 6.0: [0, 4, 2], + 6.1: [0, 4, 2], + 6.2: [0, 2, 2], + 6.3: [0, 3, 6], + 6.4: [0, 2, 5], + 6.5: [0, 1, 4], + 6.6: [0, 2, 0], + 6.7: [0, 3, 5], + 6.8: [0, 1, 2], + 6.9: [0, 1, 3], + 7.0: [0, 1, 0], + 7.1: [0, 0, 1], + 7.2: [0, 0, 3], + 7.3: [0, 0, 1], + 7.4: [0, 0, 1], + 7.6: [0, 0, 1], + 7.7: [0, 0, 4], + 7.9: [0, 0, 1] + } + num_flowers = iris.shape[0] + + # check results + assert contingency_table == expected_results + # check all flowers are included + assert table_flower_count == num_flowers \ No newline at end of file From 56eb8a4b7db09f8e94228e68d259b36a29ff66f5 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Mon, 13 Jun 2022 18:14:38 -0700 Subject: [PATCH 09/22] fix errors --- feature_engine/discretisation/chi_merge.py | 18 +++++------------- .../test_chi_merge_discretiser.py | 2 +- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index 6b2c5a7f9..8401db268 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -124,7 +124,7 @@ def _create_contingency_table(self, X: pd.DataFrame, y: pd.Series, variable: str Parameters ---------- - feature: pandas series = [n_samples, ] + X: pandas series = [n_samples, ] The data to discretised. y: pandas series = [n_samples, ] @@ -143,7 +143,10 @@ def _create_contingency_table(self, X: pd.DataFrame, y: pd.Series, variable: str unique_values = sorted(set(X[variable])) unique_labels = list(set(y)) - contingency_table = {l: [0] * len(unique_labels) for l in unique_values} + # stores frequency distribution for each unique value + contingency_table = { + val: np.array([0] * len(unique_labels)) for val in unique_values + } for value, label in zip(X[variable], y): contingency_table[value][label] += 1 @@ -168,17 +171,6 @@ def _calc_chi_square(self, array: np.array) -> float: Determines whether two sets of measurements are related. """ - if not isinstance(array, np.array): - raise ValueError( - f"array must be a numpy array. Got {type(array)} instead." - ) - - if array.shape[0] != 2: - raise ValueError( - f"array must be comprised of 2 rows. Got " - f"{array.shape[0]} instead" - ) - shape = array.shape num_obs = float(array.sum()) rows_sums = {} diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py index f51adc5b2..dd36ff065 100644 --- a/tests/test_discretisation/test_chi_merge_discretiser.py +++ b/tests/test_discretisation/test_chi_merge_discretiser.py @@ -76,5 +76,5 @@ def test_create_contingency_table(): # check results assert contingency_table == expected_results - # check all flowers are included + # confirm all flowers are included assert table_flower_count == num_flowers \ No newline at end of file From 4809988ccf2bfda3fd9f46b47e3a098c4532cea3 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Fri, 17 Jun 2022 17:55:09 -0700 Subject: [PATCH 10/22] chi_merge_disc --- feature_engine/discretisation/chi_merge.py | 33 ++++++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index 8401db268..dd72ccf9d 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -90,11 +90,14 @@ def fit(self, X: pd.DataFrame, y: pd.Series): """ # check input dataframe X = check_X(X) - _check_contains_na(X) - _check_contains_inf(X) + _check_contains_na(X, self.variables) + _check_contains_inf(X, self.variables) # find or check for numerical variables - self.variables = _find_or_check_numerical_variables(X, self.variables) + # self.variables = _find_or_check_numerical_variables(X, self.variables) + + self.contingency_table_ = self._create_contingency_table(X, y, self.variables) + self.chi2_scores_dict_ = self._create_chi_square_scores_dict() def transform(self, X: pd.DataFrame) -> pd.DataFrame: @@ -199,5 +202,29 @@ def _calc_chi_square(self, array: np.array) -> float: return chi2 + def _create_chi_square_scores_dict(self): + """ + Calculate all chi-square scores for each sequential distribution of + the contingency table. The dictionary keys correspond to the + lower-bound of the interval. + + Parameters + ---------- + None + + Returns + ------- + chi2_scores: dict + The chi-square score for each sequential distribution + + """ + chi2_scores = {} + unique_values = list(self.contingency_table_.keys()) + frequency_distributions = np.array(list(self.contingency_table_.values())) + + for idx in range(2, len(unique_values) + 1): + chi2 = self._calc_chi_square(frequency_distributions[idx - 2: idx]) + chi2_scores[unique_values[idx - 2]] = chi2 + return chi2_scores From 74faad65f9c7994a5b8ffbda4a7c84cf0853a13e Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Fri, 17 Jun 2022 19:29:31 -0700 Subject: [PATCH 11/22] change _create_contingency_table() to _create_frequency_matrix(). method now returns a 2-d numpy array and 1-d numpy array instead of a dictionary. --- feature_engine/discretisation/chi_merge.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index dd72ccf9d..718884c0c 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -204,7 +204,7 @@ def _calc_chi_square(self, array: np.array) -> float: def _create_chi_square_scores_dict(self): """ - Calculate all chi-square scores for each sequential distribution of + Calculate all chi-square scores for each adjacent distribution of the contingency table. The dictionary keys correspond to the lower-bound of the interval. @@ -215,8 +215,8 @@ def _create_chi_square_scores_dict(self): Returns ------- chi2_scores: dict - The chi-square score for each sequential distribution - + The chi-square scores for each adjacent distribution + """ chi2_scores = {} unique_values = list(self.contingency_table_.keys()) From 2c040fa234e1b5fc33653e0dee78d490f12028a9 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Fri, 17 Jun 2022 19:37:41 -0700 Subject: [PATCH 12/22] change _create_contingency_table() to _create_frequency_matrix(). method now returns a 2-d numpy array and 1-d numpy array instead of a dictionary. --- feature_engine/discretisation/chi_merge.py | 27 ++++++++++++---------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index 718884c0c..4ac87c120 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -2,6 +2,7 @@ from typing import List, Optional, Union import numpy as np +from numpy.typing import NDArray import pandas as pd from sklearn.utils.validation import check_is_fitted @@ -96,7 +97,9 @@ def fit(self, X: pd.DataFrame, y: pd.Series): # find or check for numerical variables # self.variables = _find_or_check_numerical_variables(X, self.variables) - self.contingency_table_ = self._create_contingency_table(X, y, self.variables) + self.frequency_matrix_intervals_, self.frequency_matrix_ = ( + self._create_frequency_matrix(X, y, self.variables) + ) self.chi2_scores_dict_ = self._create_chi_square_scores_dict() @@ -120,8 +123,8 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # check if X is a dataframe X = check_X(X) - - def _create_contingency_table(self, X: pd.DataFrame, y: pd.Series, variable: str) -> dict: + # TODO: How to type hint 2 numpy arrays + def _create_frequency_matrix(self, X: pd.DataFrame, y: pd.Series, variable: str) -> [NDArray, NDArray]: """ Generates a frequency table in which the labels organized into bins. @@ -143,18 +146,18 @@ def _create_contingency_table(self, X: pd.DataFrame, y: pd.Series, variable: str """ - - unique_values = sorted(set(X[variable])) - unique_labels = list(set(y)) - # stores frequency distribution for each unique value - contingency_table = { - val: np.array([0] * len(unique_labels)) for val in unique_values - } + frequency_matrix_intervals = np.sort(np.unique(X[variable])) + unique_class_values = np.sort(np.unique(y)) + frequency_matrix = np.zeros( + (len(frequency_matrix_intervals), len(unique_class_values)) + ) for value, label in zip(X[variable], y): - contingency_table[value][label] += 1 + row_idx = np.where(frequency_matrix_intervals == value)[0][0] + col_idx = np.where(unique_class_values == label)[0][0] + frequency_matrix[row_idx][col_idx] += 1 - return contingency_table + return frequency_matrix_intervals, frequency_matrix def _calc_chi_square(self, array: np.array) -> float: From 606f7096a017a0395da9cee81dac6035df762f0f Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 18 Jun 2022 13:10:21 -0700 Subject: [PATCH 13/22] delete _create_chi_square_scores_dict(). create _perform_chi_merge(). New method is incomplete. Issue with some of the chi-square calculations. It only happens w/ certain distributions --- feature_engine/discretisation/chi_merge.py | 46 +++++++++++++++------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index 4ac87c120..fcdada119 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -100,7 +100,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): self.frequency_matrix_intervals_, self.frequency_matrix_ = ( self._create_frequency_matrix(X, y, self.variables) ) - self.chi2_scores_dict_ = self._create_chi_square_scores_dict() + self._perform_chi_merge() def transform(self, X: pd.DataFrame) -> pd.DataFrame: @@ -205,11 +205,10 @@ def _calc_chi_square(self, array: np.array) -> float: return chi2 - def _create_chi_square_scores_dict(self): + def _perform_chi_merge(self) -> None: """ - Calculate all chi-square scores for each adjacent distribution of - the contingency table. The dictionary keys correspond to the - lower-bound of the interval. + Merge adjacent distributions until the the minimum chi-square is greater than + the threshold or the number of frequency-matrix intervals. Parameters ---------- @@ -217,17 +216,36 @@ def _create_chi_square_scores_dict(self): Returns ------- - chi2_scores: dict - The chi-square scores for each adjacent distribution + None """ - chi2_scores = {} - unique_values = list(self.contingency_table_.keys()) - frequency_distributions = np.array(list(self.contingency_table_.values())) - for idx in range(2, len(unique_values) + 1): - chi2 = self._calc_chi_square(frequency_distributions[idx - 2: idx]) - chi2_scores[unique_values[idx - 2]] = chi2 + chi_test = {} + + while self.frequency_matrix_.shape[0] > self.min_intervals: + + chi_test = {} + shape = self.frequency_matrix_.shape + + for row_idx in range(0, shape[0] - 1): + row_idx_2 = row_idx + 1 + chi2 = self._calc_chi_square( + self.frequency_matrix_[row_idx: row_idx_2 + 1] + ) + + if chi2 not in chi_test: + chi_test[chi2] = [] + + chi_test[chi2].append((row_idx, row_idx_2)) + smallest = min(chi_test.keys()) + biggest = max(chi_test.keys()) - return chi2_scores + if smallest < self.threshold: + for lower_bound, upper_bound in list(reversed(chi_test[smallest])): + for col_idx in range(shape[1]): + self.frequency_matrix_[lower_bound, col_idx] += self.frequency_matrix_[upper_bound, col_idx] + self.frequency_matrix_ = np.delete(self.frequency_matrix_, upper_bound, 0) + self.frequency_matrix_intervals_ = np.delete(self.frequency_matrix_intervals_, upper_bound, 0) + else: + break From 50b7409f334baab819f642377bce4733cd9ab224 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 18 Jun 2022 18:41:21 -0700 Subject: [PATCH 14/22] update init() --- feature_engine/discretisation/chi_merge.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index fcdada119..083e34609 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -43,16 +43,16 @@ class ChiMergeDiscretiser(BaseDiscretiser): def __init__( self, variables: Union[None, int, str, List[Union[str, int]]] = None, - threshold: float = 0.9, + threshold: Union[float, int] = 1.4, min_intervals: int = 2, max_intervals: int = 10, return_object: bool = False, return_boundaries: bool = False, ) -> None: - # TODO: Add threshold must be >= 0 - if not isinstance(threshold, float) or threshold >= 1: + + if not isinstance(threshold, (int, float)) or threshold < 0: raise ValueError( - "threshold must be a float and less than one. " + "threshold must be a positive integer or a float. " f"Got {threshold} instead." ) From dafb3694f4d74b301f2762400333295bc8e4e966 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 18 Jun 2022 18:42:29 -0700 Subject: [PATCH 15/22] update _perform_chi_merge() --- feature_engine/discretisation/chi_merge.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index 083e34609..10170d6da 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -100,7 +100,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series): self.frequency_matrix_intervals_, self.frequency_matrix_ = ( self._create_frequency_matrix(X, y, self.variables) ) - self._perform_chi_merge() + self.chi_test_ = self._perform_chi_merge() def transform(self, X: pd.DataFrame) -> pd.DataFrame: @@ -249,3 +249,4 @@ def _perform_chi_merge(self) -> None: else: break + return chi_test \ No newline at end of file From 9b5ba12d4cad0e1b5753afef6a2ee675a9218d99 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sat, 18 Jun 2022 18:52:45 -0700 Subject: [PATCH 16/22] create test_chi_merge(). need to fix error. know test does not pass. the first 2 and last 2 chi-square values do not match expected results. meanwhile, the other 9 chi-square values match. unsure what is the cause of the discrepancy --- .../test_chi_merge_discretiser.py | 30 +++++++++++++++++-- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py index dd36ff065..bf149aca5 100644 --- a/tests/test_discretisation/test_chi_merge_discretiser.py +++ b/tests/test_discretisation/test_chi_merge_discretiser.py @@ -15,8 +15,8 @@ def test_create_contingency_table(): transformer = ChiMergeDiscretiser( - variables=["sepal_length", "sepal_width"], - threshold=0.8, + variables="sepal_length", + threshold=1.4, min_intervals=2, max_intervals=10, return_object=False, @@ -77,4 +77,28 @@ def test_create_contingency_table(): # check results assert contingency_table == expected_results # confirm all flowers are included - assert table_flower_count == num_flowers \ No newline at end of file + assert table_flower_count == num_flowers + + +def test_chi_merge(): + + transformer = ChiMergeDiscretiser( + variables="sepal_length", + threshold=1.4, + min_intervals=2, + max_intervals=10, + return_object=False, + return_boundaries=False, + ) + + transformer.fit( + iris[["sepal_length", "sepal_width", "petal_length"]], iris["flower"] + ) + + chi_test = transformer._perform_chi_merge() + chi_scores_round = pd.Series(transformer.chi_test_.keys()).round(1) + expected_results = pd.Series( + [4.1, 2.4, 8.6, 2.9, 1.7, 1.8, 2.2, 4.8, 4.1, 3.2, 1.5, 3.6] + ) + + assert chi_scores_round == expected_results \ No newline at end of file From 71748e2579dc8f0cc42646adbed3ea34cfb4c8f0 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Sun, 19 Jun 2022 08:59:07 -0700 Subject: [PATCH 17/22] update _perform_chi_merge() --- feature_engine/discretisation/chi_merge.py | 9 +++++++-- tests/test_discretisation/test_chi_merge_discretiser.py | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index 10170d6da..ca43681b2 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -237,13 +237,18 @@ def _perform_chi_merge(self) -> None: chi_test[chi2] = [] chi_test[chi2].append((row_idx, row_idx_2)) - smallest = min(chi_test.keys()) - biggest = max(chi_test.keys()) + smallest = min(chi_test.keys()) + biggest = max(chi_test.keys()) if smallest < self.threshold: + + # reversee list allows code to remove the upperbound as it is updating the frequency matrix for lower_bound, upper_bound in list(reversed(chi_test[smallest])): for col_idx in range(shape[1]): + # merge upperbound distribution into lowerbound distribution self.frequency_matrix_[lower_bound, col_idx] += self.frequency_matrix_[upper_bound, col_idx] + + # delete upperbound and its distribution from the frequeny matrix self.frequency_matrix_ = np.delete(self.frequency_matrix_, upper_bound, 0) self.frequency_matrix_intervals_ = np.delete(self.frequency_matrix_intervals_, upper_bound, 0) else: diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py index bf149aca5..8b1c0f965 100644 --- a/tests/test_discretisation/test_chi_merge_discretiser.py +++ b/tests/test_discretisation/test_chi_merge_discretiser.py @@ -96,9 +96,9 @@ def test_chi_merge(): ) chi_test = transformer._perform_chi_merge() - chi_scores_round = pd.Series(transformer.chi_test_.keys()).round(1) + chi_scores_round = pd.Series(chi_test.keys()).round(1) expected_results = pd.Series( [4.1, 2.4, 8.6, 2.9, 1.7, 1.8, 2.2, 4.8, 4.1, 3.2, 1.5, 3.6] ) - assert chi_scores_round == expected_results \ No newline at end of file + assert (chi_scores_round == expected_results).all() \ No newline at end of file From 64ba595842c9afb6f55344433c07e880fa8e405a Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Mon, 27 Jun 2022 17:33:53 -0700 Subject: [PATCH 18/22] add/revise comments --- feature_engine/discretisation/chi_merge.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index ca43681b2..f694e06ec 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -208,7 +208,8 @@ def _calc_chi_square(self, array: np.array) -> float: def _perform_chi_merge(self) -> None: """ Merge adjacent distributions until the the minimum chi-square is greater than - the threshold or the number of frequency-matrix intervals. + the threshold or the number of frequency-matrix intervals is equal to the + limit of the minimum number of intervals. Parameters ---------- @@ -220,8 +221,6 @@ def _perform_chi_merge(self) -> None: """ - chi_test = {} - while self.frequency_matrix_.shape[0] > self.min_intervals: chi_test = {} @@ -237,20 +236,24 @@ def _perform_chi_merge(self) -> None: chi_test[chi2] = [] chi_test[chi2].append((row_idx, row_idx_2)) - smallest = min(chi_test.keys()) - biggest = max(chi_test.keys()) - if smallest < self.threshold: + # use variable to merge the frequency-matrix intervals that + # have the lowest confidence that the frequency distributions are different + min_chi_score = min(chi_test.keys()) + + if min_chi_score < self.threshold: - # reversee list allows code to remove the upperbound as it is updating the frequency matrix - for lower_bound, upper_bound in list(reversed(chi_test[smallest])): + # reverse list allows code to remove the upperbound as it is updating the frequency matrix + for lower_bound, upper_bound in list(reversed(chi_test[min_chi_score])): for col_idx in range(shape[1]): - # merge upperbound distribution into lowerbound distribution + # merge upper-bound distribution into lower-bound distribution self.frequency_matrix_[lower_bound, col_idx] += self.frequency_matrix_[upper_bound, col_idx] # delete upperbound and its distribution from the frequeny matrix self.frequency_matrix_ = np.delete(self.frequency_matrix_, upper_bound, 0) self.frequency_matrix_intervals_ = np.delete(self.frequency_matrix_intervals_, upper_bound, 0) + + # stop merge when minimum chi-score is greater than or equal to the threshold else: break From 1645e8dfeda3cc4688cdce736984c0259d569592 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Mon, 27 Jun 2022 17:38:14 -0700 Subject: [PATCH 19/22] fix nomanclature --- tests/test_discretisation/test_chi_merge_discretiser.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py index 8b1c0f965..a9df144cd 100644 --- a/tests/test_discretisation/test_chi_merge_discretiser.py +++ b/tests/test_discretisation/test_chi_merge_discretiser.py @@ -13,7 +13,7 @@ iris["flower"] = datasets.load_iris().target -def test_create_contingency_table(): +def test_create_frequency_matrix(): transformer = ChiMergeDiscretiser( variables="sepal_length", threshold=1.4, @@ -23,7 +23,7 @@ def test_create_contingency_table(): return_boundaries=False, ) - contingency_table = transformer._create_contingency_table( + frequency_matrix = transformer._create_frequency_matrix( X=iris[["sepal_length", "sepal_width", "petal_length"]], y=iris["flower"], variable="sepal_length" @@ -31,7 +31,7 @@ def test_create_contingency_table(): # number of flowers included in contingency table table_flower_count = 0 - for count_arr in contingency_table.values(): + for count_arr in frequency_matrix.values(): table_flower_count += sum(count_arr) # expected results @@ -75,7 +75,7 @@ def test_create_contingency_table(): num_flowers = iris.shape[0] # check results - assert contingency_table == expected_results + assert frequency_matrix == expected_results # confirm all flowers are included assert table_flower_count == num_flowers From 9a6c31d90affbeef9e118e6412f0ec0eab741e11 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Mon, 27 Jun 2022 18:07:34 -0700 Subject: [PATCH 20/22] edit test_chi_merge() --- .../test_chi_merge_discretiser.py | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py index a9df144cd..7b8cb3fc6 100644 --- a/tests/test_discretisation/test_chi_merge_discretiser.py +++ b/tests/test_discretisation/test_chi_merge_discretiser.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd import pytest from sklearn import datasets @@ -28,14 +29,15 @@ def test_create_frequency_matrix(): y=iris["flower"], variable="sepal_length" ) + lengths = list(frequency_matrix[0]) + distributions = list(frequency_matrix[1]) + freq_matrix_dict = {l: list(d) for l, d in zip(lengths, distributions)} - # number of flowers included in contingency table - table_flower_count = 0 - for count_arr in frequency_matrix.values(): - table_flower_count += sum(count_arr) + # number of flowers accounted for in frequency matrix + num_flowers = np.sum(distributions) # expected results - expected_results = { + expected_frequency_matrix = { 4.3: [1, 0, 0], 4.4: [3, 0, 0], 4.5: [1, 0, 0], @@ -72,16 +74,16 @@ def test_create_frequency_matrix(): 7.7: [0, 0, 4], 7.9: [0, 0, 1] } - num_flowers = iris.shape[0] + expected_num_flowers = iris.shape[0] # check results - assert frequency_matrix == expected_results + assert freq_matrix_dict == expected_frequency_matrix # confirm all flowers are included - assert table_flower_count == num_flowers + assert num_flowers == expected_num_flowers def test_chi_merge(): - + # test 1 - threshold is 0.5 significance level transformer = ChiMergeDiscretiser( variables="sepal_length", threshold=1.4, @@ -95,10 +97,19 @@ def test_chi_merge(): iris[["sepal_length", "sepal_width", "petal_length"]], iris["flower"] ) - chi_test = transformer._perform_chi_merge() - chi_scores_round = pd.Series(chi_test.keys()).round(1) - expected_results = pd.Series( + chi_scores = transformer.chi_test_.keys() + chi_scores_round = pd.Series(chi_scores).round(1) + + frequency_matrix_intervals = list(transformer.frequency_matrix_intervals_) + + # expected results + expected_chi_scores = pd.Series( [4.1, 2.4, 8.6, 2.9, 1.7, 1.8, 2.2, 4.8, 4.1, 3.2, 1.5, 3.6] ) + expected_frequency_matrix_intervals = [ + 4.3, 4.9, 5.0, 5.5, 5.6, 5.7, 5.8, 5.9, 6.3, 6.6, 6.7, 7.0, 7.1 + ] - assert (chi_scores_round == expected_results).all() \ No newline at end of file + # tests - 0.5 significance level + assert frequency_matrix_intervals == expected_frequency_matrix_intervals + assert (chi_scores_round == expected_chi_scores).all() From b8c83bcbdd6157cbfef6cfbc9d5a7bcae2d2b891 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Tue, 28 Jun 2022 17:41:24 -0700 Subject: [PATCH 21/22] expand test_chi_merge(). test still fails. --- .../test_chi_merge_discretiser.py | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py index 7b8cb3fc6..6d5c4a1a2 100644 --- a/tests/test_discretisation/test_chi_merge_discretiser.py +++ b/tests/test_discretisation/test_chi_merge_discretiser.py @@ -83,7 +83,7 @@ def test_create_frequency_matrix(): def test_chi_merge(): - # test 1 - threshold is 0.5 significance level + # Test 1 - threshold is 0.5 significance level transformer = ChiMergeDiscretiser( variables="sepal_length", threshold=1.4, @@ -113,3 +113,30 @@ def test_chi_merge(): # tests - 0.5 significance level assert frequency_matrix_intervals == expected_frequency_matrix_intervals assert (chi_scores_round == expected_chi_scores).all() + + # Test 2 - threshold is 0.9 significance level + transformer = ChiMergeDiscretiser( + variables="sepal_length", + threshold=4.6, + min_intervals=2, + max_intervals=10, + return_object=False, + return_boundaries=False, + ) + + transformer.fit( + iris[["sepal_length", "sepal_width", "petal_length"]], iris["flower"] + ) + + chi_scores = transformer.chi_test_.keys() + chi_scores_round = pd.Series(chi_scores).round(1) + + frequency_matrix_intervals = list(transformer.frequency_matrix_intervals_) + + # expected results + expected_chi_scores = pd.Series([30.9, 6.7, 4.9, 5.9]) + expected_frequency_matrix_intervals = [4.3, 5.5, 5.8, 6.3, 7.1] + + # tests - 0.9 significance level + assert frequency_matrix_intervals == expected_frequency_matrix_intervals + assert (chi_scores_round == expected_chi_scores).all() \ No newline at end of file From fac0b324166b67caab156ce0e4bb6a33f1839d10 Mon Sep 17 00:00:00 2001 From: Morgan-Sell Date: Tue, 28 Jun 2022 18:07:35 -0700 Subject: [PATCH 22/22] ran isort and expand docstring --- feature_engine/discretisation/chi_merge.py | 65 +++++++++++++++++++--- 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py index f694e06ec..6ef5b2d69 100644 --- a/feature_engine/discretisation/chi_merge.py +++ b/feature_engine/discretisation/chi_merge.py @@ -1,19 +1,21 @@ -from typing import List, Optional, Union +from typing import List, Union import numpy as np -from numpy.typing import NDArray import pandas as pd +from numpy.typing import NDArray from sklearn.utils.validation import check_is_fitted -from feature_engine.discretisation.base_discretiser import BaseDiscretiser -from feature_engine._docstrings.methods import _fit_transform_docstring +from feature_engine._docstrings.class_inputs import ( + _variables_numerical_docstring, + _drop_original_docstring, +) from feature_engine._docstrings.fit_attributes import ( - _variables_attribute_docstring, _feature_names_in_docstring, _n_features_in_docstring, + _variables_attribute_docstring, ) -from feature_engine._docstrings.class_inputs import _variables_numerical_docstring +from feature_engine._docstrings.methods import _fit_transform_docstring from feature_engine._docstrings.substitute import Substitution from feature_engine.dataframe_checks import ( _check_contains_inf, @@ -21,13 +23,22 @@ _check_X_matches_training_df, check_X, ) +from feature_engine.discretisation.base_discretiser import BaseDiscretiser from feature_engine.variable_manipulation import ( _check_input_parameter_variables, _find_or_check_numerical_variables, ) - - +@Substitution( + variables=_variables_numerical_docstring, + drop_original=_drop_original_docstring, + fit_transform=_fit_transform_docstring, + return_objects=BaseDiscretiser._return_object_docstring, + return_boundaries=BaseDiscretiser._return_boundaries_docstring, + binner_dict_=BaseDiscretiser._binner_dict_docstring, + fit=BaseDiscretiser._fit_docstring, + transform=BaseDiscretiser._transform_docstring +) class ChiMergeDiscretiser(BaseDiscretiser): """" @@ -36,6 +47,44 @@ class ChiMergeDiscretiser(BaseDiscretiser): for the categorical variable. + Parameters + --------- + {variables} + + threshold: float, default=4.6 + The transformer will merge the frequency distributions until + all chi-scores are greater than the threshold. + + min_intervals: int, default=2 + An additional constraint for the transformer. The transformer + stops merging the distributions once the number of frequency matrix + intervals equals the min_intervals. + + max_intervals: int, default=2 + # TODO: Does not exist. Do we need this param? + + {drop_original} + + + Attributes + ---------- + frequency_matrix_intervals_: + The variable values that are used as the upper- and lower-bounds + of the frequency matrix. + + frequency_matrix_: + The frequency distributions for every interval. + + chi_test_: + The chi-scores for all adjacent frequency distributions. + + {binner_dict_} + + { + + Methods: + -------- + {fit}