From 543e23610d9143af4afea93045ffbc9e40ff0e11 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Sat, 21 May 2022 21:51:07 -0700
Subject: [PATCH 01/22] initial commit. create class skeleton.

---
 feature_engine/discretisation/chi_merge.py | 93 ++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 feature_engine/discretisation/chi_merge.py

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
new file mode 100644
index 000000000..866115407
--- /dev/null
+++ b/feature_engine/discretisation/chi_merge.py
@@ -0,0 +1,93 @@
+
+from typing import List, Optional, Union
+
+import pandas as pd
+
+from feature_engine.discretisation.base_discretiser import BaseDiscretiser
+from feature_engine._docstrings.methods import _fit_transform_docstring
+from feature_engine._docstrings.fit_attributes import (
+    _variables_attribute_docstring,
+    _feature_names_in_docstring,
+    _n_features_in_docstring,
+)
+from feature_engine._docstrings.class_inputs import _variables_numerical_docstring
+from feature_engine._docstrings.substitute import Substitution
+from feature_engine.variable_manipulation import _check_input_parameter_variables
+
+
+
+class ChiMergeDiscretiser(BaseDiscretiser):
+    """"
+
+
+
+
+
+
+
+    """
+    def __init__(
+        self,
+        variables: Union[None, int, str, List[Union[str, int]]] = None,
+        threshold: float = 0.9,
+        min_intervals: int = 2,
+        max_intervals: int = 10,
+        return_object: bool = False,
+        return_boundaries: bool = False,
+    ) -> None:
+
+        if not isinstance(threshold, float) or threshold >= 1:
+            raise ValueError(
+                "threshold must be a float and less than one. "
+                f"Got {threshold} instead."
+            )
+
+        if not isinstance(min_intervals, int) or min_intervals < 2:
+            raise ValueError(
+                "min_intervals must be an integer that is greater than or "
+                f"equal to 2. Got {min_intervals} instead."
+            )
+
+        # TODO: Should we limit max_intervals? If so, how much?
+        if not isinstance(max_intervals, int) or max_intervals > 15:
+            raise ValueError(
+                "max_intervals must be an integer that is less than or "
+                f"equal to 15. Got {max_intervals} instead."
+            )
+        super().__init(return_object, return_boundaries)
+
+        self.variables = _check_input_parameter_variables(variables)
+        self.threshold = threshold
+        self.min_intervals = min_intervals
+        self.max_intervals = max_intervals
+
+    def fit(self, X: pd.DataFrame, y: Optional[pd.Series]):
+        """
+        Learn the limits of the intervals using the chi-square test.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The training dataset. Can be the entire dataframe, not just the variables
+            to be transformed.
+        y: None
+            y is not needed in this encoder. You can pass y or None.
+
+        """
+        pass
+
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        """
+        Sort the variable values into the intervals.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The data to transform.
+
+        Returns
+        -------
+        X_new: pandas dataframe of shape = [n_samples, n_features]
+            The transformed data with the discrete variables.
+        """
+        pass
\ No newline at end of file

From 54363c60cf07ee5d3c28c65f0724c138f2e225d9 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Sat, 21 May 2022 22:43:55 -0700
Subject: [PATCH 02/22] expand fit and transform

---
 feature_engine/discretisation/chi_merge.py | 31 ++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index 866115407..0f1be1f19 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -2,6 +2,7 @@
 from typing import List, Optional, Union
 
 import pandas as pd
+from sklearn.utils.validation import check_is_fitted
 
 from feature_engine.discretisation.base_discretiser import BaseDiscretiser
 from feature_engine._docstrings.methods import _fit_transform_docstring
@@ -12,7 +13,16 @@
 )
 from feature_engine._docstrings.class_inputs import _variables_numerical_docstring
 from feature_engine._docstrings.substitute import Substitution
-from feature_engine.variable_manipulation import _check_input_parameter_variables
+from feature_engine.dataframe_checks import (
+    _check_contains_inf,
+    _check_contains_na,
+    _check_X_matches_training_df,
+    check_X,
+)
+from feature_engine.variable_manipulation import (
+    _check_input_parameter_variables,
+    _find_or_check_numerical_variables,
+)
 
 
 
@@ -74,7 +84,16 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series]):
             y is not needed in this encoder. You can pass y or None.
 
         """
-        pass
+        # check input dataframe
+        X = check_X(X)
+        _check_contains_na(X)
+        _check_contains_inf(X)
+
+        # find or check for numerical variables
+        self.variables = _find_or_check_numerical_variables(X, self.variables)
+
+
+         pass
 
     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         """
@@ -90,4 +109,12 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         X_new: pandas dataframe of shape = [n_samples, n_features]
             The transformed data with the discrete variables.
         """
+        # check that fit method has been called
+        check_is_fitted(self)
+
+        # check if X is a dataframe
+        X = check_X(X)
+
+        
+
         pass
\ No newline at end of file

From 54420714cd97324445783082f38488b98e068cdf Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Sun, 22 May 2022 11:53:40 -0700
Subject: [PATCH 03/22] start _create_contingency_table. stuck. debating if
 neccessary

---
 feature_engine/discretisation/chi_merge.py | 60 ++++++++++++++++++++--
 1 file changed, 56 insertions(+), 4 deletions(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index 0f1be1f19..fe40b6b7f 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -29,7 +29,9 @@
 class ChiMergeDiscretiser(BaseDiscretiser):
     """"
 
-
+    Chi-Squared test is a statistical hypothesis test that assumes (the null hypothesis)
+    that the observed frequencies for a categorical variable match the expected frequencies
+    for the categorical variable.
 
 
 
@@ -71,7 +73,7 @@ def __init__(
         self.min_intervals = min_intervals
         self.max_intervals = max_intervals
 
-    def fit(self, X: pd.DataFrame, y: Optional[pd.Series]):
+    def fit(self, X: pd.DataFrame, y: pd.Series):
         """
         Learn the limits of the intervals using the chi-square test.
 
@@ -115,6 +117,56 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         # check if X is a dataframe
         X = check_X(X)
 
-        
 
-        pass
\ No newline at end of file
+
+        pass
+
+    def _create_contingency_table(self, feature: pd.Series, class_labels: pd.Series):
+        """
+        Generates a frequency table in which the labels organized into bins.
+
+        Parameters
+        ----------
+        feature: pandas series = [n_samples, ]
+            The data to discretised.
+
+        class_labels: pandas series = [n_samples, ]
+            The categorical data that will be arranged in the bins.
+
+        Returns
+        -------
+        TBD
+
+
+        """
+
+        unique_values = sorted(set(feature), reverse=False)
+        unique_labels = sorted(set(class_labels))
+        count_dict = {label: 0 for label in unique_labels}
+        zeros = [0 for i in range(len(unique_labels))]
+        frequency_table = {val: zeros for val in unique_values}
+
+        for feature_val, label_val in zip(feature, class_labels):
+            print(feature_val)
+            for idx, interval_key in enumerate(frequency_table.keys()):
+                min_interval = list(frequency_table.keys())[idx]
+                max_interval = list(frequency_table.keys())[idx + 1]
+                table_col_index = unique_labels.index(label_val)
+
+                print(idx, min_interval, max_interval)
+                if interval_key == max(unique_values):
+                    frequency_table[interval_key][label_val] += 1
+                    print(feature_val, label_val, min_interval, max_interval, table_col_index)
+                    print(frequency_table)
+                    break
+
+                if min_interval <= feature_val and feature_val < max_interval:
+                    print(feature_val, label_val, min_interval, max_interval, table_col_index)
+                    frequency_table[min_interval][label_val] += 1
+                    print(frequency_table)
+                    break
+
+
+    def _calc_chi_sqaure(self):
+
+

From 61752bdf0653845bf61ba77e4eaf0de7d608fdc4 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Sat, 4 Jun 2022 09:59:55 -0700
Subject: [PATCH 04/22] create _calc_chi_square()

---
 feature_engine/discretisation/chi_merge.py | 57 +++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index fe40b6b7f..8ba219f4d 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -1,6 +1,7 @@
 
 from typing import List, Optional, Union
 
+import numpy as np
 import pandas as pd
 from sklearn.utils.validation import check_is_fitted
 
@@ -167,6 +168,60 @@ def _create_contingency_table(self, feature: pd.Series, class_labels: pd.Series)
                     break
 
 
-    def _calc_chi_sqaure(self):
+    def _calc_chi_sqaure(self, array: np.array) -> float:
+        """
+        Calculates chi-squared. Using the following equation:
+
+        # TODO: Add chi2 formula docstring
+
+        Parameters
+        ----------
+        X: np.array = [2, n_features]
+            Two sequential rows from the contingency table.
+
+        Returns
+        -------
+        chi2: float
+            Determines whether two sets of measurements are related.
+        """
+
+        if not isinstance(array, np.array):
+            raise ValueError(
+                f"array must be a numpy array. Got {type(array)} instead."
+            )
+
+        if array.shape[0] != 2:
+            raise ValueError(
+                f"array must be comprised of 2 rows. Got "
+                f"{array.shape[0]} instead"
+            )
+
+        shape = array.shape
+        num_obs = float(array.sum())
+        rows_sums = {}
+        cols_sums = {}
+        chi2 = 0
+
+        # calculate row-wise summations
+        for row_idx in range(shape[0]):
+            rows_sums[row_idx] = array[row_idx, :].sum()
+
+        # calculate column-wise summations
+        for col_idx in range(shape[1]):
+            cols_sums[col_idx] = array[:, col_idx].sum()
+
+        # iterate through all expect and actual value pairs.
+        for row_idx in range(shape[0]):
+            for col_idx in range(shape[1]):
+                expected_val = rows_sums[row_idx] * cols_sums[col_idx] / num_obs
+                actual_val = array[row_idx, col_idx]
+
+                if expected_val == 0:
+                    # prevents NaN error
+                    chi2 += 0
+                else:
+                    chi2 += (actual_val - expected_val) ** 2 / float(expected_val)
+
+        return chi2
 
 

From 2fdb15cb2c19b364d4872698055912f2586fb69b Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Sat, 4 Jun 2022 10:00:17 -0700
Subject: [PATCH 05/22] create _calc_chi_square()

---
 feature_engine/discretisation/chi_merge.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index 8ba219f4d..0bbe2362c 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -168,7 +168,7 @@ def _create_contingency_table(self, feature: pd.Series, class_labels: pd.Series)
                     break
 
 
-    def _calc_chi_sqaure(self, array: np.array) -> float:
+    def _calc_chi_square(self, array: np.array) -> float:
         """
         Calculates chi-squared. Using the following equation:
 

From dd22c1949714f6db20aee6a3d945f9bf1c579f0f Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Sat, 4 Jun 2022 16:51:05 -0700
Subject: [PATCH 06/22] revise _create_contingency_table(). simplify code

---
 feature_engine/discretisation/chi_merge.py | 52 +++++++++-------------
 1 file changed, 20 insertions(+), 32 deletions(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index 0bbe2362c..6c49894d4 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -83,8 +83,9 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         X: pandas dataframe of shape = [n_samples, n_features]
             The training dataset. Can be the entire dataframe, not just the variables
             to be transformed.
-        y: None
-            y is not needed in this encoder. You can pass y or None.
+
+        y: pd.Series
+            y is the predicted variables.
 
         """
         # check input dataframe
@@ -118,11 +119,10 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         # check if X is a dataframe
         X = check_X(X)
 
-
-
         pass
 
-    def _create_contingency_table(self, feature: pd.Series, class_labels: pd.Series):
+
+    def _create_contingency_table(self, X: pd.DataFrame, y: pd.Series, variable: str) -> dict:
         """
         Generates a frequency table in which the labels organized into bins.
 
@@ -131,41 +131,28 @@ def _create_contingency_table(self, feature: pd.Series, class_labels: pd.Series)
         feature: pandas series = [n_samples, ]
             The data to discretised.
 
-        class_labels: pandas series = [n_samples, ]
+        y: pandas series = [n_samples, ]
             The categorical data that will be arranged in the bins.
 
+        variable: str
+            The variable used to count the frequency of the class labels.
+
         Returns
         -------
-        TBD
+        contingency_table: dict
+            A frequency table of the tables for each unvariable feature value.
 
 
         """
 
-        unique_values = sorted(set(feature), reverse=False)
-        unique_labels = sorted(set(class_labels))
-        count_dict = {label: 0 for label in unique_labels}
-        zeros = [0 for i in range(len(unique_labels))]
-        frequency_table = {val: zeros for val in unique_values}
-
-        for feature_val, label_val in zip(feature, class_labels):
-            print(feature_val)
-            for idx, interval_key in enumerate(frequency_table.keys()):
-                min_interval = list(frequency_table.keys())[idx]
-                max_interval = list(frequency_table.keys())[idx + 1]
-                table_col_index = unique_labels.index(label_val)
-
-                print(idx, min_interval, max_interval)
-                if interval_key == max(unique_values):
-                    frequency_table[interval_key][label_val] += 1
-                    print(feature_val, label_val, min_interval, max_interval, table_col_index)
-                    print(frequency_table)
-                    break
-
-                if min_interval <= feature_val and feature_val < max_interval:
-                    print(feature_val, label_val, min_interval, max_interval, table_col_index)
-                    frequency_table[min_interval][label_val] += 1
-                    print(frequency_table)
-                    break
+        unique_values = sorted(set(X[variable]))
+        unique_labels = list(set(y))
+        contingency_table = {l: [0] * len(unique_labels) for l in unique_values}
+
+        for value, label in zip(X[variable], y):
+            contingency_table[value][label] += 1
+
+        return contingency_table
 
 
     def _calc_chi_square(self, array: np.array) -> float:
@@ -225,3 +212,4 @@ def _calc_chi_square(self, array: np.array) -> float:
         return chi2
 
 
+

From d6317b3398dd712c84b7cb0cadba405fe312ef18 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Mon, 13 Jun 2022 16:37:42 -0700
Subject: [PATCH 07/22] create test_chi_merge_discretiser.py

---
 feature_engine/discretisation/__init__.py           |  2 ++
 .../test_chi_merge_discretiser.py                   | 13 +++++++++++++
 2 files changed, 15 insertions(+)
 create mode 100644 tests/test_discretisation/test_chi_merge_discretiser.py

diff --git a/feature_engine/discretisation/__init__.py b/feature_engine/discretisation/__init__.py
index a305c8a93..19fd90c7e 100644
--- a/feature_engine/discretisation/__init__.py
+++ b/feature_engine/discretisation/__init__.py
@@ -7,10 +7,12 @@
 from .decision_tree import DecisionTreeDiscretiser
 from .equal_frequency import EqualFrequencyDiscretiser
 from .equal_width import EqualWidthDiscretiser
+from .chi_merge import ChiMergeDiscretiser
 
 __all__ = [
     "DecisionTreeDiscretiser",
     "EqualFrequencyDiscretiser",
     "EqualWidthDiscretiser",
     "ArbitraryDiscretiser",
+    "ChiMergeDiscretiser",
 ]
diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py
new file mode 100644
index 000000000..f397fa018
--- /dev/null
+++ b/tests/test_discretisation/test_chi_merge_discretiser.py
@@ -0,0 +1,13 @@
+import pandas as pd
+import pytest
+from sklearn import datasets
+
+from feature_engine.discretisation import ChiMergeDiscretiser
+
+# TODO: Should we create the df here on in conftest?
+
+# create dataset for unit tests
+col_names = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
+iris_data = datasets.load_iris().data
+iris = pd.DataFrame(iris_data, columns=col_names)
+iris["flower"] = datasets.load_iris().target
\ No newline at end of file

From c41e09c1a872d4a75af86eefa9acc55be2588aa1 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Mon, 13 Jun 2022 17:04:31 -0700
Subject: [PATCH 08/22] create/pass test_create_contingency_table() and fix
 syntax errors

---
 feature_engine/discretisation/chi_merge.py    |  8 +--
 .../test_chi_merge_discretiser.py             | 69 ++++++++++++++++++-
 2 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index 6c49894d4..6b2c5a7f9 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -48,7 +48,7 @@ def __init__(
         return_object: bool = False,
         return_boundaries: bool = False,
     ) -> None:
-
+        # TODO: Add threshold must be >= 0
         if not isinstance(threshold, float) or threshold >= 1:
             raise ValueError(
                 "threshold must be a float and less than one. "
@@ -67,7 +67,7 @@ def __init__(
                 "max_intervals must be an integer that is less than or "
                 f"equal to 15. Got {max_intervals} instead."
             )
-        super().__init(return_object, return_boundaries)
+        super().__init__(return_object, return_boundaries)
 
         self.variables = _check_input_parameter_variables(variables)
         self.threshold = threshold
@@ -97,8 +97,6 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         self.variables = _find_or_check_numerical_variables(X, self.variables)
 
 
-         pass
-
     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         """
         Sort the variable values into the intervals.
@@ -119,8 +117,6 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         # check if X is a dataframe
         X = check_X(X)
 
-        pass
-
 
     def _create_contingency_table(self, X: pd.DataFrame, y: pd.Series, variable: str) -> dict:
         """
diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py
index f397fa018..f51adc5b2 100644
--- a/tests/test_discretisation/test_chi_merge_discretiser.py
+++ b/tests/test_discretisation/test_chi_merge_discretiser.py
@@ -10,4 +10,71 @@
 col_names = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
 iris_data = datasets.load_iris().data
 iris = pd.DataFrame(iris_data, columns=col_names)
-iris["flower"] = datasets.load_iris().target
\ No newline at end of file
+iris["flower"] = datasets.load_iris().target
+
+
+def test_create_contingency_table():
+    transformer = ChiMergeDiscretiser(
+        variables=["sepal_length", "sepal_width"],
+        threshold=0.8,
+        min_intervals=2,
+        max_intervals=10,
+        return_object=False,
+        return_boundaries=False,
+    )
+
+    contingency_table = transformer._create_contingency_table(
+        X=iris[["sepal_length", "sepal_width", "petal_length"]],
+        y=iris["flower"],
+        variable="sepal_length"
+    )
+
+    # number of flowers included in contingency table
+    table_flower_count = 0
+    for count_arr in contingency_table.values():
+        table_flower_count += sum(count_arr)
+
+    # expected results
+    expected_results = {
+        4.3: [1, 0, 0],
+        4.4: [3, 0, 0],
+        4.5: [1, 0, 0],
+        4.6: [4, 0, 0],
+        4.7: [2, 0, 0],
+        4.8: [5, 0, 0],
+        4.9: [4, 1, 1],
+        5.0: [8, 2, 0],
+        5.1: [8, 1, 0],
+        5.2: [3, 1, 0],
+        5.3: [1, 0, 0],
+        5.4: [5, 1, 0],
+        5.5: [2, 5, 0],
+        5.6: [0, 5, 1],
+        5.7: [2, 5, 1],
+        5.8: [1, 3, 3],
+        5.9: [0, 2, 1],
+        6.0: [0, 4, 2],
+        6.1: [0, 4, 2],
+        6.2: [0, 2, 2],
+        6.3: [0, 3, 6],
+        6.4: [0, 2, 5],
+        6.5: [0, 1, 4],
+        6.6: [0, 2, 0],
+        6.7: [0, 3, 5],
+        6.8: [0, 1, 2],
+        6.9: [0, 1, 3],
+        7.0: [0, 1, 0],
+        7.1: [0, 0, 1],
+        7.2: [0, 0, 3],
+        7.3: [0, 0, 1],
+        7.4: [0, 0, 1],
+        7.6: [0, 0, 1],
+        7.7: [0, 0, 4],
+        7.9: [0, 0, 1]
+    }
+    num_flowers = iris.shape[0]
+
+    # check results
+    assert contingency_table == expected_results
+    # check all flowers are included
+    assert table_flower_count == num_flowers
\ No newline at end of file

From 56eb8a4b7db09f8e94228e68d259b36a29ff66f5 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Mon, 13 Jun 2022 18:14:38 -0700
Subject: [PATCH 09/22] fix errors

---
 feature_engine/discretisation/chi_merge.py     | 18 +++++-------------
 .../test_chi_merge_discretiser.py              |  2 +-
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index 6b2c5a7f9..8401db268 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -124,7 +124,7 @@ def _create_contingency_table(self, X: pd.DataFrame, y: pd.Series, variable: str
 
         Parameters
         ----------
-        feature: pandas series = [n_samples, ]
+        X: pandas series = [n_samples, ]
             The data to discretised.
 
         y: pandas series = [n_samples, ]
@@ -143,7 +143,10 @@ def _create_contingency_table(self, X: pd.DataFrame, y: pd.Series, variable: str
 
         unique_values = sorted(set(X[variable]))
         unique_labels = list(set(y))
-        contingency_table = {l: [0] * len(unique_labels) for l in unique_values}
+        # stores frequency distribution for each unique value
+        contingency_table = {
+            val: np.array([0] * len(unique_labels)) for val in unique_values
+        }
 
         for value, label in zip(X[variable], y):
             contingency_table[value][label] += 1
@@ -168,17 +171,6 @@ def _calc_chi_square(self, array: np.array) -> float:
             Determines whether two sets of measurements are related.
         """
 
-        if not isinstance(array, np.array):
-            raise ValueError(
-                f"array must be a numpy array. Got {type(array)} instead."
-            )
-
-        if array.shape[0] != 2:
-            raise ValueError(
-                f"array must be comprised of 2 rows. Got "
-                f"{array.shape[0]} instead"
-            )
-
         shape = array.shape
         num_obs = float(array.sum())
         rows_sums = {}
diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py
index f51adc5b2..dd36ff065 100644
--- a/tests/test_discretisation/test_chi_merge_discretiser.py
+++ b/tests/test_discretisation/test_chi_merge_discretiser.py
@@ -76,5 +76,5 @@ def test_create_contingency_table():
 
     # check results
     assert contingency_table == expected_results
-    # check all flowers are included
+    # confirm all flowers are included
     assert table_flower_count == num_flowers
\ No newline at end of file

From 4809988ccf2bfda3fd9f46b47e3a098c4532cea3 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Fri, 17 Jun 2022 17:55:09 -0700
Subject: [PATCH 10/22] chi_merge_disc

---
 feature_engine/discretisation/chi_merge.py | 33 ++++++++++++++++++++--
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index 8401db268..dd72ccf9d 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -90,11 +90,14 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         """
         # check input dataframe
         X = check_X(X)
-        _check_contains_na(X)
-        _check_contains_inf(X)
+        _check_contains_na(X, self.variables)
+        _check_contains_inf(X, self.variables)
 
         # find or check for numerical variables
-        self.variables = _find_or_check_numerical_variables(X, self.variables)
+        # self.variables = _find_or_check_numerical_variables(X, self.variables)
+
+        self.contingency_table_ = self._create_contingency_table(X, y, self.variables)
+        self.chi2_scores_dict_ = self._create_chi_square_scores_dict()
 
 
     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
@@ -199,5 +202,29 @@ def _calc_chi_square(self, array: np.array) -> float:
 
         return chi2
 
+    def _create_chi_square_scores_dict(self):
+        """
+        Calculate all chi-square scores for each sequential distribution of
+        the contingency table. The dictionary keys correspond to the
+        lower-bound of the interval.
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        chi2_scores: dict
+            The chi-square score for each sequential distribution
+            
+        """
+        chi2_scores = {}
+        unique_values = list(self.contingency_table_.keys())
+        frequency_distributions = np.array(list(self.contingency_table_.values()))
+
+        for idx in range(2, len(unique_values) + 1):
+            chi2 = self._calc_chi_square(frequency_distributions[idx - 2: idx])
+            chi2_scores[unique_values[idx - 2]] = chi2
 
+        return chi2_scores
 

From 74faad65f9c7994a5b8ffbda4a7c84cf0853a13e Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Fri, 17 Jun 2022 19:29:31 -0700
Subject: [PATCH 11/22] change _create_contingency_table() to
 _create_frequency_matrix(). method now returns a 2-d numpy array and 1-d
 numpy array instead of a dictionary.

---
 feature_engine/discretisation/chi_merge.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index dd72ccf9d..718884c0c 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -204,7 +204,7 @@ def _calc_chi_square(self, array: np.array) -> float:
 
     def _create_chi_square_scores_dict(self):
         """
-        Calculate all chi-square scores for each sequential distribution of
+        Calculate all chi-square scores for each adjacent distribution of
         the contingency table. The dictionary keys correspond to the
         lower-bound of the interval.
 
@@ -215,8 +215,8 @@ def _create_chi_square_scores_dict(self):
         Returns
         -------
         chi2_scores: dict
-            The chi-square score for each sequential distribution
-            
+            The chi-square scores for each adjacent distribution
+
         """
         chi2_scores = {}
         unique_values = list(self.contingency_table_.keys())

From 2c040fa234e1b5fc33653e0dee78d490f12028a9 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Fri, 17 Jun 2022 19:37:41 -0700
Subject: [PATCH 12/22] change _create_contingency_table() to
 _create_frequency_matrix(). method now returns a 2-d numpy array and 1-d
 numpy array instead of a dictionary.

---
 feature_engine/discretisation/chi_merge.py | 27 ++++++++++++----------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index 718884c0c..4ac87c120 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -2,6 +2,7 @@
 from typing import List, Optional, Union
 
 import numpy as np
+from numpy.typing import NDArray
 import pandas as pd
 from sklearn.utils.validation import check_is_fitted
 
@@ -96,7 +97,9 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         # find or check for numerical variables
         # self.variables = _find_or_check_numerical_variables(X, self.variables)
 
-        self.contingency_table_ = self._create_contingency_table(X, y, self.variables)
+        self.frequency_matrix_intervals_, self.frequency_matrix_ = (
+            self._create_frequency_matrix(X, y, self.variables)
+        )
         self.chi2_scores_dict_ = self._create_chi_square_scores_dict()
 
 
@@ -120,8 +123,8 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         # check if X is a dataframe
         X = check_X(X)
 
-
-    def _create_contingency_table(self, X: pd.DataFrame, y: pd.Series, variable: str) -> dict:
+    # TODO: How to type hint 2 numpy arrays
+    def _create_frequency_matrix(self, X: pd.DataFrame, y: pd.Series, variable: str) -> [NDArray, NDArray]:
         """
         Generates a frequency table in which the labels organized into bins.
 
@@ -143,18 +146,18 @@ def _create_contingency_table(self, X: pd.DataFrame, y: pd.Series, variable: str
 
 
         """
-
-        unique_values = sorted(set(X[variable]))
-        unique_labels = list(set(y))
-        # stores frequency distribution for each unique value
-        contingency_table = {
-            val: np.array([0] * len(unique_labels)) for val in unique_values
-        }
+        frequency_matrix_intervals = np.sort(np.unique(X[variable]))
+        unique_class_values = np.sort(np.unique(y))
+        frequency_matrix = np.zeros(
+            (len(frequency_matrix_intervals), len(unique_class_values))
+        )
 
         for value, label in zip(X[variable], y):
-            contingency_table[value][label] += 1
+            row_idx = np.where(frequency_matrix_intervals == value)[0][0]
+            col_idx = np.where(unique_class_values == label)[0][0]
+            frequency_matrix[row_idx][col_idx] += 1
 
-        return contingency_table
+        return frequency_matrix_intervals, frequency_matrix
 
 
     def _calc_chi_square(self, array: np.array) -> float:

From 606f7096a017a0395da9cee81dac6035df762f0f Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Sat, 18 Jun 2022 13:10:21 -0700
Subject: [PATCH 13/22] delete _create_chi_square_scores_dict(). create
 _perform_chi_merge(). New method is incomplete. Issue with some of the
 chi-square calculations. It only happens w/ certain distributions

---
 feature_engine/discretisation/chi_merge.py | 46 +++++++++++++++-------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index 4ac87c120..fcdada119 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -100,7 +100,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         self.frequency_matrix_intervals_, self.frequency_matrix_ = (
             self._create_frequency_matrix(X, y, self.variables)
         )
-        self.chi2_scores_dict_ = self._create_chi_square_scores_dict()
+        self._perform_chi_merge()
 
 
     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
@@ -205,11 +205,10 @@ def _calc_chi_square(self, array: np.array) -> float:
 
         return chi2
 
-    def _create_chi_square_scores_dict(self):
+    def _perform_chi_merge(self) -> None:
         """
-        Calculate all chi-square scores for each adjacent distribution of
-        the contingency table. The dictionary keys correspond to the
-        lower-bound of the interval.
+        Merge adjacent distributions until the the minimum chi-square is greater than
+        the threshold or the number of frequency-matrix intervals.
 
         Parameters
         ----------
@@ -217,17 +216,36 @@ def _create_chi_square_scores_dict(self):
 
         Returns
         -------
-        chi2_scores: dict
-            The chi-square scores for each adjacent distribution
+        None
 
         """
-        chi2_scores = {}
-        unique_values = list(self.contingency_table_.keys())
-        frequency_distributions = np.array(list(self.contingency_table_.values()))
 
-        for idx in range(2, len(unique_values) + 1):
-            chi2 = self._calc_chi_square(frequency_distributions[idx - 2: idx])
-            chi2_scores[unique_values[idx - 2]] = chi2
+        chi_test = {}
+
+        while self.frequency_matrix_.shape[0] > self.min_intervals:
+
+            chi_test = {}
+            shape = self.frequency_matrix_.shape
+
+            for row_idx in range(0, shape[0] - 1):
+                row_idx_2 = row_idx + 1
+                chi2 = self._calc_chi_square(
+                    self.frequency_matrix_[row_idx: row_idx_2 + 1]
+                )
+
+                if chi2 not in chi_test:
+                    chi_test[chi2] = []
+
+                chi_test[chi2].append((row_idx, row_idx_2))
+                smallest = min(chi_test.keys())
+                biggest = max(chi_test.keys())
 
-        return chi2_scores
+            if smallest < self.threshold:
+                for lower_bound, upper_bound in list(reversed(chi_test[smallest])):
+                    for col_idx in range(shape[1]):
+                        self.frequency_matrix_[lower_bound, col_idx] += self.frequency_matrix_[upper_bound, col_idx]
+                    self.frequency_matrix_ = np.delete(self.frequency_matrix_, upper_bound, 0)
+                    self.frequency_matrix_intervals_ = np.delete(self.frequency_matrix_intervals_, upper_bound, 0)
+            else:
+                break
 

From 50b7409f334baab819f642377bce4733cd9ab224 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Sat, 18 Jun 2022 18:41:21 -0700
Subject: [PATCH 14/22] update init()

---
 feature_engine/discretisation/chi_merge.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index fcdada119..083e34609 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -43,16 +43,16 @@ class ChiMergeDiscretiser(BaseDiscretiser):
     def __init__(
         self,
         variables: Union[None, int, str, List[Union[str, int]]] = None,
-        threshold: float = 0.9,
+        threshold: Union[float, int] = 1.4,
         min_intervals: int = 2,
         max_intervals: int = 10,
         return_object: bool = False,
         return_boundaries: bool = False,
     ) -> None:
-        # TODO: Add threshold must be >= 0
-        if not isinstance(threshold, float) or threshold >= 1:
+
+        if not isinstance(threshold, (int, float)) or threshold < 0:
             raise ValueError(
-                "threshold must be a float and less than one. "
+                "threshold must be a positive integer or a float. "
                 f"Got {threshold} instead."
             )
 

From dafb3694f4d74b301f2762400333295bc8e4e966 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Sat, 18 Jun 2022 18:42:29 -0700
Subject: [PATCH 15/22] update _perform_chi_merge()

---
 feature_engine/discretisation/chi_merge.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index 083e34609..10170d6da 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -100,7 +100,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
         self.frequency_matrix_intervals_, self.frequency_matrix_ = (
             self._create_frequency_matrix(X, y, self.variables)
         )
-        self._perform_chi_merge()
+        self.chi_test_ = self._perform_chi_merge()
 
 
     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
@@ -249,3 +249,4 @@ def _perform_chi_merge(self) -> None:
             else:
                 break
 
+        return chi_test
\ No newline at end of file

From 9b5ba12d4cad0e1b5753afef6a2ee675a9218d99 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Sat, 18 Jun 2022 18:52:45 -0700
Subject: [PATCH 16/22] create test_chi_merge(). need to fix error. know test
 does not pass. the first 2 and last 2 chi-square values do not match expected
 results. meanwhile, the other 9 chi-square values match. unsure what is the
 cause of the discrepancy

---
 .../test_chi_merge_discretiser.py             | 30 +++++++++++++++++--
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py
index dd36ff065..bf149aca5 100644
--- a/tests/test_discretisation/test_chi_merge_discretiser.py
+++ b/tests/test_discretisation/test_chi_merge_discretiser.py
@@ -15,8 +15,8 @@
 
 def test_create_contingency_table():
     transformer = ChiMergeDiscretiser(
-        variables=["sepal_length", "sepal_width"],
-        threshold=0.8,
+        variables="sepal_length",
+        threshold=1.4,
         min_intervals=2,
         max_intervals=10,
         return_object=False,
@@ -77,4 +77,28 @@ def test_create_contingency_table():
     # check results
     assert contingency_table == expected_results
     # confirm all flowers are included
-    assert table_flower_count == num_flowers
\ No newline at end of file
+    assert table_flower_count == num_flowers
+
+
+def test_chi_merge():
+
+    transformer = ChiMergeDiscretiser(
+        variables="sepal_length",
+        threshold=1.4,
+        min_intervals=2,
+        max_intervals=10,
+        return_object=False,
+        return_boundaries=False,
+    )
+
+    transformer.fit(
+        iris[["sepal_length", "sepal_width", "petal_length"]], iris["flower"]
+    )
+
+    chi_test = transformer._perform_chi_merge()
+    chi_scores_round = pd.Series(transformer.chi_test_.keys()).round(1)
+    expected_results = pd.Series(
+        [4.1, 2.4, 8.6, 2.9, 1.7, 1.8, 2.2, 4.8, 4.1, 3.2, 1.5, 3.6]
+    )
+
+    assert chi_scores_round == expected_results
\ No newline at end of file

From 71748e2579dc8f0cc42646adbed3ea34cfb4c8f0 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Sun, 19 Jun 2022 08:59:07 -0700
Subject: [PATCH 17/22] update _perform_chi_merge()

---
 feature_engine/discretisation/chi_merge.py              | 9 +++++++--
 tests/test_discretisation/test_chi_merge_discretiser.py | 4 ++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index 10170d6da..ca43681b2 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -237,13 +237,18 @@ def _perform_chi_merge(self) -> None:
                     chi_test[chi2] = []
 
                 chi_test[chi2].append((row_idx, row_idx_2))
-                smallest = min(chi_test.keys())
-                biggest = max(chi_test.keys())
+            smallest = min(chi_test.keys())
+            biggest = max(chi_test.keys())
 
             if smallest < self.threshold:
+
+                # reversee list allows code to remove the upperbound as it is updating the frequency matrix
                 for lower_bound, upper_bound in list(reversed(chi_test[smallest])):
                     for col_idx in range(shape[1]):
+                        # merge upperbound distribution into lowerbound distribution
                         self.frequency_matrix_[lower_bound, col_idx] += self.frequency_matrix_[upper_bound, col_idx]
+
+                    # delete upperbound and its distribution from the frequeny matrix
                     self.frequency_matrix_ = np.delete(self.frequency_matrix_, upper_bound, 0)
                     self.frequency_matrix_intervals_ = np.delete(self.frequency_matrix_intervals_, upper_bound, 0)
             else:
diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py
index bf149aca5..8b1c0f965 100644
--- a/tests/test_discretisation/test_chi_merge_discretiser.py
+++ b/tests/test_discretisation/test_chi_merge_discretiser.py
@@ -96,9 +96,9 @@ def test_chi_merge():
     )
 
     chi_test = transformer._perform_chi_merge()
-    chi_scores_round = pd.Series(transformer.chi_test_.keys()).round(1)
+    chi_scores_round = pd.Series(chi_test.keys()).round(1)
     expected_results = pd.Series(
         [4.1, 2.4, 8.6, 2.9, 1.7, 1.8, 2.2, 4.8, 4.1, 3.2, 1.5, 3.6]
     )
 
-    assert chi_scores_round == expected_results
\ No newline at end of file
+    assert (chi_scores_round == expected_results).all()
\ No newline at end of file

From 64ba595842c9afb6f55344433c07e880fa8e405a Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Mon, 27 Jun 2022 17:33:53 -0700
Subject: [PATCH 18/22] add/revise comments

---
 feature_engine/discretisation/chi_merge.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index ca43681b2..f694e06ec 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -208,7 +208,8 @@ def _calc_chi_square(self, array: np.array) -> float:
     def _perform_chi_merge(self) -> None:
         """
         Merge adjacent distributions until the the minimum chi-square is greater than
-        the threshold or the number of frequency-matrix intervals.
+        the threshold or the number of frequency-matrix intervals is equal to the
+        limit of the minimum number of intervals.
 
         Parameters
         ----------
@@ -220,8 +221,6 @@ def _perform_chi_merge(self) -> None:
 
         """
 
-        chi_test = {}
-
         while self.frequency_matrix_.shape[0] > self.min_intervals:
 
             chi_test = {}
@@ -237,20 +236,24 @@ def _perform_chi_merge(self) -> None:
                     chi_test[chi2] = []
 
                 chi_test[chi2].append((row_idx, row_idx_2))
-            smallest = min(chi_test.keys())
-            biggest = max(chi_test.keys())
 
-            if smallest < self.threshold:
+            # use variable to merge the frequency-matrix intervals that
+            # have the lowest confidence that the frequency distributions are different
+            min_chi_score = min(chi_test.keys())
+
+            if min_chi_score < self.threshold:
 
-                # reversee list allows code to remove the upperbound as it is updating the frequency matrix
-                for lower_bound, upper_bound in list(reversed(chi_test[smallest])):
+                # reverse list allows code to remove the upperbound as it is updating the frequency matrix
+                for lower_bound, upper_bound in list(reversed(chi_test[min_chi_score])):
                     for col_idx in range(shape[1]):
-                        # merge upperbound distribution into lowerbound distribution
+                        # merge upper-bound distribution into lower-bound distribution
                         self.frequency_matrix_[lower_bound, col_idx] += self.frequency_matrix_[upper_bound, col_idx]
 
                     # delete upperbound and its distribution from the frequeny matrix
                     self.frequency_matrix_ = np.delete(self.frequency_matrix_, upper_bound, 0)
                     self.frequency_matrix_intervals_ = np.delete(self.frequency_matrix_intervals_, upper_bound, 0)
+
+            # stop merge when minimum chi-score is greater than or equal to the threshold
             else:
                 break
 

From 1645e8dfeda3cc4688cdce736984c0259d569592 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Mon, 27 Jun 2022 17:38:14 -0700
Subject: [PATCH 19/22] fix nomanclature

---
 tests/test_discretisation/test_chi_merge_discretiser.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py
index 8b1c0f965..a9df144cd 100644
--- a/tests/test_discretisation/test_chi_merge_discretiser.py
+++ b/tests/test_discretisation/test_chi_merge_discretiser.py
@@ -13,7 +13,7 @@
 iris["flower"] = datasets.load_iris().target
 
 
-def test_create_contingency_table():
+def test_create_frequency_matrix():
     transformer = ChiMergeDiscretiser(
         variables="sepal_length",
         threshold=1.4,
@@ -23,7 +23,7 @@ def test_create_contingency_table():
         return_boundaries=False,
     )
 
-    contingency_table = transformer._create_contingency_table(
+    frequency_matrix = transformer._create_frequency_matrix(
         X=iris[["sepal_length", "sepal_width", "petal_length"]],
         y=iris["flower"],
         variable="sepal_length"
@@ -31,7 +31,7 @@ def test_create_contingency_table():
 
     # number of flowers included in contingency table
     table_flower_count = 0
-    for count_arr in contingency_table.values():
+    for count_arr in frequency_matrix.values():
         table_flower_count += sum(count_arr)
 
     # expected results
@@ -75,7 +75,7 @@ def test_create_contingency_table():
     num_flowers = iris.shape[0]
 
     # check results
-    assert contingency_table == expected_results
+    assert frequency_matrix == expected_results
     # confirm all flowers are included
     assert table_flower_count == num_flowers
 

From 9a6c31d90affbeef9e118e6412f0ec0eab741e11 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Mon, 27 Jun 2022 18:07:34 -0700
Subject: [PATCH 20/22] edit test_chi_merge()

---
 .../test_chi_merge_discretiser.py             | 37 ++++++++++++-------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py
index a9df144cd..7b8cb3fc6 100644
--- a/tests/test_discretisation/test_chi_merge_discretiser.py
+++ b/tests/test_discretisation/test_chi_merge_discretiser.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd
 import pytest
 from sklearn import datasets
@@ -28,14 +29,15 @@ def test_create_frequency_matrix():
         y=iris["flower"],
         variable="sepal_length"
     )
+    lengths = list(frequency_matrix[0])
+    distributions = list(frequency_matrix[1])
+    freq_matrix_dict = {l: list(d) for l, d in zip(lengths, distributions)}
 
-    # number of flowers included in contingency table
-    table_flower_count = 0
-    for count_arr in frequency_matrix.values():
-        table_flower_count += sum(count_arr)
+    # number of flowers accounted for in frequency matrix
+    num_flowers = np.sum(distributions)
 
     # expected results
-    expected_results = {
+    expected_frequency_matrix = {
         4.3: [1, 0, 0],
         4.4: [3, 0, 0],
         4.5: [1, 0, 0],
@@ -72,16 +74,16 @@ def test_create_frequency_matrix():
         7.7: [0, 0, 4],
         7.9: [0, 0, 1]
     }
-    num_flowers = iris.shape[0]
+    expected_num_flowers = iris.shape[0]
 
     # check results
-    assert frequency_matrix == expected_results
+    assert freq_matrix_dict == expected_frequency_matrix
     # confirm all flowers are included
-    assert table_flower_count == num_flowers
+    assert num_flowers == expected_num_flowers
 
 
 def test_chi_merge():
-
+    # test 1 - threshold is 0.5 significance level
     transformer = ChiMergeDiscretiser(
         variables="sepal_length",
         threshold=1.4,
@@ -95,10 +97,19 @@ def test_chi_merge():
         iris[["sepal_length", "sepal_width", "petal_length"]], iris["flower"]
     )
 
-    chi_test = transformer._perform_chi_merge()
-    chi_scores_round = pd.Series(chi_test.keys()).round(1)
-    expected_results = pd.Series(
+    chi_scores = transformer.chi_test_.keys()
+    chi_scores_round = pd.Series(chi_scores).round(1)
+
+    frequency_matrix_intervals = list(transformer.frequency_matrix_intervals_)
+
+    # expected results
+    expected_chi_scores = pd.Series(
         [4.1, 2.4, 8.6, 2.9, 1.7, 1.8, 2.2, 4.8, 4.1, 3.2, 1.5, 3.6]
     )
+    expected_frequency_matrix_intervals = [
+        4.3, 4.9, 5.0, 5.5, 5.6, 5.7, 5.8, 5.9, 6.3, 6.6, 6.7, 7.0, 7.1
+    ]
 
-    assert (chi_scores_round == expected_results).all()
\ No newline at end of file
+    # tests - 0.5 significance level
+    assert frequency_matrix_intervals == expected_frequency_matrix_intervals
+    assert (chi_scores_round == expected_chi_scores).all()

From b8c83bcbdd6157cbfef6cfbc9d5a7bcae2d2b891 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Tue, 28 Jun 2022 17:41:24 -0700
Subject: [PATCH 21/22] expand test_chi_merge(). test still fails.

---
 .../test_chi_merge_discretiser.py             | 29 ++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/tests/test_discretisation/test_chi_merge_discretiser.py b/tests/test_discretisation/test_chi_merge_discretiser.py
index 7b8cb3fc6..6d5c4a1a2 100644
--- a/tests/test_discretisation/test_chi_merge_discretiser.py
+++ b/tests/test_discretisation/test_chi_merge_discretiser.py
@@ -83,7 +83,7 @@ def test_create_frequency_matrix():
 
 
 def test_chi_merge():
-    # test 1 - threshold is 0.5 significance level
+    # Test 1 - threshold is 0.5 significance level
     transformer = ChiMergeDiscretiser(
         variables="sepal_length",
         threshold=1.4,
@@ -113,3 +113,30 @@ def test_chi_merge():
     # tests - 0.5 significance level
     assert frequency_matrix_intervals == expected_frequency_matrix_intervals
     assert (chi_scores_round == expected_chi_scores).all()
+
+    # Test 2 - threshold is 0.9 significance level
+    transformer = ChiMergeDiscretiser(
+        variables="sepal_length",
+        threshold=4.6,
+        min_intervals=2,
+        max_intervals=10,
+        return_object=False,
+        return_boundaries=False,
+    )
+
+    transformer.fit(
+        iris[["sepal_length", "sepal_width", "petal_length"]], iris["flower"]
+    )
+
+    chi_scores = transformer.chi_test_.keys()
+    chi_scores_round = pd.Series(chi_scores).round(1)
+
+    frequency_matrix_intervals = list(transformer.frequency_matrix_intervals_)
+
+    # expected results
+    expected_chi_scores = pd.Series([30.9, 6.7, 4.9, 5.9])
+    expected_frequency_matrix_intervals = [4.3, 5.5, 5.8, 6.3, 7.1]
+
+    # tests - 0.9 significance level
+    assert frequency_matrix_intervals == expected_frequency_matrix_intervals
+    assert (chi_scores_round == expected_chi_scores).all()
\ No newline at end of file

From fac0b324166b67caab156ce0e4bb6a33f1839d10 Mon Sep 17 00:00:00 2001
From: Morgan-Sell <morganpsell@gmail.com>
Date: Tue, 28 Jun 2022 18:07:35 -0700
Subject: [PATCH 22/22] ran isort and expand docstring

---
 feature_engine/discretisation/chi_merge.py | 65 +++++++++++++++++++---
 1 file changed, 57 insertions(+), 8 deletions(-)

diff --git a/feature_engine/discretisation/chi_merge.py b/feature_engine/discretisation/chi_merge.py
index f694e06ec..6ef5b2d69 100644
--- a/feature_engine/discretisation/chi_merge.py
+++ b/feature_engine/discretisation/chi_merge.py
@@ -1,19 +1,21 @@
 
-from typing import List, Optional, Union
+from typing import List, Union
 
 import numpy as np
-from numpy.typing import NDArray
 import pandas as pd
+from numpy.typing import NDArray
 from sklearn.utils.validation import check_is_fitted
 
-from feature_engine.discretisation.base_discretiser import BaseDiscretiser
-from feature_engine._docstrings.methods import _fit_transform_docstring
+from feature_engine._docstrings.class_inputs import (
+    _variables_numerical_docstring,
+    _drop_original_docstring,
+)
 from feature_engine._docstrings.fit_attributes import (
-    _variables_attribute_docstring,
     _feature_names_in_docstring,
     _n_features_in_docstring,
+    _variables_attribute_docstring,
 )
-from feature_engine._docstrings.class_inputs import _variables_numerical_docstring
+from feature_engine._docstrings.methods import _fit_transform_docstring
 from feature_engine._docstrings.substitute import Substitution
 from feature_engine.dataframe_checks import (
     _check_contains_inf,
@@ -21,13 +23,22 @@
     _check_X_matches_training_df,
     check_X,
 )
+from feature_engine.discretisation.base_discretiser import BaseDiscretiser
 from feature_engine.variable_manipulation import (
     _check_input_parameter_variables,
     _find_or_check_numerical_variables,
 )
 
-
-
+@Substitution(
+    variables=_variables_numerical_docstring,
+    drop_original=_drop_original_docstring,
+    fit_transform=_fit_transform_docstring,
+    return_objects=BaseDiscretiser._return_object_docstring,
+    return_boundaries=BaseDiscretiser._return_boundaries_docstring,
+    binner_dict_=BaseDiscretiser._binner_dict_docstring,
+    fit=BaseDiscretiser._fit_docstring,
+    transform=BaseDiscretiser._transform_docstring
+)
 class ChiMergeDiscretiser(BaseDiscretiser):
     """"
 
@@ -36,6 +47,44 @@ class ChiMergeDiscretiser(BaseDiscretiser):
     for the categorical variable.
 
 
+    Parameters
+    ---------
+    {variables}
+
+    threshold: float, default=4.6
+        The transformer will merge the frequency distributions until
+        all chi-scores are greater than the threshold.
+
+    min_intervals: int, default=2
+        An additional constraint for the transformer. The transformer
+        stops merging the distributions once the number of frequency matrix
+        intervals equals the min_intervals.
+
+    max_intervals: int, default=2
+        # TODO: Does not exist. Do we need this param?
+
+    {drop_original}
+
+
+    Attributes
+    ----------
+    frequency_matrix_intervals_:
+        The variable values that are used as the upper- and lower-bounds
+        of the frequency matrix.
+
+    frequency_matrix_:
+        The frequency distributions for every interval.
+
+    chi_test_:
+        The chi-scores for all adjacent frequency distributions.
+
+    {binner_dict_}
+
+    {
+
+    Methods:
+    --------
+    {fit}