Add new row synthesis single table metric (#226)

katxiao · web-flow · commit 315e34a1cf98 · 2022-09-23T17:01:12.000-04:00
* Add synthetic uniqueness single table metric and tests

* Add warning for edge case

* Update metric name

* Update implementation

* Add input validation

* fix unit test

* Fix edge cases in new row synthesis query

* Update query logic

* Update unit test
diff --git a/sdmetrics/multi_table/multi_single_table.py b/sdmetrics/multi_table/multi_single_table.py
@@ -241,6 +241,12 @@ class BNLikelihood(MultiSingleTableMetric):
     single_table_metric = single_table.bayesian_network.BNLikelihood
 
 
+class NewRowSynthesis(MultiSingleTableMetric):
+    """MultiSingleTableMetric based on SingleTable NewRowSynthesis."""
+
+    single_table_metric = single_table.new_row_synthesis.NewRowSynthesis
+
+
 class BNLogLikelihood(MultiSingleTableMetric):
     """MultiSingleTableMetric based on SingleTable BNLogLikelihood."""
 
diff --git a/sdmetrics/reports/single_table/plot_utils.py b/sdmetrics/reports/single_table/plot_utils.py
@@ -85,10 +85,15 @@ def _get_similarity_correlation_matrix(score_breakdowns, columns):
     Args:
         score_breakdowns (dict):
             Mapping of metric to the score breakdown result.
+        columns (list[string] or set[string]):
+            A list or set of column names.
 
     Returns:
         pandas.DataFrame
     """
+    if isinstance(columns, set):
+        columns = list(columns)
+
     similarity_correlation = pd.DataFrame(
         index=columns,
         columns=columns,
diff --git a/sdmetrics/single_table/__init__.py b/sdmetrics/single_table/__init__.py
@@ -22,6 +22,7 @@
 from sdmetrics.single_table.multi_single_column import (
     BoundaryAdherence, CategoryCoverage, CSTest, KSComplement, MissingValueSimilarity,
     MultiSingleColumnMetric, RangeCoverage, StatisticSimilarity, TVComplement)
+from sdmetrics.single_table.new_row_synthesis import NewRowSynthesis
 from sdmetrics.single_table.privacy.base import CategoricalPrivacyMetric, NumericalPrivacyMetric
 from sdmetrics.single_table.privacy.cap import (
     CategoricalCAP, CategoricalGeneralizedCAP, CategoricalZeroCAP)
@@ -88,4 +89,5 @@
     'StatisticSimilarity',
     'TVComplement',
     'RangeCoverage',
+    'NewRowSynthesis',
 ]
diff --git a/sdmetrics/single_table/base.py b/sdmetrics/single_table/base.py
@@ -1,5 +1,6 @@
 """Base Single Table metric class."""
 
+import copy
 from operator import attrgetter
 
 import pandas as pd
@@ -103,6 +104,11 @@ def _validate_inputs(cls, real_data, synthetic_data, metadata=None):
             (pandas.DataFrame, pandas.DataFrame, dict):
                 The validated data and metadata.
         """
+        real_data = real_data.copy()
+        synthetic_data = synthetic_data.copy()
+        if metadata is not None:
+            metadata = copy.deepcopy(metadata)
+
         if set(real_data.columns) != set(synthetic_data.columns):
             raise ValueError('`real_data` and `synthetic_data` must have the same columns')
 
diff --git a/sdmetrics/single_table/new_row_synthesis.py b/sdmetrics/single_table/new_row_synthesis.py
@@ -0,0 +1,123 @@
+"""New Row Synthesis metric for single table."""
+import warnings
+
+import pandas as pd
+
+from sdmetrics.goal import Goal
+from sdmetrics.single_table.base import SingleTableMetric
+
+
+class NewRowSynthesis(SingleTableMetric):
+    """NewRowSynthesis Single Table metric.
+
+    This metric measures whether each row in the synthetic data is new,
+    or whether it exactly matches a row in the real data.
+
+    Attributes:
+        name (str):
+            Name to use when reports about this metric are printed.
+        goal (sdmetrics.goal.Goal):
+            The goal of this metric.
+        min_value (Union[float, tuple[float]]):
+            Minimum value or values that this metric can take.
+        max_value (Union[float, tuple[float]]):
+            Maximum value or values that this metric can take.
+    """
+
+    name = 'NewRowSynthesis'
+    goal = Goal.MAXIMIZE
+    min_value = 0
+    max_value = 1
+
+    @classmethod
+    def compute(cls, real_data, synthetic_data, metadata=None, numerical_match_tolerance=0.01,
+                synthetic_sample_size=None):
+        """Compute this metric.
+
+        This metric looks for matches between the real and synthetic data for
+        the compatible columns. This metric also looks for matches in missing values.
+
+        Args:
+            real_data (Union[numpy.ndarray, pandas.DataFrame]):
+                The values from the real dataset.
+            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
+                The values from the synthetic dataset.
+            metadata (dict):
+                Table metadata dict.
+            numerical_match_tolerance (float):
+                A float larger than 0 representing how close two numerical values have to be
+                in order to be considered a match. Defaults to `0.01`.
+            synthetic_sample_size (int):
+                The number of synthetic rows to sample before computing this metric.
+                Use this to speed up the computation time if you have a large amount
+                of synthetic data. Note that the final score may not be as precise if
+                your sample size is low. Defaults to ``None``, which does not sample,
+                and uses all of the provided rows.
+
+        Returns:
+            float:
+                The new row synthesis score.
+        """
+        real_data, synthetic_data, metadata = cls._validate_inputs(
+            real_data, synthetic_data, metadata)
+
+        if synthetic_sample_size is not None:
+            if synthetic_sample_size > len(synthetic_data):
+                warnings.warn(f'The provided `synthetic_sample_size` of {synthetic_sample_size} '
+                              'is larger than the number of synthetic data rows '
+                              f'({len(synthetic_data)}). Proceeding without sampling.')
+            else:
+                synthetic_data = synthetic_data.sample(n=synthetic_sample_size)
+
+        numerical_fields = []
+        discrete_fields = []
+        for field, field_meta in metadata['fields'].items():
+            if field_meta['type'] == 'datetime':
+                real_data[field] = pd.to_numeric(real_data[field])
+                synthetic_data[field] = pd.to_numeric(synthetic_data[field])
+                numerical_fields.append(field)
+            elif field_meta['type'] == 'numerical':
+                numerical_fields.append(field)
+            else:
+                discrete_fields.append(field)
+
+        num_unique_rows = 0
+        for index, row in synthetic_data.iterrows():
+            row_filter = []
+            for field in real_data.columns:
+                if pd.isna(row[field]):
+                    field_filter = f'{field}.isnull()'
+                elif field in numerical_fields:
+                    field_filter = (
+                        f'abs({field} - {row[field]}) <= '
+                        f'{abs(numerical_match_tolerance * row[field])}'
+                    )
+                else:
+                    if real_data[field].dtype == 'O':
+                        field_filter = f"{field} == '{row[field]}'"
+                    else:
+                        field_filter = f'{field} == {row[field]}'
+
+                row_filter.append(field_filter)
+
+            matches = real_data.query(' and '.join(row_filter))
+            if matches is None or matches.empty:
+                num_unique_rows += 1
+
+        return num_unique_rows / len(synthetic_data)
+
+    @classmethod
+    def normalize(cls, raw_score):
+        """Normalize the log-likelihood value.
+
+        Notice that this is not the mean likelihood.
+
+        Args:
+            raw_score (float):
+                The value of the metric from `compute`.
+
+        Returns:
+            float:
+                The normalized value of the metric
+        """
+        return super().normalize(raw_score)
diff --git a/tests/unit/single_table/test_new_row_synthesis.py b/tests/unit/single_table/test_new_row_synthesis.py
@@ -0,0 +1,134 @@
+from unittest.mock import patch
+
+import numpy as np
+import pandas as pd
+
+from sdmetrics.single_table import NewRowSynthesis
+
+
+class TestNewRowSynthesis:
+
+    def test_compute(self):
+        """Test the ``compute`` method and expect that the new row synthesis score is returned."""
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': [0, 1, 2, 3, 4],
+            'col2': [1, 2, 1, 3, 4],
+            'col3': ['a', 'b', 'c', 'd', 'b'],
+            'col4': [1.32, np.nan, 1.43, np.nan, 2.0],
+            'col5': [51, 52, 53, 54, 55],
+            'col6': ['2020-01-02', '2021-01-04', '2021-05-03', '2022-10-11', '2022-11-13'],
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': [0, 1, 2, 3, 4],
+            'col2': [1, 3, 4, 2, 2],
+            'col3': ['a', 'b', 'c', 'b', 'e'],
+            'col4': [1.32, 1.56, 1.21, np.nan, 1.90],
+            'col5': [51, 51, 54, 55, 53],
+            'col6': ['2020-01-02', '2022-11-24', '2022-06-01', '2021-04-12', '2020-12-11'],
+        })
+        metadata = {
+            'fields': {
+                'col1': {'type': 'id', 'subtype': 'int'},
+                'col2': {'type': 'numerical', 'subtype': 'int'},
+                'col3': {'type': 'categorical'},
+                'col4': {'type': 'numerical', 'subtype': 'float'},
+                'col5': {'type': 'categorical'},
+                'col6': {'type': 'datetime', 'format': '%Y-%m-%d'},
+            },
+        }
+        metric = NewRowSynthesis()
+
+        # Run
+        score = metric.compute(real_data, synthetic_data, metadata)
+
+        # Assert
+        assert score == 0.8
+
+    def test_compute_with_sample_size(self):
+        """Test the ``compute`` method with a sample size.
+
+        Expect that the new row synthesis score is returned.
+        """
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': [1, 2, 1, 3, 4],
+            'col2': ['a', 'b', 'c', 'd', 'b'],
+            'col3': [1.32, np.nan, 1.43, np.nan, 2.0],
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': [1, 3, 4, 2, 2],
+            'col2': ['a', 'b', 'c', 'd', 'e'],
+            'col3': [1.46, 1.56, 1.21, np.nan, 1.92],
+        })
+        metadata = {
+            'fields': {
+                'col1': {'type': 'numerical', 'subtype': 'int'},
+                'col2': {'type': 'categorical'},
+                'col3': {'type': 'numerical', 'subtype': 'float'},
+            },
+        }
+        sample_size = 2
+        metric = NewRowSynthesis()
+
+        # Run
+        score = metric.compute(
+            real_data, synthetic_data, metadata, synthetic_sample_size=sample_size)
+
+        # Assert
+        assert score == 1
+
+    @patch('sdmetrics.single_table.new_row_synthesis.warnings')
+    def test_compute_with_sample_size_too_large(self, warnings_mock):
+        """Test the ``compute`` method with a sample size larger than the number of rows.
+
+        Expect that the new row synthesis is returned. Expect a warning to be raised.
+        """
+        # Setup
+        real_data = pd.DataFrame({
+            'col1': [1, 2, 1, 3, 4],
+            'col2': ['a', 'b', 'c', 'd', 'b'],
+            'col3': [1.32, np.nan, 1.43, np.nan, 2.0],
+        })
+        synthetic_data = pd.DataFrame({
+            'col1': [1, 3, 4, 2, 2],
+            'col2': ['a', 'b', 'c', 'd', 'e'],
+            'col3': [1.35, 1.56, 1.21, np.nan, 1.92],
+        })
+        metadata = {
+            'fields': {
+                'col1': {'type': 'numerical', 'subtype': 'int'},
+                'col2': {'type': 'categorical'},
+                'col3': {'type': 'numerical', 'subtype': 'float'},
+            },
+        }
+        sample_size = 15
+        metric = NewRowSynthesis()
+
+        # Run
+        score = metric.compute(
+            real_data, synthetic_data, metadata, synthetic_sample_size=sample_size)
+
+        # Assert
+        assert score == 1
+        warnings_mock.warn.assert_called_once_with(
+            'The provided `synthetic_sample_size` of 15 is larger than the number of '
+            'synthetic data rows (5). Proceeding without sampling.'
+        )
+
+    @patch('sdmetrics.single_table.new_row_synthesis.SingleTableMetric.normalize')
+    def test_normalize(self, normalize_mock):
+        """Test the ``normalize`` method.
+
+        Expect that the inherited ``normalize`` method is called.
+        """
+        # Setup
+        metric = NewRowSynthesis()
+        raw_score = 0.9
+
+        # Run
+        result = metric.normalize(raw_score)
+
+        # Assert
+        normalize_mock.assert_called_once_with(raw_score)
+        assert result == normalize_mock.return_value