Merge pull request #36 from capitalone/null-dupes

Closes #35 by fixing dedupe bug
capitalone · Jan 23, 2019 · 246aad8 · 246aad8
2 parents 46748b8 + 45884d0
commit 246aad8
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 19 deletions.
diff --git a/datacompy/_version.py b/datacompy/_version.py
@@ -14,4 +14,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.5.1"
+__version__ = "0.5.2"
diff --git a/datacompy/core.py b/datacompy/core.py
@@ -24,6 +24,7 @@
 
 import os
 import logging
+from datetime import datetime
 import pandas as pd
 import numpy as np
 
@@ -219,16 +220,8 @@ def _dataframe_merge(self, ignore_spaces):
 
             # Create order column for uniqueness of match
             order_column = temp_column_name(self.df1, self.df2)
-            self.df1[order_column] = (
-                self.df1.sort_values(by=list(self.df1.columns))
-                .groupby(temp_join_columns)
-                .cumcount()
-            )
-            self.df2[order_column] = (
-                self.df2.sort_values(by=list(self.df2.columns))
-                .groupby(temp_join_columns)
-                .cumcount()
-            )
+            self.df1[order_column] = generate_id_within_group(self.df1, temp_join_columns)
+            self.df2[order_column] = generate_id_within_group(self.df2, temp_join_columns)
             temp_join_columns.append(order_column)
 
             params = {"on": temp_join_columns}
@@ -761,3 +754,34 @@ def calculate_max_diff(col_1, col_2):
         return (col_1.astype(float) - col_2.astype(float)).abs().max()
     except:
         return 0
+
+
+def generate_id_within_group(dataframe, join_columns):
+    """Generate an ID column that can be used to deduplicate identical rows.  The series generated
+    is the order within a unique group, and it handles nulls.
+
+    Parameters
+    ----------
+    dataframe : Pandas.DataFrame
+        The dataframe to operate on
+    join_columns : list
+        List of strings which are the join columns
+
+    Returns
+    -------
+    Pandas.Series
+        The ID column that's unique in each group.
+    """
+    default_value = "DATACOMPY_NULL"
+    if dataframe[join_columns].isnull().any().any():
+        if (dataframe[join_columns] == default_value).any().any():
+            raise ValueError("{} was found in your join columns".format(default_value))
+        return (
+            dataframe[join_columns]
+            .astype(str)
+            .fillna(default_value)
+            .groupby(join_columns)
+            .cumcount()
+        )
+    else:
+        return dataframe[join_columns].groupby(join_columns).cumcount()
diff --git a/sphinx/pandas_usage.rst b/sphinx/pandas_usage.rst
@@ -36,7 +36,7 @@ acct_id     dollar_amt name                 float_fld
 Set up like:
 
 .. code-block:: python
-    
+
     from io import StringIO
     import pandas as pd
     import datacompy
@@ -82,7 +82,7 @@ join column(s) or by index.
 
     compare = datacompy.Compare(df1, df2, join_columns=['acct_id', 'name'])
 
-    # OR 
+    # OR
 
     compare = datacompy.Compare(df1, df2, on_index=True)
 
@@ -196,6 +196,66 @@ There are a few convenience methods available after the comparison has been run:
     print(compare.df2_unq_columns())
     # set()
 
+Duplicate rows
+--------------
+
+Datacompy will try to handle rows that are duplicate in the join columns.  It does this behind the
+scenes by generating a unique ID within each unique group of the join columns.  For example, if you
+have two dataframes you're trying to join on acct_id:
+
+=========== ================
+acct_id     name
+=========== ================
+1           George Maharis
+1           Michael Bluth
+2           George Bluth
+=========== ================
+
+=========== ================
+acct_id     name
+=========== ================
+1           George Maharis
+1           Michael Bluth
+1           Tony Wonder
+2           George Bluth
+=========== ================
+
+Datacompy will generate a unique temporary ID for joining:
+
+=========== ================ ========
+acct_id     name             temp_id
+=========== ================ ========
+1           George Maharis   0
+1           Michael Bluth    1
+2           George Bluth     0
+=========== ================ ========
+
+=========== ================ ========
+acct_id     name             temp_id
+=========== ================ ========
+1           George Maharis   0
+1           Michael Bluth    1
+1           Tony Wonder      2
+2           George Bluth     0
+=========== ================ ========
+
+And then merge the two dataframes on a combination of the join_columns you specified and the temporary
+ID, before dropping the temp_id again.  So the first two rows in the first dataframe will match the
+first two rows in the second dataframe, and the third row in the second dataframe will be recognized
+as uniquely in the second.
+
+Caveats
++++++++
+
+- Duplicate matching is resilient to nulls in your join columns - it will convert the join
+  columns to strings and fill null values with ``'DATACOMPY_NULL'`` before generating the temporary
+  ID.  If you already have ``'DATACOMPY_NULL'`` as a value in your join columns, the merge step will
+  fail with a ``ValueError``.  You can also fill null values with a value of your choice before
+  initializing the ``Compare`` class, based on what you know about the data.
+- The duplicate matching is somewhat naïve when it comes to picking which rows to match when there
+  are duplicates.  Datacompy sorts by the other fields before generating the temporary ID, then matches
+  directly on that field.  If there are a lot of duplicates you may need to join on more columns, or
+  handle them separately.
 
 Limitations
 -----------

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -17,7 +17,7 @@
 """
 Testing out the datacompy functionality
 """
-
+from datetime import datetime
 from decimal import Decimal
 import pytest
 from pytest import raises
@@ -124,8 +124,9 @@ def test_string_columns_equal_with_ignore_spaces_and_case():
 |something|False
 ||True"""
     df = pd.read_csv(six.StringIO(data), sep="|")
-    actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True,
-                                         ignore_case=True)
+    actual_out = datacompy.columns_equal(
+        df.a, df.b, rel_tol=0.2, ignore_spaces=True, ignore_case=True
+    )
     expect_out = df["expected"]
     assert_series_equal(expect_out, actual_out, check_names=False)
 
@@ -190,8 +191,9 @@ def test_date_columns_equal_with_ignore_spaces_and_case():
 ||True"""
     df = pd.read_csv(six.StringIO(data), sep="|")
     # First compare just the strings
-    actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True,
-                                         ignore_case=True)
+    actual_out = datacompy.columns_equal(
+        df.a, df.b, rel_tol=0.2, ignore_spaces=True, ignore_case=True
+    )
     expect_out = df["expected"]
     assert_series_equal(expect_out, actual_out, check_names=False)
 
@@ -735,7 +737,6 @@ def test_strings_with_joins_with_ignore_spaces():
     assert compare.intersect_rows_match()
 
 
-
 def test_strings_with_joins_with_ignore_case():
     df1 = pd.DataFrame([{"a": "hi", "b": "a"}, {"a": "bye", "b": "A"}])
     df2 = pd.DataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "a"}])
@@ -851,3 +852,47 @@ def test_calculate_max_diff(column, expected):
     assert np.isclose(
         datacompy.calculate_max_diff(MAX_DIFF_DF["base"], MAX_DIFF_DF[column]), expected
     )
+
+
+def test_dupes_with_nulls():
+    df1 = pd.DataFrame(
+        {
+            "fld_1": [1, 2, 2, 3, 3, 4, 5, 5],
+            "fld_2": ["A", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
+        }
+    )
+    df2 = pd.DataFrame({"fld_1": [1, 2, 3, 4, 5], "fld_2": ["A", np.nan, np.nan, np.nan, np.nan]})
+    comp = datacompy.Compare(df1, df2, join_columns=["fld_1", "fld_2"])
+    assert comp.subset()
+
+
+@pytest.mark.parametrize(
+    "dataframe,expected",
+    [
+        (pd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}), pd.Series([0, 0, 0])),
+        (pd.DataFrame({"a": ["a", "a", "DATACOMPY_NULL"], "b": [1, 1, 2]}), pd.Series([0, 1, 0])),
+        (pd.DataFrame({"a": [-999, 2, 3], "b": [1, 2, 3]}), pd.Series([0, 0, 0])),
+        (pd.DataFrame({"a": [1, np.nan, np.nan], "b": [1, 2, 2]}), pd.Series([0, 0, 1])),
+        (pd.DataFrame({"a": ["1", np.nan, np.nan], "b": ["1", "2", "2"]}), pd.Series([0, 0, 1])),
+        (
+            pd.DataFrame({"a": [datetime(2018, 1, 1), np.nan, np.nan], "b": ["1", "2", "2"]}),
+            pd.Series([0, 0, 1]),
+        ),
+    ],
+)
+def test_generate_id_within_group(dataframe, expected):
+    assert (datacompy.core.generate_id_within_group(dataframe, ["a", "b"]) == expected).all()
+
+
+@pytest.mark.parametrize(
+    "dataframe, message",
+    [
+        (
+            pd.DataFrame({"a": [1, np.nan, "DATACOMPY_NULL"], "b": [1, 2, 3]}),
+            "DATACOMPY_NULL was found in your join columns",
+        )
+    ],
+)
+def test_generate_id_within_group_valueerror(dataframe, message):
+    with raises(ValueError, message=message):
+        datacompy.core.generate_id_within_group(dataframe, ["a", "b"])