Skip to content

Commit

Permalink
Merge pull request #36 from capitalone/null-dupes
Browse files Browse the repository at this point in the history
Closes #35 by fixing dedupe bug
  • Loading branch information
theianrobertson authored Jan 23, 2019
2 parents 46748b8 + 45884d0 commit 246aad8
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 19 deletions.
2 changes: 1 addition & 1 deletion datacompy/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.5.1"
__version__ = "0.5.2"
44 changes: 34 additions & 10 deletions datacompy/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import os
import logging
from datetime import datetime
import pandas as pd
import numpy as np

Expand Down Expand Up @@ -219,16 +220,8 @@ def _dataframe_merge(self, ignore_spaces):

# Create order column for uniqueness of match
order_column = temp_column_name(self.df1, self.df2)
self.df1[order_column] = (
self.df1.sort_values(by=list(self.df1.columns))
.groupby(temp_join_columns)
.cumcount()
)
self.df2[order_column] = (
self.df2.sort_values(by=list(self.df2.columns))
.groupby(temp_join_columns)
.cumcount()
)
self.df1[order_column] = generate_id_within_group(self.df1, temp_join_columns)
self.df2[order_column] = generate_id_within_group(self.df2, temp_join_columns)
temp_join_columns.append(order_column)

params = {"on": temp_join_columns}
Expand Down Expand Up @@ -761,3 +754,34 @@ def calculate_max_diff(col_1, col_2):
return (col_1.astype(float) - col_2.astype(float)).abs().max()
except:
return 0


def generate_id_within_group(dataframe, join_columns):
"""Generate an ID column that can be used to deduplicate identical rows. The series generated
is the order within a unique group, and it handles nulls.
Parameters
----------
dataframe : Pandas.DataFrame
The dataframe to operate on
join_columns : list
List of strings which are the join columns
Returns
-------
Pandas.Series
The ID column that's unique in each group.
"""
default_value = "DATACOMPY_NULL"
if dataframe[join_columns].isnull().any().any():
if (dataframe[join_columns] == default_value).any().any():
raise ValueError("{} was found in your join columns".format(default_value))
return (
dataframe[join_columns]
.astype(str)
.fillna(default_value)
.groupby(join_columns)
.cumcount()
)
else:
return dataframe[join_columns].groupby(join_columns).cumcount()
64 changes: 62 additions & 2 deletions sphinx/pandas_usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ acct_id dollar_amt name float_fld
Set up like:

.. code-block:: python
from io import StringIO
import pandas as pd
import datacompy
Expand Down Expand Up @@ -82,7 +82,7 @@ join column(s) or by index.
compare = datacompy.Compare(df1, df2, join_columns=['acct_id', 'name'])
# OR
# OR
compare = datacompy.Compare(df1, df2, on_index=True)
Expand Down Expand Up @@ -196,6 +196,66 @@ There are a few convenience methods available after the comparison has been run:
print(compare.df2_unq_columns())
# set()
Duplicate rows
--------------

Datacompy will try to handle rows that are duplicate in the join columns. It does this behind the
scenes by generating a unique ID within each unique group of the join columns. For example, if you
have two dataframes you're trying to join on acct_id:

=========== ================
acct_id name
=========== ================
1 George Maharis
1 Michael Bluth
2 George Bluth
=========== ================

=========== ================
acct_id name
=========== ================
1 George Maharis
1 Michael Bluth
1 Tony Wonder
2 George Bluth
=========== ================

Datacompy will generate a unique temporary ID for joining:

=========== ================ ========
acct_id name temp_id
=========== ================ ========
1 George Maharis 0
1 Michael Bluth 1
2 George Bluth 0
=========== ================ ========

=========== ================ ========
acct_id name temp_id
=========== ================ ========
1 George Maharis 0
1 Michael Bluth 1
1 Tony Wonder 2
2 George Bluth 0
=========== ================ ========

And then merge the two dataframes on a combination of the join_columns you specified and the temporary
ID, before dropping the temp_id again. So the first two rows in the first dataframe will match the
first two rows in the second dataframe, and the third row in the second dataframe will be recognized
as uniquely in the second.

Caveats
+++++++

- Duplicate matching is resilient to nulls in your join columns - it will convert the join
columns to strings and fill null values with ``'DATACOMPY_NULL'`` before generating the temporary
ID. If you already have ``'DATACOMPY_NULL'`` as a value in your join columns, the merge step will
fail with a ``ValueError``. You can also fill null values with a value of your choice before
initializing the ``Compare`` class, based on what you know about the data.
- The duplicate matching is somewhat naïve when it comes to picking which rows to match when there
are duplicates. Datacompy sorts by the other fields before generating the temporary ID, then matches
directly on that field. If there are a lot of duplicates you may need to join on more columns, or
handle them separately.

Limitations
-----------
Expand Down
57 changes: 51 additions & 6 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"""
Testing out the datacompy functionality
"""

from datetime import datetime
from decimal import Decimal
import pytest
from pytest import raises
Expand Down Expand Up @@ -124,8 +124,9 @@ def test_string_columns_equal_with_ignore_spaces_and_case():
|something|False
||True"""
df = pd.read_csv(six.StringIO(data), sep="|")
actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True,
ignore_case=True)
actual_out = datacompy.columns_equal(
df.a, df.b, rel_tol=0.2, ignore_spaces=True, ignore_case=True
)
expect_out = df["expected"]
assert_series_equal(expect_out, actual_out, check_names=False)

Expand Down Expand Up @@ -190,8 +191,9 @@ def test_date_columns_equal_with_ignore_spaces_and_case():
||True"""
df = pd.read_csv(six.StringIO(data), sep="|")
# First compare just the strings
actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True,
ignore_case=True)
actual_out = datacompy.columns_equal(
df.a, df.b, rel_tol=0.2, ignore_spaces=True, ignore_case=True
)
expect_out = df["expected"]
assert_series_equal(expect_out, actual_out, check_names=False)

Expand Down Expand Up @@ -735,7 +737,6 @@ def test_strings_with_joins_with_ignore_spaces():
assert compare.intersect_rows_match()



def test_strings_with_joins_with_ignore_case():
df1 = pd.DataFrame([{"a": "hi", "b": "a"}, {"a": "bye", "b": "A"}])
df2 = pd.DataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "a"}])
Expand Down Expand Up @@ -851,3 +852,47 @@ def test_calculate_max_diff(column, expected):
assert np.isclose(
datacompy.calculate_max_diff(MAX_DIFF_DF["base"], MAX_DIFF_DF[column]), expected
)


def test_dupes_with_nulls():
df1 = pd.DataFrame(
{
"fld_1": [1, 2, 2, 3, 3, 4, 5, 5],
"fld_2": ["A", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
}
)
df2 = pd.DataFrame({"fld_1": [1, 2, 3, 4, 5], "fld_2": ["A", np.nan, np.nan, np.nan, np.nan]})
comp = datacompy.Compare(df1, df2, join_columns=["fld_1", "fld_2"])
assert comp.subset()


@pytest.mark.parametrize(
"dataframe,expected",
[
(pd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}), pd.Series([0, 0, 0])),
(pd.DataFrame({"a": ["a", "a", "DATACOMPY_NULL"], "b": [1, 1, 2]}), pd.Series([0, 1, 0])),
(pd.DataFrame({"a": [-999, 2, 3], "b": [1, 2, 3]}), pd.Series([0, 0, 0])),
(pd.DataFrame({"a": [1, np.nan, np.nan], "b": [1, 2, 2]}), pd.Series([0, 0, 1])),
(pd.DataFrame({"a": ["1", np.nan, np.nan], "b": ["1", "2", "2"]}), pd.Series([0, 0, 1])),
(
pd.DataFrame({"a": [datetime(2018, 1, 1), np.nan, np.nan], "b": ["1", "2", "2"]}),
pd.Series([0, 0, 1]),
),
],
)
def test_generate_id_within_group(dataframe, expected):
assert (datacompy.core.generate_id_within_group(dataframe, ["a", "b"]) == expected).all()


@pytest.mark.parametrize(
"dataframe, message",
[
(
pd.DataFrame({"a": [1, np.nan, "DATACOMPY_NULL"], "b": [1, 2, 3]}),
"DATACOMPY_NULL was found in your join columns",
)
],
)
def test_generate_id_within_group_valueerror(dataframe, message):
with raises(ValueError, message=message):
datacompy.core.generate_id_within_group(dataframe, ["a", "b"])

0 comments on commit 246aad8

Please sign in to comment.