Merge pull request #337 from capitalone/develop

Release v0.14.0
capitalone · Oct 16, 2024 · b1dc886 · b1dc886
2 parents 6bd8f7a + 759efa2
commit b1dc886
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 24 deletions.
diff --git a/datacompy/__init__.py b/datacompy/__init__.py
@@ -18,7 +18,7 @@
 Then extended to carry that functionality over to Spark Dataframes.
 """
 
-__version__ = "0.13.3"
+__version__ = "0.14.0"
 
 import platform
 from warnings import warn

diff --git a/datacompy/core.py b/datacompy/core.py
@@ -770,6 +770,11 @@ def columns_equal(
     - Non-numeric values (i.e. where np.isclose can't be used) will just
       trigger True on two nulls or exact matches.
 
+    Notes
+    -----
+    As of version ``0.14.0`` If a column is of a mixed data type the compare will
+    default to returning ``False``.
+
     Parameters
     ----------
     col_1 : Pandas.Series
@@ -792,6 +797,15 @@ def columns_equal(
         values don't match.
     """
     compare: pd.Series[bool]
+
+    # short circuit if comparing mixed type columns. We don't want to support this moving forward.
+    if pd.api.types.infer_dtype(col_1).startswith("mixed") or pd.api.types.infer_dtype(
+        col_2
+    ).startswith("mixed"):
+        compare = pd.Series(False, index=col_1.index)
+        compare.index = col_1.index
+        return compare
+
     try:
         compare = pd.Series(
             np.isclose(col_1, col_2, rtol=rel_tol, atol=abs_tol, equal_nan=True)
@@ -810,15 +824,15 @@ def columns_equal(
         except (ValueError, TypeError):
             try:
                 if ignore_spaces:
-                    if col_1.dtype.kind == "O":
+                    if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1):
                         col_1 = col_1.str.strip()
-                    if col_2.dtype.kind == "O":
+                    if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2):
                         col_2 = col_2.str.strip()
 
                 if ignore_case:
-                    if col_1.dtype.kind == "O":
+                    if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1):
                         col_1 = col_1.str.upper()
-                    if col_2.dtype.kind == "O":
+                    if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2):
                         col_2 = col_2.str.upper()
 
                 if {col_1.dtype.kind, col_2.dtype.kind} == {"M", "O"}:

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -98,7 +98,7 @@ def test_string_columns_equal_with_ignore_spaces():
 something||False
 |something|False
 ||True"""
-    df = pd.read_csv(io.StringIO(data), sep="|")
+    df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False)
     actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True)
     expect_out = df["expected"]
     assert_series_equal(expect_out, actual_out, check_names=False)
@@ -119,7 +119,7 @@ def test_string_columns_equal_with_ignore_spaces_and_case():
 something||False
 |something|False
 ||True"""
-    df = pd.read_csv(io.StringIO(data), sep="|")
+    df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False)
     actual_out = datacompy.columns_equal(
         df.a, df.b, rel_tol=0.2, ignore_spaces=True, ignore_case=True
     )
@@ -160,7 +160,7 @@ def test_date_columns_equal_with_ignore_spaces():
 2017-01-01||False
 |2017-01-01|False
 ||True"""
-    df = pd.read_csv(io.StringIO(data), sep="|")
+    df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False)
     # First compare just the strings
     actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True)
     expect_out = df["expected"]
@@ -192,7 +192,7 @@ def test_date_columns_equal_with_ignore_spaces_and_case():
 2017-01-01||False
 |2017-01-01|False
 ||True"""
-    df = pd.read_csv(io.StringIO(data), sep="|")
+    df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False)
     # First compare just the strings
     actual_out = datacompy.columns_equal(
         df.a, df.b, rel_tol=0.2, ignore_spaces=True, ignore_case=True
@@ -364,10 +364,10 @@ def test_infinity_and_beyond():
 def test_mixed_column():
     df = pd.DataFrame(
         [
-            {"a": "hi", "b": "hi", "expected": True},
-            {"a": 1, "b": 1, "expected": True},
-            {"a": np.inf, "b": np.inf, "expected": True},
-            {"a": Decimal("1"), "b": Decimal("1"), "expected": True},
+            {"a": "hi", "b": "hi", "expected": False},
+            {"a": 1, "b": 1, "expected": False},
+            {"a": np.inf, "b": np.inf, "expected": False},
+            {"a": Decimal("1"), "b": Decimal("1"), "expected": False},
             {"a": 1, "b": "1", "expected": False},
             {"a": 1, "b": "yo", "expected": False},
         ]
@@ -380,10 +380,10 @@ def test_mixed_column():
 def test_mixed_column_with_ignore_spaces():
     df = pd.DataFrame(
         [
-            {"a": "hi", "b": "hi ", "expected": True},
-            {"a": 1, "b": 1, "expected": True},
-            {"a": np.inf, "b": np.inf, "expected": True},
-            {"a": Decimal("1"), "b": Decimal("1"), "expected": True},
+            {"a": "hi", "b": "hi ", "expected": False},
+            {"a": 1, "b": 1, "expected": False},
+            {"a": np.inf, "b": np.inf, "expected": False},
+            {"a": Decimal("1"), "b": Decimal("1"), "expected": False},
             {"a": 1, "b": "1 ", "expected": False},
             {"a": 1, "b": "yo ", "expected": False},
         ]
@@ -396,15 +396,15 @@ def test_mixed_column_with_ignore_spaces():
 def test_mixed_column_with_ignore_spaces_and_case():
     df = pd.DataFrame(
         [
-            {"a": "hi", "b": "hi ", "expected": True},
-            {"a": 1, "b": 1, "expected": True},
-            {"a": np.inf, "b": np.inf, "expected": True},
-            {"a": Decimal("1"), "b": Decimal("1"), "expected": True},
+            {"a": "hi", "b": "hi ", "expected": False},
+            {"a": 1, "b": 1, "expected": False},
+            {"a": np.inf, "b": np.inf, "expected": False},
+            {"a": Decimal("1"), "b": Decimal("1"), "expected": False},
             {"a": 1, "b": "1 ", "expected": False},
             {"a": 1, "b": "yo ", "expected": False},
-            {"a": "Hi", "b": "hI ", "expected": True},
-            {"a": "HI", "b": "HI ", "expected": True},
-            {"a": "hi", "b": "hi ", "expected": True},
+            {"a": "Hi", "b": "hI ", "expected": False},
+            {"a": "HI", "b": "HI ", "expected": False},
+            {"a": "hi", "b": "hi ", "expected": False},
         ]
     )
     actual_out = datacompy.columns_equal(