From 759efa2d0f4eb9373eb02ea6f3713d077622ad00 Mon Sep 17 00:00:00 2001 From: Faisal Date: Wed, 9 Oct 2024 15:06:52 -0300 Subject: [PATCH] check for is_string_dtype and unsupported mixed type (#335) --- datacompy/__init__.py | 2 +- datacompy/core.py | 22 ++++++++++++++++++---- tests/test_core.py | 38 +++++++++++++++++++------------------- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/datacompy/__init__.py b/datacompy/__init__.py index 8dfa816a..a1890224 100644 --- a/datacompy/__init__.py +++ b/datacompy/__init__.py @@ -18,7 +18,7 @@ Then extended to carry that functionality over to Spark Dataframes. """ -__version__ = "0.13.3" +__version__ = "0.14.0" import platform from warnings import warn diff --git a/datacompy/core.py b/datacompy/core.py index 0089dc38..f9a3a314 100644 --- a/datacompy/core.py +++ b/datacompy/core.py @@ -770,6 +770,11 @@ def columns_equal( - Non-numeric values (i.e. where np.isclose can't be used) will just trigger True on two nulls or exact matches. + Notes + ----- + As of version ``0.14.0`` If a column is of a mixed data type the compare will + default to returning ``False``. + Parameters ---------- col_1 : Pandas.Series @@ -792,6 +797,15 @@ def columns_equal( values don't match. """ compare: pd.Series[bool] + + # short circuit if comparing mixed type columns. We don't want to support this moving forward. + if pd.api.types.infer_dtype(col_1).startswith("mixed") or pd.api.types.infer_dtype( + col_2 + ).startswith("mixed"): + compare = pd.Series(False, index=col_1.index) + compare.index = col_1.index + return compare + try: compare = pd.Series( np.isclose(col_1, col_2, rtol=rel_tol, atol=abs_tol, equal_nan=True) @@ -810,15 +824,15 @@ def columns_equal( except (ValueError, TypeError): try: if ignore_spaces: - if col_1.dtype.kind == "O": + if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1): col_1 = col_1.str.strip() - if col_2.dtype.kind == "O": + if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2): col_2 = col_2.str.strip() if ignore_case: - if col_1.dtype.kind == "O": + if col_1.dtype.kind == "O" and pd.api.types.is_string_dtype(col_1): col_1 = col_1.str.upper() - if col_2.dtype.kind == "O": + if col_2.dtype.kind == "O" and pd.api.types.is_string_dtype(col_2): col_2 = col_2.str.upper() if {col_1.dtype.kind, col_2.dtype.kind} == {"M", "O"}: diff --git a/tests/test_core.py b/tests/test_core.py index 14298e09..482a12f4 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -98,7 +98,7 @@ def test_string_columns_equal_with_ignore_spaces(): something||False |something|False ||True""" - df = pd.read_csv(io.StringIO(data), sep="|") + df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False) actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True) expect_out = df["expected"] assert_series_equal(expect_out, actual_out, check_names=False) @@ -119,7 +119,7 @@ def test_string_columns_equal_with_ignore_spaces_and_case(): something||False |something|False ||True""" - df = pd.read_csv(io.StringIO(data), sep="|") + df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False) actual_out = datacompy.columns_equal( df.a, df.b, rel_tol=0.2, ignore_spaces=True, ignore_case=True ) @@ -160,7 +160,7 @@ def test_date_columns_equal_with_ignore_spaces(): 2017-01-01||False |2017-01-01|False ||True""" - df = pd.read_csv(io.StringIO(data), sep="|") + df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False) # First compare just the strings actual_out = datacompy.columns_equal(df.a, df.b, rel_tol=0.2, ignore_spaces=True) expect_out = df["expected"] @@ -192,7 +192,7 @@ def test_date_columns_equal_with_ignore_spaces_and_case(): 2017-01-01||False |2017-01-01|False ||True""" - df = pd.read_csv(io.StringIO(data), sep="|") + df = pd.read_csv(io.StringIO(data), sep="|", keep_default_na=False) # First compare just the strings actual_out = datacompy.columns_equal( df.a, df.b, rel_tol=0.2, ignore_spaces=True, ignore_case=True @@ -364,10 +364,10 @@ def test_infinity_and_beyond(): def test_mixed_column(): df = pd.DataFrame( [ - {"a": "hi", "b": "hi", "expected": True}, - {"a": 1, "b": 1, "expected": True}, - {"a": np.inf, "b": np.inf, "expected": True}, - {"a": Decimal("1"), "b": Decimal("1"), "expected": True}, + {"a": "hi", "b": "hi", "expected": False}, + {"a": 1, "b": 1, "expected": False}, + {"a": np.inf, "b": np.inf, "expected": False}, + {"a": Decimal("1"), "b": Decimal("1"), "expected": False}, {"a": 1, "b": "1", "expected": False}, {"a": 1, "b": "yo", "expected": False}, ] @@ -380,10 +380,10 @@ def test_mixed_column(): def test_mixed_column_with_ignore_spaces(): df = pd.DataFrame( [ - {"a": "hi", "b": "hi ", "expected": True}, - {"a": 1, "b": 1, "expected": True}, - {"a": np.inf, "b": np.inf, "expected": True}, - {"a": Decimal("1"), "b": Decimal("1"), "expected": True}, + {"a": "hi", "b": "hi ", "expected": False}, + {"a": 1, "b": 1, "expected": False}, + {"a": np.inf, "b": np.inf, "expected": False}, + {"a": Decimal("1"), "b": Decimal("1"), "expected": False}, {"a": 1, "b": "1 ", "expected": False}, {"a": 1, "b": "yo ", "expected": False}, ] @@ -396,15 +396,15 @@ def test_mixed_column_with_ignore_spaces(): def test_mixed_column_with_ignore_spaces_and_case(): df = pd.DataFrame( [ - {"a": "hi", "b": "hi ", "expected": True}, - {"a": 1, "b": 1, "expected": True}, - {"a": np.inf, "b": np.inf, "expected": True}, - {"a": Decimal("1"), "b": Decimal("1"), "expected": True}, + {"a": "hi", "b": "hi ", "expected": False}, + {"a": 1, "b": 1, "expected": False}, + {"a": np.inf, "b": np.inf, "expected": False}, + {"a": Decimal("1"), "b": Decimal("1"), "expected": False}, {"a": 1, "b": "1 ", "expected": False}, {"a": 1, "b": "yo ", "expected": False}, - {"a": "Hi", "b": "hI ", "expected": True}, - {"a": "HI", "b": "HI ", "expected": True}, - {"a": "hi", "b": "hi ", "expected": True}, + {"a": "Hi", "b": "hI ", "expected": False}, + {"a": "HI", "b": "HI ", "expected": False}, + {"a": "hi", "b": "hi ", "expected": False}, ] ) actual_out = datacompy.columns_equal(