From f8235a951e182a271166813c025aa4a164d4b703 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 27 May 2024 09:53:24 -0400 Subject: [PATCH 1/8] [edgetest] automated change (#304) Co-authored-by: fdosani --- pyproject.toml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2636643b..7f51ac4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,13 +11,7 @@ maintainers = [ { name="Faisal Dosani", email="faisal.dosani@capitalone.com" } ] license = {text = "Apache Software License"} -dependencies = [ - "pandas<=2.2.2,>=0.25.0", - "numpy<=1.26.4,>=1.22.0", - "ordered-set<=4.1.0,>=4.0.2", - "fugue<=0.8.7,>=0.8.7", - "polars<=0.20.27,>=0.20.4", -] +dependencies = ["pandas<=2.2.2,>=0.25.0", "numpy<=1.26.4,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "fugue<=0.9.0,>=0.8.7", "polars<=0.20.29,>=0.20.4"] requires-python = ">=3.9.0" classifiers = [ "Intended Audience :: Developers", From b133a2ff04fc5c3a4d2ace3079315dbfc6ec634f Mon Sep 17 00:00:00 2001 From: Faisal Date: Wed, 29 May 2024 12:21:09 -0300 Subject: [PATCH 2/8] spark clean up (#305) * spark clean up * fixing spark session weirdness with parameters --- pyproject.toml | 2 +- tests/test_spark.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7f51ac4f..73c34780 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,7 @@ ray = ["fugue[ray]"] docs = ["sphinx", "furo", "myst-parser"] tests = ["pytest", "pytest-cov"] -tests-spark = ["pytest", "pytest-cov", "pytest-spark", "spark"] +tests-spark = ["pytest", "pytest-cov", "pytest-spark"] qa = ["pre-commit", "black", "isort", "mypy", "pandas-stubs"] build = ["build", "twine", "wheel"] edgetest = ["edgetest", "edgetest-conda"] diff --git a/tests/test_spark.py b/tests/test_spark.py index 88acc9a0..8f789fa4 100644 --- a/tests/test_spark.py +++ b/tests/test_spark.py @@ -46,6 +46,7 @@ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +ps.set_option("compute.ops_on_diff_frames", True) pandas_version = pytest.mark.skipif( pd.__version__ >= "2.0.0", reason="Pandas 2 is currently not supported" ) @@ -1206,9 +1207,8 @@ def test_dupes_with_nulls_ints(): @pandas_version -@pytest.mark.parametrize( - "dataframe,expected", - [ +def test_generate_id_within_group(): + matrix = [ (ps.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}), ps.Series([0, 0, 0])), ( ps.DataFrame({"a": ["a", "a", "DATACOMPY_NULL"], "b": [1, 1, 2]}), @@ -1229,10 +1229,11 @@ def test_dupes_with_nulls_ints(): ), ps.Series([0, 0, 1]), ), - ], -) -def test_generate_id_within_group(dataframe, expected): - assert (generate_id_within_group(dataframe, ["a", "b"]) == expected).all() + ] + for i in matrix: + dataframe = i[0] + expected = i[1] + assert (generate_id_within_group(dataframe, ["a", "b"]) == expected).all() @pandas_version From 8608830f8707804cc6f1e246d76929706b8b1b73 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 3 Jun 2024 09:28:14 -0400 Subject: [PATCH 3/8] [edgetest] automated change (#307) Co-authored-by: fdosani --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 73c34780..0d619ec3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ maintainers = [ { name="Faisal Dosani", email="faisal.dosani@capitalone.com" } ] license = {text = "Apache Software License"} -dependencies = ["pandas<=2.2.2,>=0.25.0", "numpy<=1.26.4,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "fugue<=0.9.0,>=0.8.7", "polars<=0.20.29,>=0.20.4"] +dependencies = ["pandas<=2.2.2,>=0.25.0", "numpy<=1.26.4,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "fugue<=0.9.0,>=0.8.7", "polars<=0.20.30,>=0.20.4"] requires-python = ">=3.9.0" classifiers = [ "Intended Audience :: Developers", From 1fd8ac14b991afca142b9bdb1b4f7bbec69326b9 Mon Sep 17 00:00:00 2001 From: Faisal Date: Mon, 3 Jun 2024 13:51:21 -0300 Subject: [PATCH 4/8] bug fix and clean up of temp_column_name (#308) --- datacompy/base.py | 30 +++++++++++++++++++++++++++++ datacompy/core.py | 27 +------------------------- datacompy/polars.py | 45 ++++++++++++++------------------------------ datacompy/spark.py | 36 +++++++---------------------------- tests/test_core.py | 36 +++++++++++++++++------------------ tests/test_polars.py | 45 ++++++++++++++++++++++---------------------- tests/test_spark.py | 24 +++++++++++------------ 7 files changed, 105 insertions(+), 138 deletions(-) diff --git a/datacompy/base.py b/datacompy/base.py index 23a815fc..6ac54afe 100644 --- a/datacompy/base.py +++ b/datacompy/base.py @@ -139,3 +139,33 @@ def report( html_file: Optional[str] = None, ) -> str: pass + + +def temp_column_name(*dataframes) -> str: + """Gets a temp column name that isn't included in columns of any dataframes + + Parameters + ---------- + dataframes : list of DataFrames + The DataFrames to create a temporary column name for + + Returns + ------- + str + String column name that looks like '_temp_x' for some integer x + """ + i = 0 + columns = [] + for dataframe in dataframes: + columns = columns + list(dataframe.columns) + columns = set(columns) + + while True: + temp_column = f"_temp_{i}" + unique = True + + if temp_column in columns: + i += 1 + unique = False + if unique: + return temp_column diff --git a/datacompy/core.py b/datacompy/core.py index 042dffb4..d10967a8 100644 --- a/datacompy/core.py +++ b/datacompy/core.py @@ -29,7 +29,7 @@ import pandas as pd from ordered_set import OrderedSet -from datacompy.base import BaseCompare +from datacompy.base import BaseCompare, temp_column_name LOG = logging.getLogger(__name__) @@ -890,31 +890,6 @@ def get_merged_columns( return columns -def temp_column_name(*dataframes: pd.DataFrame) -> str: - """Gets a temp column name that isn't included in columns of any dataframes - - Parameters - ---------- - dataframes : list of Pandas.DataFrame - The DataFrames to create a temporary column name for - - Returns - ------- - str - String column name that looks like '_temp_x' for some integer x - """ - i = 0 - while True: - temp_column = f"_temp_{i}" - unique = True - for dataframe in dataframes: - if temp_column in dataframe.columns: - i += 1 - unique = False - if unique: - return temp_column - - def calculate_max_diff(col_1: "pd.Series[Any]", col_2: "pd.Series[Any]") -> float: """Get a maximum difference between two columns diff --git a/datacompy/polars.py b/datacompy/polars.py index aca96296..5b0d22cb 100644 --- a/datacompy/polars.py +++ b/datacompy/polars.py @@ -29,7 +29,7 @@ import numpy as np from ordered_set import OrderedSet -from datacompy.base import BaseCompare +from datacompy.base import BaseCompare, temp_column_name try: import polars as pl @@ -278,11 +278,17 @@ def _dataframe_merge(self, ignore_spaces: bool) -> None: # process merge indicator outer_join = outer_join.with_columns( - pl.when((pl.col("_merge_left") == True) & (pl.col("_merge_right") == True)) + pl.when( + (pl.col("_merge_left") == True) & (pl.col("_merge_right") == True) + ) # noqa: E712 .then(pl.lit("both")) - .when((pl.col("_merge_left") == True) & (pl.col("_merge_right").is_null())) + .when( + (pl.col("_merge_left") == True) & (pl.col("_merge_right").is_null()) + ) # noqa: E712 .then(pl.lit("left_only")) - .when((pl.col("_merge_left").is_null()) & (pl.col("_merge_right") == True)) + .when( + (pl.col("_merge_left").is_null()) & (pl.col("_merge_right") == True) + ) # noqa: E712 .then(pl.lit("right_only")) .alias("_merge") ) @@ -497,7 +503,9 @@ def sample_mismatch( col_match = self.intersect_rows[column + "_match"] match_cnt = col_match.sum() sample_count = min(sample_count, row_cnt - match_cnt) # type: ignore - sample = self.intersect_rows.filter(pl.col(column + "_match") != True).sample( + sample = self.intersect_rows.filter( + pl.col(column + "_match") != True + ).sample( # noqa: E712 sample_count ) return_cols = self.join_columns + [ @@ -558,7 +566,7 @@ def all_mismatch(self, ignore_matching_cols: bool = False) -> "pl.DataFrame": ) return ( self.intersect_rows.with_columns(__all=pl.all_horizontal(match_list)) - .filter(pl.col("__all") != True) + .filter(pl.col("__all") != True) # noqa: E712 .select(self.join_columns + return_list) ) @@ -899,31 +907,6 @@ def get_merged_columns( return columns -def temp_column_name(*dataframes: "pl.DataFrame") -> str: - """Gets a temp column name that isn't included in columns of any dataframes - - Parameters - ---------- - dataframes : list of Polars.DataFrame - The DataFrames to create a temporary column name for - - Returns - ------- - str - String column name that looks like '_temp_x' for some integer x - """ - i = 0 - while True: - temp_column = f"_temp_{i}" - unique = True - for dataframe in dataframes: - if temp_column in dataframe.columns: - i += 1 - unique = False - if unique: - return temp_column - - def calculate_max_diff(col_1: "pl.Series", col_2: "pl.Series") -> float: """Get a maximum difference between two columns diff --git a/datacompy/spark.py b/datacompy/spark.py index 070a58e5..aad9f11e 100644 --- a/datacompy/spark.py +++ b/datacompy/spark.py @@ -27,7 +27,7 @@ import pandas as pd from ordered_set import OrderedSet -from datacompy.base import BaseCompare +from datacompy.base import BaseCompare, temp_column_name try: import pyspark.pandas as ps @@ -301,15 +301,18 @@ def _dataframe_merge(self, ignore_spaces): # process merge indicator outer_join["_merge"] = outer_join._merge.mask( - (outer_join["_merge_left"] == True) & (outer_join["_merge_right"] == True), + (outer_join["_merge_left"] == True) + & (outer_join["_merge_right"] == True), # noqa: E712 "both", ) outer_join["_merge"] = outer_join._merge.mask( - (outer_join["_merge_left"] == True) & (outer_join["_merge_right"] != True), + (outer_join["_merge_left"] == True) + & (outer_join["_merge_right"] != True), # noqa: E712 "left_only", ) outer_join["_merge"] = outer_join._merge.mask( - (outer_join["_merge_left"] != True) & (outer_join["_merge_right"] == True), + (outer_join["_merge_left"] != True) + & (outer_join["_merge_right"] == True), # noqa: E712 "right_only", ) @@ -913,31 +916,6 @@ def get_merged_columns(original_df, merged_df, suffix): return columns -def temp_column_name(*dataframes): - """Gets a temp column name that isn't included in columns of any dataframes - - Parameters - ---------- - dataframes : list of pyspark.pandas.frame.DataFrame - The DataFrames to create a temporary column name for - - Returns - ------- - str - String column name that looks like '_temp_x' for some integer x - """ - i = 0 - while True: - temp_column = f"_temp_{i}" - unique = True - for dataframe in dataframes: - if temp_column in dataframe.columns: - i += 1 - unique = False - if unique: - return temp_column - - def calculate_max_diff(col_1, col_2): """Get a maximum difference between two columns diff --git a/tests/test_core.py b/tests/test_core.py index 12c204df..103b9f3b 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -417,11 +417,11 @@ def test_mixed_column_with_ignore_spaces_and_case(): def test_compare_df_setter_bad(): df = pd.DataFrame([{"a": 1, "A": 2}, {"a": 2, "A": 2}]) with raises(TypeError, match="df1 must be a pandas DataFrame"): - compare = datacompy.Compare("a", "a", ["a"]) + datacompy.Compare("a", "a", ["a"]) with raises(ValueError, match="df1 must have all columns from join_columns"): - compare = datacompy.Compare(df, df.copy(), ["b"]) + datacompy.Compare(df, df.copy(), ["b"]) with raises(ValueError, match="df1 must have unique column names"): - compare = datacompy.Compare(df, df.copy(), ["a"]) + datacompy.Compare(df, df.copy(), ["a"]) df_dupe = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 3}]) assert datacompy.Compare(df_dupe, df_dupe.copy(), ["a", "b"]).df1.equals(df_dupe) @@ -450,15 +450,15 @@ def test_compare_df_setter_different_cases(): def test_compare_df_setter_bad_index(): df = pd.DataFrame([{"a": 1, "A": 2}, {"a": 2, "A": 2}]) with raises(TypeError, match="df1 must be a pandas DataFrame"): - compare = datacompy.Compare("a", "a", on_index=True) + datacompy.Compare("a", "a", on_index=True) with raises(ValueError, match="df1 must have unique column names"): - compare = datacompy.Compare(df, df.copy(), on_index=True) + datacompy.Compare(df, df.copy(), on_index=True) def test_compare_on_index_and_join_columns(): df = pd.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}]) with raises(Exception, match="Only provide on_index or join_columns"): - compare = datacompy.Compare(df, df.copy(), on_index=True, join_columns=["a"]) + datacompy.Compare(df, df.copy(), on_index=True, join_columns=["a"]) def test_compare_df_setter_good_index(): @@ -647,7 +647,7 @@ def test_temp_column_name_one_has(): assert actual == "_temp_1" -def test_temp_column_name_both_have(): +def test_temp_column_name_both_have_temp_1(): df1 = pd.DataFrame([{"_temp_0": "hi", "b": 2}, {"_temp_0": "bye", "b": 2}]) df2 = pd.DataFrame( [ @@ -660,7 +660,7 @@ def test_temp_column_name_both_have(): assert actual == "_temp_1" -def test_temp_column_name_both_have(): +def test_temp_column_name_both_have_temp_2(): df1 = pd.DataFrame([{"_temp_0": "hi", "b": 2}, {"_temp_0": "bye", "b": 2}]) df2 = pd.DataFrame( [ @@ -693,7 +693,7 @@ def test_simple_dupes_one_field(): compare = datacompy.Compare(df1, df2, join_columns=["a"]) assert compare.matches() # Just render the report to make sure it renders. - t = compare.report() + compare.report() def test_simple_dupes_two_fields(): @@ -702,7 +702,7 @@ def test_simple_dupes_two_fields(): compare = datacompy.Compare(df1, df2, join_columns=["a", "b"]) assert compare.matches() # Just render the report to make sure it renders. - t = compare.report() + compare.report() def test_simple_dupes_index(): @@ -714,19 +714,19 @@ def test_simple_dupes_index(): compare = datacompy.Compare(df1, df2, on_index=True) assert compare.matches() # Just render the report to make sure it renders. - t = compare.report() + compare.report() -def test_simple_dupes_one_field_two_vals(): +def test_simple_dupes_one_field_two_vals_1(): df1 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}]) df2 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}]) compare = datacompy.Compare(df1, df2, join_columns=["a"]) assert compare.matches() # Just render the report to make sure it renders. - t = compare.report() + compare.report() -def test_simple_dupes_one_field_two_vals(): +def test_simple_dupes_one_field_two_vals_2(): df1 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}]) df2 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 0}]) compare = datacompy.Compare(df1, df2, join_columns=["a"]) @@ -735,7 +735,7 @@ def test_simple_dupes_one_field_two_vals(): assert len(compare.df2_unq_rows) == 1 assert len(compare.intersect_rows) == 1 # Just render the report to make sure it renders. - t = compare.report() + compare.report() def test_simple_dupes_one_field_three_to_two_vals(): @@ -747,7 +747,7 @@ def test_simple_dupes_one_field_three_to_two_vals(): assert len(compare.df2_unq_rows) == 0 assert len(compare.intersect_rows) == 2 # Just render the report to make sure it renders. - t = compare.report() + compare.report() assert "(First 1 Columns)" in compare.report(column_count=1) assert "(First 2 Columns)" in compare.report(column_count=2) @@ -786,8 +786,8 @@ def test_dupes_from_real_data(): ) assert compare_unq.matches() # Just render the report to make sure it renders. - t = compare_acct.report() - r = compare_unq.report() + compare_acct.report() + compare_unq.report() def test_strings_with_joins_with_ignore_spaces(): diff --git a/tests/test_polars.py b/tests/test_polars.py index 679a9ab7..c878cbba 100644 --- a/tests/test_polars.py +++ b/tests/test_polars.py @@ -16,6 +16,7 @@ """ Testing out the datacompy functionality """ + import io import logging import sys @@ -29,12 +30,12 @@ pytest.importorskip("polars") -import polars as pl -from polars.exceptions import ComputeError, DuplicateError -from polars.testing import assert_series_equal +import polars as pl # noqa: E402 +from polars.exceptions import ComputeError, DuplicateError # noqa: E402 +from polars.testing import assert_series_equal # noqa: E402 -from datacompy import PolarsCompare -from datacompy.polars import ( +from datacompy import PolarsCompare # noqa: E402 +from datacompy.polars import ( # noqa: E402 calculate_max_diff, columns_equal, generate_id_within_group, @@ -383,11 +384,11 @@ def test_compare_df_setter_bad(): df_same_col_names = pl.DataFrame([{"a": 1, "A": 2}, {"a": 2, "A": 2}]) df_dupe = pl.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 3}]) with raises(TypeError, match="df1 must be a Polars DataFrame"): - compare = PolarsCompare("a", "a", ["a"]) + PolarsCompare("a", "a", ["a"]) with raises(ValueError, match="df1 must have all columns from join_columns"): - compare = PolarsCompare(df, df.clone(), ["b"]) + PolarsCompare(df, df.clone(), ["b"]) with raises(DuplicateError, match="duplicate column names found"): - compare = PolarsCompare(df_same_col_names, df_same_col_names.clone(), ["a"]) + PolarsCompare(df_same_col_names, df_same_col_names.clone(), ["a"]) assert ( PolarsCompare(df_dupe, df_dupe.clone(), ["a", "b"]) .df1.drop("_merge_left") @@ -419,9 +420,9 @@ def test_compare_df_setter_different_cases(): def test_compare_df_setter_bad_index(): df = pl.DataFrame([{"a": 1, "A": 2}, {"a": 2, "A": 2}]) with raises(TypeError, match="df1 must be a Polars DataFrame"): - compare = PolarsCompare("a", "a", join_columns="a") + PolarsCompare("a", "a", join_columns="a") with raises(DuplicateError, match="duplicate column names found"): - compare = PolarsCompare(df, df.clone(), join_columns="a") + PolarsCompare(df, df.clone(), join_columns="a") def test_compare_df_setter_good_index(): @@ -535,7 +536,7 @@ def test_float_and_string_with_joins(): df1 = pl.DataFrame([{"a": float("1"), "b": 2}, {"a": float("2"), "b": 2}]) df2 = pl.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}]) with raises(ComputeError): - compare = PolarsCompare(df1, df2, "a") + PolarsCompare(df1, df2, "a") def test_decimal_with_nulls(): @@ -576,7 +577,7 @@ def test_temp_column_name_one_has(): assert actual == "_temp_1" -def test_temp_column_name_both_have(): +def test_temp_column_name_both_have_temp_1(): df1 = pl.DataFrame([{"_temp_0": "hi", "b": 2}, {"_temp_0": "bye", "b": 2}]) df2 = pl.DataFrame( [ @@ -589,7 +590,7 @@ def test_temp_column_name_both_have(): assert actual == "_temp_1" -def test_temp_column_name_both_have(): +def test_temp_column_name_both_have_temp_2(): df1 = pl.DataFrame([{"_temp_0": "hi", "b": 2}, {"_temp_0": "bye", "b": 2}]) df2 = pl.DataFrame( [ @@ -622,7 +623,7 @@ def test_simple_dupes_one_field(): compare = PolarsCompare(df1, df2, join_columns=["a"]) assert compare.matches() # Just render the report to make sure it renders. - t = compare.report() + compare.report() def test_simple_dupes_two_fields(): @@ -631,19 +632,19 @@ def test_simple_dupes_two_fields(): compare = PolarsCompare(df1, df2, join_columns=["a", "b"]) assert compare.matches() # Just render the report to make sure it renders. - t = compare.report() + compare.report() -def test_simple_dupes_one_field_two_vals(): +def test_simple_dupes_one_field_two_vals_1(): df1 = pl.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}]) df2 = pl.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}]) compare = PolarsCompare(df1, df2, join_columns=["a"]) assert compare.matches() # Just render the report to make sure it renders. - t = compare.report() + compare.report() -def test_simple_dupes_one_field_two_vals(): +def test_simple_dupes_one_field_two_vals_2(): df1 = pl.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}]) df2 = pl.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 0}]) compare = PolarsCompare(df1, df2, join_columns=["a"]) @@ -652,7 +653,7 @@ def test_simple_dupes_one_field_two_vals(): assert len(compare.df2_unq_rows) == 1 assert len(compare.intersect_rows) == 1 # Just render the report to make sure it renders. - t = compare.report() + compare.report() def test_simple_dupes_one_field_three_to_two_vals(): @@ -664,7 +665,7 @@ def test_simple_dupes_one_field_three_to_two_vals(): assert len(compare.df2_unq_rows) == 0 assert len(compare.intersect_rows) == 2 # Just render the report to make sure it renders. - t = compare.report() + compare.report() assert "(First 1 Columns)" in compare.report(column_count=1) assert "(First 2 Columns)" in compare.report(column_count=2) @@ -703,8 +704,8 @@ def test_dupes_from_real_data(): ) assert compare_unq.matches() # Just render the report to make sure it renders. - t = compare_acct.report() - r = compare_unq.report() + compare_acct.report() + compare_unq.report() def test_strings_with_joins_with_ignore_spaces(): diff --git a/tests/test_spark.py b/tests/test_spark.py index 8f789fa4..937396ec 100644 --- a/tests/test_spark.py +++ b/tests/test_spark.py @@ -428,9 +428,9 @@ def test_infinity_and_beyond(): def test_compare_df_setter_bad(): df = ps.DataFrame([{"a": 1, "c": 2}, {"a": 2, "c": 2}]) with raises(TypeError, match="df1 must be a pyspark.pandas.frame.DataFrame"): - compare = SparkCompare("a", "a", ["a"]) + SparkCompare("a", "a", ["a"]) with raises(ValueError, match="df1 must have all columns from join_columns"): - compare = SparkCompare(df, df.copy(), ["b"]) + SparkCompare(df, df.copy(), ["b"]) df_dupe = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 3}]) assert ( SparkCompare(df_dupe, df_dupe.copy(), ["a", "b"]) @@ -624,7 +624,7 @@ def test_temp_column_name_one_has(): @pandas_version -def test_temp_column_name_both_have(): +def test_temp_column_name_both_have_temp_1(): df1 = ps.DataFrame([{"_temp_0": "hi", "b": 2}, {"_temp_0": "bye", "b": 2}]) df2 = ps.DataFrame( [ @@ -638,7 +638,7 @@ def test_temp_column_name_both_have(): @pandas_version -def test_temp_column_name_both_have(): +def test_temp_column_name_both_have_temp_2(): df1 = ps.DataFrame([{"_temp_0": "hi", "b": 2}, {"_temp_0": "bye", "b": 2}]) df2 = ps.DataFrame( [ @@ -673,7 +673,7 @@ def test_simple_dupes_one_field(): compare = SparkCompare(df1, df2, join_columns=["a"]) assert compare.matches() # Just render the report to make sure it renders. - t = compare.report() + compare.report() @pandas_version @@ -683,7 +683,7 @@ def test_simple_dupes_two_fields(): compare = SparkCompare(df1, df2, join_columns=["a", "b"]) assert compare.matches() # Just render the report to make sure it renders. - t = compare.report() + compare.report() @pandas_version @@ -693,7 +693,7 @@ def test_simple_dupes_one_field_two_vals_1(): compare = SparkCompare(df1, df2, join_columns=["a"]) assert compare.matches() # Just render the report to make sure it renders. - t = compare.report() + compare.report() @pandas_version @@ -706,7 +706,7 @@ def test_simple_dupes_one_field_two_vals_2(): assert len(compare.df2_unq_rows) == 1 assert len(compare.intersect_rows) == 1 # Just render the report to make sure it renders. - t = compare.report() + compare.report() @pandas_version @@ -719,7 +719,7 @@ def test_simple_dupes_one_field_three_to_two_vals(): assert len(compare.df2_unq_rows) == 0 assert len(compare.intersect_rows) == 2 # Just render the report to make sure it renders. - t = compare.report() + compare.report() assert "(First 1 Columns)" in compare.report(column_count=1) assert "(First 2 Columns)" in compare.report(column_count=2) @@ -759,8 +759,8 @@ def test_dupes_from_real_data(): ) assert compare_unq.matches() # Just render the report to make sure it renders. - t = compare_acct.report() - r = compare_unq.report() + compare_acct.report() + compare_unq.report() @pandas_version @@ -1321,4 +1321,4 @@ def test_unicode_columns(): compare = SparkCompare(df1, df2, join_columns=["例"]) assert compare.matches() # Just render the report to make sure it renders. - t = compare.report() + compare.report() From 5861594441270b276ce4b8ef83073b7ca205133f Mon Sep 17 00:00:00 2001 From: Faisal Date: Tue, 4 Jun 2024 12:35:53 -0300 Subject: [PATCH 5/8] adding in benchmark docs (#309) * adding in benchmark docs * Update docs/source/benchmark.rst Co-authored-by: Jacob Dawang --------- Co-authored-by: Jacob Dawang --- docs/source/benchmark.rst | 70 +++++++++++++++++++++++++++++++++ docs/source/img/benchmarks.png | Bin 0 -> 35703 bytes docs/source/index.rst | 1 + 3 files changed, 71 insertions(+) create mode 100644 docs/source/benchmark.rst create mode 100644 docs/source/img/benchmarks.png diff --git a/docs/source/benchmark.rst b/docs/source/benchmark.rst new file mode 100644 index 00000000..56365afa --- /dev/null +++ b/docs/source/benchmark.rst @@ -0,0 +1,70 @@ +Benchmarks +========== + +.. important:: + + - Minimal tuning was done + - Benchmarking is hard (and biased) + - Take with a grain of salt + + +We've tried to create some benchmarks to help users understand which DataFrames to use when. +Please take the results with a grain of salt! If you have ideas on how we can further improve +preformance please make an issue, we always welcome contributions. + + +Setup used +---------- + +Single Machine: + +- 16 CPUs +- 64GB RAM + +Distributed Spark: + +- 20 Executors +- 8 Cores +- 32GB RAM + +The Data +--------- + +The data (base, and compare) we generated was purely synthetic consisting of 10 columns: + +- 1 id (montonicly increasing) column used for joining +- 3 string columns +- 6 numeric columns + + +Table of mean benchmark times in seconds: + +=========== ======= ======= =============== =============== =============== +Number of pandas polars pandas on spark spark (fugue) spark (fugue) +rows (distributed) (single) (distributed) +=========== ======= ======= =============== =============== =============== +1000 0.025 0.025 15.2838 2.041 1.109 +100,000 0.196 0.120 11.1113 1.743 3.175 +10,000,000 18.804 11.330 20.6274 17.560 16.455 +50,000,000 96.494 62.827 57.5735 90.578 94.304 +100,000,000 DNR 127.194 96.3204 DNR 193.234 +500,000,000 DNR DNR 262.6094 DNR DNR +=========== ======= ======= =============== =============== =============== + +.. note:: DNR = Did not run + +.. image:: img/benchmarks.png + +TLDR +---- + +* Polars can handle a lot of data and is fast! + + * From our experiments we can see that on a 64GB machine it was able to process 100 Million records + +* The Pandas on Spark implementation will be slower for small to mediumish data. + + * in the 100 Million + range is starts to shine, and due to its distributed + nature it can process vast amounts of data + +* The native Pandas version is best for small and medium data diff --git a/docs/source/img/benchmarks.png b/docs/source/img/benchmarks.png new file mode 100644 index 0000000000000000000000000000000000000000..e5f80d6b8e7866a60746c7219deedd2ccad131b4 GIT binary patch literal 35703 zcmeFZXE>bQ*EcMY5NQ(9MF~OlM2#{^LbM>dQLjWVL!t~uO%;TR2!iM>dKe{`L5ezh z88H}1^fD$GCd%;a$rbtE_fwAd%X=K}^WptsnDgBG++Aq~ukjCWeKxpZ{S-!>Sl)zd#P zaM^b}sCqPbi@3fk7d841G+x;3ePc(Jl8{c#<+DQqHa&1Uy7ZTJyt9c)A;A^>IRYm9 zY{Sa~hI&nlM`!Nw%l@}}&R@TuKK324MXV<7+5`}#P>0K(blt!E%?lji(=U|tys9b_ z1Z7EtLI@}w7#tklU8tDm*_u6imO;^xj_!^|ATuIYFU{Sz>e(?i2R&wXxq(z%;Ltj8 zgotk83{gm$8Egm%sY8GG@HO>#*k^I~(d+WNUGk*Y>Ky*6%`eXLWZZc-80fBEnwfbc zHE10gpwPQe+Q1=VQ{fbYCiiK3jt5bOvHrilhuIIj)4$eoQA%1GZ%^G^LoG0KDvky> zn5DEC4T+_yF0e09?@iV9;~fyWItjhi9yS|4KQj~8R&mRGUcp{+Ea9!Qy;)N`TciX= z4nO5)B&SD$jHKk(w6dzso6Hcq24oE;(dDd{T(2~rgxBwRx?RES)%*T~jk`{Y+FlVPiH6+5WZ6c0^brg*Xso zs>B)OF~&!|E!piRA2(kGG;xw`{4obn!`X46u(ZBBFG0k@L2q|8ZP(3MPu?cU#>i+u zFl3@R3SKX&KGzt4QRpIm@$VNZMc+%PLvL*Evipu+;<@$WJ+^}0wTr}89jz_DiGLuq zP?NCW?3sfKUiMp9*d?qPFz?Xc1)`*n9!Nto~|eflSM}NMhq&Z1#dcg z@7%76ludDBV@;HXnI~_IO4_%-cEWh$tmxzO(t>MsAjcRA3kovVB+WL48g>SAQ*#q+ zf`z8#tJ!$64nF z6k|Q+K18n6QDk&&&2!vvH9^$MWr(_h6(dF$ZE-k^8R2MW4y(682a?3e)HTl=kgaro zMAfGvNAcCMxfYIq4n@}t$NJIkTiq-Q%U@XG^;J1+=bWuf`9_-F>Q+T9EByNQ7;be8 z?;Yel<`dGR0NMJXEy$fHwma)n{yVTz{|1odNz@ciS*Nn^u2 zU08qOZAa=4%?iTwRuk4Y&$QHaDwkgYS1-` z0DDW$2I5R_+&p~#dYT#5oBp!L*q+?akZ0|$JIq`)!I#@>dvjn;sjrS|kB(xq$p-8R zgpE$iXEM$;*v?2PYgPG2Wz9+-+OR=-sGH{&ZZXi7=sso!)G00EwB4Kj<(1Akzl~*F zThUoDrC1`1ha9wCQ*5D6WmoRyc%A44g(8~cl3Py;ol8@#aj`6mKlhK60Rh_!qVzPZc zgAdr2tR^M>$NM3IrRv`?qgg8iiUv6QYYB5x@&1gp3(zxO9A0wP^|I_@<8O6k8YJ*Q zm*Y8+(T=^>%t`1~U2;(>L8?2htG3R^)y1VF)8q%yaB29a6|*Utcx`peMDMkd-$6)SAnPgsWboH-wkv5bf;! z-!NG?;OH|~{gCksuwtq1i@V>BU!>Hl%PpKm*4!4OIMr7h7Ed^L_;&yaPJ5#Rz2wHW zyPSYug$4^&Sov79t4{+nOAFcYuUmS>tDWh;bPR&FMU2`ffgj0CUoIp&h_BiNtr-X4 z1LXa3sxzb)nps>&K)7Cs5mV_dFfLr|iuJt;7hamOCtOn^G~SGr^&1UZLZ+VV!+s}g zR35)FBylULwE7I3cYd~Jf^)h%&4cl}T&@bqe5z~W*s}^$p+DlXr3a?iR>)hqq3$Rt zARBnjH(0V~8Z^85F>bAASx--K!!G&v9+vD)5VDyG(C+`p+rx82hyEB$+bXXumLJ%0 zo}{VVg5upz2july!kimRC(6m^wUcB#CtIB$6!)jK9cI2Bs|=#cRV^Ju@?~=sscL;| z1NNde^$G}v77Lj{7yuuw>h%X8OI*&xC%iuhhs_oOfSM$c6q8%D#TFtcfvm*Ex$~D3 zhI?06j!#C%RnX5zP~LLL_>I_+H^7|Vd3uKh3V-`1?QtQT7vi=2I~KJ+Tz++`peTnh?yE=t#>&qgZke0BG=%1v$R4~t3)ky)vL` zd{`!GNa@eG8HQirDEwos9&(Ubfy+zjUKpc80Qo7qbIm$4QgRvPc2qcD6*yXW3r7V! zeypE$D-lIQhOI1*QsMVWSI!E}P3aOC;rnUdiGuHXRHo$WHChxtOh|ffKUEx8@814> zKl}g7K(qDDt5+-%g@zl{;HRO|2k8#Pc&O+Z8phsEmewI47DW8l22Jh1zP~FTvJ+S) zEMz`2m0+{p(R(e?GG6G^;kK8TZg#kh%nue4VF+U7(Bxxg#h2yf;wUX4JV$??UUfp= zDtdQktK6Y8$;8K-4ek8%+eh&SKUAHaoScmL>G4oZ$=9Pju6sv0Y6b!RpHSAmC z!*c)CX_K{ku-!M;Y_n+*0AuuJYmw}AP99!#^nzro_0CLM2%#Xz%UF>3_An|+YR)k$ zEe0xfdQ+Bf!pfuo8^s?-(dJ9>>7hE!59F^*CwsKBd7$s^Lgp1G+lQ!aLaZ?S$+fjL zcpYYEa{Y9Mp%up>pF#`}%~E&*k>m#VYSaAIs&HegN={O--@y^s`>o zm+mOL{I`2k z+=FQcd@=9BSGw8;-^zbpiB|0^_L!gD7EH09tuJ@#njyQ2o3mL?{^eBc>nM^TU8`yRLC9uP zk?+!Sb#Y_uP(56fPBN?qL$(`^haGrwgN3rd{itx8{14 z#-fWYsu3=H@$FS}%(G#f!RM-j(D#C(OX7q~+v`a6Y$?%|!ee3NL-ukkLGTOj{JSlk z)f-2cuFZ*8P{|sL0D}+}NWB(g-IXFPE-vPsw&IjQ)vem4LIl|~q4Z%{m0>}eM-gQ` z!nEWeQnWv=eao5M!*jJ`UA!7oP|&VeUYfQ5(vZ>l)#sJL1p8!C9t*XY-xX^{n#oz6 zRXLIXx5RAO)^5c{RM!JkobNXJ;hMSLh!y>QQ}o8`U_B>CC`BEjp>Jt!Fl^go#YVkTgK4}>C1tCUN8Q{EUS|rz=Oxn5X!l*2k(Z8l+ zEI5)2Tn!@sM=?@JzosNj=Bo6W5q6Hp$#u?waGys>iLFfRn+6H5%Ir@m>y-p(%T(u3^c(nL zJkM~|oZ~z78LMSnOI*>Y_Cm83Uceb?Mk?OQ^0iD%K8qCne6c@UrO3DAAxGO2J)(^$KM^&&94tj&G6(+)}v^mMo+s2uvR(?ZdGrb=f|tR>ZLR5c3AL~ znxoifKHv|ps)Q`07Qy7pVWPgH3DFi>-gnx5ODXR8NUDPii6THtH6L;0q&y^!RF5RU zoz=@p}*=9;sF+DWB6e-5yH=PqB|W{;p@32t9!Hc{-45H)ahe&dLwi&5>=p`%87cRP7X!p1 zwWQPPqpLZ;;0da*r1l+HaC7*2j9rD!Sy4-;kn!323`ZXH#5tY+A}DLQwCLe*nOr{H zj=i{&kW64;%}68uznEoQviZ3hQok%MKo)rzL(s{AO&f>LHZv(2zQLA{G+&hKZ%XL?ucM&>#t zg3oIFyG|DrG)zo6WdZ1eai zY4~~c^Ga*Pv093{X-F%%)lC86eFynIk!-*~N}n+!+4o6^Oi!JZdpQ&k{xSilV_#FE zI+BYGPx$emGz_V>icxATi_cQ8!`LSEopklbJLeY{N9iuCutsq{uRIEKU1hnj>QtU! zUBK#L6BU=IzKK!aSbO!!Z1tVD(zu_G;fC<*jbI0e7rvu^i12U*-=-~j>k4#)cgsRA zPh_HZW6dqk`kB4fZ<|8!S$cS8*lM8@s)124^EK5y!Hp8^U`b0mFmXzZT}Ok*ShVUx z+e{d&yO!V8qSU)uN33{N)MBU;9T%JlqGl&8YRZ79iD0})TwFLQehgk=n?6(!5Yoy@ z{h8#Cc&37UKZi_VP{JpM%QV2xkv*A;3xOf|Tb0WTHnF;5ysf<77rG}f97M=Ig9rcU zm6N-5OI~s_E33cu=&_@iiMJqZYF6veEBkVNAaJSVVTRti<|(&K&Fbr}hFwkct`-@V zTi^ovSC_!8bi#(x8!>?fN=O02 zTvcprWy(e{P_WoadvU~lwcYw3J`w$6X)ML1!Rqsyb1IBFwR$zS4J@5`-Vm}xpHB~} zWo{AakjyvIU|VUE36u>pi48(OtWR^#+Y61q{dw-V4Pte_RTUrAYGBQm78{4Re;c1g zPnHsXb83mup^O}zOK6}hB8Sh5C;DGdor5xB3S&}v!+njk$1OrECExXEpC!vVlkDlI zJz2v|O+#*vjb!GoG|Eu*`$x$0a&rn7H2fI73;lo|9)DC-$nkODAFYv3+AaEXz1pxe zGj)bojGo*`yc@I~BcF)#F+{Je@WF(r4{=?Q0EU*$&jqKmUl9hwGphmUhS4)=Gvfz zNu4i;5kfXD06VX`DnMo1SVVO%k`6#dzGE!x0`QW(JM4v>EJPkboPwy8FJ)g!61j3i zh7|Xon3>z)d!cczd~JPTlWiKaEjqk&T06HvZy(n?5H;nJ{Y^Qty8`SWoKstyzI=Fw znk%ha_AjfC(XBuU3=qV$iWsJk#}RSa7-=?_$dj#ybeTp4+U$0zj&Pkl=-+U3rD~tD zB6cKQUF=}Lqx&rHIYg4R*!im-uW30Boj==e_FP%;|4XjD+{q$`*K6n*F}pNIt}wq% zm+oZ*ei|XakM1z5ir$%`uNPvH$6a2=a+JX0B=xW_EyrG}!1uqjymCq$C0>9^7}vkB zXeqTMDu419-P0UYLz-N`M(f2#9BG|q=i?V&k1tELW(cknxW80uAS~*KfTZ zrI0JrJc~tM8&?}}Ywy0g3hsCnNt_Kcs@{s#Q;cb#5~7`!nzX@d+H467;cq}s8(t=Fh%&|E+IVZeMh9!y*Rip&O#!mE(rvtc zOt?I!{HS5UYPNazK<|dP1yX+LdccT8+fhGH$WP52_D<@Ay3%IMnDYzOZH!hrN)ct0 z(`Ryib(zY|n#vOHcU)uMwX?tcVyxE4XoASx~f{`|PIHNI=DrKOc=4>F6FUfmGW<8rfHvfDpD+yDCcy?8ocD$aEB zjcs&U!y@2_Sw)0|gkq0~erU9w?#Z}mLA}doo*R63zI=T=Bt&*=5oHgU+TQZp&em0C z5qcZOlC!--NeLyYL=%1eOCV3Gv4S_9Hr24Vw=dkV49DMCfPgWcc;~#~{*+l8Z>M?=y-I8Xi>%TgGeII4*24 z?{Lj7_ogupR5Q8%5&zsUDvuhLt8}rNfN-t2tydFF?p0qu`TFkXXKi=G*l{lP0ZTRN z^YinQ6G)}124PDKV(rz($BZM`fgyrYhbnf!9=+VmM$1Cw5T^vPgtZ0T(JtzZt3G{T zA?lg%I=}?k1BSrB!ooFS%xB1|c-o8CQ$2Nj4+fv+k(s@9Qy zX^&-tTG)NRra-J(l0E8Wr#4uEQo|KdcEqV5b`nWq&_}l-`aH_>kw$lls{7LX<)C7J zXDyuQqIxm!X4lVpU0exas0`m8lu}Gm`t>-yMLmz=!8ZCu+g#@KraH=fOm3@^gNWl% zE}K=BUqucCoNBYk#oli?A?^0{1ckZZavGG9+&I7Yi1lr54#=_0IsICkklu$(t`KGq z#}h$ODVpPZZ@JC(kF&kyMkAlx#=G2-yAC0KR7RPW!`8jhwAK@wEW}DxNpWq`H0FoM z6Rx9E);+zm=%|E3=GL-QS{12D_v!dk+j1v=N(-uq{HjTaiDYX{# zS}be1g=K+7avU6lbw3HYJwL?P7@kn!F8Eta%wGQ1ggG9hf`OK{HgU*dgg;5FcP4RG zLK|mI7;<3yU|=<)y6{Gyez&LGi<%cp_L0BrdUibv#}ee|XT$GmN^(UNqf1e1RK}1K zo{X(**D@1Q1bLND-x{`8uuf2Q=$elK;yV=fh+W1x5vOUK2_Nxjpk@w9xyt)t4PZxt zh>%)bKCGWfMGsx2S=DJBykwXk>`<0#wP(uSo6p$X-#;!Fv8YmA#dr@dA#>%~ONjXHMrxV1GLd%kbwX|VinDwSV~T=M zl_77it1G$zGZG_;y@BD1dR`WoSCytkL(|>`3vZp+C2|WiCZl)pC&{y}o!W-~#YD^D zYxUA2?OSpd;~_s8RglO{^w7HW;PNY+&eNS!gFl#TKX%bCpgZ5?@>74vE$o zz#`3vnQ~b$IiA_@A?Z0cRN9tNK^nBk*|3Px?wvf#dt7GB+pthPp(NnJa&#RFpNqWL zh9F_H1p(7tSV)v`#Fwr)<-<$R_M#AR*RGj9K*{Bzf$q3_p}VUguMR5C9Z;}d#>-f2 zsiUV7(|W~#zneX|3Cg883O2xh#dQl?2Q-1%@_xH7TWjJaPcNC6GV0sxQmN9EAC*V8 z^j*X2B|3;E%XlfAtC#tIJl7+i@6sQ2(AhqQ86lE#sVa%SgKo5a1g2V@~Sg? zadHHZ(vUEXwO;qZAgn4W>O6y0S4XP*@wepo|0hYAX8x8`uOfzQH`kX_+&-ALL+0NA zaUrWe94|Up)i>nM$WM@~DfKz=p5di!OLQElJ9C-mVtb}s9F<2#h*5f4tGt^E ztU(TKcp)Ghxn8-v+UWCT_dm4uw}iV{RPpH9xTePET!PlU=d6sMmH)gYr(ZYynZmiP z(lddYQPK%!lqrrCq-ObVOeIXUDD9M4`V1PcbH-{ju2R>l8!C~aPU=0|6YDFw3*w@6 zi{e`v;qc2C`|x;KKq$*jy*6AFqoW+*Hsy7Zfkb2}9YRD{GMdg;_$r!a(gY?k(p-?l z^WB+|@3uL=aX6kY<+qz!lZcce0=J&h%NNV9SH;MhOumvsS%Up5ua6x-K-lUKfhX$e zjWov$^>X=G2KkheKi>s@RU*H-0?{g_FZ2m~9+2>UePu!dOO~6FTjxpl`oFG?#wLX%78e)SSLf?d^YhaYLqS@1&&w?^|8hZoep&ag59Hgr znwsHM1ZWZQoXrT!HKMg5aZyfN_83}N2+_J&-ue>!d(lf{b=p@>mD|WjOSik%5#tTc zgZn~9P4B4a`S?@@&d86)h+5UQ0y;l)L!N~8B3If&%lmcZbM_`Q>Y2D8?e=E-8>7;5 zT~f_VYTxLnTLTUyi-7aTi5miNIHj~Rp=6*tWa$;vkt~B$1w{H~mT*X}L{9!%ahvXGZI1Zdwyh>T!{bg8 zdM^!hbfSjix~@Os;Jlwp>oi+d#VKCS==KdyYxb3LR7skR+6w)BZp$ag-in9`MJ`%@ znzF(HND7khf%$hreS%p{Tu(MVKGuvJcHZ;$a%HBvwO~8{!5(y zL*mcQ_@M+XyE&T|-j%XP`c(g3ZxT!s&R8B>w2RY|jEuRorwO6bN!phseQdUb_3U(P zP^ZrS9pTjZ>gOjfVko}?Y1f?Zv%^8V6MH(-C{+I4+3{lcBu4FmOmb7})x8acVnI~M zV5+M;0I~WaxX~(ni$8sDLtk&iv83+Uf?@5}4gzvady^?YJ!`f2Vj|J&g+_K87?80y zKvlZo2l$@MUN%4UrfF7nZ^?JQ79};Ue^!^8Evv$~p2>>s{iC(Y`;H80`c2>Mx}I8E zBBlMEes5Vbx@1BWN1apLdwA$4+reTN zCio7vrU-`R0#`EsEGu+BtMluhi#Bozl_EQ_(8FnepME#I=)*9RUoNLGOM!UgcZt7G zzi6`jEKs&g?^q6I-pzW#7cz9d1+^@TIt9J)_d?@gO+UhNmp#np ztw%g}l3kB72n{>oR?aBIHJ%eOb`HEd&`FD|Nhpkv9z zk)h-8j)6i`F#$orKdz2@%3lz=n(?GLu-4No+0(!!KR35|QOoh73X=JHKk(lt zu4Z^L>Bu*NP0ov=PRTyo)A&cc+f$rxA1HW**eazl)4(tLf&S1pn zzatvdA_(`vCbz51e1tmgOVwryb{-RD~*GV08vY?7K#!U&aQZ548Rxvx(F_$Stm>Cs_RU7mD<@iIjuMU4pHJR1Uy%F1-;sh(+<~G{+i32eli9^)oLfpt~QS5em2V^h>rYLc?!I0O;3| z?!KcahLv-Fw^G%L5AGu7=k46dcCLfIP*C3+Q-dvd!%g!cWeO%62Tms5ABo(3aAiX7H9FulgfWb zuyH3d%7MpFtIgI5JMSGhwYN{ZG-H(N4c2;RMhLFzuEVwcMgJfsBpfbc3RlP_%iJ6I2AC6o_?eFYj8 z;d30W9aaW|WZXxusI&VRdDjL|mRJryT^x#viYgy*ugm%SoSx*v3O}_$w+;! z!{stNpo?XYIhK z;KRBe_tcBOT7ZA*mY|@Zc-`Q60LYNmI5=Q6Mm|Ok0Wyj&S;iw8Cp-D_rh%aLX-8UL z_qKi93wI{JS`DBU_l!FwBK7|{oGVjOU8K0$Awf(&_s0FhcZRv4(zF2_PAg3@XcDwL z-2Nk}^v?wm=7^P(624(tGh8C(lJ2j1CNaS4u%$|A>xY7Qf*bcSR@9(7r2YV>Z0!Wm z^)WnoSRw!Qzi`XjEGVZn{fJtXh-=EYnaB!>P0d8tEydiUuzP+wQmm_yu5ZgSEqX2Pvzzk zS&}wy(%gHngTUd7e>6Pw#XG#R_W~y^4mdlfHFt$fA6eOhjiJXArOYOARw;Bd zOBl5(G2a!QyD`gp5enW;+KvsmgThZg6}%P5Sgym{`WN+mI=*v)b8l7f@F%LzD5DPq zMzAw-{<(cp+>we`y&j7hUCMB#S*ppJwD!X*dv3zHP~@cl&Sn0uy(325ft^MAKR?;* z^a77cxqh;6SePNRypJtS()ks&Q7?yicQk;6A%=xSUzF{8cm62NPPjyp&qje~u-Zrot49Z^A6QiD+EoOpyV%kE>fn%6JJKbUCN94fKO z$;eQ-k#T~CB(GQ^3?J!Zd|!;8-!zT69}7M5{Q^_>^#KqKw;52Uo(l+J9vpLLIC_o? zf*`*H^edr@4Wu0xAAC7^tMi}VIFV^EF7oL0&rERlB|&kYP$<345iv|V5JF+j_&E^N z9=-um@D;O{2c%IBXvFW5C->Z>^*`-R@ZdC%;?EYF@o0ot%wTNig-YDkwjfyq#+`7DLq)ZxMLu>_r(p5sk2ty5_gt%anQ#n} zbat*Lmh~rk!!c$WfjVRPJ3jV-_}CWbAn@{aDdqujn_pYy)Bq8*&=Gs?@UE-6+itBvO$F|=@1}`aa$xLyP%-JI8`@<6fo`@ zclH!*q;h<)uxCqkO6W{)TIA~zDc8nyX7<4_B!0qA8sl_^UI44*u?;Bwau2ALzW!aM z(!7caBaV>m2eJ!AsE&e%wA@Mg;THLBOsYK42H+5qTskjYg@Sk;@YV}Dk`N-P{jkC= zN5i{V#bv_7e)a?`!SjUa$%c`H^$f(fJq(s zJC}{z&(J&XLD0i5t?P3W%A6as5-LHjs7AQ)X8!LWKYpUn^BMGL_8wfiauxA-KSuu{ zg92h=Kjz^+gMPEi83NH>+^If7NB1SYIvr=B z@7kDEor|+D;*l~@{j{nFLb&kt{!|9>OEk6gyA^%W*@5v4O^**MbBY!)pRa5oElw%~ z8gbr;c&&z0oCNMGJ-tz!YQ`??=1oe?mh1^6T-8w4>}*F9&@KACXp z^yz<-`u6?`*whW@dD4sVGWRhZK}F7<+~We3Mrem3N#uAX@9Z46z0kOmp|0QyBSS-5 zc#edqkcytaf6X>1BqneI7GXVQbe$03Q8Qmqm|amZ0JQn-91sx~uJcrlFXzl+7j-M# z&Dsg4T<{y@u!8kRBzGEcWdVg&qKtY?B|4_YfF@3+u+Q-KSu!%$CtgVGDJ5M;pw48x z$Mt>PtNh^u@o%JFKKuCt3mhFP1i)U&$cHc5fodm@Z#yLSo5_t~0w^Z_Dk?N}=7@Zv zx)U@Jy)6GZK+fN(x;1_X3}fBZE~+(lxc2e^kaL<>m+WtDGNDF|`=tE0^9p|NQ#e&= z7B!ilbT4K`*ZWb5h41M^sjo#3C83qt7LQUOi|H+RxdCAYIW-f+1Ohs8OlFh|yn~eH zy@2cF81Vv4srU^Q)!!??S14z9*XuTVk(gC+-O{dHRxt>s?(Gl&8|_1+v>)p3F%*-D zH6D?pwaKhM6;|0|3W^<$Rot{2mIC)BDzP#RB{nJN?OD&t6Gd0YNuqo+>1bWEKOeJP z2Yx*oD9e?FLI@da`%nqOK#`1<<43wMFD+a3Xpe{KI8B)rPn(9vk6+*L5~y4BHSpZgG+15C;?9^hnG&pU zsX3K~sF3VTynCk7W`d7Nhkyl<@VhhW)X&pJMBB(^^%?A%jmS8`8b2kEyHa8;d^EC3)!3D!3O?AI;pb0J&n1G-3`87ch3J1 zQ8F;$=AB99>O3;7m7QrR=)RdefLYCT>^c+Ads@cYeh6#UWBZg>fusu3t*0|C&!F@_ zje)`%8ykB9ZN=_myvPMmf8}a4WZeu}<(8-~agOG~TTezRovNW)=r;E6-j6Bwafs42 z4(`k;e)MWuY(|^f>g#)3%zXBM^fehl3+&U+5k_e2{WY@6kyCz@v~;^S<#@@6Ys ze)(k^K>|x(+w9J~I0%W6qR3b0rPg|2Z^gZ73MJ9(es_D-ezqqA!060oTafE`j1TFc!#x=$Moep)mjwaRe-0I(Do} zSw)7cXAUr_0fh${em`=+jAsSU*}&0&ifna6CP@X_GVof3#_Vv`ivtjy5njnPbzr)c%t<-QI1n^T z91&+k%r9r$X+3@j6bxza0C<9wB+;1uAwXfF!U9l{mDX;c#nHPg(D*UNjAWH?Ap^8d zyC~JFqdfk2Ec?-?xDs(~{PW7~XV`tYXAhoL9$L!LA)?)y#Zvx(W+jF#M0@o5X zNF?BBKktHv_Z5AwGgApSwykBKPxg87ei_|fya zhGXxCqG}9Sp}S9jBWkCZn;;^gNA32s{u^pp?|MSHbK`3JhDgfKFSd8L`}nRfJat|8=~kkG ziSy#OW37CDfOI2&=Nh+%C2(qp{zrP7ax&LiPdeP`lXNvtmJmrwa?q{BCZ831-51&I z9ByuIBj{jkT6y4Vo@O?B%~*VTCC<9pL5cny?*|v_+IPl@rge`~?>v2y$36>)1^+PU z;mIdJeEd2|ml@f;Us4fEqc!D#i3fR?GTUn(n)^9C|-~XoUze}YM8~LBpjBnPi zkS~06&5Hk<|4)8yi-aVp{VnYeCCp-U#AHE69Hb7Y6%e{4fc*?5pMCQAF0JZDy>EcA zswA}ieiJwwwsK?-F+>cQzLDCm$+=!a&37wvIiu-x<2Sv;EED&yHPyqyl{~ zKs0ZGXvVaMbs=j;0-|t^MjueMXkQ$Lk)&A;&|;{a+0>aBe&mBkDb{0Z(+#$`LufY! z$oHI(2hGbJa+a5WDBS00wf}>Bzw>_OXluyzr? zwBsHZ|6Xc$UVOqhCb(!XFS{EFthb&s!tISK?uuD31H@!m1E8EIV@(Ct|D2(h z75f*de)aX}|EKZxzez*7VW+?0ByJ3hOdO9e@{n*nB{}{m_1PLr5?KkjkEmUMxzR6a z=cK@+9xaKY&!NG${c<}0jkI6xo7OSd!uj+r%lkYQO~}+?!{da08krbyTU5KlV$*sn z?6U^+cH`K84sjp0si&a)uI>7&V0OSmN;o5UCf;t=+Vh2aQL*hace;LjQ!R+(LtY!MytGhBw`am zhBPwyEp4Q>COJDC3oLeSK1GY525SSK9rT;c?9()z0*B(s#{ip~+@x&XrsXk;o{YQ( zH0IQYgS%(u2FIBEQWeazJyB5lk~^r^R)o*T5fOS2^BaXgqvcAR3!mgJP5=X?yaQUu z{>Kq;9T(zW`3~WR_yR0ZUFtG7Ad^)2O`oy^V%e%s!G|exlvYbcAm8_cd9u}c*7s{y z&+qe({=>{Z&!{(;v~iUG*rj8o$_D|d*ChjeQ4nCi3eNTpT%YqoxzwrQAT-*(92wNi zcxb;(z=ZqEI66BDI|34)t0VE@WcjaP*PA7+?col`zTBbOHGoc-+;VI%zlmWi6aq78 zk_UPz}sOHx$> zEk}TW>~m;O`9QE2pWgRx(zcLLMV?&fFu$tWj?JEa%Aft~^)K-JS14qH#)B==3$~pf z6rff$4X}-(8$g5iCb{$Tf~%@Eae3`mij`_J!~tZT1tszLhhq4TFWA4n?PK}+)(idG zDzGX@)}qL0YhJ0dDf@EyvOm!8Nm7KQ8_3epf0)$J^*7tb8lqIkl(!V46a|S~{TttL zQ~f4SrZ!d9JeOkIy5o!RXJQOF^5bM(;lh`PSB#4``b^MTlQEy)hUQU1B8AypaiL*@ zcf@>dGHAAK@T+zIFd7R9dooliRp{`<7-DmS74E@=sC{m{{SbQe!2UnLs?XTmwoO)V zpzV>*-TZfz*)z$upr0=lXKSC4Mu8q&3H>hh&hC^BujCF{Ry^l*n2V|C9U}_;Mk4aqWkVty&)F zf&wk~1}a?kTPS_D>re@tP}agBgpC)G_!qx$SxqE7nOx5});qW1`q{IyadVbzV^dsB zA8!;`arH8DP|uYij0xv4>sKN+O-W)R6Pxn1@R*fQHOK4jB*YQ}LSN2e-?-Z2l!%aR zwWPCM60Q!Ger;(q9vJBYWllvt`9@27K+8q~cm=9zJ4*e30a0KPXTm0r>)k1@5$2vz zB3d?^KpoqhsjgEWW1>8}VtrPsIeGtJ)Uy;#0Bv7~V$mzWyz!}-Vo>As`&)v?`s$0Ko?hQU*IQ}vIKj7+YG24k* zZ_3gbW^n&wfN)-dj@9_L8-B%sWs1Us$KpqKt!*W*^wP~_hq+EY_U~NJf!)lN!N!RV zjwECyrfz@Gf7b0OYO%^779OAGRG&>=??npmNo=4EWt%kFo{)m=HgSvk z>R+Ovt+er?#U&@Z)+fw=Zu+?;)dlmUL6OOhL2Nlb&8_}}iSCJ-)sVNL>YNG%%v@?h zo!a#;BP>e78RT=$rtlu)j5@)T*HfR!;=JMJNF4=<09njI;DiPM0j4G*yEl_4NLrWAm4^9q|baj8MS`?lEkr; z-pywR+v~Qyrz4ZdZ(DV&Kn$^Qj2tipVbI)ZO)DS0HO>A}7ipNYG*3F+l{(rMTFcXzbY$?L{8$njy#$)%;nrG@8b8uJM^;D6@vNV9)mjo1v=lH+Mu3pa% z+K>%5GI?VhWII?I*ON`Uo=^!320ZG1k^itni+_XfsrOR-svttK{v<<`?XcXaTPhJo z@!FlCc{zaic?&>|l1V{YuqroilY?=HTGC~OnRCDER*5;v-|PXT-`5SAGdmX>xj{;6 zr)}u?;(X#xd21&vny2MO3>@^j?Jie@zmU(q{pvO7vSwX;>sdj|h@QxJNESM%*-gQ8 z8zrb*F()R9@Av^cw@aYZ^GiuRT@4inbra<(&}i{IE`Nk8ELn|E)5QdrjG7(nOOAdp zY_YHG0$YyzS2mD-w)G~TG9m;NKEX|Qd3hAQLR#!k6=k{5P-te1>b#fE1#CbOzGyZ# zQtj~%3Fq+dTALrG&cnF7w{aj79@DYwvT%s{#Bx}ZI!nnWKa$A$s0%XRY=7rytLdQ+ z&>OK7c=NKGU%T+Z!?!^Cm~|`a6;Jf_%uc`5bOuJ1F;P0@C9Fmd>TYyvkdpWQR7EA0 z;p^Q+`qzt|O9rl&y{67>)0_rHDBGDJjB;3q-eu}mY6En^ciE)7IJ)0+f(G5ZqqP5s za^3k7jUlkp4ttzB?D2fGwh-_k0gp7+;HX<4o!-jnx0lvmxv=<}I(0){%v24h@D0=( zT(r~iucz1q=%6$YJK1gL#^eqaVBAzvWKqgpfQWl)2MzpfN}b+k`8xE=SInj0J=5%w<7F+&9ZGE+ zL-btQDtcA3V?Wl19XTZ|Xm9p1at6NlCgP2w6;?DrL!+*ieg3~EYp&A0?W>CL0p;4f z=+pV5_i%?Qe#YwDo!w7G;_Srf(p!_{6HLh@Wx!B#xt}rV|^EZmnQvw>*bzRrR zI#7!{ZpvZsZEp9sUdQEG34c&4{7S%qCu>G#_3V zDop_Y%SI9u=b8|DbaZltSA@&Gy;K$78uCU;KgqsExcYkYaLhhOxsP3)R%*H1{Y59u zVhXf?+4~Cs^Pe%O>W_|@=ikz_+cH79EFCe{^4nAaa3`)BSnNky?Xn6svbbw|kZY{Y z7fDd8D=X87kHD&5w;P;Zfb652J^KB5YqDYTl&{`knw_5yv)U!EZnMBkCr_*}1g96VmI{(3^LM=8<{(8G#_w%& z7Z?Y%gJOACrZoOkEJ7|B)PJ{e*DeO-z@vzyq@>9_C8~2Wc6f9kNBe9IXgTd178f<4%VgGu{m(T{ zSe20ef2(oYUxrA!-Av}UGl3y!@6FdB)^i!XZDSG5^R5piSvB@v@l)0$$iYS*}hg9KMPvwV)1zM z|I%zGhzr(ig}`G**y^!SfLrMr5WtakaCIN#VFsns7=?m^#>E!u!< z<6gA)0LT)xvl%&K)id_-;67J-QdLpdC2q*!TdVaZcRp<9i2ZA6Lh@ z&f&Aj*r$!P5G#|z(zd$ZJ)34*uIqcudJ@KbK^45x<6&9z7OM||YK&h8`0OG8qr{h6 zZbzbJT=-B`NP8V9K0&@^!%cM2{8T*s0?I1_ZIa{tE}+zwgt|H zumlv3PYr(@mC>X+bxp_fWo7lsS-+^ox>;8I81oM8 zddX8`@7#-t>@C;4GD#*AkhT#w` zxk&tGZA}T>%y<8x zf$JT`_?4H!qVL7~Mw=5CR>vW3O|?4dI(t2e*UEZd#wB#<&WF9~UKE%Agz?#GAXER5s&j1O7Jf}NCvx}*Z0veqNlzlU&JKpvM`*Brr@cNtfUjRtPuMJZ%S|s~g4r#` zn`_qJ9gt4vc`Y>{!b5N_cX$E!yLJM7!?x2;=TC~+r?b%zYtA=`#A$X{9<-2jXd(Hf z9ebPmm@pr0p;b{OLi$pxYoRwajgpp4lzZ5eb&|r!UPHMt6IJ|9K6&)jFOeU6!=NzO z$mH1OZbLgnz*Qon?dv$syaUs_TN)fqmwXM}@u9{?7lD7G4cp-=YOu#Ls%_1=YxB=@ zZ<1wR*I2plN)HX&CY>IY18S^)stJeZHlTRD(jDQoktX<&BDk9zoVsm^ss||Bnp4)+ z4d%`lg+;MH$@0Th$1h#QtW(+L2g+`Y0bAr-x#fmRwbs7mFNn1oT*-;~(ciqhPtKN~ zvPti+u>k$$0Ljy%9t<7tk8`4-Pw95YtcQXPHL?q$UejhYNa&LXdZ)TLd z2FmwLAheio{P}h8KXFgO3(-RfHR%#h)#gIiHW)S~`UC3yU>O35~5pnuvpo&jwQI8HMLH1ru?x_ij?^Mjw2!GuX3Wn&pQ@9~k|RR`hd zZzhIPrHzc}zE^#_PwC^&NIYt`L6WLtkZ<`dvR28vT1VD7H{}U1SA)+I_MbjDtg4P# z@*C&02Tj7ZWN9>m_3+dkc3sTCAwg>8752&4m|%}ot0=;%WG52?L12_8A%8Y8Y=T4c z=q6vz@rbI{QR`j{H6)dd)I%-rY<;u4s5q|4uzYUS!zS~KuG;x5Y{q4V<7rTdZUT6H6hY`SU=y&M#y zf5a&ENDZ!G=pMLVvXzsQ1Jc_1qSq`l{p93k>Coaqt3~rgWinzKHDXn%U<8`cU}b= zksKS0XSM~$VR~8W$1BQ8jy(?YA6%{~&lXls)Mhn1NvA*g|IO=62xiLO;2LaH;5B!( zb8{|><#&sCzE|^Vw^p#P@2P#?(~S97+0*i0vSy76F(+lZMvw!*Um&9hIc!BPm> z=coO}@qON(QVk^-V+;hmh39^{u7z9AjcYo<=`<8LH^Qw&-3dHbeZG&1h{n2-Le09d zp2wKN3Q4^XXheKu(*`&jXs2vZ+s$Q&2P%}!`dt$}tWbY&em-N@JGR_%K#YuSn|fJf z$fOPnvs-ut=lAszDr=}qXD9=W@mF%~BV+K3in=R!g_Cwp6Dj(9VQpe_xM0+15-dhc zheWMhRq2lz$XQQs$=B;i&u3p-|L367c{<2YH0T&XMG_8gXd6U~BNj<2_;>arc zK)5wkEbcVth72aJjzeT<%lE|w^0z10pr@QERLYDq*5b8>11fdTM8H&a_q}CfX`zR4 zV{V$tz#<&*XM>_kY{+)3H8J_)2l27znvJbsh02em(9H+qW~)-}!mVo?SBOCFkx&Og zgRPg8XgZtrGowbyRUd>Vmr?e}7Evt{6oN^uml9Yk9gNjsmm5*fxJiC3S!yf>lkG;n z?ZVt)I){M_dp~3|;+V_Edj2*co({K8YXCaEeDQ}5Ubsb=8LWsur+=m>?iw6e;#WDn zJFq3+9P-g-Ac3~NA{~QIpYi%J`fzE$hRwW@NwxHu%O8Nwq4*9sBk_3V{ zbubHC<%4;r?%H@@O~BP>l9ZI?e|f^s^!_zND{(%f$_M*Vj#T2QQI0sKmlsUViNz6n zUMnd4Lf1W%^sm)C(c1bgwko@OC>(wg5ER9$ zrOdMF{JYY@iK2W+rR)P0R09#elnvlo8Z%D$#*a%9bx^zUC*u4zuLYZz-PU;EbT`aa z*JcZ&#tPD+KTG=GPF(1q^5KTRWNyGOnJ%|XF?*)2LzP7m_-DBXZtu9wgi51<)fj(C z^U?FeQ()I%qxZjxXY_&NVdrkl*nEL+!zMT)GX7CUmZ3qung zT_}8pEdKLmtsEPpOLuM6GH?K6zgBSVQwa$PvF=&O=A}z7tY;yo!?aI|+lY8*fsi+?L9LdC$L>XJ5T>yf7Do&DG}$=@*wh zZIpf+WbW8(Sspr2R^gQI@_c@_u7C9+A~tl8801=lt^|JJLOjG`?|~8KSxj;DgJ8bZ z=jSRn9?;CCB>Q@vD*bF9T}39@en~mt&B$r6FV9vKNUJ>46l^0QJbDdqE`R5e$R!Vl z$@1ml0U0-`x0Gl2^+1hpJ;Ow3OqI{;h!jqWosg!o-Yji)|LyN&@XV_ThBCO5ck6BlQgzY zz$uvvsh~aKk4@Jn?#Ac%{##@m+kQAHo3LMIvl%(P8L*)@B`=L@sk&e_A#RSP4QHhP z^cwZI*Gq6}ZC1Pefbdg&tfq*;PNIgqD&76#C#@-JdXRngr3vCqW5@lWi+F2wY{l9L ziDi)3ULzGJsa%3}cdgZfICz5gSK2xeuE4jN*}H2oPH#pJD_p#lf=QKoCEhwtT+`Nl zV=gMWLftL58z)OQPE^QvKff#>NpwgeD}$b+t%TRQ4W5Up>s*sc_OBdGii(~S{_XWR z6gO9$j6K_mIJj{a{~jc3hLOLifaOUzwoJD)<)x^)d{np~de+|)h; zS4(kLlNIlRoZk&V?pGhqT0U^r{?ST6f=00#d-Au4{RskR^j=|YUjN|$*55h@XWTyN zCxp_>cSNg@5s|9y0{)xU%HCco_w9G}w=Ni}GE_bqz8b@6V0)XE$Ahg=??fO)8s{%u z1cO6qu?+0zu?RduL-#W-O!$IJS(@q6hR~TDfW9L1)ej`0p9#A*u4~qXN z<9Xq2-01e~kP4~&1Cz_^x`JSi=o$-78;8m=@Y#h(g)Z5n-MbaC$26rz3;M#RjDRvl z__O@ZzBcbHg%-wu<;O(Uf3G2FJ1zXQ*ArJfU>jG_NlwP5xl-SA>(;RHd&8wdSIsS* zOW`(JH})KqNw&G34+nx-kxNS)ly3QW5aQb$lFSq}I3!#v6auXK5$~T3=xejtP~TcaOe>gohpyv4JuXR zc@;rkxL0h95dz~A6^Z-_h_Kne>CA;YaQZL~g z*|Tp=>F3H{hG0L}8Q^HDe!V58lBSE9<{G$cwn7U3@=$AZj%f0ihdUXl%=mdQ) z*yR(`rfr0@Xl%+~zHO~M$iiKURK=q&V7s7}54n4FJ_qvKbgb_ka~{~-YPCkRqamyb zxe6IVS}iLLg`ZjPpxNl(V@a)sx3_HTN;LFu3fS7}v%r{7L-_Z+er7fnY z$_)XFAT%=P*PHg1){1M3;SJvoj~pZ$uDm{eaA;^B*I)B$ullawX{|VIHlJK@diT&5 ztFj=pXXhS7OCzOWJm%`{R`pg`r#yp6Uqi`fu@qgxUzXikOJ3btCf>Vs@o(AqqSlQM z8FG~%l}ODhV}qix{w{~6EB`?7GpBuK5q?@SYw>3xxvR#kdtI)5QOjfGSxf0GZ(m7R zW1Z49&jiK_PE*!d+jgB_3FiEmm`|a~D}~=;*U8)(M9t;%r-p{!eEK8j^&8D94EN=$ zb!zuUanGY8v!_ceIC(Ugm*!o3&@c|c^&T`(aBpX9(@y8twN|cZ)b!1Uk8{Cj0naP% zW9-G%0Bw2pjEx>kHLuDAZRT^qWAX-Z*Sr0DR;@BCKdNB1SrP96k6oge%^IPfed-2U zTHk>wdSFaZO&T-{fFv99_+tTTBK}fixr6eG5}TbQAy6biStMj3T5bm--_i3N?^Vhk zL0wPiH&gM+=yN-Bm%zv?V-gn{CQ0^LP}6Xz5W~XAOUlu5wzv;#pGTdJnpE#1DECy* zISob%AQiQqlEG=(Y2+d!{p%H5Hy*G(PIzXkA;jO^6-hkd8em%nmx<_%B*dB)%n5u761fY=Y!;`SCCKMz zpPTtQ-09(i{ew8X0V{Cm<&7&?aI?0*x zEx}PwwC7Q1C`YWZ1_i+Adt=&$d`nN`OOxePAyJA$t zqsKPgD3Fre(UA-$HEs@hYdRsJ;qDMXRgin@WD@oo?@asHkbkBR8AiyDq zAuqdNklU?pV~mix}4^%jWZ zzi!KVsa9;bTd$3hT{>nynubzOfnvMJ*c36X+X1J@$kR2DNqlBgJmPzfI!$5}dGxRM zQ!dsB+Oh4bu5XNrQc(=!f**|FAiJ(pfpXwjb7FcR_@?V+@-rSkxT$(5yZzJ^Hl97+ zApb}O3iqAeP}^f93{^o<(e;?n-6hhEY~)(=mz4i)s_KW?opHo$$cjyz*BbJ!wH}&! zxEP5R>=(T1qMk8?dJqQBt{d|BBAucS!q1$>{XG(FN1H9*y(t>$4?&`|gFt|Fa+ z`9sn>)k`ec^#~U(9Y;xRP)m`m=a5ZdjWf9v@nzegh}GBh4oQXztr*%c`wFr$#6h$3 zTc?t_`+(Tj>!9%+Dp8dS0?@jk!wre*7j9!pn`qFiG65om*H=X5sH%xWHy*;UE zAJXakv3%Ead@(K5ZecCmCEm##`&3s>J*{E0`$Kc;?sZLQ1#VT(mR13U$%%=mv$rtd zmChmm-b_?#FySud-&@?{gc*Z7z?JwVp5aXj+H)r;B_RXtEl|uABz7=jni1<(D9-p9 z4FT8v_Jtw4(;cv-%l~!eJ=(>~<%h_~H&KulqPe|W>ISlhYs`_2d~R841$d;8P@Bh` zG;M?!!9D zusd1|zJjPqXL8v$;{r9L2gA(qF!d}#RP@F7KdS7A18R`1zWHjuAC&pv-#|V+_Z{I~ zt&4Qb>FT#*rOTvgzlCYDfSuFI8AWcpzah#JEs!0Ol0_TRH0lcMB_1=y$QGt#??^YI z&7B?s8}Vdtqjx^P6z+jucbsMzwfJPP5zvBD()O3p7dYq~#LGOa+s zpB(bRv$LI&i@HRWG*>IblW8be&B59-8>_=3n7TqVS;7wU+w*AkcNzh|Q@HqthyHO| z*`$Sq&+MeW;02QjE=yv8E|V86AbM#Wd*O3drtu63*q5rgo>O7lhSB=!PLpAEMq>y< zC0A$1DyEtKXd-2X`+Yla0o9}#+@$v%|P4 zKj=F|1EHwWGu2?${o?sEv;ups`j&sUt4Gl2WdPs}(G}N+#VrW?o5aED^XHejoiF$i z6Pg6Ki57VH&d>=~JL6U-l{yTXSx_Ie^^DF3He0x0MVd91=LEGxvh)IiBx41Icp4== zo@)?kC40RWKecI17+?v7{nh3&3@~oX-L5P|Ba<$?(unw1_e`o^?n~rDiYNZ%3U|%k_s3t$aA?A8t2|QNpTnlxPJImW)>nzo zv^%Hh*x+)>$$s%Y8L`gzIHz~Y9 z@RT>v?DkD_W`&}nqT7GuEGz{2_GI$Z-P5WFhBjdmUx<5=A82=Tham153N>N|i=dva#GIY`&TFQB#1Gfj>NNZ3G>)%Baq>E^7j{m{Y{f?ucmjq%6kYUogt&N?BG)`X z^dTb(7Bb4BPthvkLkNx_B@^FU8K~H2*e7%3+z2>*PY5f+2aZV{#`Jh0y)lp{`3h^FAO3 zrr|fk_oIM)0t6N=%N*+(=Ki^B3AlWU?NOao;{3L72aTx7046VL18FEeEz`!=UTpk% zbWog3V~4!TIFh7ZBf(kDD_bW@$}HLbAn4&d4OcMPU?ZEODK(8XOqIYYLAmzBRxKMj z-z$4wFwXro`D6iP_xj=&!j~$$++9!LF|2hw89eg0$MjUn@%U|A^r#c%k8tX*F^ApD z8Y*dT$&^E&2u=?gM`|Y)YeRF`(z?6@oQJy?O-w=~KbGDTxnL3Ni;y+xxvN6&*WjaT zDB$(}VafWe8_T)U(bs!Mvlrbye6)SHwUtv~T9ZbRC)FMD%-Q=*9={kzA+kuV!utJI zaulgrEwz6C4-TiF!DLLQwW=BS;1Yh3O`7}a{WqCOJ{x)^y*mToFmXQdGOn!40{$(& ze1s->x`07lbifC<-$i-XdED7_{2a!N707=Vx}o{g4C+pb`dqyK?H&f7k>y^O<*MQ5 zhHP7Tz|1uLOPcX~9f-x&*U|gU8T1S3bfBARaYTK!4mZQ*)*MD50~?{MT^up_Y$Tne zUMqI=Y4Jy2Ou@b_4(0LSbdz3F-jAA%_tKuWlbiK3Y+#=w!Or*zPdgk=fe5MMB|isa zfobD<*Qt(5s@+P%!x8S9Gusmlr6+?(Q04##$~!tJT6*Kg7p4$z&DNJVN{j|AQhUb4+@01`nU`r2c) z@q--(Kfvv4_lta*on_xYG`AE5t?0qKBKO7wy&2T69I4{7aqpXpJ4RQ@&zsCFr5H{u z%HeS+rl^MY!wLds_3i17xMzW@l3_4N6IHg!mLf8lxdT#$t&CNsQXh{Esz-nA2R~C$ zspY!s;LL4MhAGRsA|i=74Ad@xNC65A1w1Ia^V1%u%y|NjE;51@-`~o28o4FkIlL%V zMErFo^VnSZ1U%r3cwNqRVL3t7y1j*Ax|7ON%n9RjT|+;Ny5Gp#A0t4mH@-N}Rx9s< zJWk5s{KE<}?-3Kblq)3cgKp0#VRrH%@S{htVV6&E6=e-9b$=IMIt|y>XM^#=cIP`b zx;hff9NvZ-zGKFE4)xzx88e+0d)v27O`DZTKX=aNVpkaZr)K318pYoyfyVO+8MZR*gH9MKU#n3Se?zqZ;Kk@@%fuc*zjB_;@6P7A`bY;HBJD|S#y)Z~ zr?dH04Sco^E<94kokGZgqZU0?k}|Lni`n-p6|j&6%R6D z9Zjrzu^JzcA0h(nizDA=C?&XwNJ47c?vc6{Ct9tzR(zN%5uq9p-#LOx!90x`ei}8O zLPNKF2GgL2I@I;R)y=@GaVFT;H}sFGjb87uen;#CXEx&)Kqfw&E^oRb{1_)|f7NNa zSzNJChqjw}R6%y%L7rQH#t2!BoBnZq*#B)CXUGj zr*3>5?3mhkOmd(~u+WRw8?>+RePUl|B12IMRY*C+VDi_A`oi-;>CfxA-K^uo6;$yV z@|yMj#ixLJfph?t0;0zjXxzqtizi&{uDo{dlWxW%4Wn4cC@62JGZ~HA~4p~c@ms*Jpb_Sj1n4h+TIaCDlVFx{A)yj)s zsRR}10e3C6!LB>G6Hj*DLinJY&hbr}^)DT{WDps^*-+gXp?~GmVnmr)l-ql^l%~AyYUY zMR{Bp^pJ#dR3yq&*=GcjS`3cODR6UBTT04tqQ$%Gn(gHXbgU> zb;O9Pe8iSm;lmF4?LiS-6EcQL&OAA<+a@JTi=q8!VzT>ze`jG|%; zqRn{RsnvUM$GLvKKvL)(XUWPVfS0fjtAIy5Jq9PSn8bmY%)~Qy?9C5n3gEPCU><3? z~4**EbhWXokz%t{tfEsViu;USOt_a;gl74I<|1b`x__hJB%*0Zm5QABQ z=V;Z|fpEsOW|S}7V`K=q^^=3XX8kTjp7LedYxzc>%4X8(vp285>kt>}c!CU4%iUc; z8sM9`V^@Z3(>b<*gT+io$RlOtOWo$tPLKf-Lz_CNICiOUO$K5TbBf79wE}lM$i_wf zaRDDOsnPsMSWA825|}(m%kRKjB)=CR{+uYg6Lv)K?g~NH0Wp~aLBIWOab($QT3FI5 zW5l+Ldf_KCx!TD=*hr6!SmouZLs9S7qr)}uz#$*WW2uPWv83)ObMh&M^Oa94)3h5G znsLV~gXW{8N;)I~-9U^8wtpAWx0@+}h&MyXKQ_wWu;;zR1k>Q;-*YHg`kZLB<|Ws> z=;#X_zHfAm|IDvy$N=NFSkT-TD2j9AofNA2;Zh=Nx}h zywAHWkz}29;GCyE?Qq$<_`G#e6fiZ_K~_F}U^@7GKU?csrl>>h z>G?O8y2$Gh4{v`@m{OaB1M<8?ZAsXy>t#ACu1D%LmvT=}VKZohUf7MkTSZAUEG!>U=(4G(kR-n2B@L?7 zRxtR!@^p(UliXi|y$ALZx8m)EuFcLpE9Be@t)376HwyDMuG2Kl7n~c;AFK3Fj#ChJ z13p37KC$^eu9tt@UpbqQHS!U3NL6HHTRfuZ)#`|)kksf>k?XG6#p#P09Vhq&$Q5A9 zXfcYW!Zy1*4*^Wte3ZRKw`ag*px9QUegv0tfWH$ivpGu%7@goK4{UzXFfq+4Sh0#&1`a{?w-BM z;uBZ_>HK6sT}T%``F2m~Ve|CW@IC5m~ac3P(TYEkr-f68%bGS|(Q!A?=;4=!&j$$cOlxHwxj}fv=l`-gk zKUqi)EX8y)mD2OLF`h%VZgSNpvhsQjlrNveR|KFeaS?|koOKL`VOH|3DU-mjsms7j z5nX4O{Qfdp-BV|e37Q1L7;cw(*2gkdH|EK)Yg@?~LpBhAVnmASH6y8(MhF^}_z)D< z9wy)ZxJevyIq{o{J4fXYmH28v*dnw*V;Fqa6?H4BPlK-)IS5~#)mJW}sr&z^ ziv%KtvZ$R^cqb4U+M%HU0yEGfdHWlI0h!EAwRuuTfm*%mx9du?qfXl+ zH2Vr$EYy!&ho5Y6KE5`4;*0SJukMqn5eEUOU(W=b?jO>(P6~AL+^yU2D_>oZ$`UBy ziHLgz2t4=5y7x~afLfAIrME;{M{`b-acox-E5}ke?Y)iQghBOS1E ztHV9$0Oy39efc57{)h|mgp%ViEq`wq`&-3mv@Bn8+`itd~PP#;@0xB!R+A$CjQ1p*r!I#5* z1?j4)!3tKDnw3yz6W zOzi;%2n4!RTt4rq-dgMb>b6^2{{<+6>;+U1hPC;&)%Ei~_1KzeGkIbqe?P|bn|lzp z#7n_EfA5+5;Y_1uIs#0^H>eovv~V9ZMs-hf;QDaSiRp7qOmjwt_-q)N6B9Y!6VXBT>njU?h=*N9JB8Q){m`osFnE2sA^F@?>#k${Fuelf>iIk}gT7A_#(&p6B=PHqyiRyPV0? zame3X;NeQ}c8Fis^ZQ3si@JEcc<_E~Z}q;%2$dUqZSkR8!HsWNx7j9=Gzm>NCVOa( zle4>ikkRs(z8Z+gh_s@@`_=q-a5S%1Is)E`Vci7jO)nGqm}iWoV}FW{fw}wr!!i0u zMV2~!93UIZ1u5&)ksHyHF<602y~(lE^YV`f@<5R0`-=wXp$(S8p~WMMhgI37I?V zG26F>ZpRr{?{*@Sisy2*0nBTC&aGr9b~M|!`{pf z)Q(1B3#_HF55kx0v6qgMoIi5}TsN)7;!N+;#int$a~Mr;a;;y5Q|2YBrr+i&ZK4S1 zN2P|li!63+fex}ZVMZ1LjU^b+?tS?X27xvf=@E-4wC5g~`vnF!1SJ0H3rH*VYTlJ4 zhOT=yqa^0m)9t5k2Hk^kp-Po*R$sPf^!8<_&hn-7Dp7ideCdTY#0mCL4`=mt0f{wW z{G^&IJrWI%5#_VkjHn`Gug#! zkK!KBu4oz4dBMUjJvTO7EQTn$c^#t$5>lEU1U?wPgn8=xq&IRFe_)Tm2p@e&Q;zD6 zti?eSczt8)y-B{rx;1XO#7#5LR_=uL0gAwSe!a+o1JfvD4z9Rh>z9rf_#9#&ZJl4T)hIm9AOW@+$u66SQ zW zzyDao9cLWBQt8?HHcK#B&MAI!%6NP2{aHY0X({2S=?9S-vNPR0>4lfYYu@ZDS7DD_ zs%3;~k}nELprC2Zh%;wWZQ-c|!e&HYj*CgQ?*-Bmu7Con0vR>eD*FF8Ih53^d(;!@ zODjaZM-ODO%<8iMyOw1GN(A^>rQxGt;?O{zk5P_9#7zzjEv+O{YD7FxlKe5b_le^B zIIGE`OyF*VEUJ3#PN~X7$_oHV2mYw|x0&S1CbLV;l@5b3_|HGoSSZdnWZI14*f|Msa@>2Yc`QKG4QT@gEWF7aP9p^Gm1sGaUioXomw|1)CK`(N-)m2&z2 zCL0|wJwqj{`Juu+P@@;z;oqv>zx2U1D84rWFpL)Bkou>h8PnKjzYs<#D=0tC)rOfOHqj ztV;yo7dlOvp4^TB>-=ZP==u!R{}}Rv+cnRBQsuWv6DLXaF#P{K!4HFb|7`LBlDPH$ z?Tc2`tUTqPiTp!Dg8DlE;|Qo^8?dLbH&czwVKwpb|1!9ync)Ar_ah4Q(c)UZqmyX( z%ox=(jBR-LY!r9-6vhAB@c1(}oG<8A`cA$7W7WTyo>l)Rdu9FblK)R0SiY2c{Zn!m zt4V7tOAG*h#JqIT&>&d!xrRS5xzz6eVDfC0B!BJskFL0L4tRe)Y-1>;d587tZVNaIf5v01{$F>xU9biV+cNhQS@#y^hk*PWz&51N zZ}oCq$EqCmRNJM!&T+XtS@eYR>bc(c@3|bG)*hZ-9;1y#djN>GhkzW&24q?&1OlGi zh6=X&+#eULriO6)l3#-K9BPBo_I1)22>q zD`%^K;P&z9^}+73a?iyT@5R+ktbl&od8>dg1t71=oevguud&x3x9syIl$S>W@q#O_ z+q-MvTP&@u&wwB2j!Yjwfss%iG82JD`;7(2M+chXd*lCS^6o(ge2sTJl9j*ZA*nUsGl=j)Mm+(DP=2}6`x?C7MYK7R(F;^m2*}Uy zAp&qeT=>%$kI`3n%zq^Mbp)XgR#{Eb+rTOwj|kqkcn#w+s5R8;)>wcZq>tCtc7B7? zCZyQzU=hl?6&B6xuvem4Jr5z~{zxHkv5au(4HW9#nJQ%%QEmp?cJ=l)&xVfzVJn_X zEg(-kdE;6}RFtr{Een{O>8yNXzexvBVfYZh)@vZM2J|0LQjDr|6o4ccD^g{+bW7p1 zeQWvi)#d5$U!978$%KfEJb2)Hf9vX=O3~4Kaky$&t5s>zkE2fT6GuomRU$ARfFfNu zb@Lc`6FR+=BlVo{U00*nTU0c(7~2FUZR!l|g)^WA@_QQ_M)63Zpgg&hrky*W;F%Fn zKSokYs=dZ?+~kuIIgOsLtZc+hD*!CV?Q_#$!v~aHao+yfJz1w&ZT3$oZSfUr_`_6^ zI{%%@u%~;fSj6v%B2{!CzoCq?yyYm|+F2~3sRekYe6)MIJDsfu^*GusQZ3E`0L&a? zMJ|p8-#louOPB6&T+?qe4GvT>F)?e#mLMT2N=o7Dk4+DA*dwW|)L=aF923_qzqHq=$F#%gE$asOVx~|EaqHAjemupT%7p!J*{uoG=PQoklX<1`(Jb(YXbci$j%_hEu= zPPh6%54xuTMC~v7pO0IBYE)%Stfm8fmJUWI(z3eaWcg<+pWh6OeX9xO-5jA1AFSyG zE*k!fuBnxFbhJJ=*o~a}Ah;Se-WwTZwdCgkI--o%p*%7%p1LE3LR_W`%K!4^%Q8-} z@E6hOD4}Dxn^{Pfmq1ONHFw|{{svefYVp`wzish+#oEF5OT#@4JG;u_*R@yT1zjky ziOJOar(*yegKyfF0WNn8v|i~7vtw8YyN(RTB&I&tww$TT?XCQebFNd;M{T#~y68)# z_NrK|G&wa{tDypZ})$??kz)^$Q>qz@lcnZM+tD*foJy)f?v4xQa1rC^{ztYd>t1f zs| Polars Usage Fugue Usage + Benchmarks Developer Instructions .. toctree:: From 0292567cdc3b0ce2112752e806c0c58c57716bca Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 7 Jun 2024 11:47:12 -0600 Subject: [PATCH 6/8] [edgetest] automated change (#311) Co-authored-by: fdosani --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0d619ec3..f4d5866d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ maintainers = [ { name="Faisal Dosani", email="faisal.dosani@capitalone.com" } ] license = {text = "Apache Software License"} -dependencies = ["pandas<=2.2.2,>=0.25.0", "numpy<=1.26.4,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "fugue<=0.9.0,>=0.8.7", "polars<=0.20.30,>=0.20.4"] +dependencies = ["pandas<=2.2.2,>=0.25.0", "numpy<=1.26.4,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "fugue<=0.9.0,>=0.8.7", "polars<=0.20.31,>=0.20.4"] requires-python = ">=3.9.0" classifiers = [ "Intended Audience :: Developers", From afeed8b5aa7160b1ed4b3b6aeaa73988a1af2fe8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 14 Jun 2024 11:45:06 -0600 Subject: [PATCH 7/8] [edgetest] automated change (#313) Co-authored-by: fdosani --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f4d5866d..1b4c1c23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ maintainers = [ { name="Faisal Dosani", email="faisal.dosani@capitalone.com" } ] license = {text = "Apache Software License"} -dependencies = ["pandas<=2.2.2,>=0.25.0", "numpy<=1.26.4,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "fugue<=0.9.0,>=0.8.7", "polars<=0.20.31,>=0.20.4"] +dependencies = ["pandas<=2.2.2,>=0.25.0", "numpy<=1.26.4,>=1.22.0", "ordered-set<=4.1.0,>=4.0.2", "fugue<=0.9.1,>=0.8.7", "polars<=0.20.31,>=0.20.4"] requires-python = ">=3.9.0" classifiers = [ "Intended Audience :: Developers", From 537cd7dd7b507989af84eac0209cf23506f19144 Mon Sep 17 00:00:00 2001 From: Faisal Date: Thu, 20 Jun 2024 15:29:45 -0300 Subject: [PATCH 8/8] SparkSQLCompare and spark submodule (#310) * [WIP] vanilla spark * [WIP] fixing tests and logic * [WIP] __index cleanup * updating pyspark.sql logic and fixing tests * restructuring spark logic into submodule and typing * remove pandas 2 restriction for spark sql * fix for sql call * updating docs * updating benchmarks with pyspark dataframe * relative imports and linting * relative imports and linting * feedback from review, switch to monotonic and simplify checks * allow pyspark.sql.connect.dataframe.DataFrame * checking version for spark connect * typo fix * adding import * adding connect extras * adding connect extras --- .github/workflows/test-package.yml | 2 +- .gitignore | 2 +- README.md | 34 +- datacompy/__init__.py | 11 +- datacompy/core.py | 2 +- datacompy/polars.py | 23 +- datacompy/spark/__init__.py | 0 datacompy/{ => spark}/legacy.py | 0 datacompy/{spark.py => spark/pandas.py} | 135 +- datacompy/spark/sql.py | 1204 +++++++++++++++ docs/source/benchmark.rst | 26 +- docs/source/conf.py | 3 +- docs/source/img/benchmarks.png | Bin 35703 -> 39529 bytes docs/source/index.rst | 2 +- docs/source/spark_usage.rst | 67 +- pyproject.toml | 2 +- tests/__init__.py | 0 tests/{ => test_spark}/test_legacy_spark.py | 2 +- .../test_pandas_spark.py} | 124 +- tests/test_spark/test_sql_spark.py | 1343 +++++++++++++++++ 20 files changed, 2800 insertions(+), 182 deletions(-) create mode 100644 datacompy/spark/__init__.py rename datacompy/{ => spark}/legacy.py (100%) rename datacompy/{spark.py => spark/pandas.py} (91%) create mode 100644 datacompy/spark/sql.py create mode 100644 tests/__init__.py rename tests/{ => test_spark}/test_legacy_spark.py (99%) rename tests/{test_spark.py => test_spark/test_pandas_spark.py} (92%) create mode 100644 tests/test_spark/test_sql_spark.py diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml index 7b3e6ba6..b710d6e5 100644 --- a/.github/workflows/test-package.yml +++ b/.github/workflows/test-package.yml @@ -49,7 +49,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install pytest pytest-spark pypandoc - python -m pip install pyspark==${{ matrix.spark-version }} + python -m pip install pyspark[connect]==${{ matrix.spark-version }} python -m pip install pandas==${{ matrix.pandas-version }} python -m pip install .[dev] - name: Test with pytest diff --git a/.gitignore b/.gitignore index e94d2315..2e0f6e3a 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,4 @@ docs/source/api/ #edgetest .edgetest/ -tmp/ +.tmp/ diff --git a/README.md b/README.md index 23128f44..4de6c86f 100644 --- a/README.md +++ b/README.md @@ -39,14 +39,20 @@ pip install datacompy[ray] ### Legacy Spark Deprecation -#### Starting with version 0.12.0 +With version ``v0.12.0`` the original ``SparkCompare`` was replaced with a +Pandas on Spark implementation. The original ``SparkCompare`` implementation differs +from all the other native implementations. To align the API better, and keep behaviour +consistent we are deprecating the original ``SparkCompare`` into a new module ``LegacySparkCompare`` -The original ``SparkCompare`` implementation differs from all the other native implementations. To align the API better, and keep behaviour consistent we are deprecating ``SparkCompare`` into a new module ``LegacySparkCompare`` +Subsequently in ``v0.13.0`` a PySaprk DataFrame class has been introduced (``SparkSQLCompare``) +which accepts ``pyspark.sql.DataFrame`` and should provide better performance. With this version +the Pandas on Spark implementation has been renamed to ``SparkPandasCompare`` and all the spark +logic is now under the ``spark`` submodule. -If you wish to use the old SparkCompare moving forward you can +If you wish to use the old SparkCompare moving forward you can import it like so: ```python -from datacompy.legacy import LegacySparkCompare +from datacompy.spark.legacy import LegacySparkCompare ``` #### Supported versions and dependncies @@ -55,11 +61,6 @@ Different versions of Spark, Pandas, and Python interact differently. Below is a With the move to Pandas on Spark API and compatability issues with Pandas 2+ we will for the mean time note support Pandas 2 with the Pandas on Spark implementation. Spark plans to support Pandas 2 in [Spark 4](https://issues.apache.org/jira/browse/SPARK-44101) -With version ``0.12.0``: -- Not support Pandas ``2.0.0`` For the native Spark implemention -- Spark ``3.1`` support will be dropped -- Python ``3.8`` support is dropped - | | Spark 3.2.4 | Spark 3.3.4 | Spark 3.4.2 | Spark 3.5.1 | |-------------|-------------|-------------|-------------|-------------| @@ -69,11 +70,12 @@ With version ``0.12.0``: | Python 3.12 | ❌ | ❌ | ❌ | ❌ | -| | Pandas < 1.5.3 | Pandas >=2.0.0 | -|---------------|----------------|----------------| -| Native Pandas | ✅ | ✅ | -| Native Spark | ✅ | ❌ | -| Fugue | ✅ | ✅ | +| | Pandas < 1.5.3 | Pandas >=2.0.0 | +|------------------------|----------------|----------------| +| ``Compare`` | ✅ | ✅ | +| ``SparkPandasCompare`` | ✅ | ❌ | +| ``SparkSQLCompare`` | ✅ | ✅ | +| Fugue | ✅ | ✅ | @@ -85,8 +87,8 @@ With version ``0.12.0``: ## Supported backends - Pandas: ([See documentation](https://capitalone.github.io/datacompy/pandas_usage.html)) -- Spark (Pandas on Spark API): ([See documentation](https://capitalone.github.io/datacompy/spark_usage.html)) -- Polars (Experimental): ([See documentation](https://capitalone.github.io/datacompy/polars_usage.html)) +- Spark: ([See documentation](https://capitalone.github.io/datacompy/spark_usage.html)) +- Polars: ([See documentation](https://capitalone.github.io/datacompy/polars_usage.html)) - Fugue is a Python library that provides a unified interface for data processing on Pandas, DuckDB, Polars, Arrow, Spark, Dask, Ray, and many other backends. DataComPy integrates with Fugue to provide a simple way to compare data across these backends. Please note that Fugue will use the Pandas (Native) logic at its lowest level diff --git a/datacompy/__init__.py b/datacompy/__init__.py index 6af6e81c..25514adf 100644 --- a/datacompy/__init__.py +++ b/datacompy/__init__.py @@ -13,13 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.12.1" +__version__ = "0.13.0" import platform from warnings import warn -from datacompy.core import * -from datacompy.fugue import ( +from .core import * # noqa: F403 +from .fugue import ( # noqa: F401 all_columns_match, all_rows_overlap, count_matching_rows, @@ -28,8 +28,9 @@ report, unq_columns, ) -from datacompy.polars import PolarsCompare -from datacompy.spark import SparkCompare +from .polars import PolarsCompare # noqa: F401 +from .spark.pandas import SparkPandasCompare # noqa: F401 +from .spark.sql import SparkSQLCompare # noqa: F401 major = platform.python_version_tuple()[0] minor = platform.python_version_tuple()[1] diff --git a/datacompy/core.py b/datacompy/core.py index d10967a8..d07cac96 100644 --- a/datacompy/core.py +++ b/datacompy/core.py @@ -29,7 +29,7 @@ import pandas as pd from ordered_set import OrderedSet -from datacompy.base import BaseCompare, temp_column_name +from .base import BaseCompare, temp_column_name LOG = logging.getLogger(__name__) diff --git a/datacompy/polars.py b/datacompy/polars.py index 5b0d22cb..3dbf82ff 100644 --- a/datacompy/polars.py +++ b/datacompy/polars.py @@ -29,7 +29,7 @@ import numpy as np from ordered_set import OrderedSet -from datacompy.base import BaseCompare, temp_column_name +from .base import BaseCompare, temp_column_name try: import polars as pl @@ -279,16 +279,19 @@ def _dataframe_merge(self, ignore_spaces: bool) -> None: # process merge indicator outer_join = outer_join.with_columns( pl.when( - (pl.col("_merge_left") == True) & (pl.col("_merge_right") == True) - ) # noqa: E712 + (pl.col("_merge_left") == True) + & (pl.col("_merge_right") == True) # noqa: E712 + ) .then(pl.lit("both")) .when( - (pl.col("_merge_left") == True) & (pl.col("_merge_right").is_null()) - ) # noqa: E712 + (pl.col("_merge_left") == True) + & (pl.col("_merge_right").is_null()) # noqa: E712 + ) .then(pl.lit("left_only")) .when( - (pl.col("_merge_left").is_null()) & (pl.col("_merge_right") == True) - ) # noqa: E712 + (pl.col("_merge_left").is_null()) + & (pl.col("_merge_right") == True) # noqa: E712 + ) .then(pl.lit("right_only")) .alias("_merge") ) @@ -504,10 +507,8 @@ def sample_mismatch( match_cnt = col_match.sum() sample_count = min(sample_count, row_cnt - match_cnt) # type: ignore sample = self.intersect_rows.filter( - pl.col(column + "_match") != True - ).sample( # noqa: E712 - sample_count - ) + pl.col(column + "_match") != True # noqa: E712 + ).sample(sample_count) return_cols = self.join_columns + [ column + "_" + self.df1_name, column + "_" + self.df2_name, diff --git a/datacompy/spark/__init__.py b/datacompy/spark/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/datacompy/legacy.py b/datacompy/spark/legacy.py similarity index 100% rename from datacompy/legacy.py rename to datacompy/spark/legacy.py diff --git a/datacompy/spark.py b/datacompy/spark/pandas.py similarity index 91% rename from datacompy/spark.py rename to datacompy/spark/pandas.py index aad9f11e..09c983cf 100644 --- a/datacompy/spark.py +++ b/datacompy/spark/pandas.py @@ -23,11 +23,12 @@ import logging import os +from typing import List, Optional, Union import pandas as pd from ordered_set import OrderedSet -from datacompy.base import BaseCompare, temp_column_name +from ..base import BaseCompare, temp_column_name try: import pyspark.pandas as ps @@ -39,7 +40,7 @@ LOG = logging.getLogger(__name__) -class SparkCompare(BaseCompare): +class SparkPandasCompare(BaseCompare): """Comparison class to be used to compare whether two Pandas on Spark dataframes are equal. Both df1 and df2 should be dataframes containing all of the join_columns, @@ -83,17 +84,17 @@ class SparkCompare(BaseCompare): def __init__( self, - df1, - df2, - join_columns, - abs_tol=0, - rel_tol=0, - df1_name="df1", - df2_name="df2", - ignore_spaces=False, - ignore_case=False, - cast_column_names_lower=True, - ): + df1: "ps.DataFrame", + df2: "ps.DataFrame", + join_columns: Union[List[str], str], + abs_tol: float = 0, + rel_tol: float = 0, + df1_name: str = "df1", + df2_name: str = "df2", + ignore_spaces: bool = False, + ignore_case: bool = False, + cast_column_names_lower: bool = True, + ) -> None: if pd.__version__ >= "2.0.0": raise Exception( "It seems like you are running Pandas 2+. Please note that Pandas 2+ will only be supported in Spark 4+. See: https://issues.apache.org/jira/browse/SPARK-44101. If you need to use Spark DataFrame with Pandas 2+ then consider using Fugue otherwise downgrade to Pandas 1.5.3" @@ -115,7 +116,7 @@ def __init__( for col in join_columns ] - self._any_dupes = False + self._any_dupes: bool = False self.df1 = df1 self.df2 = df2 self.df1_name = df1_name @@ -125,15 +126,15 @@ def __init__( self.ignore_spaces = ignore_spaces self.ignore_case = ignore_case self.df1_unq_rows = self.df2_unq_rows = self.intersect_rows = None - self.column_stats = [] + self.column_stats: List = [] self._compare(ignore_spaces, ignore_case) @property - def df1(self): + def df1(self) -> "ps.DataFrame": return self._df1 @df1.setter - def df1(self, df1): + def df1(self, df1: "ps.DataFrame") -> None: """Check that it is a dataframe and has the join columns""" self._df1 = df1 self._validate_dataframe( @@ -141,18 +142,20 @@ def df1(self, df1): ) @property - def df2(self): + def df2(self) -> "ps.DataFrame": return self._df2 @df2.setter - def df2(self, df2): + def df2(self, df2: "ps.DataFrame") -> None: """Check that it is a dataframe and has the join columns""" self._df2 = df2 self._validate_dataframe( "df2", cast_column_names_lower=self.cast_column_names_lower ) - def _validate_dataframe(self, index, cast_column_names_lower=True): + def _validate_dataframe( + self, index: str, cast_column_names_lower: bool = True + ) -> None: """Check that it is a dataframe and has the join columns Parameters @@ -161,6 +164,10 @@ def _validate_dataframe(self, index, cast_column_names_lower=True): The "index" of the dataframe - df1 or df2. cast_column_names_lower: bool, optional Boolean indicator that controls of column names will be cast into lower case + + Return + ------ + None """ dataframe = getattr(self, index) if not isinstance(dataframe, (ps.DataFrame)): @@ -180,7 +187,7 @@ def _validate_dataframe(self, index, cast_column_names_lower=True): if len(dataframe.drop_duplicates(subset=self.join_columns)) < len(dataframe): self._any_dupes = True - def _compare(self, ignore_spaces, ignore_case): + def _compare(self, ignore_spaces: bool, ignore_case: bool) -> None: """Actually run the comparison. This tries to run df1.equals(df2) first so that if they're truly equal we can tell. @@ -216,19 +223,19 @@ def _compare(self, ignore_spaces, ignore_case): else: LOG.info("df1 does not match df2") - def df1_unq_columns(self): + def df1_unq_columns(self) -> OrderedSet[str]: """Get columns that are unique to df1""" return OrderedSet(self.df1.columns) - OrderedSet(self.df2.columns) - def df2_unq_columns(self): + def df2_unq_columns(self) -> OrderedSet[str]: """Get columns that are unique to df2""" return OrderedSet(self.df2.columns) - OrderedSet(self.df1.columns) - def intersect_columns(self): + def intersect_columns(self) -> OrderedSet[str]: """Get columns that are shared between the two dataframes""" return OrderedSet(self.df1.columns) & OrderedSet(self.df2.columns) - def _dataframe_merge(self, ignore_spaces): + def _dataframe_merge(self, ignore_spaces: bool) -> None: """Merge df1 to df2 on the join columns, to get df1 - df2, df2 - df1 and df1 & df2 """ @@ -301,17 +308,17 @@ def _dataframe_merge(self, ignore_spaces): # process merge indicator outer_join["_merge"] = outer_join._merge.mask( - (outer_join["_merge_left"] == True) + (outer_join["_merge_left"] == True) # noqa: E712 & (outer_join["_merge_right"] == True), # noqa: E712 "both", ) outer_join["_merge"] = outer_join._merge.mask( - (outer_join["_merge_left"] == True) + (outer_join["_merge_left"] == True) # noqa: E712 & (outer_join["_merge_right"] != True), # noqa: E712 "left_only", ) outer_join["_merge"] = outer_join._merge.mask( - (outer_join["_merge_left"] != True) + (outer_join["_merge_left"] != True) # noqa: E712 & (outer_join["_merge_right"] == True), # noqa: E712 "right_only", ) @@ -364,7 +371,7 @@ def _dataframe_merge(self, ignore_spaces): # cache self.intersect_rows.spark.cache() - def _intersect_compare(self, ignore_spaces, ignore_case): + def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None: """Run the comparison on the intersect dataframe This loops through all columns that are shared between df1 and df2, and @@ -372,6 +379,8 @@ def _intersect_compare(self, ignore_spaces, ignore_case): otherwise. """ LOG.debug("Comparing intersection") + max_diff: float + null_diff: int row_cnt = len(self.intersect_rows) for column in self.intersect_columns(): if column in self.join_columns: @@ -430,11 +439,11 @@ def _intersect_compare(self, ignore_spaces, ignore_case): } ) - def all_columns_match(self): + def all_columns_match(self) -> bool: """Whether the columns all match in the dataframes""" return self.df1_unq_columns() == self.df2_unq_columns() == set() - def all_rows_overlap(self): + def all_rows_overlap(self) -> bool: """Whether the rows are all present in both dataframes Returns @@ -445,7 +454,7 @@ def all_rows_overlap(self): """ return len(self.df1_unq_rows) == len(self.df2_unq_rows) == 0 - def count_matching_rows(self): + def count_matching_rows(self) -> bool: """Count the number of rows match (on overlapping fields) Returns @@ -469,12 +478,12 @@ def count_matching_rows(self): match_columns_count = 0 return match_columns_count - def intersect_rows_match(self): + def intersect_rows_match(self) -> bool: """Check whether the intersect rows all match""" actual_length = self.intersect_rows.shape[0] return self.count_matching_rows() == actual_length - def matches(self, ignore_extra_columns=False): + def matches(self, ignore_extra_columns: bool = False) -> bool: """Return True or False if the dataframes match. Parameters @@ -491,12 +500,17 @@ def matches(self, ignore_extra_columns=False): else: return True - def subset(self): + def subset(self) -> bool: """Return True if dataframe 2 is a subset of dataframe 1. Dataframe 2 is considered a subset if all of its columns are in dataframe 1, and all of its rows match rows in dataframe 1 for the shared columns. + + Returns + ------- + bool + True if dataframe 2 is a subset of dataframe 1. """ if not self.df2_unq_columns() == set(): return False @@ -507,7 +521,9 @@ def subset(self): else: return True - def sample_mismatch(self, column, sample_count=10, for_display=False): + def sample_mismatch( + self, column: str, sample_count: int = 10, for_display: bool = False + ) -> "ps.DataFrame": """Returns a sample sub-dataframe which contains the identifying columns, and df1 and df2 versions of the column. @@ -549,7 +565,7 @@ def sample_mismatch(self, column, sample_count=10, for_display=False): ] return to_return - def all_mismatch(self, ignore_matching_cols=False): + def all_mismatch(self, ignore_matching_cols: bool = False) -> "ps.DataFrame": """All rows with any columns that have a mismatch. Returns all df1 and df2 versions of the columns and join columns. @@ -603,7 +619,12 @@ def all_mismatch(self, ignore_matching_cols=False): return self.intersect_rows[~mm_bool][updated_join_columns + return_list] - def report(self, sample_count=10, column_count=10, html_file=None): + def report( + self, + sample_count: int = 10, + column_count: int = 10, + html_file: Optional[str] = None, + ) -> str: """Returns a string representation of a report. The representation can then be printed or saved to a file. @@ -755,7 +776,7 @@ def report(self, sample_count=10, column_count=10, html_file=None): return report -def render(filename, *fields): +def render(filename: str, *fields: Union[int, float, str]) -> str: """Renders out an individual template. This basically just reads in a template file, and applies ``.format()`` on the fields. @@ -773,13 +794,18 @@ def render(filename, *fields): The fully rendered out file. """ this_dir = os.path.dirname(os.path.realpath(__file__)) - with open(os.path.join(this_dir, "templates", filename)) as file_open: + with open(os.path.join(this_dir, "..", "templates", filename)) as file_open: return file_open.read().format(*fields) def columns_equal( - col_1, col_2, rel_tol=0, abs_tol=0, ignore_spaces=False, ignore_case=False -): + col_1: "ps.Series", + col_2: "ps.Series", + rel_tol: float = 0, + abs_tol: float = 0, + ignore_spaces: bool = False, + ignore_case: bool = False, +) -> "ps.Series": """Compares two columns from a dataframe, returning a True/False series, with the same index as column 1. @@ -856,7 +882,9 @@ def columns_equal( return compare -def compare_string_and_date_columns(col_1, col_2): +def compare_string_and_date_columns( + col_1: "ps.Series", col_2: "ps.Series" +) -> "ps.Series": """Compare a string column and date column, value-wise. This tries to convert a string column to a date column and compare that way. @@ -892,7 +920,11 @@ def compare_string_and_date_columns(col_1, col_2): return compare -def get_merged_columns(original_df, merged_df, suffix): +def get_merged_columns( + original_df: "ps.DataFrame", + merged_df: "ps.DataFrame", + suffix: str, +) -> List[str]: """Gets the columns from an original dataframe, in the new merged dataframe Parameters @@ -904,6 +936,11 @@ def get_merged_columns(original_df, merged_df, suffix): suffix : str What suffix was used to distinguish when the original dataframe was overlapping with the other merged dataframe. + + Returns + ------- + List[str] + Column list of the original dataframe pre suffix """ columns = [] for col in original_df.columns: @@ -916,7 +953,7 @@ def get_merged_columns(original_df, merged_df, suffix): return columns -def calculate_max_diff(col_1, col_2): +def calculate_max_diff(col_1: "ps.DataFrame", col_2: "ps.DataFrame") -> float: """Get a maximum difference between two columns Parameters @@ -928,8 +965,8 @@ def calculate_max_diff(col_1, col_2): Returns ------- - Numeric - Numeric field, or zero. + float + max diff """ try: return (col_1.astype(float) - col_2.astype(float)).abs().max() @@ -937,7 +974,9 @@ def calculate_max_diff(col_1, col_2): return 0 -def generate_id_within_group(dataframe, join_columns): +def generate_id_within_group( + dataframe: "ps.DataFrame", join_columns: List[str] +) -> "ps.Series": """Generate an ID column that can be used to deduplicate identical rows. The series generated is the order within a unique group, and it handles nulls. diff --git a/datacompy/spark/sql.py b/datacompy/spark/sql.py new file mode 100644 index 00000000..21dd286d --- /dev/null +++ b/datacompy/spark/sql.py @@ -0,0 +1,1204 @@ +# +# Copyright 2024 Capital One Services, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Compare two PySpark SQL DataFrames + +Originally this package was meant to provide similar functionality to +PROC COMPARE in SAS - i.e. human-readable reporting on the difference between +two dataframes. +""" + +import logging +import os +from copy import deepcopy +from typing import List, Optional, Tuple, Union + +import pandas as pd +from ordered_set import OrderedSet + +from ..base import BaseCompare, temp_column_name + +try: + import pyspark.sql + from pyspark.sql import Window + from pyspark.sql.functions import ( + abs, + array, + array_contains, + col, + isnan, + isnull, + lit, + monotonically_increasing_id, + row_number, + trim, + upper, + when, + ) + from pyspark.version import __version__ + +except ImportError: + pass # Let non-Spark people at least enjoy the loveliness of the spark sql datacompy functionality + + +LOG = logging.getLogger(__name__) + + +# Used for checking equality with decimal(X, Y) types. Otherwise treated as the string "decimal". +def decimal_comparator(): + class DecimalComparator(str): + def __eq__(self, other): + return len(other) >= 7 and other[0:7] == "decimal" + + return DecimalComparator("decimal") + + +NUMERIC_SPARK_TYPES = [ + "tinyint", + "smallint", + "int", + "bigint", + "float", + "double", + decimal_comparator(), +] + + +class SparkSQLCompare(BaseCompare): + """Comparison class to be used to compare whether two Spark SQL dataframes are equal. + + Both df1 and df2 should be dataframes containing all of the join_columns, + with unique column names. Differences between values are compared to + abs_tol + rel_tol * abs(df2['value']). + + Parameters + ---------- + spark_session : pyspark.sql.SparkSession + A ``SparkSession`` to be used to execute Spark commands in the comparison. + df1 : pyspark.sql.DataFrame + First dataframe to check + df2 : pyspark.sql.DataFrame + Second dataframe to check + join_columns : list or str, optional + Column(s) to join dataframes on. If a string is passed in, that one + column will be used. + abs_tol : float, optional + Absolute tolerance between two values. + rel_tol : float, optional + Relative tolerance between two values. + df1_name : str, optional + A string name for the first dataframe. This allows the reporting to + print out an actual name instead of "df1", and allows human users to + more easily track the dataframes. + df2_name : str, optional + A string name for the second dataframe + ignore_spaces : bool, optional + Flag to strip whitespace (including newlines) from string columns (including any join + columns) + ignore_case : bool, optional + Flag to ignore the case of string columns + cast_column_names_lower: bool, optional + Boolean indicator that controls of column names will be cast into lower case + + Attributes + ---------- + df1_unq_rows : pyspark.sql.DataFrame + All records that are only in df1 (based on a join on join_columns) + df2_unq_rows : pyspark.sql.DataFrame + All records that are only in df2 (based on a join on join_columns) + intersect_rows : pyspark.sql.DataFrame + All records that are in both df1 and df2 + """ + + def __init__( + self, + spark_session: "pyspark.sql.SparkSession", + df1: "pyspark.sql.DataFrame", + df2: "pyspark.sql.DataFrame", + join_columns: Union[List[str], str], + abs_tol: float = 0, + rel_tol: float = 0, + df1_name: str = "df1", + df2_name: str = "df2", + ignore_spaces: bool = False, + ignore_case: bool = False, + cast_column_names_lower: bool = True, + ) -> None: + self.cast_column_names_lower = cast_column_names_lower + if isinstance(join_columns, (str, int, float)): + self.join_columns = [ + ( + str(join_columns).lower() + if self.cast_column_names_lower + else str(join_columns) + ) + ] + else: + self.join_columns = [ + str(col).lower() if self.cast_column_names_lower else str(col) + for col in join_columns + ] + + self.spark_session = spark_session + self._any_dupes: bool = False + self.df1 = df1 + self.df2 = df2 + self.df1_name = df1_name + self.df2_name = df2_name + self.abs_tol = abs_tol + self.rel_tol = rel_tol + self.ignore_spaces = ignore_spaces + self.ignore_case = ignore_case + self.df1_unq_rows: "pyspark.sql.DataFrame" + self.df2_unq_rows: "pyspark.sql.DataFrame" + self.intersect_rows: "pyspark.sql.DataFrame" + self.column_stats: List = [] + self._compare(ignore_spaces=ignore_spaces, ignore_case=ignore_case) + + @property + def df1(self) -> "pyspark.sql.DataFrame": + return self._df1 + + @df1.setter + def df1(self, df1: "pyspark.sql.DataFrame") -> None: + """Check that it is a dataframe and has the join columns""" + self._df1 = df1 + self._validate_dataframe( + "df1", cast_column_names_lower=self.cast_column_names_lower + ) + + @property + def df2(self) -> "pyspark.sql.DataFrame": + return self._df2 + + @df2.setter + def df2(self, df2: "pyspark.sql.DataFrame") -> None: + """Check that it is a dataframe and has the join columns""" + self._df2 = df2 + self._validate_dataframe( + "df2", cast_column_names_lower=self.cast_column_names_lower + ) + + def _validate_dataframe( + self, index: str, cast_column_names_lower: bool = True + ) -> None: + """Check that it is a dataframe and has the join columns + + Parameters + ---------- + index : str + The "index" of the dataframe - df1 or df2. + cast_column_names_lower: bool, optional + Boolean indicator that controls of column names will be cast into lower case + + Return + ------ + None + """ + dataframe = getattr(self, index) + + if __version__ >= "3.4.0": + import pyspark.sql.connect.dataframe + + instances = (pyspark.sql.DataFrame, pyspark.sql.connect.dataframe.DataFrame) + else: + import pyspark.sql + + instances = pyspark.sql.DataFrame + + if not isinstance(dataframe, instances): + raise TypeError( + f"{index} must be a pyspark.sql.DataFrame or pyspark.sql.connect.dataframe.DataFrame (Spark 3.4.0 and above)" + ) + + if cast_column_names_lower: + if index == "df1": + self._df1 = dataframe.toDF( + *[str(col).lower() for col in dataframe.columns] + ) + if index == "df2": + self._df2 = dataframe.toDF( + *[str(col).lower() for col in dataframe.columns] + ) + + # Check if join_columns are present in the dataframe + dataframe = getattr(self, index) # refresh + if not set(self.join_columns).issubset(set(dataframe.columns)): + raise ValueError(f"{index} must have all columns from join_columns") + + if len(set(dataframe.columns)) < len(dataframe.columns): + raise ValueError(f"{index} must have unique column names") + + if ( + dataframe.drop_duplicates(subset=self.join_columns).count() + < dataframe.count() + ): + self._any_dupes = True + + def _compare(self, ignore_spaces: bool, ignore_case: bool) -> None: + """Actually run the comparison. This tries to run df1.equals(df2) + first so that if they're truly equal we can tell. + + This method will log out information about what is different between + the two dataframes, and will also return a boolean. + """ + LOG.info(f"Number of columns in common: {len(self.intersect_columns())}") + LOG.debug("Checking column overlap") + for col in self.df1_unq_columns(): + LOG.info(f"Column in df1 and not in df2: {col}") + LOG.info( + f"Number of columns in df1 and not in df2: {len(self.df1_unq_columns())}" + ) + for col in self.df2_unq_columns(): + LOG.info(f"Column in df2 and not in df1: {col}") + LOG.info( + f"Number of columns in df2 and not in df1: {len(self.df2_unq_columns())}" + ) + + LOG.debug("Merging dataframes") + self._dataframe_merge(ignore_spaces) + self._intersect_compare(ignore_spaces, ignore_case) + + if self.matches(): + LOG.info("df1 matches df2") + else: + LOG.info("df1 does not match df2") + + def df1_unq_columns(self) -> OrderedSet[str]: + """Get columns that are unique to df1""" + return OrderedSet(self.df1.columns) - OrderedSet(self.df2.columns) + + def df2_unq_columns(self) -> OrderedSet[str]: + """Get columns that are unique to df2""" + return OrderedSet(self.df2.columns) - OrderedSet(self.df1.columns) + + def intersect_columns(self) -> OrderedSet[str]: + """Get columns that are shared between the two dataframes""" + return OrderedSet(self.df1.columns) & OrderedSet(self.df2.columns) + + def _dataframe_merge(self, ignore_spaces: bool) -> None: + """Merge df1 to df2 on the join columns, to get df1 - df2, df2 - df1 + and df1 & df2 + """ + LOG.debug("Outer joining") + + df1 = self.df1 + df2 = self.df2 + temp_join_columns = deepcopy(self.join_columns) + + if self._any_dupes: + LOG.debug("Duplicate rows found, deduping by order of remaining fields") + # setting internal index + LOG.info("Adding internal index to dataframes") + df1 = df1.withColumn("__index", monotonically_increasing_id()) + df2 = df2.withColumn("__index", monotonically_increasing_id()) + + # Create order column for uniqueness of match + order_column = temp_column_name(df1, df2) + df1 = df1.join( + _generate_id_within_group(df1, temp_join_columns, order_column), + on="__index", + how="inner", + ).drop("__index") + df2 = df2.join( + _generate_id_within_group(df2, temp_join_columns, order_column), + on="__index", + how="inner", + ).drop("__index") + temp_join_columns.append(order_column) + + # drop index + LOG.info("Dropping internal index") + df1 = df1.drop("__index") + df2 = df2.drop("__index") + + params = {"on": temp_join_columns} + + if ignore_spaces: + for column in self.join_columns: + if [dtype for name, dtype in df1.dtypes if name == column][ + 0 + ] == "string": + df1 = df1.withColumn(column, trim(col(column))) + if [dtype for name, dtype in df2.dtypes if name == column][ + 0 + ] == "string": + df2 = df2.withColumn(column, trim(col(column))) + + df1_non_join_columns = OrderedSet(df1.columns) - OrderedSet(temp_join_columns) + df2_non_join_columns = OrderedSet(df2.columns) - OrderedSet(temp_join_columns) + + for c in df1_non_join_columns: + df1 = df1.withColumnRenamed(c, c + "_" + self.df1_name) + for c in df2_non_join_columns: + df2 = df2.withColumnRenamed(c, c + "_" + self.df2_name) + + # generate merge indicator + df1 = df1.withColumn("_merge_left", lit(True)) + df2 = df2.withColumn("_merge_right", lit(True)) + + for c in temp_join_columns: + df1 = df1.withColumnRenamed(c, c + "_" + self.df1_name) + df2 = df2.withColumnRenamed(c, c + "_" + self.df2_name) + + # cache + df1.cache() + df2.cache() + + # NULL SAFE Outer join using ON + df1.createOrReplaceTempView("df1") + df2.createOrReplaceTempView("df2") + on = " and ".join( + [ + f"df1.`{c}_{self.df1_name}` <=> df2.`{c}_{self.df2_name}`" + for c in params["on"] + ] + ) + outer_join = self.spark_session.sql( + """ + SELECT * FROM + df1 FULL OUTER JOIN df2 + ON + """ + + on + ) + + outer_join = outer_join.withColumn("_merge", lit(None)) # initialize col + + # process merge indicator + outer_join = outer_join.withColumn( + "_merge", + when( + (outer_join["_merge_left"] == True) # noqa: E712 + & (isnull(outer_join["_merge_right"])), + "left_only", + ) + .when( + (isnull(outer_join["_merge_left"])) + & (outer_join["_merge_right"] == True), # noqa: E712 + "right_only", + ) + .otherwise("both"), + ) + + # Clean up temp columns for duplicate row matching + if self._any_dupes: + outer_join = outer_join.drop( + *[ + order_column + "_" + self.df1_name, + order_column + "_" + self.df2_name, + ], + ) + df1 = df1.drop( + *[ + order_column + "_" + self.df1_name, + order_column + "_" + self.df2_name, + ], + ) + df2 = df2.drop( + *[ + order_column + "_" + self.df1_name, + order_column + "_" + self.df2_name, + ], + ) + + df1_cols = get_merged_columns(df1, outer_join, self.df1_name) + df2_cols = get_merged_columns(df2, outer_join, self.df2_name) + + LOG.debug("Selecting df1 unique rows") + self.df1_unq_rows = outer_join[outer_join["_merge"] == "left_only"][df1_cols] + + LOG.debug("Selecting df2 unique rows") + self.df2_unq_rows = outer_join[outer_join["_merge"] == "right_only"][df2_cols] + + LOG.info(f"Number of rows in df1 and not in df2: {self.df1_unq_rows.count()}") + LOG.info(f"Number of rows in df2 and not in df1: {self.df2_unq_rows.count()}") + + LOG.debug("Selecting intersecting rows") + self.intersect_rows = outer_join[outer_join["_merge"] == "both"] + LOG.info( + "Number of rows in df1 and df2 (not necessarily equal): {len(self.intersect_rows)}" + ) + # cache + self.intersect_rows.cache() + + def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None: + """Run the comparison on the intersect dataframe + + This loops through all columns that are shared between df1 and df2, and + creates a column column_match which is True for matches, False + otherwise. + """ + LOG.debug("Comparing intersection") + max_diff: float + null_diff: int + row_cnt = self.intersect_rows.count() + for column in self.intersect_columns(): + if column in self.join_columns: + match_cnt = row_cnt + col_match = "" + max_diff = 0 + null_diff = 0 + else: + col_1 = column + "_" + self.df1_name + col_2 = column + "_" + self.df2_name + col_match = column + "_match" + self.intersect_rows = columns_equal( + self.intersect_rows, + col_1, + col_2, + col_match, + self.rel_tol, + self.abs_tol, + ignore_spaces, + ignore_case, + ) + match_cnt = ( + self.intersect_rows.select(col_match) + .where(col(col_match) == True) # noqa: E712 + .count() + ) + max_diff = calculate_max_diff(self.intersect_rows, col_1, col_2) + null_diff = calculate_null_diff(self.intersect_rows, col_1, col_2) + + if row_cnt > 0: + match_rate = float(match_cnt) / row_cnt + else: + match_rate = 0 + LOG.info(f"{column}: {match_cnt} / {row_cnt} ({match_rate:.2%}) match") + + col1_dtype, _ = _get_column_dtypes(self.df1, column, column) + col2_dtype, _ = _get_column_dtypes(self.df2, column, column) + + self.column_stats.append( + { + "column": column, + "match_column": col_match, + "match_cnt": match_cnt, + "unequal_cnt": row_cnt - match_cnt, + "dtype1": str(col1_dtype), + "dtype2": str(col2_dtype), + "all_match": all( + ( + col1_dtype == col2_dtype, + row_cnt == match_cnt, + ) + ), + "max_diff": max_diff, + "null_diff": null_diff, + } + ) + + def all_columns_match(self) -> bool: + """Whether the columns all match in the dataframes. + + Returns + ------- + bool + True if all columns in df1 are in df2 and vice versa + """ + return self.df1_unq_columns() == self.df2_unq_columns() == set() + + def all_rows_overlap(self) -> bool: + """Whether the rows are all present in both dataframes. + + Returns + ------- + bool + True if all rows in df1 are in df2 and vice versa (based on + existence for join option) + """ + return self.df1_unq_rows.count() == self.df2_unq_rows.count() == 0 + + def count_matching_rows(self) -> int: + """Count the number of rows match (on overlapping fields) + + Returns + ------- + int + Number of matching rows + """ + conditions = [] + match_columns = [] + for column in self.intersect_columns(): + if column not in self.join_columns: + match_columns.append(column + "_match") + conditions.append(f"`{column}_match` == True") + if len(conditions) > 0: + match_columns_count = self.intersect_rows.filter( + " and ".join(conditions) + ).count() + else: + match_columns_count = 0 + return match_columns_count + + def intersect_rows_match(self) -> bool: + """Check whether the intersect rows all match""" + actual_length = self.intersect_rows.count() + return self.count_matching_rows() == actual_length + + def matches(self, ignore_extra_columns: bool = False) -> bool: + """Return True or False if the dataframes match. + + Parameters + ---------- + ignore_extra_columns : bool + Ignores any columns in one dataframe and not in the other. + """ + if not ignore_extra_columns and not self.all_columns_match(): + return False + elif not self.all_rows_overlap(): + return False + elif not self.intersect_rows_match(): + return False + else: + return True + + def subset(self) -> bool: + """Return True if dataframe 2 is a subset of dataframe 1. + + Dataframe 2 is considered a subset if all of its columns are in + dataframe 1, and all of its rows match rows in dataframe 1 for the + shared columns. + + Returns + ------- + bool + True if dataframe 2 is a subset of dataframe 1. + """ + if not self.df2_unq_columns() == set(): + return False + elif not self.df2_unq_rows.count() == 0: + return False + elif not self.intersect_rows_match(): + return False + else: + return True + + def sample_mismatch( + self, column: str, sample_count: int = 10, for_display: bool = False + ) -> "pyspark.sql.DataFrame": + """Returns a sample sub-dataframe which contains the identifying + columns, and df1 and df2 versions of the column. + + Parameters + ---------- + column : str + The raw column name (i.e. without ``_df1`` appended) + sample_count : int, optional + The number of sample records to return. Defaults to 10. + for_display : bool, optional + Whether this is just going to be used for display (overwrite the + column names) + + Returns + ------- + pyspark.sql.DataFrame + A sample of the intersection dataframe, containing only the + "pertinent" columns, for rows that don't match on the provided + column. + """ + row_cnt = self.intersect_rows.count() + col_match = self.intersect_rows.select(column + "_match") + match_cnt = col_match.where( + col(column + "_match") == True # noqa: E712 + ).count() + sample_count = min(sample_count, row_cnt - match_cnt) + sample = ( + self.intersect_rows.where(col(column + "_match") == False) # noqa: E712 + .drop(column + "_match") + .limit(sample_count) + ) + + for c in self.join_columns: + sample = sample.withColumnRenamed(c + "_" + self.df1_name, c) + + return_cols = self.join_columns + [ + column + "_" + self.df1_name, + column + "_" + self.df2_name, + ] + to_return = sample.select(return_cols) + + if for_display: + return to_return.toDF( + *self.join_columns + + [ + column + " (" + self.df1_name + ")", + column + " (" + self.df2_name + ")", + ] + ) + return to_return + + def all_mismatch( + self, ignore_matching_cols: bool = False + ) -> "pyspark.sql.DataFrame": + """All rows with any columns that have a mismatch. Returns all df1 and df2 versions of the columns and join + columns. + + Parameters + ---------- + ignore_matching_cols : bool, optional + Whether showing the matching columns in the output or not. The default is False. + + Returns + ------- + pyspark.sql.DataFrame + All rows of the intersection dataframe, containing any columns, that don't match. + """ + match_list = [] + return_list = [] + for c in self.intersect_rows.columns: + if c.endswith("_match"): + orig_col_name = c[:-6] + + col_comparison = columns_equal( + self.intersect_rows, + orig_col_name + "_" + self.df1_name, + orig_col_name + "_" + self.df2_name, + c, + self.rel_tol, + self.abs_tol, + self.ignore_spaces, + self.ignore_case, + ) + + if not ignore_matching_cols or ( + ignore_matching_cols + and col_comparison.select(c) + .where(col(c) == False) # noqa: E712 + .count() + > 0 + ): + LOG.debug(f"Adding column {orig_col_name} to the result.") + match_list.append(c) + return_list.extend( + [ + orig_col_name + "_" + self.df1_name, + orig_col_name + "_" + self.df2_name, + ] + ) + elif ignore_matching_cols: + LOG.debug( + f"Column {orig_col_name} is equal in df1 and df2. It will not be added to the result." + ) + + mm_rows = self.intersect_rows.withColumn( + "match_array", array(match_list) + ).where(array_contains("match_array", False)) + + for c in self.join_columns: + mm_rows = mm_rows.withColumnRenamed(c + "_" + self.df1_name, c) + + return mm_rows.select(self.join_columns + return_list) + + def report( + self, + sample_count: int = 10, + column_count: int = 10, + html_file: Optional[str] = None, + ) -> str: + """Returns a string representation of a report. The representation can + then be printed or saved to a file. + + Parameters + ---------- + sample_count : int, optional + The number of sample records to return. Defaults to 10. + + column_count : int, optional + The number of columns to display in the sample records output. Defaults to 10. + + html_file : str, optional + HTML file name to save report output to. If ``None`` the file creation will be skipped. + + Returns + ------- + str + The report, formatted kinda nicely. + """ + # Header + report = render("header.txt") + df_header = pd.DataFrame( + { + "DataFrame": [self.df1_name, self.df2_name], + "Columns": [len(self.df1.columns), len(self.df2.columns)], + "Rows": [self.df1.count(), self.df2.count()], + } + ) + report += df_header[["DataFrame", "Columns", "Rows"]].to_string() + report += "\n\n" + + # Column Summary + report += render( + "column_summary.txt", + len(self.intersect_columns()), + len(self.df1_unq_columns()), + len(self.df2_unq_columns()), + self.df1_name, + self.df2_name, + ) + + # Row Summary + match_on = ", ".join(self.join_columns) + report += render( + "row_summary.txt", + match_on, + self.abs_tol, + self.rel_tol, + self.intersect_rows.count(), + self.df1_unq_rows.count(), + self.df2_unq_rows.count(), + self.intersect_rows.count() - self.count_matching_rows(), + self.count_matching_rows(), + self.df1_name, + self.df2_name, + "Yes" if self._any_dupes else "No", + ) + + # Column Matching + report += render( + "column_comparison.txt", + len([col for col in self.column_stats if col["unequal_cnt"] > 0]), + len([col for col in self.column_stats if col["unequal_cnt"] == 0]), + sum([col["unequal_cnt"] for col in self.column_stats]), + ) + + match_stats = [] + match_sample = [] + any_mismatch = False + for column in self.column_stats: + if not column["all_match"]: + any_mismatch = True + match_stats.append( + { + "Column": column["column"], + f"{self.df1_name} dtype": column["dtype1"], + f"{self.df2_name} dtype": column["dtype2"], + "# Unequal": column["unequal_cnt"], + "Max Diff": column["max_diff"], + "# Null Diff": column["null_diff"], + } + ) + if column["unequal_cnt"] > 0: + match_sample.append( + self.sample_mismatch( + column["column"], sample_count, for_display=True + ) + ) + + if any_mismatch: + report += "Columns with Unequal Values or Types\n" + report += "------------------------------------\n" + report += "\n" + df_match_stats = pd.DataFrame(match_stats) + df_match_stats.sort_values("Column", inplace=True) + # Have to specify again for sorting + report += df_match_stats[ + [ + "Column", + f"{self.df1_name} dtype", + f"{self.df2_name} dtype", + "# Unequal", + "Max Diff", + "# Null Diff", + ] + ].to_string() + report += "\n\n" + + if sample_count > 0: + report += "Sample Rows with Unequal Values\n" + report += "-------------------------------\n" + report += "\n" + for sample in match_sample: + report += sample.toPandas().to_string() + report += "\n\n" + + if min(sample_count, self.df1_unq_rows.count()) > 0: + report += ( + f"Sample Rows Only in {self.df1_name} (First {column_count} Columns)\n" + ) + report += ( + f"---------------------------------------{'-' * len(self.df1_name)}\n" + ) + report += "\n" + columns = self.df1_unq_rows.columns[:column_count] + unq_count = min(sample_count, self.df1_unq_rows.count()) + report += ( + self.df1_unq_rows.limit(unq_count) + .select(columns) + .toPandas() + .to_string() + ) + report += "\n\n" + + if min(sample_count, self.df2_unq_rows.count()) > 0: + report += ( + f"Sample Rows Only in {self.df2_name} (First {column_count} Columns)\n" + ) + report += ( + f"---------------------------------------{'-' * len(self.df2_name)}\n" + ) + report += "\n" + columns = self.df2_unq_rows.columns[:column_count] + unq_count = min(sample_count, self.df2_unq_rows.count()) + report += ( + self.df2_unq_rows.limit(unq_count) + .select(columns) + .toPandas() + .to_string() + ) + report += "\n\n" + + if html_file: + html_report = report.replace("\n", "
").replace(" ", " ") + html_report = f"
{html_report}
" + with open(html_file, "w") as f: + f.write(html_report) + + return report + + +def render(filename: str, *fields: Union[int, float, str]) -> str: + """Renders out an individual template. This basically just reads in a + template file, and applies ``.format()`` on the fields. + + Parameters + ---------- + filename : str + The file that contains the template. Will automagically prepend the + templates directory before opening + fields : list + Fields to be rendered out in the template + + Returns + ------- + str + The fully rendered out file. + """ + this_dir = os.path.dirname(os.path.realpath(__file__)) + with open(os.path.join(this_dir, "..", "templates", filename)) as file_open: + return file_open.read().format(*fields) + + +def columns_equal( + dataframe: "pyspark.sql.DataFrame", + col_1: str, + col_2: str, + col_match: str, + rel_tol: float = 0, + abs_tol: float = 0, + ignore_spaces: bool = False, + ignore_case: bool = False, +) -> "pyspark.sql.DataFrame": + """Compares two columns from a dataframe, returning a True/False series, + with the same index as column 1. + + - Two nulls (np.nan) will evaluate to True. + - A null and a non-null value will evaluate to False. + - Numeric values will use the relative and absolute tolerances. + - Decimal values (decimal.Decimal) will attempt to be converted to floats + before comparing + - Non-numeric values (i.e. where np.isclose can't be used) will just + trigger True on two nulls or exact matches. + + Parameters + ---------- + dataframe: pyspark.sql.DataFrame + DataFrame to do comparison on + col_1 : str + The first column to look at + col_2 : str + The second column + col_match : str + The matching column denoting if the compare was a match or not + rel_tol : float, optional + Relative tolerance + abs_tol : float, optional + Absolute tolerance + ignore_spaces : bool, optional + Flag to strip whitespace (including newlines) from string columns + ignore_case : bool, optional + Flag to ignore the case of string columns + + Returns + ------- + pyspark.sql.DataFrame + A column of boolean values are added. True == the values match, False == the + values don't match. + """ + base_dtype, compare_dtype = _get_column_dtypes(dataframe, col_1, col_2) + if _is_comparable(base_dtype, compare_dtype): + if (base_dtype in NUMERIC_SPARK_TYPES) and ( + compare_dtype in NUMERIC_SPARK_TYPES + ): # numeric tolerance comparison + dataframe = dataframe.withColumn( + col_match, + when( + (col(col_1).eqNullSafe(col(col_2))) + | ( + abs(col(col_1) - col(col_2)) + <= lit(abs_tol) + (lit(rel_tol) * abs(col(col_2))) + ), + # corner case of col1 != NaN and col2 == Nan returns True incorrectly + when( + (isnan(col(col_1)) == False) # noqa: E712 + & (isnan(col(col_2)) == True), # noqa: E712 + lit(False), + ).otherwise(lit(True)), + ).otherwise(lit(False)), + ) + else: # non-numeric comparison + if ignore_case and not ignore_spaces: + when_clause = upper(col(col_1)).eqNullSafe(upper(col(col_2))) + elif not ignore_case and ignore_spaces: + when_clause = trim(col(col_1)).eqNullSafe(trim(col(col_2))) + elif ignore_case and ignore_spaces: + when_clause = upper(trim(col(col_1))).eqNullSafe( + upper(trim(col(col_2))) + ) + else: + when_clause = col(col_1).eqNullSafe(col(col_2)) + + dataframe = dataframe.withColumn( + col_match, + when(when_clause, lit(True)).otherwise(lit(False)), + ) + else: + LOG.debug( + "Skipping {}({}) and {}({}), columns are not comparable".format( + col_1, base_dtype, col_2, compare_dtype + ) + ) + dataframe = dataframe.withColumn(col_match, lit(False)) + return dataframe + + +def get_merged_columns( + original_df: "pyspark.sql.DataFrame", + merged_df: "pyspark.sql.DataFrame", + suffix: str, +) -> List[str]: + """Gets the columns from an original dataframe, in the new merged dataframe + + Parameters + ---------- + original_df : pyspark.sql.DataFrame + The original, pre-merge dataframe + merged_df : pyspark.sql.DataFrame + Post-merge with another dataframe, with suffixes added in. + suffix : str + What suffix was used to distinguish when the original dataframe was + overlapping with the other merged dataframe. + + Returns + ------- + List[str] + Column list of the original dataframe pre suffix + """ + columns = [] + for col in original_df.columns: + if col in merged_df.columns: + columns.append(col) + elif col + "_" + suffix in merged_df.columns: + columns.append(col + "_" + suffix) + else: + raise ValueError("Column not found: %s", col) + return columns + + +def calculate_max_diff( + dataframe: "pyspark.sql.DataFrame", col_1: str, col_2: str +) -> float: + """Get a maximum difference between two columns + + Parameters + ---------- + dataframe: pyspark.sql.DataFrame + DataFrame to do comparison on + col_1 : str + The first column to look at + col_2 : str + The second column + + Returns + ------- + float + max diff + """ + diff = dataframe.select( + (col(col_1).astype("float") - col(col_2).astype("float")).alias("diff") + ) + abs_diff = diff.select(abs(col("diff")).alias("abs_diff")) + max_diff: float = ( + abs_diff.where(isnan(col("abs_diff")) == False) # noqa: E712 + .agg({"abs_diff": "max"}) + .collect()[0][0] + ) + + if pd.isna(max_diff) or pd.isnull(max_diff) or max_diff is None: + return 0 + else: + return max_diff + + +def calculate_null_diff( + dataframe: "pyspark.sql.DataFrame", col_1: str, col_2: str +) -> int: + """Get the null differences between two columns + + Parameters + ---------- + dataframe: pyspark.sql.DataFrame + DataFrame to do comparison on + col_1 : str + The first column to look at + col_2 : str + The second column + + Returns + ------- + int + null diff + """ + nulls_df = dataframe.withColumn( + "col_1_null", + when(col(col_1).isNull() == True, lit(True)).otherwise( # noqa: E712 + lit(False) + ), + ) + nulls_df = nulls_df.withColumn( + "col_2_null", + when(col(col_2).isNull() == True, lit(True)).otherwise( # noqa: E712 + lit(False) + ), + ).select(["col_1_null", "col_2_null"]) + + # (not a and b) or (a and not b) + null_diff = nulls_df.where( + ((col("col_1_null") == False) & (col("col_2_null") == True)) # noqa: E712 + | ((col("col_1_null") == True) & (col("col_2_null") == False)) # noqa: E712 + ).count() + + if pd.isna(null_diff) or pd.isnull(null_diff) or null_diff is None: + return 0 + else: + return null_diff + + +def _generate_id_within_group( + dataframe: "pyspark.sql.DataFrame", join_columns: List[str], order_column_name: str +) -> "pyspark.sql.DataFrame": + """Generate an ID column that can be used to deduplicate identical rows. The series generated + is the order within a unique group, and it handles nulls. Requires a ``__index`` column. + + Parameters + ---------- + dataframe : pyspark.sql.DataFrame + The dataframe to operate on + join_columns : list + List of strings which are the join columns + order_column_name: str + The name of the ``row_number`` column name + + Returns + ------- + pyspark.sql.DataFrame + Original dataframe with the ID column that's unique in each group + """ + default_value = "DATACOMPY_NULL" + null_cols = [f"any(isnull({c}))" for c in join_columns] + default_cols = [f"any({c} == '{default_value}')" for c in join_columns] + + null_check = any(list(dataframe.selectExpr(null_cols).first())) + default_check = any(list(dataframe.selectExpr(default_cols).first())) + + if null_check: + if default_check: + raise ValueError(f"{default_value} was found in your join columns") + + return ( + dataframe.select( + *(col(c).cast("string").alias(c) for c in join_columns + ["__index"]) + ) + .fillna(default_value) + .withColumn( + order_column_name, + row_number().over(Window.orderBy("__index").partitionBy(join_columns)) + - 1, + ) + .select(["__index", order_column_name]) + ) + else: + return ( + dataframe.select(join_columns + ["__index"]) + .withColumn( + order_column_name, + row_number().over(Window.orderBy("__index").partitionBy(join_columns)) + - 1, + ) + .select(["__index", order_column_name]) + ) + + +def _get_column_dtypes( + dataframe: "pyspark.sql.DataFrame", col_1: "str", col_2: "str" +) -> Tuple[str, str]: + """Get the dtypes of two columns + + Parameters + ---------- + dataframe: pyspark.sql.DataFrame + DataFrame to do comparison on + col_1 : str + The first column to look at + col_2 : str + The second column + + Returns + ------- + Tuple(str, str) + Tuple of base and compare datatype + """ + base_dtype = [d[1] for d in dataframe.dtypes if d[0] == col_1][0] + compare_dtype = [d[1] for d in dataframe.dtypes if d[0] == col_2][0] + return base_dtype, compare_dtype + + +def _is_comparable(type1: str, type2: str) -> bool: + """Checks if two Spark data types can be safely compared. + + Two data types are considered comparable if any of the following apply: + 1. Both data types are the same + 2. Both data types are numeric + + Parameters + ---------- + type1 : str + A string representation of a Spark data type + type2 : str + A string representation of a Spark data type + + Returns + ------- + bool + True if both data types are comparable + """ + return ( + type1 == type2 + or (type1 in NUMERIC_SPARK_TYPES and type2 in NUMERIC_SPARK_TYPES) + or ({type1, type2} == {"string", "timestamp"}) + or ({type1, type2} == {"string", "date"}) + ) diff --git a/docs/source/benchmark.rst b/docs/source/benchmark.rst index 56365afa..db163c9d 100644 --- a/docs/source/benchmark.rst +++ b/docs/source/benchmark.rst @@ -39,17 +39,17 @@ The data (base, and compare) we generated was purely synthetic consisting of 10 Table of mean benchmark times in seconds: -=========== ======= ======= =============== =============== =============== -Number of pandas polars pandas on spark spark (fugue) spark (fugue) -rows (distributed) (single) (distributed) -=========== ======= ======= =============== =============== =============== -1000 0.025 0.025 15.2838 2.041 1.109 -100,000 0.196 0.120 11.1113 1.743 3.175 -10,000,000 18.804 11.330 20.6274 17.560 16.455 -50,000,000 96.494 62.827 57.5735 90.578 94.304 -100,000,000 DNR 127.194 96.3204 DNR 193.234 -500,000,000 DNR DNR 262.6094 DNR DNR -=========== ======= ======= =============== =============== =============== +=========== ======= ======= =============== =============== =============== =============== +Number of pandas polars spark sql pandas on spark spark (fugue) spark (fugue) +rows (distributed) (distributed) (single) (distributed) +=========== ======= ======= =============== =============== =============== =============== +1000 0.025 0.025 15.3112 15.2838 2.041 1.109 +100,000 0.196 0.120 15.0701 11.1113 1.743 3.175 +10,000,000 18.804 11.330 18.2763 20.6274 17.560 16.455 +50,000,000 96.494 62.827 31.1257 57.5735 90.578 94.304 +100,000,000 DNR 127.194 47.2185 96.3204 DNR 193.234 +500,000,000 DNR DNR 130.9814 262.6094 DNR DNR +=========== ======= ======= =============== =============== =============== =============== .. note:: DNR = Did not run @@ -67,4 +67,8 @@ TLDR * in the 100 Million + range is starts to shine, and due to its distributed nature it can process vast amounts of data +* The Spark SQL implementaion seems to be the most performant on very large datasets + + * It makes the Pandas on Spark implementation obsolete moving forward. + * The native Pandas version is best for small and medium data diff --git a/docs/source/conf.py b/docs/source/conf.py index 33c9c03b..d927194d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -24,8 +24,7 @@ author = "Ian Robertson, Dan Coates, Faisal Dosani" # The full version, including alpha/beta/rc tags -version = datacompy.__version__ -release = datacompy.__version__ +release = version = datacompy.__version__ # -- General configuration --------------------------------------------------- diff --git a/docs/source/img/benchmarks.png b/docs/source/img/benchmarks.png index e5f80d6b8e7866a60746c7219deedd2ccad131b4..579b173c27c7a903ceff0a3ce5f286a773b63283 100644 GIT binary patch literal 39529 zcmdSBcT`hr_b!TMV+9loo30`tDxfGW5Q-qZ2!bdjsDN}(Itc;GR#XI}OA8=PYNSgE zs327c5J(_&DItW=5)w%W+!eR@edl-ox?|ii&N+<12qdhv<~!e6p83rA^6G|(F4sYU zgB%5(2<{TV*z#JUAmiBW3?^M~{ALHP7%%QKXX%Pfj9NQN!^sRP<#T@1=eL|u% zwuRtm-!d*MG!EH@MCwZn-F3Kl>C~QUyt}oJoUwM~*VZ`|tAFzEh_grZ51c;W^u(t} zna~icKv24Zt}&GcFO;MciNopX$z;~H--1hB5onOQIDoMnqj>KIp3Sj~=1n`y{$W4w zQxW#-6P?~Yz{{!L{ZRI=dj&LkfS1P_V)MIz*CQ@_0l_&qc1LFJ170*Tk5bvc?!Wqf z+Qrs7PZ2380>RWi-PteuM6Yg>u9j<^<&Bx`N*bx?hUH47I!>!qofr`2;BYaO}A?6Zt)8ebCB5;YCGROj;y!;MR>m4vtK4hhFcMhm#tZNd^^T+P%HpwB2a8 zm;ai>C-@{B*hR8F{uyFz&4-A|k5=9KIc;ucc5e&knjZZ<`|_yTirOBI$F#F#O>{#r z#mK>ZzBlUuwEgYr2i61?ZV^?NITW?uurJt4w(;lt4s>_7JSOXgUY=3%{WAL5HOS0H~hya zi0p+0w>TWqUj1fh%gdt=U}XYvp>$ahBrB>4b(@mzT5^T#it(qCN32>S1)XVVyVmr@ zSu*jV(nW{1sDyk3)Bq96a7|Y5OzA)t7RoJ`VY@5PYVOqzqTYoI-w-rVtMn^X}mj4uI3RMDz<#j~_q&PA8srP{}H1 zJuDr&k+S?y;2t!0#2ER26oNWmk|<;6mM5>*|5y*1xI9*`C}1mXU3spq81oX7x=vYJ zdrujRGgWh4n?q-Km;OT<3?^4Sq~W@U^Lp%SywauW-3HHN-uhSHOIoi8TMZ)Dpin`S zF;Xv?^za%P-W!a!xtCNiOi3p;ZL^m1aMxgsX>=Xf0Z8bE(-nJC|A9A^#bOET_e~{1 ztwD(3IsTiso+*rpjs=26{SdT9rP$#>uD^kw@GkKAwSA=d3UTf+TWPFsg0Y1qxWR4{CqOVztB}XQb@VHZxgM) zJ^CFN60R`g_c9vxG8Lj3_+r21ty?L<&=a;v{x9m3*C>?k5EOLjp#T*Bf!fSpng_n% z)${D{Q7$ui=bki)$PPyu+qSD~_`Dmhp!`YLc~!D&Z?H8wKsgnxj#W!YOw=KGk2ffT z>;)OL0P{PtG{5%_Z*-r@OQ)WjGBKFSuAvmu^?b^fnE?D5hC!%s5rHb%< zimsH-vTX=@MK_wLV3D4e4%?FKFT0BRpJs&_4Nsu>ZbI01aCkE;#O{^&M!LbartT%E{kuA}CKp<*UbvI44$qqSG8Ws>26op_$V zKZ=9$Q!s-0T`ku6OA&V19St6WrZK_1xvSG{x=D|g=zXWXpyD=ycR>x^W4!@O+4_Fs zcEeoYnrQW{p&8;^u`w%@cXROTGzq#wdeIv6#CUh@`Q>pPw)#*eijrt}c0H|psA~wb zIyds`yQ9ZD5?YpjtUFqF*|`r^`GQOEBzX{%Kc~!An#Y|x$?1c74ZfX3@Br&!i;4)W zv0|A9sv@+>ko5c(f!yY4X8lxz01@)k5LSBlxkrJvJmI}Y&TJ))^%eCpwp(5uvoqAE zO5LNu50$mJ&r1;yob+Q=0c89MyH94+IR~zAvl-q-2};#7yU|MCrWMnMQtfOOoV9QWL8IG z@aV>vDEVmr)I(^lwf8xkh*&x@`eZYUc&y)jO?hTi}0?+kA9v(TC<&PO-^` zEE2UTo+c>Q%q;%`=2ds>#Ve*6@pF0)Ch|=)yyl3kdjcyY_#K9zA;_CL-`D5UgE{D* zUWOQ--C_@n;LgbUfT>`*6?M!}_D)d-;X5O4GZ(c_8L_gN;<-3TN& zuP2UkOh3GI*oh_bOZNi{n$ffr7p)t}+arn=Bh&|O(nTG1*x-RG)=QUgnS36V-l=P# zQNyrGvy7DoJh>HuP%r3-ke@44d6x@EaGW0$Xb-->yO-FPbuA!R7@BX@oo!~FMY;9Gb8joR2sXfFn#q?k^+aheVLhM#2S^QVQ2%(HkH zVg>k-_w;@x9;LspJa~iJ4}ARi@acJ1PQojnvRLjwpY6Mn?DxK(0p3Fu*e=WV!$i`U zSE*ZvSa4DD1{+pKf1&=|mj zOkZ#dp9lOUPVX$QG^Iyn`BS1xvPEf^nU$5~vwb|(&Vn{IUh-|!E#`6;pMA*bzyA6w zAV#NBZLT}j8JpkrMo8Jb+H(Z^;f=^M6)u6RdcgjjwzRa2>}=un4)y&Bu9mi`{)u0T z5?1-HJgHXwWY50NaTH82A;Nz%PQp0$$B!R6vzssJ<3#4A;5EmM$!9#@8TlsZ_z$5K z2^mmYbE$(dV|*Uwm_*To|O0I-n~Zr{|x26quBrOPAowB@+?)u|EAKU!X0#=#~(@8Xwti$8wi zM6zrN5S=<-9u-@>Z&p=RC1Kx1u>X8_B_v^KChw zJxZw>uiT<8WQeqFv6yz^s10QW)^1@%FWXVZqlk4ikb?L4#pZCHXp=VUB!m|9xg0-a zIi1sO330)5|0luI3U=Yx49F{Cm4M=8LL;<@P>a2O=gysMCl6Zff`2Kgw$LifpC$0O zJ9q;g=HNLZ4Qb?5t1}7DAiMGh`pgZlJBh;d^lQC=`uC#eGQ zLB_!oZJg8>agy8fQysd^5iNPbJFIDQ&2Sh63#?4T)^qNS3hI(#sbobufkoHPT=(+w zx+;sk3>_@Upp1k@Pp6B8&MWwB3p0{&!K*>S?!Gj;;6YErD9@pf_wppWvIx=4A>MR* z1*Z3N8+R7?ugBHrJvGtJ8Sn=JXo%fjJ9}ryN&UqH_N``%`;qGI~rZ%ta#MjhVlG^eM{7Br+8Q+<=Y1? z4&iJ0Oj1&Y1BUY{1ZA;GYnw_yG6~#o$mP(b`2?!A=DThZ3iBeeJ|-(t(39g^HmR30 zaVZ}Y)b9{j+T+$wOupi*VBPv4qXC6e+q1!I=wnu2f+q}4Q&my#a|WB)Nf_2lsciw8M6w1M{g7LB;iWg9{rLLz>pxvm zp7Y-LN+|#sjSt^$BU$E$_INYFG>|TDANnZrw7X%CK}EoVW}16xK!b>D|5N()|a&CXH2?`eb9rNf)QUw z7d;*8!;#`JHF3PX=STN!i<&I3MU-|IQQ4KCa$gu}mCjIZBGZRRLi*Z!?vlH)>u&u` zNf3JaFIxkfjaAPK3H8A3p%byV>*D_^i7#J`JTS@*cGw*0c{BXqN;j1VHYlMM!$g60>LQ(xXGWCN-X|wPHhk@k2jGt;iX1Vr%Ba2P%e=T*` zPL>yv!h)!_m4defE`k=FfbjgNjxfySU$O_5JlEJngN`=PE|1dyv{?yQS3TMuY=VSg zc?a#@qw6y2s9H7m)PoIw8_(Y)xGLUoX-i1L+JrJK-%6e~w#e^5Ohg3{^v(pBep2Wm zH#vLl_*P+U+XIY4#EIi=Doq|`w25vK+){c2c96r<82?nH{&FUjr!-VbnV0n>0F4pc z8hi4L^Tui-?o{)bU>|Q8&9E)AzR7Pr!U*}jHQQ?6`oUD=aJf(3(5NcuR9fXwvlvwe zLO*g^QJUn8;zl}P)>{l$m8Pd0`ZktiCvTJbipG{;DX~Z0;dmD*S=ne45(Bj)`m&0< z#2vOj5-vY%9KO(^V&%|{xxiW~+63iX!pfZ=rYl13s4qV+uW?SQ$OkqF51+18SZi21 zYG4ugN{!Sp;G|mDKnqy7bkU}qaPvKD5b+S+{(SuA50UXi7iHAD=9t`1X1 z9%P8U8p`knb0a_Zk#46}EWcX7N)J0=viz6)NsYtN{d7>1NAl*5Lg2fj6lm)QlQ~uJ zmwL+?sMAZK%%YyG-3Wo1ew+&Z3pM z-Ju>6GPFQZYm3M>G$odrFXe2=Q6XF zW0s!LL?nIH@ct?Dn^Bfw5Wv(xGhG|Od&UqV`8^SBzg1f;-j)~EM4kGp&dZpA`oLV) z^9vo8+n^ukz^4GEIwQ9vY&1Qbe?CElHNCvVDihRFfFU@|@T5#r`F==jkX4o62xIoW z=5ubl-bejQ_x8}BsFn*w_IW)HW~|N0)X)N=1JpmmkDu%8sG?s{&`7!K9!-)Z@~+#Z zT>q&&x*2iMuq5NQ73VQe$z%t^N|{;JbfYei)OcuZ95x|T(|!7z3_-Wp2P)$7Ob2}k ztOizGOnog2_DujpcP;di>c!6&MV#)XpLq2Qo5stE7z7 zg)%B9#NoCU#EQ}-dCm))%&L~-0p(`IAR8H-o5sE4+;q&D)SLMbLSQk70@Af8b`q*v z?&0xs>&MZY$GiVa!xMo5pNVzrs{;(-Ig!w$`OVkjgFtjU^#Oo-l?u!P)fF_{2_@}D zVos!d5OmBwm64>WroiFSh4<*q2J?WZ2FNsGf(7b+E?8F&0U7k@!ZWsNBo;V_&$Sbr z-bQnA;L8;Rsq;53vAVg0pJt6wL5`n8Z~WrdD~Ep(eTukP;Tm>(iJ4;{6I9w@)*qY_-g2HZ>@M$HeSd#{T5-G^ zdA%l{Qw5x-dY|_Rk5CDGyqd5N@Kqdo(aUS1+&k#oP|AKchta;1@iYd4f?O$SJmS%538au!^gZNN?i0BC=t;(bbm zS}Kl#QX0WH3_5smnSAzTd@M5Roco+#yFP*40DGzNNgXLXDmCPa25VC)jZUuGY%fu? z>_r1dAN2z?caa6jtphekS3>377uMqeRu*F;5MyXx>cTreq~GN->=ENBO6%v#h#gf& z+>32A$@eHabV4dc+W%(L>~G=T)GPtzk1`?6cV_a-vw*~--paC}Nvk$J*#24SZ8Vr} z>$_OsxKDLh^}I zxS3Q54Y4tkU;0*fKeW6h)qe2mfrmfDrAgtTN7|x^FrfIDEw1R13u}=uPC4Fv;oP~mGIn+E{JHt%a{9u@+|wE1 zH*o%&t8HDmNs#l0_+;M9$o;>{wRjV zFf=rrO|`%BUP1g#fQmE<0U4*82NgUz2me?I3;MTV|l z*Z?{ssSV3sORh*dh6FaWuwTNhs%3=IhuuYEf&tV0!Pu2x{4_O4zl)J2+5%guZ424I z{%rH}9v*Fq-~P`(SEsuZVzRWtJ3AyZMqO&#L|{he&!0ESHdEh>0_yj!OwDKCLF-k_ z=A|kVyEcGfIh%E(J-)9-3z^4|R+(PGOXJ;jrV{}Da zE7O{bWPmd2I#7Wndv(|8`}a;)AtCEPv8uHG6Yz`<;lNR|ZJIoMyMd-f*KGRkufEjC zT>4~h;)S#Ijr^W`MG{2zN7UQH$l)lXVh2A6NsUDC#88pS|6jU0HNnA&W7s}E}K^dk^%*Cqcc2&Uh9`!{SKiknR zP^7-YLD$WYi!JXJL-C?_>4T>O5v9!=*`M5Tr4nJ8LNrjlKj}ef=)7oqr$HDO!qEaQ zpQZnK`fRu13o8_BPO<3~26SR z+xFJ-c8aWY6+p#HjJ%#rYTC@LjU}vr&@2RwCP%wmXHQbFu(V7dX890mhr5(}R4L!* zxkf>iGm91IsJUR_2nZSLVEVk?ndZ(r*OBQ9264Ll#g z05U2PH^7m5DAxH~W%C0CaY8`g%X5bh7FzZ-B)+z|r5B1bOxdeq(yuB#odl@uXPbId zlob~Gkg!p52u^Nv!xs9-{m5){w?{2VCo_795EL}I^+zL|`JOTc7#cx&IR+kuCsb?E z2U!D>NXqDPu&1npXg0Iz%1JxAP59kUl+ndmAt+>_GR`kd2-0@H$yB-w6c(4JUJi@n zXfv_a0o;l0#hu+?vZW}5M1>;9Hn$>Q%9B^{w+t6r)b<_w<;=JBquRB$={I__|5z6f z3=X&}-6voSCQVom#kVaaerUXps?KLERp8XlP}q?(E<+J+)E*C40U&^vlNxPuJ2=7fNRGEZ_zTy&5RazR%pF0;8rV*zB2-+%( zgkoelgsXqd5Pw3#C+c2W$`;BmNDOL5i(wm~gEn&OQ%0oTIM_0(#tb2i4V_P_<<`7G zQ+D(2%397IKQsealS7FZHZ(}ufgpYR zNSjT@8GX(+hnq+t87tQLVpte?j_$2p>}pf&nOkSl@XP5IQ6^{T!l&hG?X~)GAY}57 ze|?TrF_C?xM@bCtUha$*ndk~eWrU7b1g)PSRe{NY87SJ(WZm94@xo61@wvZnNb1`_kvD=S4$*PU!?gNSmzXFmJEY z0d>wE5Hut9hE7>}gGvJdj{3pR_^70SF#ewXdsO867!GlULBvb%}sdxrk+3appPDzqU7INR8uDCbuT7qIRyyn6;~NlY#}Q!=7q)#-=i zCs+O^>a(8!E!>U{rClNRW$RKGYWh<*Lh1Dr$wj=3Z;SM2NYbaJI@0$9Ue^6ULU4)i zaJiGu)N{TRb%h?$Dvw!9Z|JQ4cCR~M?fBIK%+jn7#Qz@7+Z}71<5>33+}6Mq;290H z!J_fl0vB^f|uwGSolw(X_y;iWTXn0 zZy!rY4@6KGhb#Ipp=qhEVXG1LY)W|4MVZdmzX zr!~I;CH=zr0?xMt?LY zL3B?~D)!r2J;b|E5?o$2zPW_^^l19~iCNmUFs*CI&1wm*fE2-%b4|;3u-GNgly6^k z<#UQwIM&rPYJ3Js=T|mObuaa6a4r~U-~9jU)XU?mo1D7t09M|^@dKEZc^uHW>_ zHe8KJ$kGho9%Y}ENzE&)Y*AMGUeT$CL+6L<{FuOiN9nMr`le87_p)4Q*Kj{dh-2~y zFo5Ioq{L}<&C;ZFIcJvD`}x|QH-Pg)WKdLdfKtDA)l~2cd#(d8f3wntHG__>u828l z`$k((z~VW{w1-dAd^)A%?zg9e{BapKTUB6=yxJ-&2_i ztNZxzMrqx``Hewjs5lTv?q#bYXAOQMytcfiO31BSC%ZAMUe>GJ%uJ0sKjQsFjNEE9 z9}?mqY|~KJj{|al!XcSY8l_EJP$?N1uXw^7Bm>5Rv=RL;ay*_p4QTP?Pe-N*@selb zuO+?M_!npCwML72Lv4ic`x^me7&UK<1x_UF6A3z%lCSllvVr#UurSENL~Ub6r;7+O zG%$FNStzL)k5yUvq;)1nud;YL#l18JXq{;1lUtjB9Gw*|BB%LmX&O!+Bfx4UGeXQ2 zJcjPl!hp_Xh`kOh#23DRK~$5kG{jw&_W{{jQ;i!0z=0uL>2-BRep=&Z2YL8SNZ&tS5TMT{$X zE7pynu7OzNu;pZth?JC+`H&j>t+k{)TUOW!$dhPUHCI4tupQ$ikOipu3N~C@d_D#h9w2%n+%lv}TeLD(Wwe zURAb*?JhY%b@|*|a8r`f%OPA??KW8Gf>3zs?(W|0?y2Oy{9J3#aq_*s7pyzZ>4Q0a zjyL&pix%FSkbfSuWEnJs~Qu`$h z*|PVv7YSS99*?XGT$Z_+ZxU%zkDF5MFc#c`@So@%efwJ8(+3X&S=nQGDqJ)QHVRoUM5c4dLhO{0>!s>^Rt5iZW+>qdju6e#E}^cv`kMO0(y)_G7T%&FgF4O`nf&#HUW*ywTcXu5tKTA zPoRw(rMw;3#7rkj_l64%MfxlLn`X2QdEf#Ikq*AdiT}31)|`r?#m%2XDXA0M;TUm4n8kS5HoTUx zP(&`ynVFe+m&!_qC3A4-u_JNiW#VDz01(ZMqIKl_T+(R)^IQyQMhvC0cCLS-ybp4b zgCphH|7jD6Sg)rDc6Su;A#rv+;Dj<6k4FSMbK}wxW8MoO^@{3JPM5 zis`g}1`fvcD!!Px%7Zg!H|?zE)(>eZ3EWcBL5Or2afHb?-7 zP{3ERmb)s~ugQeR-?QWqoWDk3HRgH+O!xeLs7qx2u3w1Zt#V~6=JEgCw{04STrsLn zS}b$U{_D@eHiMitm_7g>m($D#J`iI55*u`ZI8uW=5U@BN+Np3PTP1T{(6lFmQ3 z_#@22)T9dA9-;ACHUF5um{#~@A3KaLC{DRy^o}WA9nfck5Gjm3o)G0 zlILzjQ};%FEplH;^p0t2fXgQRkFD6sPkBipODZe+4E`wOg*U=wH-e(JEHh$A8ya@G zzH`4!xJ`16W)GvdDD84_h0!p0qDSXK_KuRtRuHEwTh+Ofm@Z7%tZslp|Kpv9Rhs%w zy&aWXzkF0k-e&f z5Uerwq(@l=Fh29!OETfZwv}3;DnG1`QQwR!eCyFk>HF&v2=Z(1Y~NGQz~Ru1=63cT=nCqL47Xp}S-}xo%Pcpd0x|U1 zit1ImlZxP-KM&iBbD(VB+l`Ppww&;~&ftIf2=mr>+UT?IR{K(Uwj(k<@}j1DLjo{w zjoYgdurO9%1LV24znumjlu~}LI&J-qm9oQaXZ~8Kd0a?r6Mz;UDCXXb%9P}qCjudT zf8uE8?%MeXL|zi1+cE<`+%C7H$x}*h&Hg@=0bS$I6db>)p%QO(#Hp@78k-DyM68#X zto2wVn_JkqIg1_oqvyMS*Eqa55Y)rpXN=|ZsH%v`-7zAKk(#oH^=T?U1c2QcR_pH3 zIr+y>JZOcAXplHrJ*bcBP@{BKQ-wM@LX|XM6AnyIkfBT@0y+ zQqAxXa&XS(`V)*j4X@iJ_@(;+PZUAW3)3SNUi`6IX@Pl{&)H9+IW=5H{LSa zeOaEM7A{ZGKY|NV@BPK--3EG_+RNa#yPihuEZ#9m(F*Ri8^LW)jR!18MR3=jR94>h27IVHIvN3(+#N)H4UeErVoM<<5a>a4M#wgv`}XZyf4-?G5nok=RBif|k>yC) zT-kuhhsP)aUd9Sg*j;K@-Fo5t`I!&dQLjzh|0gcmxXw=WZiW9Gd2Nsy7JJA}& z0~GL20)_=9P_UXbfQRZM{)>|Y5Zw?vJ84~!;G93to7T9F*mU*d!E{k*DffU2UjY|} zE!7enH-E175xoIK+lPzM({5ai^#0lTFT^;-a{F54(`*F?f&seNxtk~UJ=`s@WOcB} zs%sCAYOl-q?i01zysI%r5a+nfo4!aQ% zy0rK^5pxP1l1{pv9UWzo}R?Yx`S4Hs4CnZ)$2)mQU294}8*sG2xv9n`2 z=(?xUK0VH3M!i=a5iY6-l5f&7#fx@>Yv#jez@^xjPHVjIUo1%_1-_9zBLFQlsCVPS;-U^ zT4QuZ3yRoYnmgH1bP>pNQ9HMId){y@BByKd+gi=6d+^?hu7NSi?3;V(v&HDT&esiG zzy>MrL~(0_J!zUF0Ug&d1jBGw%Is=_V2)hNigXpqF*=PN9Ej<<;z8M~B6w;m0vC*q=Z$&k6Ytn8^X>jrf&)at?$%eaVKDXAQk^8-7fY+91Ymz}fGV znY<1P(iD4!jlQ}I{a3cvTa?qjMUPYG#Y$GiqLi)y**WIPZxBgem>f{dm`&9H5-_84o5T z2Wy3MC)jT@aJM?{{zAy1#1(nr*1TQO-0r{(w#&tQRjVzf1qr|}#_{MykIYwZDqGXD zJi-{5jI)Z0xbG@JUE&BJ^PTHox#&izWon38WbQ&wB$mkYT+(om&$6!F>e^@g*>`)9 z&A%tIEjEj-W&=p81I<*farJT2^bi1z^AOiYK4?@dCKku=mHcHNi5#UdxFE657S> zd7SILi2@xZ_myl^iztd6${GGwG|@A5zKb58Z|Yc2FE=tv0kZXGwfBUKyu5rljP6wn z)Y@!TD4Pk;)Ug@=S;F^eKCvl@yW^ySONVFDm>~s&3B_PT;Rk^1l9OhDJ0^c}xKK z%6G1(?(n-t+5_FyxtPK0*RRjfp}_f=S)lP7YbaCOA|tS$U88734p-RAq?h&XRSlGe zNbf5wB%k@X>F_Naz^IgWHsVdMnd!Z%z>NsAv&mDQ_zFsiH{37^OuJo6Dy>hUZteYx z?Z9Lkr*mh~rNfR-g^joIm39kX4?7#q6xsO#+eCA=>MCrA3{aR9{GAmuS@>JGWWXmc zDt_rXQE{?k@!3Ye(Uy-@K}o>|shnW3YNP6kCpkMQ51;>v$!*Yv46higbxBw(DlU&W z^`abqu-50R9gsfjS26%yRRWfz8z~?jE*x+>uFZ0vywd)KN4q#44;5a!x52EF?wj&p zDSE`EJRFjF@%NeaZIO4B_kg!C*SyVze_S4SsRgc=pX3KaE5Rxs*;&IyJs`oVQ~HqA zySs$7Eyu=YgDj366mKJ1_#&fTqv5ih)!tw?(ug| zz{uBx^7{AvGP`*wzKLD`LQ1)??Sm?UF4e#TfzGTvrLy|jz7buj36OI%K3ax1ui;;m z0Ma6hC(aC>eo-OrV85Y>sn15KE+d(vJ)G8O#Y!GA)kK!qSkH41@%nr%ywBj+qF-9= zjmGb;qI~{0^_NF#1AM#z_eW6P1GRvj`0wbZaj9GBt@F}I-{XA3(-Y;W;;%BhMG@9N z0b~a<{xqUm%cUp3^QRvNUNo<@wt51h$QF-zgg!pr!StY<1waXX!@s;Fs3x5&M;kCu zpq;*kYud65{n;#c*Q&v50fYPToXUlUdS*no=gKI^vXV4B_^1Dp8 zPlyf=YZy4?$Zv^muZkL<0i~-k=CcL64;^GjJbZPXp7uDtK@5~eP999nc^@_9(QV1c z-BE%O0df)NM92Y&!Z&-Rb?~zK1!G7X)XEPL=K=(Bfh{In;M|_#& zn$!(VZMGSVRGw?PQJR7p+N(N7kdBKKF9}=L>^1&Rfi&X{ILNIX*z00*E#~Fn(Yl*5 z45-pbT#iF4{Oe{gt*d$arTy%_}>F z?PEgn^=%;h-AVxR)=-(lY`!|nH9HstcJ0sk9e*BI-*eTB@k>Uv0O_g6?!0@1A^D8U zi-%{=(7Qv&f%n2TG6()~BcIVIVyo<(LsX1x(`I>J1OJr=?es=EYUe(Sd68?;)#Sv8 zJjKJ#Fuz%-RH>7aO<_ZQz^4O2>|D_yg{}fb0d)3rdlS9gegWBVhRx?r5H|p`6K~p_ zl_kG_!cLKEUKYFuN#E=^9^>tqoJn)-N1s$m(0`bY~4#ON`h4J` z&GB^m8vlwJ`{1mBQhTwO&Vy3$+Kl7sK1lZtt*alUK5ZGq=neI}uHSm7iJ0+MwRcOB zjd^I6b$kp~WO0}|RON`$(|*0QaIZ5LG!NtSy|BTZ(&#jL*q!!^dQ|KV{ z8Z0msAs$|872fC?eWUOhqK%KLxXYCAimn&znG3AQ5>UIixcC}yE~e8obnTcja3H$7 z0XTZgxGu$x8v0KMutZv{ak`3y4Uy^0Mo|_V@|LR5fwqb zdLWo1Km9n!FX#C3pXMnSjZ1ewb11W@nBNFb^)>0IwD&TkSq0y3KiNW|Pa#d#yxo~O&=NAu{o`Ghx%!9^pU5lVD`SlpH zcBB0B^LJwYi=P>(_}s%OcLu<8%OM06b4yDnh{~=fCcuH8zNPz5G}*o}7eBa<%d{BZ zdnv6%jcc^d@S3YY&x42)@!FkTMk`rbefJ9xApO@OpPCWEH;dQnEf4cMLIW*2u0&rA zmP^Zfoh52*xRGRa=Nw;j?KkRLL)qFf6~Xp*z`CpQo!1uz&8@a|_4Ygk&TiiUhRAJ0 zH)elBol{p7665vh!CipD1{c4TpKI!siM?g~G?r0)qi-^i{?C?Se9fJZxT5 zK=j;YjNPt!_$bE0SipVhP~k{fcaMTK9&?kATOVBzo&1;p6a(Ty(^9?~c|i7s*xFvC zkNQTl=UjYRj_S6TwTWB+PP*3v$ESy`?|A}%rN>mkDivLs4Int846*HB3f=1U@WYz= z-Ct{sq1IOiFTQhFbH^3G-SkQt)Rm6h^HR@MAI^c%{Q5j{#DvC3;C8suw97)&cCgfD z>>RozR}*d6#R;IS^0FF3vnu)?zQ=w$lt+$fupKb|iR#;{k}^KtptT@(HLV2o?xREA zZ}Q~C*+%+)J;j|s4E2zbP7)-@0zfDM074C;ccad9NTj|FXs;D3y{w6z_q<8=ICTM7 zr<(7a(JyM6ByO14PH)H29$fi9|1~2}o0V)7@Hgfr0OXR$Ne~rz9^D8~f1kS4U?^XLZ50MHEKc9JC zm73i=yjAnzq)FJI=IV+@pL>*jeGQb3FikhHv(*2tFoX zmw1K%c)Ru;-cDmv^6Rc)z$YF#pBu^8qDpoE|1yDM0wSKhQ~81DJNt~yz&{=+;C?d5yr+nyhV}2V%=c=&HT3(#7qekJ11VSteEMS$<$~IalPW%=0%qN&}2Y zbl5H+ajBx>NwLoFu`tTzb-STZ(;`+caQ zw@1g&k{qu$WfXn23$j~(hp##7jr|f*d;bz7uE+KHH`8*L8r4NEJ%W1G+SZCMMj!dO0~t zuINUg4d$9WfFYx0??{c%mpsi&QTb2hiyXJNSjKbta^&$E-O|)vfcUKm*?6}&R)f^% zH}AT+TBv5cNBF{gS<(S>;E-ZPYCiy)&S~vI0HZY{6?-n|eG%f|C?RTCR*hB#RUHTb z3YZ(MOJm-AnkYTX+?fDYg$2^~fphKvz=HwcukBHw-Xr6J^vL5``T6Sr2K*n8UC$#F z^29Ht3cy>6J@zuGh9jmnr|-;)Hi!d|K7p`B`j*j zZq4Qx)uk}yEkktw*bUK{-_&UdPiSx5eyFuQkjrRr8O?_LHt7#-e{pCS&fXeqM;Qt( zP1HI9V7XSMO=h=Y2Zjg8*~S~`vax#C7ogt^+&XinKdAuT{WUfakj(n*#jF#;shps_ zV5yzngNxgv&=^0&7?1&&4Nm2a|G;#33+$?j%|1`(tOY`9A`}2ZIHvvqn>!M@DH4N! zm~OXM>bR86g)DJ`0+-LCb}#1sj!|n>-eY~i+WGWRkr_SJKF{K@{hv(E@g{yvMSAhB zyz}c*=Ec{KK+d>sTY~|$GF~n`JFyXY_ za<2hziTyY0Qt`DS_ZjalY|32%8i0N)>1^nC?by^aX710ekyK8VQj5(Y7VW-%a#>)^ z#;O_Xs5$Ghd1aJdpX~wZpBXNfj*SP^=xx;ft*+M@9hb-Z8zP;(&~))k63X^;SKvQ6 zA5_oKLu|#HPY&o%5?}`juNjT~#e>Pw&@4dw7Aa zi%MWYvDEpph|j$ECQ}1@cn@`ENYkX)qQdbB>YzQ~$#E3j05)6gCFJfbc4EO`VNV)F zqPOp{WaZ=xgTn=3cd+E2-?1lF-qSN(RVn8@2;~9O^AfzxpWZl&GC20}=jG7r>8i}@ zRimq+PFDdTLQlBY&b=JaH5K)qkmfVBzq=a|ZdDxj1K+yqqyQ$%Emv0krs!hY*pXg8 zCBuW>D4ovnApP@dP(`mogu*by^4bZ1%YGm@0ldBAmOHR*r(zXvEzE6x$_jL5#PE(| zafcsiF@6{58s&M#1##iFnjPa$KiaNZH(#p$@%p7-P?;@)-%dISWq!H_yH5jjSUNGb z^Ckb+Jg~~r8dN6J&2?(?*g3@Lc_ScH)uP=8WklrZK?&Jr(JMh~!-xlxrc65Vc zI9u-HtY=VkW^I5l*q5ojC+!g8LXvDEHYLmR8DX%~*xRbLL2)kOz@Xgyds@iXDnYB( zO*o&9+m;q*aNq6Rb^}al_4|)QGEb4u1H6)A2_i|O>a01^U4oDleEkkkHB?394M+;M zzI#jnIu1!=xvDa%3169NZ|jn`y|q4Hncg-CLN{5H!yRM({%ucJ+gcL*`wPOJ`!SuL zQ!^~Y6j7Ke8L|$*y8ETxzxbWM3?e zx{LhFg&`B33cOfrF|UC~#H&(1&OnBaEY8@4l*X@zXkGg+&H(>m zsm6eF4}dK)YpGl5`b7&fonaR3QKMQ~4p5%b*ucC&z3cuhd&Qvk(GCv(4^i(N*XR5G z4=*n-yH>5mWiGpA+gP^UvR}1Ywr$(CZQI6u?ep9Fd++ZayI$9Mo(G;d&P)1)n7Pu; zVykrh+nilx2_5a9<2@Dx<6}nmN}55ss<0Y&N4)tTm)pof>2-0=gVwng9p z7!C=3e)?uGSYTcFYYJ;5%l=2ervCgE2jzNrUOS*%OC2XJcg_hs)BDjc7THaMBscMy zf9@7aV>bomud3kHFUg7LHH#tcvN{?RSoQ6@6bYHcsr>(GoKch*7#OIg0cr6J?svmv zOt^v7tF?x;_$6EST8g7n0rC#(r9r-_)U@=xxVV(mKnN?zITmWN-Z`ONym&6UJF$9% z;$zb~R)b!jL`B{OyEt_i0oM8MUwh^wNl8T|ZaSeLhxOZd-h#0+G7l`-NlXiFeqDK% zCW&0w`8F7u^HD{c0wdk)b9AU(v&m8bsa~VKhz_8LbKS}nB2J6RsLxu#mbSdeo{v@W zCW$I!pALwbd(0Hb{k7F3>UK!;qt9%E18yHhXl{++q1=AoU4SXGnzd_F74W86qCIRAqV-dZicmik z0_2wW4H(|2s;ax|U?Bj3^X1hrAN)lWy^enoh4y-blv(#Bku`zI?X4=eeG&Sa-Dsw6 z{xO+6hh2e?S=BkpDqu?(a7}jx>Z2fAM13`iOoK?*a(Z;rp(raPmwjZ6*co;+`qEz% zN-mQ|n2W&n&>5q%5O=Xu z;933QIHZ5^Nb<@!+RVo1zg^mZ*C?V}+tA=Y-uF4{PgOp^`=WYmRsd5f{BTxB_=rw$ zWzS)X5qN6rpUP?_j3WSj2WG#`5IoKNEAM9|;r4m~I3=HSuy3?c0s|j$z^>z@e^26O_QA2hj;+q=#y4 z>HC-_h0fm2lCWrT_G!#eU?UZbcjvnJn{! zZnB+Q_Fv@d`e*}u{)WLu$FHeuQDhEa?Eaj$=Uc7mblySK_vHw-HHyDC-=Sk+Nu=|5 z<^aB6K(P$1mHO~us?&NR`~LXAlN0yiMzcb|Db{$LG+skXt5h-aRr9L+&;>Zalf5hb z*@7!ep|2ROcAE#ufPaJ2JaBo@26o@$i(~a$HStE`rT3SXXGHsROke8%aqMXhE7RAl z=bkcv?eH~Wc_Ixzf8TU}EY)u62l?aBHT4`T*as7-W&3VM3;xoUSEUeyG!;_3yI!k_ z!rT^IZ9Mm5`s`oAKNQ{ogD)FDZnv! z{>e+HNA%0e9ldacHglH%>GrIoK{3ZHUkd?j#!7t~1__`3y&sM*5+JS*UWI38XYBwp z@zZ}l1W|Yw#r33`s&S+7>g!>fOp&4YWoBQ++g#+^9?)Pt;c-1_aGmn!ygj#EU3dOo z-`x`I9J8DkzfC#ZtDLdyd`a208w<9&0$~s(9eRbq02MFvvNb0sZ-ojm#Q#gnu~K33 zWon?0@!UOGm3M^}9N65M`gSFF)c=#Sze8@|tv>_;8o0hC+6Ny+>cVib`@iFJ1kO!G z-T|;4dwZAzf2#lHV`lG0i*N8=f*kTnlo;K-aWBQ+@!_1#TC|JyM{B}+^V+!$i9jk%r*+GQhf}%#a~S4+;lALPj`>Wk+;pV_17sn>gjU(TZ>CLU zpL2Jw_XhXk)BiwtKMFE0EG}mkmptEee>JM+_GC71NMUI)XWp>Q2V>$flW*hB-<8-O zY~$e0aXLczx=@(%_CA;gWz9+{I^^-6*3Jz6u|bu89GJ4Balb!fDF>wXZ@ll1<3R9W zm?wYukRz{cD)jc?G5vvDyUFE6{83+~g2))3k}f7x23mX#!#f#th?4&_q*RN?*Z#c@=SeX>Z2Wxm|% zcKh4w_2~y7PH_Q)Hg?leZ2z5Gv0PL0<0ME{ZPShtMt6)~D3sr#uaMWHTZZY_(N9*q zn`dZs^sy|NP@%XWd31`Ye_uRlAR`)SJeY^XetEYi{g_r$+h=ZOwu9CFARI%iE&Ux} zqjr|e%WJVD8dbptev?V;_@hbFKWF?W^sAIuf`5n-&yJsAg|%pz>cC;SqgrYKc)r+n zUcRgEZ#=js=B{ZNFRHoCL{!wzC6LM@{L|Z77f(TK5#ZqUC&-t+>A6 zmTz|Y1N`rW1;w$H&HEDT!q!7TeX&Me=@GvlaZ@(y>U6dCEL@|`R7wTHkLJ}+h>CSN zcVg_Hm4MG&qn+)GcM0;(Cf;h1uUs?Pau?axbnbUP;V0Mg@9nM6AHt5r7hv8jtA03D zFYYX_8ah|E8cI|xj`A+jwI(dMcdlui$Qguihklz|;Jgb257h@l2LnU-o`X@!UzHOCnNvO*T)o?pJT1gfjEKtEop(oXL?so0&b)0L3t#7;SM{c+Tk^O z>2dz4gV`DFPZ!yYW}!VT&kL@v9|HXwwgt<_SbybpYG3zd@BV4RazV#FFZ;!Fv%lAY z_5_a|!?p7&WM&b;2_}e)f})fU!vWW}s8+=`e)MCV*u7_XVH+o$v#@tlL@tVuCq(Lg!)!UgI zf$Q#Do#t(CD*knrIA<9_1w5xC!v)3Tyl!x*%2cy`8$?EHK2P{SXsIo^?g24LR?(D* zKZ(5Frw)<9>KYU&IvLyJ+~18X4Rz7z-qgw7SB+wvV#{>!@F}5 z&@kaUh680G1!JryqF_~{UTXc%5EjW;TAA;P*hDL$Yg4Xb=IZ!PxNSlB_m>mvZG}8F zBEwUATN|melHr>EZtfOArL*hHcyrjs)LI#FXqjg0{XT6%$VmvP zEe|z}Rc0l2H97pDnz{nSBn8{n4D=%w5gP=@G&y^Vh#CT{bJS#YYRg)TZi6>+V_2J@ zSqvz&d+d22kpuRLvmRJoRsHD+#CXpMUK7T(<<-uwH4a={sMKTU9f>p4^vYK(b&%;z z=q|A*6xdm7RA;NM(0#>zQtbnaN8Q0gJ#!6c^f#YSw zCIhM&W6^OD27a`|yCBBg<4r3V$x?Wx9V}9MBARPmvJ8=c2lcrJJ)M~WSUOgZrgkuB zuVVVZb7w3OPH>ChHj^cCE~7hShw0&L2Dj!0&k)V7N8F-Y(75aQ=ZD|dLW2KKn@jHL$7^bsX=M;@A=y&N(^&Ke@aO;z5{p?DD75%64|{X}y}IwkT^ zV<`TVgboqmVqG8~d(%B3`cmr>GfC9#VqSF^UnT2_Ef=tW>kJ-9NJ8SKO0p3Q-TpH@ z9)SjmlZ`lAMig6;e{bMbab<{U{QgsauCo;_`TceUC;badM}0dCs)14KnL|y=GfCN> zwaP*S1~b-qS;~b?9F}*XAfMf?_(CER7EA zZ+d+60M)Ps{7F@x2xbECEGI)$04_&ymT6b7Gx2pgD=RF;(M6cEUdC^A6 zUXauiEm8jJJ9uSoOP_GYoR{}GTf-d3A_@hW<`ta15~%@~Vw~xxagwhz6MAYyfP4Xe zXWi`pE@#e6cE=sAnOiG@;8Vkq76qRsuW=tFhe|)b6OX{!Zw~sLFU827TO;oKDL(Eq zcFb$@<6n0?_NEEeg+_vOH0h_{t7r1%$tdsZ`MqCMfLwEh8V)NV>5N(4#;+*+fB7a3 zft?-(6U7dWMcwfDhAfkusoO)-i;%JXgi1v><}8i2O#rs?gTS1MXKB4KDEcEt=p0we z&>1e3zN+TjLFZ935#DR*@j&8TL@l}ai-+qlR)VkU?0%ynOr>+ADtksrtNju_erSL( z>*w#gT%qA^w}m&K&_@Zd$C#-PR~bCZ(ff*#NsCbH`ci!|gMx#pwx_fppsi;5-A0l&%DA^F*f+-44jy$!#?=q#Y{Tj*FXiTTLJ z?~3Us_#JCatvxuqvCns7A!0n=?#E0wZO-xUm=(#%t{Ya&`&$rh?d^~WE3uH}9QDmt zV6okTg9k`hR5}rKN;ytO#suBRg=&yOqV@7!2Yy(z*kA@6b0){9)zR=0wB`$^*)2 zRq!cKX9-HJq)T#l3uQi%;EpXRp~(**)RN3eVKao~JUqo6l9K5aKo}#r8{~0o=8N|@?Vz7YWlWU2iB^b98|@UkRCz!Hb0TW zesU8U-Wy_o=VQyu6qE`iSuPmxM#`gM#SkU|kSZ>xy~+9SitRaUs5Ffo)WtDbxAPQ{ zHcv}|$OTSg%^vVZXKCpBFhfeIn$M5 z+1d4=ffKA$Fw1Dc6bCh+l>8+#`EvYsrbTfQ_+b1@yz7@)9Jd;j%mMtF>@Oz{Cg2hC zP83T5x0xUnC)RoSz~@ z7TEJCBh$+_U^2OL5tuxj5#?poKTk=Bh2K3(WukDQs?%N#dRTEkQWzYJ^~Iy_@sa~t znYGc_sK$I0Znb|IPHq*&cGbs+j;w($}PyixvBl)iq%saDn2VbU<2`+wjU*|90%CsmIl+oYc zM4SM<6>^+}Kf>CWP-T?98MzU0++asgaJgaQcz##!mKM36qu{E5nc#g3Ibf?g(|t9v zZgd`NZngmKwLRet1&)`jxHJNSoVA9#bGQoh!0b}68Qj0aJm6^R0LSI)Xu>tm62EpV)^OA;uzkM!tR6tea6H(GEVD?c$f6J69rex`b>hS z$F*V48pr0nK5(KejRt})pG+FpBk_lvwuodI1!qN3xNZL}Z?rjFTZ!nt(O#wnV*9ud zh*YISFW=h)dyK|fg)!Dr&2!@n=a#H!RWOK9A%l`@CoKHT5_r=29R&gxh2O+BkR}%G zk3WhKrfZF@aTEj3`<9Ud)Ai|ceoIQk8*+Oz#OHD{PL;ff_WvH(7-QB`a2l6s(Rk3r zhV;!Z+i<}Y4HfpBU|oRs3depS@%FpTq>LwQexbv`Bo_bnzF)XiDUfm3BPXCAhl#B~ zSy@0@Buu5aA8Q_y6*Jt)F{E z<4M6VFi9Adk}?9^bJ-;?A->ZKgS``NKsdxoj+N?ZmF0Tz1IK9zD!|~psEt1SftY@a z(qyh-vvby@*_L!8}E|E{M=faDH8iqD&?h-!(`*ldRJEOnbC zwk3?d$3qdwB9(XlL`vWr#ET0-_oB!F&k6Y8-Chg8O4r)4}7{=?*bh)@w z-Sm4|AaS912mW@F^+cB|?fQogW|87!!m(TU|E@e67Wa!)rdlviw4O6Q;cMS zJm`Gr4Ex|G?HPhgZ2bt1W2L|B9Xw~QvF%bXvvUgcAa-do!obdt&|06TY*#lpM~2B) zb?F6rWV_%T8SAyMcHOwA6$7s~r}y9M)s=WaT_c%H^7#`e*e)me-(^-i^%$K|eR5EtysHU#nr3W)BN}b5DUYGNDzJe^MJjx2OoNQ11OP+&Y^ zh7au!2gwcKy(;J~rTm-&ks$pY=)=^kU=}2>Y79*yIqV4Z#wA!z9JTmdlvmSXhloz^ERw`f(Tv2O+bXz0=e6=~c)4Jo3`c zn=ic(phj~4JGNuF(-0cnVG|Z<949rZe*&#C_e%vA`%cv({X@K=_gW!VJ}Rf5c>pjk z3_VTGaErbAEP!R{$97E()#l4Oq!KH@9>Roldu*17c7bxrC5A& za8WlBgbaZph0kRl+XT^bG|;GpwDS+L=kbIVNdK3OQuccgA)3mW$CnFvd0|>1%Cmn| zaGW^s9)xY%<9$%@f)TRz+Cr;Gq#PVrZ@@iFO^&}gVt)L{!Swa>x9{j+mwGlJ^~kS9 zjo*n}9J}}w9u^&pkjso$<#d$qAPfDFffdLy#AgXenGa>dd5xxg36CKTE-3pihKh9L z15wbN)B>w^Gl$QFYG6ZHk&WQLM@kjLWZJw?`@AFY-9zGeKarS#U-HGcQx z+=!iLEQwg9xgT(~c7{own3}SVTKUZUU=@qNfm4$#!dr$qh6u)QeqyZbP zxL@&)nwZV6_^3*hCAi%P^ctmeIawM5YR9s5KhnaYm=9$-#=!W-JZ}%B4aCAl-q0=!)83w&^aZhSj+4f1#X|Zt`36{%F`Cz6*Ux7s<@r zqe+xv5e&%8Z@`vj^UZ23NQ2-#~5LqTDms|DY1J)5scGGuK!KL$5iK0nN6@)heUMIlA(CjdoQ zEk4w$bC0Y*II+%?@NC=#p1pExMxf+vUm+=rl5dk(hwGsTIr8QO(w>|(aQSG?sg;8O z3DmJwos8o6j_(jia%&VklsE2wDUi?TD_w|s$9bzJq!|AHNXe|Dsd03%o~aT2+Nx<= zvIvjfSgDmb^v?%bgt)1`k1#bh>ju1}kd78%2gA2AH0`$Jn#C~6#_aQ`5!i$yRxu4F?h^&kExg~am<-!Xp)%MMiSE zPJec{r4efVNVv4sn_Lg9dS8Fw=g^cG(iRX)1%>{CupWv^uJ( zD3vmT!u`oJhsG!;=UfJ z+IW4`5PIy>=z#;?0z@w@o_umx3K=NF5_n_IJ^&WMMvE8tu?jCt%`MCxh!^D?AFg@3LvnR)Xr?wMsU84WE}tDnvyAs z(1^w@c8d8IhHJ-$`~o8{)!QF_<+OE5q&(4FDCfDYq?*9ma*;=93sVh@Yj#`mPjTj6 z&h8JSRdqE%8q3vW4Ct@Y(Xc}aMG4(QiXD-~j;T)&7)8LQyZ764fVhUv=ufLf{(OXb zhlj1t{nSM}?32?>779OPWb~pN^YaI8=z*T`z}iic;aoIfL+wFT zVEm7}2*P3kCE-w97?}H5wvz3+^m0e`g*JD`jEknJ*{jm{8fkqMTE%V=-CvQSdo|U< zSK4pXlEI5#}n$R@vDAt5^LGX z_f7Sx#!Ff^Yd8pGl5BUpoZ6zG6UuD85yo+%s9U6?P}&sXpYODLO_$wzBVyHnZzDmW z=3Ws+-P`&v?_`S`NPtiS0KGO1+ev$bC|glavp-sc4FR|Dt(Qg>x=2>5?qq`SlyId0HDZgYb{a_=_>DG72l3KY z%zXI+pf2t5Fg3#eB)NZ9yJ8o0q{oh0tHi14@3~0ja`hfNp`7b4K&XV7h$jSq!`_a5 ztn(*zbYvrYcy?dU<-Tqga&gZkK|t&2;^je5$d1drZ1Z9z-tX2TnDln#EWuy>FiF5G zpgs+XX0rdiYW_n;RW2}PqCDqm2==Y-I9r!}U0eEPKB!HfNPmXx!4;cCCmtRiyBvw} zRojys3tzU1L~@Wv?$tzS$M+xmQe>3~(fnsw)>+Y5>_;GN-51q^ERraW5}5$Uzx@1U zX>ILT6t*>-*llS&0ZMV}=vp~T5P7RWaJNMpfI9hI~0?)F%Jr5?z>OCL4IOH0EH0r0~LF z%ft7;^ZPmgo9;DZsQAt&p~gykngvWTC$fLS3jYxuChKiY2XqM0&R7WW3BeI~?VXKl zQoGmaWTlVf1F8FwSrYp*@!9=SXL4w!a3pEH?lTAsP_dYM+-APsW`7s|1}pEwP2Eb# zw2NmBCHA-9$u3kJm39g;Bw3;VdNsvWd?-9oM@)U`$dk*1-Jfvmb;J8{O8MM`{v{|n z>L59X@Ae0S)b58`l{8N2EI?B=x|g$|F@9>|H3MK4KkpAngsMnn?N`ss!HB#v`PzM< z*PIjIbi)C@Aa^yhyyEju0s-W*UOzbESR@BBpiMr1cl34I*UsQkpC&|g%1ax`>$NLh87`(sdfqpctKt(_;D9>^3pAK5PnX);515iS=-D#_ZZFn(^IjpmiV^siSow-v{|9PHg?W-C&1UFsxn zpRb4Y`#qqEwKw7=#@f+Rlg=dZB> zh*$Hp!hirnU4r&J8ytWD2V9b1B^<<+9Me;{(YQL5!l>}>U9SUX>TarhFy>xAm6U?c*eG^P% z?+`qv&DUE-gi!u8(zN!Up!^lm0*7*GK;hpbt5Fg>(TwB40~~DEfFu3R&x39w zsVQG9+V_I=GMRRU%yI>#Ut3!lIP4gZR$%$vGEc82g^dk%wfRg(d1w!34;RoM35WH> z>@r)GL#IZ%-N(~96jFh>k^Rc_$P2hfTj=0a5oEH(L1`m+^ zMd`IHHwDfbY_ZkUjoI69Ai_9mo{)Dk%y)QmFc^ILs4(Y=>ObGdC>H7n0SVx?R$V&UKx@)II)9RrF0{&~O_zIu{1O7F5B0kGQ|>uPHFP03i$3 zm4*LlMkjTcB#&aU#0Ll+Re;dRCA+FGsm9S)K?^3<6_;j?W#+|>nM>{tSFL)~G*{?w zdEX35CV0B7e!$%FerIy4t6m$^qK60wx>K``v1fN1AIeTc+d6119lAXnM9n1U z8ml8O=YskUW$x=>!bMKRov-6FpQ86LWSs3tp~34-$IMCJ_jRYGy%`M}KiLykEIdHJ!z_}@F=7}nEJfdoI&25tu3Gdc$XKdmEnGgw>nm~upllClpPEO^3)C{)#5ThO znbqG85t0Y%$?N|igJ%GmL>ZXIBYBZ_>AXM(7K;Q?C&E4MYS|#d+NpEP!{ab+bt-#L zx2m}G<*(f#`?rEeRCuCvN0YXuU`r!SVW$fH`kQ@}CUzr0MJj#gpIK&F!*xh{M(i0$ zP-0KQ4tXJg5!^?BHX#+*EwU+$SgQ{tK*@sVs_#gMF5}sr#J&oS458>y~x=%^1%{%^vwmeGZLv6;n0Iux2VeMUE9dDO! zug@*FR-|(FbYtw9ni52|u08ydXfq>ow-@Vl{0oJ&9y4olMTjPu2eLWdDInQf_Yyi= z&!oy5jOXg;7cueKg54|@fM6`f=)yMFw07J}qSDi%{Z;=ax!kGGkbP z*ht64x(G>loRaL=#-6UrSJ&q=2t&6q&7#3zYikZU{`SG|-l`VK$b7#YO(C0&_+*XJ zK=c9tpLu(mSz@kI`yuL`i*9u)h4jeC-idq?w*_?2rNm z;fzoP&zUXurFC^O=z*4@10$+w-Txu1P$!|9R59-PYQ>8_NRNf#}`_^mk!C= z9$fT1F=S8t(p@7|>%jJAP1yzr7CT3E=A{}Gzj?1fR*j^k00OxtD~n+5RfFiN3iwU zfLn9Rc8{4vZcv44DE8Zz1MBtJ?u;G*=qx!chr^b~UxiD5p3{sH*EKu?AL|0X-zeU! z#Q{O#5uWE0o$}5%mLB41^Me-qGwaPw!%hIketkE$8&qH5c#MQdJufrUOW1YJxJ!7YH!&7m{Jz0Xxrnc@>*R za0JvNn}TJAbJ&3e^%=0Ot7Im?Bt`jGx>X~6Fzb2zDJU?XkUX#x8c zmzlA7OI%@TW{}umWL2x6{cx;{IA(T6KdDm8FTnW7*b@B6K++Nn1vaO(6mgh$!ANNr zjX}d>?TvP;ul3{V08y8v z0-DWq;P%YpIFoX5nIo=}{YI?Ry)M!ok zK&Ua!Rj65_sT7;blY90F?W9+>OrDII8~^_7aZ@sdJj=?`Z#{SRf{O_TxTvgarGf$F zwUhe82iz||g!mP0kO>0#!5}90Wp!$@KgbJY@~*S3yz#zewIO3fq1JAiv?A;98^p12@^r?c`+!_x z)Z4|j`#SAZAJgx8jt%!(Nt8_f9u@#W>kxRkWCJ)$_`S_A;pT2cexARRw2r>Vw0a&| zMF5Vc%9MV<8mXwz;AGwsXtO&JV*tm>?SZ?%GC;H2s%3Q%MwrVJ0j$FVQR%y%_xOy7 zcFX&>1K+gDHS)O2QcV3vJ*A8&YL~)$ydS(4+K$DF3|~u`WJ2k*tUS#L1(b@gX`ZVJ zhC|wk>0YcG@SeWY2E5=g^sjaO>MqwC&urI^SZ}Yed4G@D+$wHO*Y}a(uDP*_?9={s z>UB?)$9!w{Yi@0n-bRKyeB~LqL6@YxerNkJscX1X(EIo1u3ZP6p$jGZQ;HU{#G(F$ zJ8qzs4C(MT#tzLh4%}Yr$I|EP2^kkm_piutdV5!I_w?TH%Lnae_U0QXtvr@&Y`r4D zt;;LT5nN|8h|9Xv%BpD8r6H}g;>ech)K#W}kv4yLNJxsJHS1Zl{Tj>;P=4hbZ?HKb zvOA)5?k9JrZaQZL)jo*c^%N#(&)K~xT7OABHTZ_J|GdN^U!1#=WU^m*%`H(L6tAlUL}O!Tv~R58^Mpl6BALGDI0y@b||9^FANKmbN=VjCkUwn>6z_VJDnwTn1Ts=SUj(yTToiLN(yjfj`_Iq8@UD{Mm zaL$343v}o{qrX1)a&Vwa;bemRp_2dkd9UX8JdanVr;o&9z>NPV%|`U_iWvR{_b(rh zTVEKgXNgCWTak|Q8x7WX_=6KB;3l{6wodAawnoC8Z4JHPPc+xCsOLbP;ic=F=I6Bs zuDT#s(n%lene-THLC_m;Un1ipIpYPo$7}dY(vH;`3TkHlX%;v-hD`BfY6$;W$)Ft_ zfy4w#7-*+l2UgJT_vkt_2}xE`AB)B|OL%c$8bDv2;(E?19i3Wzdly}mZ+c{oMO2cg zL^!RCC3_xym$0H-gVBkJ(2@wb@m6B>{YFN7A-u?rNyczNtP*c`DgXUKRs3*6;ae|y z-OT2}TgW>JPZ5efNz!NL63V+E*6uw6OQQ@MDIL=RX zd~Pc(>87kH1{z!fr>{d32AZ$v^hYos)h{_4D>@I!%0G~Ledx~1H#%6viVFZAA00SY zECaO2$>)Kzo7xpkRT^#?AYAbpegffYJ3C>(TszkTCwBy$-Q=-1i^L=#I{b(b$N0GL zk)ys--i_!8r2OorF44`cby{|vGvwaPIUrJ#+i-du_$$a=V^{^uli_sXgLxtz<45!E zAobz8fz69B9HVc<`Tbinjka@kuUB6l+u!2kZH8CdVEWW{?YxuMlHB>r7oZ3wP&)Gm zrl#$!W3=c3Qtq<}Bjda=78_M;=(Z8Y*5LUSw@~mKPC;61NP~O~Y)+{!si-k`&w!_f z{M5TGKY!IgrLLv9*sn#TzngTIr9C8puDt!IOWLC0gtS}YVwD0%d&}-@Xm+B=ZxY=C zP(IjQr+vCJoj0nf(U(;$dIO>0E8Nx;U1)G2K)a-sZle^9L}Lr9)Dy<-SWA$E)$reT z|2PqoPtcJ-8;omqby2fbZYqO&oPXVZ@N>`{8R8eu#B__U2u&X^9AD3glCU0nIMEQqUYBAvI+&U7}xyw<2^5JMVI_HA8U6d*MsZ;c(qyR5D zPJX=@gH|i4gy%GK7XxDex-eIBI6MY8!?0A^W%Keh)H-TICkp0<2J`ZPfs*kNx)P?A zX4qkUm5m5>&m3unrd8$Mh5M|KuYO;bx$X<)hAq76fB_9JE0Dw*Yk8Qr2k=E(C*s$p z(c>h!1WV6RBJ`Z>QW^N>eTAm*>+^649JbwBq<*5H^V94rTded|n)Kn|?dlzS^zr7( z!+qKGcY5I#q%BX>c|2E-x|TP<`@ge&+Du^hy4Z6b_@L$xm?huNKRd1qi7M~Nj^>UC zgGAVyXUSR7WdPNtQ~kkyezb37)ZS=(B>vgC>)S@KmI-Bf&3DYmHDWz%Je-im{V{{p zQJ04=E8UgPrtIm7euh%SHCvrT%1@T~ymkbXh`bev$uTnz3XqQYzTrRLqch3-J-Hkh zq5-ETkP4QD9D!62OdUkX?|jn8=m+}9g$1DZjH_MvGh006jRNpxNkY*XHI7a8R6nN z#CVB4r`}W-&4q@nbavlvTkWh|NJyxxs}^`k|KcD*hA-mYg+zH!aR@((&sV|X_1%Yw z<;9cIpU~YtnkGpmPdL(^7D*j^cOkF&;?N7V3|Yp4#cc^b9~&cK0UmGkF)^3<>eL)d zo{9E^DVlqnDTwXSfrW)Fh}@}^ysPYh~S^V^Gtx;y6r|oL77!ATg5FW26`*i z;7qG^5!zlXcegF^JxfQHSw_IZ-~!B=0G^R1=6gH}w%L-xs<6;c@hUzG;WUeWRj*fM zG!Y>kHQS;LH^ID3JU#SGI6slj`ZHQN8T!o8^3K-6&+jezS-o}eLy)>}o9m+r-3N#r zL*)XoUvG;6Q9?v-gCG55Ez|>qF_6kju(%yWYOhLU=wNNh)Sg#Gwpz--=vE)EM`678 zW#?NWsi6EO7*US1iJ7y<;qO}|oogz1V4G+&tZMqwE1FQGd)3z*3j5^^`2AhDUxB9pBhCaXRt;8y5~d9-jaJ*ULV>o6xlD zK60G^GQx|~eXT=WsGi5#M41#5myCx0pqP_IY((M%{5n5?=tduga4*S# z(a*R}3V@nvw2u5Jxy5i>Z>`+0!1rq(v)Mde2@ZrWpDqqm0_xe+xd$}Gcma@dv9Pl8 zZ2Wm{Qhv#Xg4VD63EduUsmzpVR0m!RZP@#YyFG%DMh6H31K`uq4?k+$g6XGuK7Y_T zTmE&~9H+OLH>le96Jd^qAZ&JB#=k#{W1-Fr5y|3SaAMLvxHX9^K2q7)iUJBLFm^QY zX+UsDD0mQm2I4Q?`V>?D6Be${I+j=j%ToY}2@NzZ#TybFTvT=jEaT^RWNsDEXHJzM zZU*6m?U)FqYro2^U?jjm-qCu^$I3=qQp$IMzt^)fKg}tZ z_jJ8#`%j8Hn$u9v>t?-iXU`25OA1z*L~u}}d8Jn=Z{4GIZyPpZmNiaPY*~;tcaDXM zBNAn@IibAM0M^nre-&Gw%{guC3jXada2$kYdwMmtL1$CB62KjRZ@+-Anq!`~TqEi0 z>(2x!4$KOUhivcL7>vB@1-oNoBChxxk0xb}-ncKY+JfsF-|#8(ll4K$MWB0pM*p?# zo;6z1jQ!n`C#l>z?2buVKjRs;N75vM+IH_%ht_zXgoFj+`FyQ^Z#l#+J>k-r)_`w@ zqXs#w@^3u@3?7rmJ|F}p`#Q<~CCBc~+7cMwJvQNy8xRW9uDySRHDdcE#0G=y_dUr~ zf3+~*%9(A1y3?j#pUNpTFS){ZQk#dd<>#870%LoXxN^b)D4K>pZxz7{-Rgedx6)`I zothePx!s=)+!A{vNVPMRNxsoScx$|GaNP5Q9>E)$WA)wVWbU1Bu0n?tm^-sc#sR$% z;+i-h@PGtpP>H*QU3aa1xr(W}?kYC9l6boc`~R`Db+#Oe%`UuZU1jt7R3-9;N=fkU zxAXNqNaJ-q{`~f#@LAT|iWZlE%Mwd_CRa_C-HmX;Z*r}e!G77I;02SN z?X_Jj;lcPz#4B1E$O}eqz2?_>1M$MkD0&G2H`cI7X~)xLvaRk4l~81Q&~V-}S`dPO z84(gCz;9#?*{;q#7?8NRPEAei++SW^0$<@^QxsJkQ4*9Fa(!$aRtz>00Z=xR0gQrj@qL0M5Z z`#H2UCbxH!N-z5l0SD{qI#lDvHsyo6Fu^I6F=cF)SC@L0s_fv^pBcc;kaTAIopAyq|}uQ;~?OtUam+ zrkE+Z&Tgr+Xs$JesAIUW218zU|6m;V>n;z`l^P9YMa?FL2tP$>4y zoW;9)hju2ki80YzR{2`z^91?6*~J_RnmoZ7HX-?+)=<95$fvsykM>WDkL1hs5%hV% zos(D>!HS{7d{njoYajP^dvWm4Q};j)?DUgVR}NS?R0T6ELb;zH7n0CtL&h$jl1gus zkrx}An|qSayj(0ES8iGZ9KOJ+1Gq6*@bjCs&1A|F3FRzT2hz*+&19g)3Yieic~u|5 z>l1Zm->+1|waNG9=JtBp!uRf-0$SgdqRnQIRdw$qSlLI8*!Pw=>dWQwLMObHLONIB zoCZ7r7m@bY%C?p`;kVYbF6}CdpZSWJTRhSLZYZJew4r`DU@_rP`yR0AL8zk!SQlkK zXf_0P$20it{jH%PArnCIy;>2?q^6#GJqc)3&%T~-Ni8fa^k7tUa*O7%-ePKE{iKR^ zDKL;!QWDufEKaW&u6{DaJIGM_;?K^C0#vagYCq3jdfRol2HPpXv7#h<8s3gJADZbe z&3gAP@}>#WxYP+xn06wxl&Ds{>$5pN{$s2vJ`bOilyq^lsa_G?Js|e+7m4&RYjmfo zSUh9Z5j^_RJ7lRFMJd9t+OKIDag?sMs~1ddY`l_8OiXePja_|nNBnjnW@!b(bF%V} zieIbp!wh>BJ7~SAL}A8l90{YBJOcx#L}YIdJfok7tjrb#+)`EMQSE~-4VyF{@&A#O z>8j6@(ziyPH?{HvLx(DLN*@kN(BdAE+ze2;(q*gEfA>)2G;kYIyH5q+xb1!S4!VYg zJ*e;h+}VGbKI_xpHVtYb1QxIhYNrOqJn-9=$e#6eM0yUj69cn_-r4-GA5`wKgj|y# z9fWYCe2a)jyAI?w9%@spbZY(pTE%y-W$Nh;n4dYsc`-O>f4*Hk@#ExmhvhRoDi#ZF zX%8!{!6`C_P6yP`Amb#U+U$?J5EAlJXive_g=dBXRo>PiVPTvVNd5YC+YkOpM1?dw zzB&(%;s_d|M{{NEKt*t&_fOmj52QjDLVl&Crx)4LW8OU1Qi`!_?LY!JB~?S)Z{BWx zVdFw!u|rgQj~#1Y4`99O>gXf`4xG31gPNL}zS`!v`RAT?t!XHv>Pv0oBaiq%2fh0z zIH8Hn^nVgr>xcvM>ne~X6231ffi*Ij-Qu641O`A}m>=wZk&b1xB{IZ-Zz+*kf^z=` ziw4n!h>kN)R(1e=gAV{Sb#C6sA#!JdZJ<@hFXir8_E+AXieLLM*-`ubdq24o_?F$(9m#D^I?YR9)|g`9-x_z5vzoVHq$a9P&Wzr zpfhanls!5cXxt2NuJRE|&U+rs@Nr#TEx=hyiGo1B(7DhEO%z^unw`;8>fAqqTWA}I#0W4qreE>E-bu= zm_ZR0dG&Gb(kA8K<5+p)CeDdFcRO9VJ$@*HubH>kE4LMPEX%-VVs;kJ!VOXHXdZBJ zarr%fj4$7HSHCqhbXGPhbOtfhQ96&;4EG483I}Br7As!zKp6y+fYiP6!}tNZ_-H&O zA$=8`dpVtYH;lhxkp%de%AA506ZT{lz0_S2Dis@ z5peOw0sr7R1b9XMu@9HDq1wmVXr)}kT~n@yB`(#GyT=S%)Qh;zE1KX#hdCP<8AY^y zAC6Q`Y71pNPC}x~5u!UfGMe;YyInclQ|CTKiXVLOLa5`Q+rO%uMG8tWiv4U!ZJHaJ z%?(8>F6iV?lAJ+{`sI(9{2Moni<;1FF~V;gQIrD%b>D5VH-kat<1znW)T^`GcSL|% z)1m@9eee`J!=dBNKK5mMZS9X6Yg=xWPn7X8P)kB>>{ywNc$|J~I@Z`uJzucu>$R0GB(& z5@3inh5YL#n>Is9DqBfT-)6{hJYi6N{Tu1gjQio=F>(|wgPXK=rNz$ncN;SfM({`I zSKu>7ePkrIe-2VQ9%cS>RD6AJ#nhS=7mnbHh~_>P5o0Y(*ggK=PUy!Aj?o+UGWPgH z=$o0F3!`+m?qg?0<630}FomTW3TSxVYb@9q1SiC}bQ*EcMY5NQ(9MF~OlM2#{^LbM>dQLjWVL!t~uO%;TR2!iM>dKe{`L5ezh z88H}1^fD$GCd%;a$rbtE_fwAd%X=K}^WptsnDgBG++Aq~ukjCWeKxpZ{S-!>Sl)zd#P zaM^b}sCqPbi@3fk7d841G+x;3ePc(Jl8{c#<+DQqHa&1Uy7ZTJyt9c)A;A^>IRYm9 zY{Sa~hI&nlM`!Nw%l@}}&R@TuKK324MXV<7+5`}#P>0K(blt!E%?lji(=U|tys9b_ z1Z7EtLI@}w7#tklU8tDm*_u6imO;^xj_!^|ATuIYFU{Sz>e(?i2R&wXxq(z%;Ltj8 zgotk83{gm$8Egm%sY8GG@HO>#*k^I~(d+WNUGk*Y>Ky*6%`eXLWZZc-80fBEnwfbc zHE10gpwPQe+Q1=VQ{fbYCiiK3jt5bOvHrilhuIIj)4$eoQA%1GZ%^G^LoG0KDvky> zn5DEC4T+_yF0e09?@iV9;~fyWItjhi9yS|4KQj~8R&mRGUcp{+Ea9!Qy;)N`TciX= z4nO5)B&SD$jHKk(w6dzso6Hcq24oE;(dDd{T(2~rgxBwRx?RES)%*T~jk`{Y+FlVPiH6+5WZ6c0^brg*Xso zs>B)OF~&!|E!piRA2(kGG;xw`{4obn!`X46u(ZBBFG0k@L2q|8ZP(3MPu?cU#>i+u zFl3@R3SKX&KGzt4QRpIm@$VNZMc+%PLvL*Evipu+;<@$WJ+^}0wTr}89jz_DiGLuq zP?NCW?3sfKUiMp9*d?qPFz?Xc1)`*n9!Nto~|eflSM}NMhq&Z1#dcg z@7%76ludDBV@;HXnI~_IO4_%-cEWh$tmxzO(t>MsAjcRA3kovVB+WL48g>SAQ*#q+ zf`z8#tJ!$64nF z6k|Q+K18n6QDk&&&2!vvH9^$MWr(_h6(dF$ZE-k^8R2MW4y(682a?3e)HTl=kgaro zMAfGvNAcCMxfYIq4n@}t$NJIkTiq-Q%U@XG^;J1+=bWuf`9_-F>Q+T9EByNQ7;be8 z?;Yel<`dGR0NMJXEy$fHwma)n{yVTz{|1odNz@ciS*Nn^u2 zU08qOZAa=4%?iTwRuk4Y&$QHaDwkgYS1-` z0DDW$2I5R_+&p~#dYT#5oBp!L*q+?akZ0|$JIq`)!I#@>dvjn;sjrS|kB(xq$p-8R zgpE$iXEM$;*v?2PYgPG2Wz9+-+OR=-sGH{&ZZXi7=sso!)G00EwB4Kj<(1Akzl~*F zThUoDrC1`1ha9wCQ*5D6WmoRyc%A44g(8~cl3Py;ol8@#aj`6mKlhK60Rh_!qVzPZc zgAdr2tR^M>$NM3IrRv`?qgg8iiUv6QYYB5x@&1gp3(zxO9A0wP^|I_@<8O6k8YJ*Q zm*Y8+(T=^>%t`1~U2;(>L8?2htG3R^)y1VF)8q%yaB29a6|*Utcx`peMDMkd-$6)SAnPgsWboH-wkv5bf;! z-!NG?;OH|~{gCksuwtq1i@V>BU!>Hl%PpKm*4!4OIMr7h7Ed^L_;&yaPJ5#Rz2wHW zyPSYug$4^&Sov79t4{+nOAFcYuUmS>tDWh;bPR&FMU2`ffgj0CUoIp&h_BiNtr-X4 z1LXa3sxzb)nps>&K)7Cs5mV_dFfLr|iuJt;7hamOCtOn^G~SGr^&1UZLZ+VV!+s}g zR35)FBylULwE7I3cYd~Jf^)h%&4cl}T&@bqe5z~W*s}^$p+DlXr3a?iR>)hqq3$Rt zARBnjH(0V~8Z^85F>bAASx--K!!G&v9+vD)5VDyG(C+`p+rx82hyEB$+bXXumLJ%0 zo}{VVg5upz2july!kimRC(6m^wUcB#CtIB$6!)jK9cI2Bs|=#cRV^Ju@?~=sscL;| z1NNde^$G}v77Lj{7yuuw>h%X8OI*&xC%iuhhs_oOfSM$c6q8%D#TFtcfvm*Ex$~D3 zhI?06j!#C%RnX5zP~LLL_>I_+H^7|Vd3uKh3V-`1?QtQT7vi=2I~KJ+Tz++`peTnh?yE=t#>&qgZke0BG=%1v$R4~t3)ky)vL` zd{`!GNa@eG8HQirDEwos9&(Ubfy+zjUKpc80Qo7qbIm$4QgRvPc2qcD6*yXW3r7V! zeypE$D-lIQhOI1*QsMVWSI!E}P3aOC;rnUdiGuHXRHo$WHChxtOh|ffKUEx8@814> zKl}g7K(qDDt5+-%g@zl{;HRO|2k8#Pc&O+Z8phsEmewI47DW8l22Jh1zP~FTvJ+S) zEMz`2m0+{p(R(e?GG6G^;kK8TZg#kh%nue4VF+U7(Bxxg#h2yf;wUX4JV$??UUfp= zDtdQktK6Y8$;8K-4ek8%+eh&SKUAHaoScmL>G4oZ$=9Pju6sv0Y6b!RpHSAmC z!*c)CX_K{ku-!M;Y_n+*0AuuJYmw}AP99!#^nzro_0CLM2%#Xz%UF>3_An|+YR)k$ zEe0xfdQ+Bf!pfuo8^s?-(dJ9>>7hE!59F^*CwsKBd7$s^Lgp1G+lQ!aLaZ?S$+fjL zcpYYEa{Y9Mp%up>pF#`}%~E&*k>m#VYSaAIs&HegN={O--@y^s`>o zm+mOL{I`2k z+=FQcd@=9BSGw8;-^zbpiB|0^_L!gD7EH09tuJ@#njyQ2o3mL?{^eBc>nM^TU8`yRLC9uP zk?+!Sb#Y_uP(56fPBN?qL$(`^haGrwgN3rd{itx8{14 z#-fWYsu3=H@$FS}%(G#f!RM-j(D#C(OX7q~+v`a6Y$?%|!ee3NL-ukkLGTOj{JSlk z)f-2cuFZ*8P{|sL0D}+}NWB(g-IXFPE-vPsw&IjQ)vem4LIl|~q4Z%{m0>}eM-gQ` z!nEWeQnWv=eao5M!*jJ`UA!7oP|&VeUYfQ5(vZ>l)#sJL1p8!C9t*XY-xX^{n#oz6 zRXLIXx5RAO)^5c{RM!JkobNXJ;hMSLh!y>QQ}o8`U_B>CC`BEjp>Jt!Fl^go#YVkTgK4}>C1tCUN8Q{EUS|rz=Oxn5X!l*2k(Z8l+ zEI5)2Tn!@sM=?@JzosNj=Bo6W5q6Hp$#u?waGys>iLFfRn+6H5%Ir@m>y-p(%T(u3^c(nL zJkM~|oZ~z78LMSnOI*>Y_Cm83Uceb?Mk?OQ^0iD%K8qCne6c@UrO3DAAxGO2J)(^$KM^&&94tj&G6(+)}v^mMo+s2uvR(?ZdGrb=f|tR>ZLR5c3AL~ znxoifKHv|ps)Q`07Qy7pVWPgH3DFi>-gnx5ODXR8NUDPii6THtH6L;0q&y^!RF5RU zoz=@p}*=9;sF+DWB6e-5yH=PqB|W{;p@32t9!Hc{-45H)ahe&dLwi&5>=p`%87cRP7X!p1 zwWQPPqpLZ;;0da*r1l+HaC7*2j9rD!Sy4-;kn!323`ZXH#5tY+A}DLQwCLe*nOr{H zj=i{&kW64;%}68uznEoQviZ3hQok%MKo)rzL(s{AO&f>LHZv(2zQLA{G+&hKZ%XL?ucM&>#t zg3oIFyG|DrG)zo6WdZ1eai zY4~~c^Ga*Pv093{X-F%%)lC86eFynIk!-*~N}n+!+4o6^Oi!JZdpQ&k{xSilV_#FE zI+BYGPx$emGz_V>icxATi_cQ8!`LSEopklbJLeY{N9iuCutsq{uRIEKU1hnj>QtU! zUBK#L6BU=IzKK!aSbO!!Z1tVD(zu_G;fC<*jbI0e7rvu^i12U*-=-~j>k4#)cgsRA zPh_HZW6dqk`kB4fZ<|8!S$cS8*lM8@s)124^EK5y!Hp8^U`b0mFmXzZT}Ok*ShVUx z+e{d&yO!V8qSU)uN33{N)MBU;9T%JlqGl&8YRZ79iD0})TwFLQehgk=n?6(!5Yoy@ z{h8#Cc&37UKZi_VP{JpM%QV2xkv*A;3xOf|Tb0WTHnF;5ysf<77rG}f97M=Ig9rcU zm6N-5OI~s_E33cu=&_@iiMJqZYF6veEBkVNAaJSVVTRti<|(&K&Fbr}hFwkct`-@V zTi^ovSC_!8bi#(x8!>?fN=O02 zTvcprWy(e{P_WoadvU~lwcYw3J`w$6X)ML1!Rqsyb1IBFwR$zS4J@5`-Vm}xpHB~} zWo{AakjyvIU|VUE36u>pi48(OtWR^#+Y61q{dw-V4Pte_RTUrAYGBQm78{4Re;c1g zPnHsXb83mup^O}zOK6}hB8Sh5C;DGdor5xB3S&}v!+njk$1OrECExXEpC!vVlkDlI zJz2v|O+#*vjb!GoG|Eu*`$x$0a&rn7H2fI73;lo|9)DC-$nkODAFYv3+AaEXz1pxe zGj)bojGo*`yc@I~BcF)#F+{Je@WF(r4{=?Q0EU*$&jqKmUl9hwGphmUhS4)=Gvfz zNu4i;5kfXD06VX`DnMo1SVVO%k`6#dzGE!x0`QW(JM4v>EJPkboPwy8FJ)g!61j3i zh7|Xon3>z)d!cczd~JPTlWiKaEjqk&T06HvZy(n?5H;nJ{Y^Qty8`SWoKstyzI=Fw znk%ha_AjfC(XBuU3=qV$iWsJk#}RSa7-=?_$dj#ybeTp4+U$0zj&Pkl=-+U3rD~tD zB6cKQUF=}Lqx&rHIYg4R*!im-uW30Boj==e_FP%;|4XjD+{q$`*K6n*F}pNIt}wq% zm+oZ*ei|XakM1z5ir$%`uNPvH$6a2=a+JX0B=xW_EyrG}!1uqjymCq$C0>9^7}vkB zXeqTMDu419-P0UYLz-N`M(f2#9BG|q=i?V&k1tELW(cknxW80uAS~*KfTZ zrI0JrJc~tM8&?}}Ywy0g3hsCnNt_Kcs@{s#Q;cb#5~7`!nzX@d+H467;cq}s8(t=Fh%&|E+IVZeMh9!y*Rip&O#!mE(rvtc zOt?I!{HS5UYPNazK<|dP1yX+LdccT8+fhGH$WP52_D<@Ay3%IMnDYzOZH!hrN)ct0 z(`Ryib(zY|n#vOHcU)uMwX?tcVyxE4XoASx~f{`|PIHNI=DrKOc=4>F6FUfmGW<8rfHvfDpD+yDCcy?8ocD$aEB zjcs&U!y@2_Sw)0|gkq0~erU9w?#Z}mLA}doo*R63zI=T=Bt&*=5oHgU+TQZp&em0C z5qcZOlC!--NeLyYL=%1eOCV3Gv4S_9Hr24Vw=dkV49DMCfPgWcc;~#~{*+l8Z>M?=y-I8Xi>%TgGeII4*24 z?{Lj7_ogupR5Q8%5&zsUDvuhLt8}rNfN-t2tydFF?p0qu`TFkXXKi=G*l{lP0ZTRN z^YinQ6G)}124PDKV(rz($BZM`fgyrYhbnf!9=+VmM$1Cw5T^vPgtZ0T(JtzZt3G{T zA?lg%I=}?k1BSrB!ooFS%xB1|c-o8CQ$2Nj4+fv+k(s@9Qy zX^&-tTG)NRra-J(l0E8Wr#4uEQo|KdcEqV5b`nWq&_}l-`aH_>kw$lls{7LX<)C7J zXDyuQqIxm!X4lVpU0exas0`m8lu}Gm`t>-yMLmz=!8ZCu+g#@KraH=fOm3@^gNWl% zE}K=BUqucCoNBYk#oli?A?^0{1ckZZavGG9+&I7Yi1lr54#=_0IsICkklu$(t`KGq z#}h$ODVpPZZ@JC(kF&kyMkAlx#=G2-yAC0KR7RPW!`8jhwAK@wEW}DxNpWq`H0FoM z6Rx9E);+zm=%|E3=GL-QS{12D_v!dk+j1v=N(-uq{HjTaiDYX{# zS}be1g=K+7avU6lbw3HYJwL?P7@kn!F8Eta%wGQ1ggG9hf`OK{HgU*dgg;5FcP4RG zLK|mI7;<3yU|=<)y6{Gyez&LGi<%cp_L0BrdUibv#}ee|XT$GmN^(UNqf1e1RK}1K zo{X(**D@1Q1bLND-x{`8uuf2Q=$elK;yV=fh+W1x5vOUK2_Nxjpk@w9xyt)t4PZxt zh>%)bKCGWfMGsx2S=DJBykwXk>`<0#wP(uSo6p$X-#;!Fv8YmA#dr@dA#>%~ONjXHMrxV1GLd%kbwX|VinDwSV~T=M zl_77it1G$zGZG_;y@BD1dR`WoSCytkL(|>`3vZp+C2|WiCZl)pC&{y}o!W-~#YD^D zYxUA2?OSpd;~_s8RglO{^w7HW;PNY+&eNS!gFl#TKX%bCpgZ5?@>74vE$o zz#`3vnQ~b$IiA_@A?Z0cRN9tNK^nBk*|3Px?wvf#dt7GB+pthPp(NnJa&#RFpNqWL zh9F_H1p(7tSV)v`#Fwr)<-<$R_M#AR*RGj9K*{Bzf$q3_p}VUguMR5C9Z;}d#>-f2 zsiUV7(|W~#zneX|3Cg883O2xh#dQl?2Q-1%@_xH7TWjJaPcNC6GV0sxQmN9EAC*V8 z^j*X2B|3;E%XlfAtC#tIJl7+i@6sQ2(AhqQ86lE#sVa%SgKo5a1g2V@~Sg? zadHHZ(vUEXwO;qZAgn4W>O6y0S4XP*@wepo|0hYAX8x8`uOfzQH`kX_+&-ALL+0NA zaUrWe94|Up)i>nM$WM@~DfKz=p5di!OLQElJ9C-mVtb}s9F<2#h*5f4tGt^E ztU(TKcp)Ghxn8-v+UWCT_dm4uw}iV{RPpH9xTePET!PlU=d6sMmH)gYr(ZYynZmiP z(lddYQPK%!lqrrCq-ObVOeIXUDD9M4`V1PcbH-{ju2R>l8!C~aPU=0|6YDFw3*w@6 zi{e`v;qc2C`|x;KKq$*jy*6AFqoW+*Hsy7Zfkb2}9YRD{GMdg;_$r!a(gY?k(p-?l z^WB+|@3uL=aX6kY<+qz!lZcce0=J&h%NNV9SH;MhOumvsS%Up5ua6x-K-lUKfhX$e zjWov$^>X=G2KkheKi>s@RU*H-0?{g_FZ2m~9+2>UePu!dOO~6FTjxpl`oFG?#wLX%78e)SSLf?d^YhaYLqS@1&&w?^|8hZoep&ag59Hgr znwsHM1ZWZQoXrT!HKMg5aZyfN_83}N2+_J&-ue>!d(lf{b=p@>mD|WjOSik%5#tTc zgZn~9P4B4a`S?@@&d86)h+5UQ0y;l)L!N~8B3If&%lmcZbM_`Q>Y2D8?e=E-8>7;5 zT~f_VYTxLnTLTUyi-7aTi5miNIHj~Rp=6*tWa$;vkt~B$1w{H~mT*X}L{9!%ahvXGZI1Zdwyh>T!{bg8 zdM^!hbfSjix~@Os;Jlwp>oi+d#VKCS==KdyYxb3LR7skR+6w)BZp$ag-in9`MJ`%@ znzF(HND7khf%$hreS%p{Tu(MVKGuvJcHZ;$a%HBvwO~8{!5(y zL*mcQ_@M+XyE&T|-j%XP`c(g3ZxT!s&R8B>w2RY|jEuRorwO6bN!phseQdUb_3U(P zP^ZrS9pTjZ>gOjfVko}?Y1f?Zv%^8V6MH(-C{+I4+3{lcBu4FmOmb7})x8acVnI~M zV5+M;0I~WaxX~(ni$8sDLtk&iv83+Uf?@5}4gzvady^?YJ!`f2Vj|J&g+_K87?80y zKvlZo2l$@MUN%4UrfF7nZ^?JQ79};Ue^!^8Evv$~p2>>s{iC(Y`;H80`c2>Mx}I8E zBBlMEes5Vbx@1BWN1apLdwA$4+reTN zCio7vrU-`R0#`EsEGu+BtMluhi#Bozl_EQ_(8FnepME#I=)*9RUoNLGOM!UgcZt7G zzi6`jEKs&g?^q6I-pzW#7cz9d1+^@TIt9J)_d?@gO+UhNmp#np ztw%g}l3kB72n{>oR?aBIHJ%eOb`HEd&`FD|Nhpkv9z zk)h-8j)6i`F#$orKdz2@%3lz=n(?GLu-4No+0(!!KR35|QOoh73X=JHKk(lt zu4Z^L>Bu*NP0ov=PRTyo)A&cc+f$rxA1HW**eazl)4(tLf&S1pn zzatvdA_(`vCbz51e1tmgOVwryb{-RD~*GV08vY?7K#!U&aQZ548Rxvx(F_$Stm>Cs_RU7mD<@iIjuMU4pHJR1Uy%F1-;sh(+<~G{+i32eli9^)oLfpt~QS5em2V^h>rYLc?!I0O;3| z?!KcahLv-Fw^G%L5AGu7=k46dcCLfIP*C3+Q-dvd!%g!cWeO%62Tms5ABo(3aAiX7H9FulgfWb zuyH3d%7MpFtIgI5JMSGhwYN{ZG-H(N4c2;RMhLFzuEVwcMgJfsBpfbc3RlP_%iJ6I2AC6o_?eFYj8 z;d30W9aaW|WZXxusI&VRdDjL|mRJryT^x#viYgy*ugm%SoSx*v3O}_$w+;! z!{stNpo?XYIhK z;KRBe_tcBOT7ZA*mY|@Zc-`Q60LYNmI5=Q6Mm|Ok0Wyj&S;iw8Cp-D_rh%aLX-8UL z_qKi93wI{JS`DBU_l!FwBK7|{oGVjOU8K0$Awf(&_s0FhcZRv4(zF2_PAg3@XcDwL z-2Nk}^v?wm=7^P(624(tGh8C(lJ2j1CNaS4u%$|A>xY7Qf*bcSR@9(7r2YV>Z0!Wm z^)WnoSRw!Qzi`XjEGVZn{fJtXh-=EYnaB!>P0d8tEydiUuzP+wQmm_yu5ZgSEqX2Pvzzk zS&}wy(%gHngTUd7e>6Pw#XG#R_W~y^4mdlfHFt$fA6eOhjiJXArOYOARw;Bd zOBl5(G2a!QyD`gp5enW;+KvsmgThZg6}%P5Sgym{`WN+mI=*v)b8l7f@F%LzD5DPq zMzAw-{<(cp+>we`y&j7hUCMB#S*ppJwD!X*dv3zHP~@cl&Sn0uy(325ft^MAKR?;* z^a77cxqh;6SePNRypJtS()ks&Q7?yicQk;6A%=xSUzF{8cm62NPPjyp&qje~u-Zrot49Z^A6QiD+EoOpyV%kE>fn%6JJKbUCN94fKO z$;eQ-k#T~CB(GQ^3?J!Zd|!;8-!zT69}7M5{Q^_>^#KqKw;52Uo(l+J9vpLLIC_o? zf*`*H^edr@4Wu0xAAC7^tMi}VIFV^EF7oL0&rERlB|&kYP$<345iv|V5JF+j_&E^N z9=-um@D;O{2c%IBXvFW5C->Z>^*`-R@ZdC%;?EYF@o0ot%wTNig-YDkwjfyq#+`7DLq)ZxMLu>_r(p5sk2ty5_gt%anQ#n} zbat*Lmh~rk!!c$WfjVRPJ3jV-_}CWbAn@{aDdqujn_pYy)Bq8*&=Gs?@UE-6+itBvO$F|=@1}`aa$xLyP%-JI8`@<6fo`@ zclH!*q;h<)uxCqkO6W{)TIA~zDc8nyX7<4_B!0qA8sl_^UI44*u?;Bwau2ALzW!aM z(!7caBaV>m2eJ!AsE&e%wA@Mg;THLBOsYK42H+5qTskjYg@Sk;@YV}Dk`N-P{jkC= zN5i{V#bv_7e)a?`!SjUa$%c`H^$f(fJq(s zJC}{z&(J&XLD0i5t?P3W%A6as5-LHjs7AQ)X8!LWKYpUn^BMGL_8wfiauxA-KSuu{ zg92h=Kjz^+gMPEi83NH>+^If7NB1SYIvr=B z@7kDEor|+D;*l~@{j{nFLb&kt{!|9>OEk6gyA^%W*@5v4O^**MbBY!)pRa5oElw%~ z8gbr;c&&z0oCNMGJ-tz!YQ`??=1oe?mh1^6T-8w4>}*F9&@KACXp z^yz<-`u6?`*whW@dD4sVGWRhZK}F7<+~We3Mrem3N#uAX@9Z46z0kOmp|0QyBSS-5 zc#edqkcytaf6X>1BqneI7GXVQbe$03Q8Qmqm|amZ0JQn-91sx~uJcrlFXzl+7j-M# z&Dsg4T<{y@u!8kRBzGEcWdVg&qKtY?B|4_YfF@3+u+Q-KSu!%$CtgVGDJ5M;pw48x z$Mt>PtNh^u@o%JFKKuCt3mhFP1i)U&$cHc5fodm@Z#yLSo5_t~0w^Z_Dk?N}=7@Zv zx)U@Jy)6GZK+fN(x;1_X3}fBZE~+(lxc2e^kaL<>m+WtDGNDF|`=tE0^9p|NQ#e&= z7B!ilbT4K`*ZWb5h41M^sjo#3C83qt7LQUOi|H+RxdCAYIW-f+1Ohs8OlFh|yn~eH zy@2cF81Vv4srU^Q)!!??S14z9*XuTVk(gC+-O{dHRxt>s?(Gl&8|_1+v>)p3F%*-D zH6D?pwaKhM6;|0|3W^<$Rot{2mIC)BDzP#RB{nJN?OD&t6Gd0YNuqo+>1bWEKOeJP z2Yx*oD9e?FLI@da`%nqOK#`1<<43wMFD+a3Xpe{KI8B)rPn(9vk6+*L5~y4BHSpZgG+15C;?9^hnG&pU zsX3K~sF3VTynCk7W`d7Nhkyl<@VhhW)X&pJMBB(^^%?A%jmS8`8b2kEyHa8;d^EC3)!3D!3O?AI;pb0J&n1G-3`87ch3J1 zQ8F;$=AB99>O3;7m7QrR=)RdefLYCT>^c+Ads@cYeh6#UWBZg>fusu3t*0|C&!F@_ zje)`%8ykB9ZN=_myvPMmf8}a4WZeu}<(8-~agOG~TTezRovNW)=r;E6-j6Bwafs42 z4(`k;e)MWuY(|^f>g#)3%zXBM^fehl3+&U+5k_e2{WY@6kyCz@v~;^S<#@@6Ys ze)(k^K>|x(+w9J~I0%W6qR3b0rPg|2Z^gZ73MJ9(es_D-ezqqA!060oTafE`j1TFc!#x=$Moep)mjwaRe-0I(Do} zSw)7cXAUr_0fh${em`=+jAsSU*}&0&ifna6CP@X_GVof3#_Vv`ivtjy5njnPbzr)c%t<-QI1n^T z91&+k%r9r$X+3@j6bxza0C<9wB+;1uAwXfF!U9l{mDX;c#nHPg(D*UNjAWH?Ap^8d zyC~JFqdfk2Ec?-?xDs(~{PW7~XV`tYXAhoL9$L!LA)?)y#Zvx(W+jF#M0@o5X zNF?BBKktHv_Z5AwGgApSwykBKPxg87ei_|fya zhGXxCqG}9Sp}S9jBWkCZn;;^gNA32s{u^pp?|MSHbK`3JhDgfKFSd8L`}nRfJat|8=~kkG ziSy#OW37CDfOI2&=Nh+%C2(qp{zrP7ax&LiPdeP`lXNvtmJmrwa?q{BCZ831-51&I z9ByuIBj{jkT6y4Vo@O?B%~*VTCC<9pL5cny?*|v_+IPl@rge`~?>v2y$36>)1^+PU z;mIdJeEd2|ml@f;Us4fEqc!D#i3fR?GTUn(n)^9C|-~XoUze}YM8~LBpjBnPi zkS~06&5Hk<|4)8yi-aVp{VnYeCCp-U#AHE69Hb7Y6%e{4fc*?5pMCQAF0JZDy>EcA zswA}ieiJwwwsK?-F+>cQzLDCm$+=!a&37wvIiu-x<2Sv;EED&yHPyqyl{~ zKs0ZGXvVaMbs=j;0-|t^MjueMXkQ$Lk)&A;&|;{a+0>aBe&mBkDb{0Z(+#$`LufY! z$oHI(2hGbJa+a5WDBS00wf}>Bzw>_OXluyzr? zwBsHZ|6Xc$UVOqhCb(!XFS{EFthb&s!tISK?uuD31H@!m1E8EIV@(Ct|D2(h z75f*de)aX}|EKZxzez*7VW+?0ByJ3hOdO9e@{n*nB{}{m_1PLr5?KkjkEmUMxzR6a z=cK@+9xaKY&!NG${c<}0jkI6xo7OSd!uj+r%lkYQO~}+?!{da08krbyTU5KlV$*sn z?6U^+cH`K84sjp0si&a)uI>7&V0OSmN;o5UCf;t=+Vh2aQL*hace;LjQ!R+(LtY!MytGhBw`am zhBPwyEp4Q>COJDC3oLeSK1GY525SSK9rT;c?9()z0*B(s#{ip~+@x&XrsXk;o{YQ( zH0IQYgS%(u2FIBEQWeazJyB5lk~^r^R)o*T5fOS2^BaXgqvcAR3!mgJP5=X?yaQUu z{>Kq;9T(zW`3~WR_yR0ZUFtG7Ad^)2O`oy^V%e%s!G|exlvYbcAm8_cd9u}c*7s{y z&+qe({=>{Z&!{(;v~iUG*rj8o$_D|d*ChjeQ4nCi3eNTpT%YqoxzwrQAT-*(92wNi zcxb;(z=ZqEI66BDI|34)t0VE@WcjaP*PA7+?col`zTBbOHGoc-+;VI%zlmWi6aq78 zk_UPz}sOHx$> zEk}TW>~m;O`9QE2pWgRx(zcLLMV?&fFu$tWj?JEa%Aft~^)K-JS14qH#)B==3$~pf z6rff$4X}-(8$g5iCb{$Tf~%@Eae3`mij`_J!~tZT1tszLhhq4TFWA4n?PK}+)(idG zDzGX@)}qL0YhJ0dDf@EyvOm!8Nm7KQ8_3epf0)$J^*7tb8lqIkl(!V46a|S~{TttL zQ~f4SrZ!d9JeOkIy5o!RXJQOF^5bM(;lh`PSB#4``b^MTlQEy)hUQU1B8AypaiL*@ zcf@>dGHAAK@T+zIFd7R9dooliRp{`<7-DmS74E@=sC{m{{SbQe!2UnLs?XTmwoO)V zpzV>*-TZfz*)z$upr0=lXKSC4Mu8q&3H>hh&hC^BujCF{Ry^l*n2V|C9U}_;Mk4aqWkVty&)F zf&wk~1}a?kTPS_D>re@tP}agBgpC)G_!qx$SxqE7nOx5});qW1`q{IyadVbzV^dsB zA8!;`arH8DP|uYij0xv4>sKN+O-W)R6Pxn1@R*fQHOK4jB*YQ}LSN2e-?-Z2l!%aR zwWPCM60Q!Ger;(q9vJBYWllvt`9@27K+8q~cm=9zJ4*e30a0KPXTm0r>)k1@5$2vz zB3d?^KpoqhsjgEWW1>8}VtrPsIeGtJ)Uy;#0Bv7~V$mzWyz!}-Vo>As`&)v?`s$0Ko?hQU*IQ}vIKj7+YG24k* zZ_3gbW^n&wfN)-dj@9_L8-B%sWs1Us$KpqKt!*W*^wP~_hq+EY_U~NJf!)lN!N!RV zjwECyrfz@Gf7b0OYO%^779OAGRG&>=??npmNo=4EWt%kFo{)m=HgSvk z>R+Ovt+er?#U&@Z)+fw=Zu+?;)dlmUL6OOhL2Nlb&8_}}iSCJ-)sVNL>YNG%%v@?h zo!a#;BP>e78RT=$rtlu)j5@)T*HfR!;=JMJNF4=<09njI;DiPM0j4G*yEl_4NLrWAm4^9q|baj8MS`?lEkr; z-pywR+v~Qyrz4ZdZ(DV&Kn$^Qj2tipVbI)ZO)DS0HO>A}7ipNYG*3F+l{(rMTFcXzbY$?L{8$njy#$)%;nrG@8b8uJM^;D6@vNV9)mjo1v=lH+Mu3pa% z+K>%5GI?VhWII?I*ON`Uo=^!320ZG1k^itni+_XfsrOR-svttK{v<<`?XcXaTPhJo z@!FlCc{zaic?&>|l1V{YuqroilY?=HTGC~OnRCDER*5;v-|PXT-`5SAGdmX>xj{;6 zr)}u?;(X#xd21&vny2MO3>@^j?Jie@zmU(q{pvO7vSwX;>sdj|h@QxJNESM%*-gQ8 z8zrb*F()R9@Av^cw@aYZ^GiuRT@4inbra<(&}i{IE`Nk8ELn|E)5QdrjG7(nOOAdp zY_YHG0$YyzS2mD-w)G~TG9m;NKEX|Qd3hAQLR#!k6=k{5P-te1>b#fE1#CbOzGyZ# zQtj~%3Fq+dTALrG&cnF7w{aj79@DYwvT%s{#Bx}ZI!nnWKa$A$s0%XRY=7rytLdQ+ z&>OK7c=NKGU%T+Z!?!^Cm~|`a6;Jf_%uc`5bOuJ1F;P0@C9Fmd>TYyvkdpWQR7EA0 z;p^Q+`qzt|O9rl&y{67>)0_rHDBGDJjB;3q-eu}mY6En^ciE)7IJ)0+f(G5ZqqP5s za^3k7jUlkp4ttzB?D2fGwh-_k0gp7+;HX<4o!-jnx0lvmxv=<}I(0){%v24h@D0=( zT(r~iucz1q=%6$YJK1gL#^eqaVBAzvWKqgpfQWl)2MzpfN}b+k`8xE=SInj0J=5%w<7F+&9ZGE+ zL-btQDtcA3V?Wl19XTZ|Xm9p1at6NlCgP2w6;?DrL!+*ieg3~EYp&A0?W>CL0p;4f z=+pV5_i%?Qe#YwDo!w7G;_Srf(p!_{6HLh@Wx!B#xt}rV|^EZmnQvw>*bzRrR zI#7!{ZpvZsZEp9sUdQEG34c&4{7S%qCu>G#_3V zDop_Y%SI9u=b8|DbaZltSA@&Gy;K$78uCU;KgqsExcYkYaLhhOxsP3)R%*H1{Y59u zVhXf?+4~Cs^Pe%O>W_|@=ikz_+cH79EFCe{^4nAaa3`)BSnNky?Xn6svbbw|kZY{Y z7fDd8D=X87kHD&5w;P;Zfb652J^KB5YqDYTl&{`knw_5yv)U!EZnMBkCr_*}1g96VmI{(3^LM=8<{(8G#_w%& z7Z?Y%gJOACrZoOkEJ7|B)PJ{e*DeO-z@vzyq@>9_C8~2Wc6f9kNBe9IXgTd178f<4%VgGu{m(T{ zSe20ef2(oYUxrA!-Av}UGl3y!@6FdB)^i!XZDSG5^R5piSvB@v@l)0$$iYS*}hg9KMPvwV)1zM z|I%zGhzr(ig}`G**y^!SfLrMr5WtakaCIN#VFsns7=?m^#>E!u!< z<6gA)0LT)xvl%&K)id_-;67J-QdLpdC2q*!TdVaZcRp<9i2ZA6Lh@ z&f&Aj*r$!P5G#|z(zd$ZJ)34*uIqcudJ@KbK^45x<6&9z7OM||YK&h8`0OG8qr{h6 zZbzbJT=-B`NP8V9K0&@^!%cM2{8T*s0?I1_ZIa{tE}+zwgt|H zumlv3PYr(@mC>X+bxp_fWo7lsS-+^ox>;8I81oM8 zddX8`@7#-t>@C;4GD#*AkhT#w` zxk&tGZA}T>%y<8x zf$JT`_?4H!qVL7~Mw=5CR>vW3O|?4dI(t2e*UEZd#wB#<&WF9~UKE%Agz?#GAXER5s&j1O7Jf}NCvx}*Z0veqNlzlU&JKpvM`*Brr@cNtfUjRtPuMJZ%S|s~g4r#` zn`_qJ9gt4vc`Y>{!b5N_cX$E!yLJM7!?x2;=TC~+r?b%zYtA=`#A$X{9<-2jXd(Hf z9ebPmm@pr0p;b{OLi$pxYoRwajgpp4lzZ5eb&|r!UPHMt6IJ|9K6&)jFOeU6!=NzO z$mH1OZbLgnz*Qon?dv$syaUs_TN)fqmwXM}@u9{?7lD7G4cp-=YOu#Ls%_1=YxB=@ zZ<1wR*I2plN)HX&CY>IY18S^)stJeZHlTRD(jDQoktX<&BDk9zoVsm^ss||Bnp4)+ z4d%`lg+;MH$@0Th$1h#QtW(+L2g+`Y0bAr-x#fmRwbs7mFNn1oT*-;~(ciqhPtKN~ zvPti+u>k$$0Ljy%9t<7tk8`4-Pw95YtcQXPHL?q$UejhYNa&LXdZ)TLd z2FmwLAheio{P}h8KXFgO3(-RfHR%#h)#gIiHW)S~`UC3yU>O35~5pnuvpo&jwQI8HMLH1ru?x_ij?^Mjw2!GuX3Wn&pQ@9~k|RR`hd zZzhIPrHzc}zE^#_PwC^&NIYt`L6WLtkZ<`dvR28vT1VD7H{}U1SA)+I_MbjDtg4P# z@*C&02Tj7ZWN9>m_3+dkc3sTCAwg>8752&4m|%}ot0=;%WG52?L12_8A%8Y8Y=T4c z=q6vz@rbI{QR`j{H6)dd)I%-rY<;u4s5q|4uzYUS!zS~KuG;x5Y{q4V<7rTdZUT6H6hY`SU=y&M#y zf5a&ENDZ!G=pMLVvXzsQ1Jc_1qSq`l{p93k>Coaqt3~rgWinzKHDXn%U<8`cU}b= zksKS0XSM~$VR~8W$1BQ8jy(?YA6%{~&lXls)Mhn1NvA*g|IO=62xiLO;2LaH;5B!( zb8{|><#&sCzE|^Vw^p#P@2P#?(~S97+0*i0vSy76F(+lZMvw!*Um&9hIc!BPm> z=coO}@qON(QVk^-V+;hmh39^{u7z9AjcYo<=`<8LH^Qw&-3dHbeZG&1h{n2-Le09d zp2wKN3Q4^XXheKu(*`&jXs2vZ+s$Q&2P%}!`dt$}tWbY&em-N@JGR_%K#YuSn|fJf z$fOPnvs-ut=lAszDr=}qXD9=W@mF%~BV+K3in=R!g_Cwp6Dj(9VQpe_xM0+15-dhc zheWMhRq2lz$XQQs$=B;i&u3p-|L367c{<2YH0T&XMG_8gXd6U~BNj<2_;>arc zK)5wkEbcVth72aJjzeT<%lE|w^0z10pr@QERLYDq*5b8>11fdTM8H&a_q}CfX`zR4 zV{V$tz#<&*XM>_kY{+)3H8J_)2l27znvJbsh02em(9H+qW~)-}!mVo?SBOCFkx&Og zgRPg8XgZtrGowbyRUd>Vmr?e}7Evt{6oN^uml9Yk9gNjsmm5*fxJiC3S!yf>lkG;n z?ZVt)I){M_dp~3|;+V_Edj2*co({K8YXCaEeDQ}5Ubsb=8LWsur+=m>?iw6e;#WDn zJFq3+9P-g-Ac3~NA{~QIpYi%J`fzE$hRwW@NwxHu%O8Nwq4*9sBk_3V{ zbubHC<%4;r?%H@@O~BP>l9ZI?e|f^s^!_zND{(%f$_M*Vj#T2QQI0sKmlsUViNz6n zUMnd4Lf1W%^sm)C(c1bgwko@OC>(wg5ER9$ zrOdMF{JYY@iK2W+rR)P0R09#elnvlo8Z%D$#*a%9bx^zUC*u4zuLYZz-PU;EbT`aa z*JcZ&#tPD+KTG=GPF(1q^5KTRWNyGOnJ%|XF?*)2LzP7m_-DBXZtu9wgi51<)fj(C z^U?FeQ()I%qxZjxXY_&NVdrkl*nEL+!zMT)GX7CUmZ3qung zT_}8pEdKLmtsEPpOLuM6GH?K6zgBSVQwa$PvF=&O=A}z7tY;yo!?aI|+lY8*fsi+?L9LdC$L>XJ5T>yf7Do&DG}$=@*wh zZIpf+WbW8(Sspr2R^gQI@_c@_u7C9+A~tl8801=lt^|JJLOjG`?|~8KSxj;DgJ8bZ z=jSRn9?;CCB>Q@vD*bF9T}39@en~mt&B$r6FV9vKNUJ>46l^0QJbDdqE`R5e$R!Vl z$@1ml0U0-`x0Gl2^+1hpJ;Ow3OqI{;h!jqWosg!o-Yji)|LyN&@XV_ThBCO5ck6BlQgzY zz$uvvsh~aKk4@Jn?#Ac%{##@m+kQAHo3LMIvl%(P8L*)@B`=L@sk&e_A#RSP4QHhP z^cwZI*Gq6}ZC1Pefbdg&tfq*;PNIgqD&76#C#@-JdXRngr3vCqW5@lWi+F2wY{l9L ziDi)3ULzGJsa%3}cdgZfICz5gSK2xeuE4jN*}H2oPH#pJD_p#lf=QKoCEhwtT+`Nl zV=gMWLftL58z)OQPE^QvKff#>NpwgeD}$b+t%TRQ4W5Up>s*sc_OBdGii(~S{_XWR z6gO9$j6K_mIJj{a{~jc3hLOLifaOUzwoJD)<)x^)d{np~de+|)h; zS4(kLlNIlRoZk&V?pGhqT0U^r{?ST6f=00#d-Au4{RskR^j=|YUjN|$*55h@XWTyN zCxp_>cSNg@5s|9y0{)xU%HCco_w9G}w=Ni}GE_bqz8b@6V0)XE$Ahg=??fO)8s{%u z1cO6qu?+0zu?RduL-#W-O!$IJS(@q6hR~TDfW9L1)ej`0p9#A*u4~qXN z<9Xq2-01e~kP4~&1Cz_^x`JSi=o$-78;8m=@Y#h(g)Z5n-MbaC$26rz3;M#RjDRvl z__O@ZzBcbHg%-wu<;O(Uf3G2FJ1zXQ*ArJfU>jG_NlwP5xl-SA>(;RHd&8wdSIsS* zOW`(JH})KqNw&G34+nx-kxNS)ly3QW5aQb$lFSq}I3!#v6auXK5$~T3=xejtP~TcaOe>gohpyv4JuXR zc@;rkxL0h95dz~A6^Z-_h_Kne>CA;YaQZL~g z*|Tp=>F3H{hG0L}8Q^HDe!V58lBSE9<{G$cwn7U3@=$AZj%f0ihdUXl%=mdQ) z*yR(`rfr0@Xl%+~zHO~M$iiKURK=q&V7s7}54n4FJ_qvKbgb_ka~{~-YPCkRqamyb zxe6IVS}iLLg`ZjPpxNl(V@a)sx3_HTN;LFu3fS7}v%r{7L-_Z+er7fnY z$_)XFAT%=P*PHg1){1M3;SJvoj~pZ$uDm{eaA;^B*I)B$ullawX{|VIHlJK@diT&5 ztFj=pXXhS7OCzOWJm%`{R`pg`r#yp6Uqi`fu@qgxUzXikOJ3btCf>Vs@o(AqqSlQM z8FG~%l}ODhV}qix{w{~6EB`?7GpBuK5q?@SYw>3xxvR#kdtI)5QOjfGSxf0GZ(m7R zW1Z49&jiK_PE*!d+jgB_3FiEmm`|a~D}~=;*U8)(M9t;%r-p{!eEK8j^&8D94EN=$ zb!zuUanGY8v!_ceIC(Ugm*!o3&@c|c^&T`(aBpX9(@y8twN|cZ)b!1Uk8{Cj0naP% zW9-G%0Bw2pjEx>kHLuDAZRT^qWAX-Z*Sr0DR;@BCKdNB1SrP96k6oge%^IPfed-2U zTHk>wdSFaZO&T-{fFv99_+tTTBK}fixr6eG5}TbQAy6biStMj3T5bm--_i3N?^Vhk zL0wPiH&gM+=yN-Bm%zv?V-gn{CQ0^LP}6Xz5W~XAOUlu5wzv;#pGTdJnpE#1DECy* zISob%AQiQqlEG=(Y2+d!{p%H5Hy*G(PIzXkA;jO^6-hkd8em%nmx<_%B*dB)%n5u761fY=Y!;`SCCKMz zpPTtQ-09(i{ew8X0V{Cm<&7&?aI?0*x zEx}PwwC7Q1C`YWZ1_i+Adt=&$d`nN`OOxePAyJA$t zqsKPgD3Fre(UA-$HEs@hYdRsJ;qDMXRgin@WD@oo?@asHkbkBR8AiyDq zAuqdNklU?pV~mix}4^%jWZ zzi!KVsa9;bTd$3hT{>nynubzOfnvMJ*c36X+X1J@$kR2DNqlBgJmPzfI!$5}dGxRM zQ!dsB+Oh4bu5XNrQc(=!f**|FAiJ(pfpXwjb7FcR_@?V+@-rSkxT$(5yZzJ^Hl97+ zApb}O3iqAeP}^f93{^o<(e;?n-6hhEY~)(=mz4i)s_KW?opHo$$cjyz*BbJ!wH}&! zxEP5R>=(T1qMk8?dJqQBt{d|BBAucS!q1$>{XG(FN1H9*y(t>$4?&`|gFt|Fa+ z`9sn>)k`ec^#~U(9Y;xRP)m`m=a5ZdjWf9v@nzegh}GBh4oQXztr*%c`wFr$#6h$3 zTc?t_`+(Tj>!9%+Dp8dS0?@jk!wre*7j9!pn`qFiG65om*H=X5sH%xWHy*;UE zAJXakv3%Ead@(K5ZecCmCEm##`&3s>J*{E0`$Kc;?sZLQ1#VT(mR13U$%%=mv$rtd zmChmm-b_?#FySud-&@?{gc*Z7z?JwVp5aXj+H)r;B_RXtEl|uABz7=jni1<(D9-p9 z4FT8v_Jtw4(;cv-%l~!eJ=(>~<%h_~H&KulqPe|W>ISlhYs`_2d~R841$d;8P@Bh` zG;M?!!9D zusd1|zJjPqXL8v$;{r9L2gA(qF!d}#RP@F7KdS7A18R`1zWHjuAC&pv-#|V+_Z{I~ zt&4Qb>FT#*rOTvgzlCYDfSuFI8AWcpzah#JEs!0Ol0_TRH0lcMB_1=y$QGt#??^YI z&7B?s8}Vdtqjx^P6z+jucbsMzwfJPP5zvBD()O3p7dYq~#LGOa+s zpB(bRv$LI&i@HRWG*>IblW8be&B59-8>_=3n7TqVS;7wU+w*AkcNzh|Q@HqthyHO| z*`$Sq&+MeW;02QjE=yv8E|V86AbM#Wd*O3drtu63*q5rgo>O7lhSB=!PLpAEMq>y< zC0A$1DyEtKXd-2X`+Yla0o9}#+@$v%|P4 zKj=F|1EHwWGu2?${o?sEv;ups`j&sUt4Gl2WdPs}(G}N+#VrW?o5aED^XHejoiF$i z6Pg6Ki57VH&d>=~JL6U-l{yTXSx_Ie^^DF3He0x0MVd91=LEGxvh)IiBx41Icp4== zo@)?kC40RWKecI17+?v7{nh3&3@~oX-L5P|Ba<$?(unw1_e`o^?n~rDiYNZ%3U|%k_s3t$aA?A8t2|QNpTnlxPJImW)>nzo zv^%Hh*x+)>$$s%Y8L`gzIHz~Y9 z@RT>v?DkD_W`&}nqT7GuEGz{2_GI$Z-P5WFhBjdmUx<5=A82=Tham153N>N|i=dva#GIY`&TFQB#1Gfj>NNZ3G>)%Baq>E^7j{m{Y{f?ucmjq%6kYUogt&N?BG)`X z^dTb(7Bb4BPthvkLkNx_B@^FU8K~H2*e7%3+z2>*PY5f+2aZV{#`Jh0y)lp{`3h^FAO3 zrr|fk_oIM)0t6N=%N*+(=Ki^B3AlWU?NOao;{3L72aTx7046VL18FEeEz`!=UTpk% zbWog3V~4!TIFh7ZBf(kDD_bW@$}HLbAn4&d4OcMPU?ZEODK(8XOqIYYLAmzBRxKMj z-z$4wFwXro`D6iP_xj=&!j~$$++9!LF|2hw89eg0$MjUn@%U|A^r#c%k8tX*F^ApD z8Y*dT$&^E&2u=?gM`|Y)YeRF`(z?6@oQJy?O-w=~KbGDTxnL3Ni;y+xxvN6&*WjaT zDB$(}VafWe8_T)U(bs!Mvlrbye6)SHwUtv~T9ZbRC)FMD%-Q=*9={kzA+kuV!utJI zaulgrEwz6C4-TiF!DLLQwW=BS;1Yh3O`7}a{WqCOJ{x)^y*mToFmXQdGOn!40{$(& ze1s->x`07lbifC<-$i-XdED7_{2a!N707=Vx}o{g4C+pb`dqyK?H&f7k>y^O<*MQ5 zhHP7Tz|1uLOPcX~9f-x&*U|gU8T1S3bfBARaYTK!4mZQ*)*MD50~?{MT^up_Y$Tne zUMqI=Y4Jy2Ou@b_4(0LSbdz3F-jAA%_tKuWlbiK3Y+#=w!Or*zPdgk=fe5MMB|isa zfobD<*Qt(5s@+P%!x8S9Gusmlr6+?(Q04##$~!tJT6*Kg7p4$z&DNJVN{j|AQhUb4+@01`nU`r2c) z@q--(Kfvv4_lta*on_xYG`AE5t?0qKBKO7wy&2T69I4{7aqpXpJ4RQ@&zsCFr5H{u z%HeS+rl^MY!wLds_3i17xMzW@l3_4N6IHg!mLf8lxdT#$t&CNsQXh{Esz-nA2R~C$ zspY!s;LL4MhAGRsA|i=74Ad@xNC65A1w1Ia^V1%u%y|NjE;51@-`~o28o4FkIlL%V zMErFo^VnSZ1U%r3cwNqRVL3t7y1j*Ax|7ON%n9RjT|+;Ny5Gp#A0t4mH@-N}Rx9s< zJWk5s{KE<}?-3Kblq)3cgKp0#VRrH%@S{htVV6&E6=e-9b$=IMIt|y>XM^#=cIP`b zx;hff9NvZ-zGKFE4)xzx88e+0d)v27O`DZTKX=aNVpkaZr)K318pYoyfyVO+8MZR*gH9MKU#n3Se?zqZ;Kk@@%fuc*zjB_;@6P7A`bY;HBJD|S#y)Z~ zr?dH04Sco^E<94kokGZgqZU0?k}|Lni`n-p6|j&6%R6D z9Zjrzu^JzcA0h(nizDA=C?&XwNJ47c?vc6{Ct9tzR(zN%5uq9p-#LOx!90x`ei}8O zLPNKF2GgL2I@I;R)y=@GaVFT;H}sFGjb87uen;#CXEx&)Kqfw&E^oRb{1_)|f7NNa zSzNJChqjw}R6%y%L7rQH#t2!BoBnZq*#B)CXUGj zr*3>5?3mhkOmd(~u+WRw8?>+RePUl|B12IMRY*C+VDi_A`oi-;>CfxA-K^uo6;$yV z@|yMj#ixLJfph?t0;0zjXxzqtizi&{uDo{dlWxW%4Wn4cC@62JGZ~HA~4p~c@ms*Jpb_Sj1n4h+TIaCDlVFx{A)yj)s zsRR}10e3C6!LB>G6Hj*DLinJY&hbr}^)DT{WDps^*-+gXp?~GmVnmr)l-ql^l%~AyYUY zMR{Bp^pJ#dR3yq&*=GcjS`3cODR6UBTT04tqQ$%Gn(gHXbgU> zb;O9Pe8iSm;lmF4?LiS-6EcQL&OAA<+a@JTi=q8!VzT>ze`jG|%; zqRn{RsnvUM$GLvKKvL)(XUWPVfS0fjtAIy5Jq9PSn8bmY%)~Qy?9C5n3gEPCU><3? z~4**EbhWXokz%t{tfEsViu;USOt_a;gl74I<|1b`x__hJB%*0Zm5QABQ z=V;Z|fpEsOW|S}7V`K=q^^=3XX8kTjp7LedYxzc>%4X8(vp285>kt>}c!CU4%iUc; z8sM9`V^@Z3(>b<*gT+io$RlOtOWo$tPLKf-Lz_CNICiOUO$K5TbBf79wE}lM$i_wf zaRDDOsnPsMSWA825|}(m%kRKjB)=CR{+uYg6Lv)K?g~NH0Wp~aLBIWOab($QT3FI5 zW5l+Ldf_KCx!TD=*hr6!SmouZLs9S7qr)}uz#$*WW2uPWv83)ObMh&M^Oa94)3h5G znsLV~gXW{8N;)I~-9U^8wtpAWx0@+}h&MyXKQ_wWu;;zR1k>Q;-*YHg`kZLB<|Ws> z=;#X_zHfAm|IDvy$N=NFSkT-TD2j9AofNA2;Zh=Nx}h zywAHWkz}29;GCyE?Qq$<_`G#e6fiZ_K~_F}U^@7GKU?csrl>>h z>G?O8y2$Gh4{v`@m{OaB1M<8?ZAsXy>t#ACu1D%LmvT=}VKZohUf7MkTSZAUEG!>U=(4G(kR-n2B@L?7 zRxtR!@^p(UliXi|y$ALZx8m)EuFcLpE9Be@t)376HwyDMuG2Kl7n~c;AFK3Fj#ChJ z13p37KC$^eu9tt@UpbqQHS!U3NL6HHTRfuZ)#`|)kksf>k?XG6#p#P09Vhq&$Q5A9 zXfcYW!Zy1*4*^Wte3ZRKw`ag*px9QUegv0tfWH$ivpGu%7@goK4{UzXFfq+4Sh0#&1`a{?w-BM z;uBZ_>HK6sT}T%``F2m~Ve|CW@IC5m~ac3P(TYEkr-f68%bGS|(Q!A?=;4=!&j$$cOlxHwxj}fv=l`-gk zKUqi)EX8y)mD2OLF`h%VZgSNpvhsQjlrNveR|KFeaS?|koOKL`VOH|3DU-mjsms7j z5nX4O{Qfdp-BV|e37Q1L7;cw(*2gkdH|EK)Yg@?~LpBhAVnmASH6y8(MhF^}_z)D< z9wy)ZxJevyIq{o{J4fXYmH28v*dnw*V;Fqa6?H4BPlK-)IS5~#)mJW}sr&z^ ziv%KtvZ$R^cqb4U+M%HU0yEGfdHWlI0h!EAwRuuTfm*%mx9du?qfXl+ zH2Vr$EYy!&ho5Y6KE5`4;*0SJukMqn5eEUOU(W=b?jO>(P6~AL+^yU2D_>oZ$`UBy ziHLgz2t4=5y7x~afLfAIrME;{M{`b-acox-E5}ke?Y)iQghBOS1E ztHV9$0Oy39efc57{)h|mgp%ViEq`wq`&-3mv@Bn8+`itd~PP#;@0xB!R+A$CjQ1p*r!I#5* z1?j4)!3tKDnw3yz6W zOzi;%2n4!RTt4rq-dgMb>b6^2{{<+6>;+U1hPC;&)%Ei~_1KzeGkIbqe?P|bn|lzp z#7n_EfA5+5;Y_1uIs#0^H>eovv~V9ZMs-hf;QDaSiRp7qOmjwt_-q)N6B9Y!6VXBT>njU?h=*N9JB8Q){m`osFnE2sA^F@?>#k${Fuelf>iIk}gT7A_#(&p6B=PHqyiRyPV0? zame3X;NeQ}c8Fis^ZQ3si@JEcc<_E~Z}q;%2$dUqZSkR8!HsWNx7j9=Gzm>NCVOa( zle4>ikkRs(z8Z+gh_s@@`_=q-a5S%1Is)E`Vci7jO)nGqm}iWoV}FW{fw}wr!!i0u zMV2~!93UIZ1u5&)ksHyHF<602y~(lE^YV`f@<5R0`-=wXp$(S8p~WMMhgI37I?V zG26F>ZpRr{?{*@Sisy2*0nBTC&aGr9b~M|!`{pf z)Q(1B3#_HF55kx0v6qgMoIi5}TsN)7;!N+;#int$a~Mr;a;;y5Q|2YBrr+i&ZK4S1 zN2P|li!63+fex}ZVMZ1LjU^b+?tS?X27xvf=@E-4wC5g~`vnF!1SJ0H3rH*VYTlJ4 zhOT=yqa^0m)9t5k2Hk^kp-Po*R$sPf^!8<_&hn-7Dp7ideCdTY#0mCL4`=mt0f{wW z{G^&IJrWI%5#_VkjHn`Gug#! zkK!KBu4oz4dBMUjJvTO7EQTn$c^#t$5>lEU1U?wPgn8=xq&IRFe_)Tm2p@e&Q;zD6 zti?eSczt8)y-B{rx;1XO#7#5LR_=uL0gAwSe!a+o1JfvD4z9Rh>z9rf_#9#&ZJl4T)hIm9AOW@+$u66SQ zW zzyDao9cLWBQt8?HHcK#B&MAI!%6NP2{aHY0X({2S=?9S-vNPR0>4lfYYu@ZDS7DD_ zs%3;~k}nELprC2Zh%;wWZQ-c|!e&HYj*CgQ?*-Bmu7Con0vR>eD*FF8Ih53^d(;!@ zODjaZM-ODO%<8iMyOw1GN(A^>rQxGt;?O{zk5P_9#7zzjEv+O{YD7FxlKe5b_le^B zIIGE`OyF*VEUJ3#PN~X7$_oHV2mYw|x0&S1CbLV;l@5b3_|HGoSSZdnWZI14*f|Msa@>2Yc`QKG4QT@gEWF7aP9p^Gm1sGaUioXomw|1)CK`(N-)m2&z2 zCL0|wJwqj{`Juu+P@@;z;oqv>zx2U1D84rWFpL)Bkou>h8PnKjzYs<#D=0tC)rOfOHqj ztV;yo7dlOvp4^TB>-=ZP==u!R{}}Rv+cnRBQsuWv6DLXaF#P{K!4HFb|7`LBlDPH$ z?Tc2`tUTqPiTp!Dg8DlE;|Qo^8?dLbH&czwVKwpb|1!9ync)Ar_ah4Q(c)UZqmyX( z%ox=(jBR-LY!r9-6vhAB@c1(}oG<8A`cA$7W7WTyo>l)Rdu9FblK)R0SiY2c{Zn!m zt4V7tOAG*h#JqIT&>&d!xrRS5xzz6eVDfC0B!BJskFL0L4tRe)Y-1>;d587tZVNaIf5v01{$F>xU9biV+cNhQS@#y^hk*PWz&51N zZ}oCq$EqCmRNJM!&T+XtS@eYR>bc(c@3|bG)*hZ-9;1y#djN>GhkzW&24q?&1OlGi zh6=X&+#eULriO6)l3#-K9BPBo_I1)22>q zD`%^K;P&z9^}+73a?iyT@5R+ktbl&od8>dg1t71=oevguud&x3x9syIl$S>W@q#O_ z+q-MvTP&@u&wwB2j!Yjwfss%iG82JD`;7(2M+chXd*lCS^6o(ge2sTJl9j*ZA*nUsGl=j)Mm+(DP=2}6`x?C7MYK7R(F;^m2*}Uy zAp&qeT=>%$kI`3n%zq^Mbp)XgR#{Eb+rTOwj|kqkcn#w+s5R8;)>wcZq>tCtc7B7? zCZyQzU=hl?6&B6xuvem4Jr5z~{zxHkv5au(4HW9#nJQ%%QEmp?cJ=l)&xVfzVJn_X zEg(-kdE;6}RFtr{Een{O>8yNXzexvBVfYZh)@vZM2J|0LQjDr|6o4ccD^g{+bW7p1 zeQWvi)#d5$U!978$%KfEJb2)Hf9vX=O3~4Kaky$&t5s>zkE2fT6GuomRU$ARfFfNu zb@Lc`6FR+=BlVo{U00*nTU0c(7~2FUZR!l|g)^WA@_QQ_M)63Zpgg&hrky*W;F%Fn zKSokYs=dZ?+~kuIIgOsLtZc+hD*!CV?Q_#$!v~aHao+yfJz1w&ZT3$oZSfUr_`_6^ zI{%%@u%~;fSj6v%B2{!CzoCq?yyYm|+F2~3sRekYe6)MIJDsfu^*GusQZ3E`0L&a? zMJ|p8-#louOPB6&T+?qe4GvT>F)?e#mLMT2N=o7Dk4+DA*dwW|)L=aF923_qzqHq=$F#%gE$asOVx~|EaqHAjemupT%7p!J*{uoG=PQoklX<1`(Jb(YXbci$j%_hEu= zPPh6%54xuTMC~v7pO0IBYE)%Stfm8fmJUWI(z3eaWcg<+pWh6OeX9xO-5jA1AFSyG zE*k!fuBnxFbhJJ=*o~a}Ah;Se-WwTZwdCgkI--o%p*%7%p1LE3LR_W`%K!4^%Q8-} z@E6hOD4}Dxn^{Pfmq1ONHFw|{{svefYVp`wzish+#oEF5OT#@4JG;u_*R@yT1zjky ziOJOar(*yegKyfF0WNn8v|i~7vtw8YyN(RTB&I&tww$TT?XCQebFNd;M{T#~y68)# z_NrK|G&wa{tDypZ})$??kz)^$Q>qz@lcnZM+tD*foJy)f?v4xQa1rC^{ztYd>t1f zs| Pandas Usage - Spark (Pandas on Spark) Usage + Spark Usage Polars Usage Fugue Usage Benchmarks diff --git a/docs/source/spark_usage.rst b/docs/source/spark_usage.rst index 5f35724a..e064f0fb 100644 --- a/docs/source/spark_usage.rst +++ b/docs/source/spark_usage.rst @@ -1,30 +1,32 @@ -Spark (Pandas on Spark) Usage -============================= +Spark Usage +=========== .. important:: - With version ``v0.12.0`` the original ``SparkCompare`` is now replaced with a - Pandas on Spark implementation. The original ``SparkCompare`` implementation - differs from all the other native implementations. To align the API better, + With version ``v0.12.0`` the original ``SparkCompare`` was replaced with a + Pandas on Spark implementation The original ``SparkCompare`` + implementation differs from all the other native implementations. To align the API better, and keep behaviour consistent we are deprecating the original ``SparkCompare`` into a new module ``LegacySparkCompare`` - If you wish to use the old SparkCompare moving forward you can + Subsequently in ``v0.13.0`` a PySaprk DataFrame class has been introduced (``SparkSQLCompare``) + which accepts ``pyspark.sql.DataFrame`` and should provide better performance. With this version + the Pandas on Spark implementation has been renamed to ``SparkPandasCompare`` and all the spark + logic is now under the ``spark`` submodule. + + If you wish to use the old SparkCompare moving forward you can import it like so: .. code-block:: python - from datacompy.legacy import LegacySparkCompare + from datacompy.spark.legacy import LegacySparkCompare +For both ``SparkSQLCompare`` and ``SparkPandasCompare`` -DataComPy's Pandas on Spark implementation ``SparkCompare`` (new in ``v0.12.0``) -is a very similar port of the Pandas version - -- ``on_index`` is NOT supported like in ``PandasCompare`` +- ``on_index`` is not supported. - Joining is done using ``<=>`` which is the equality test that is safe for null values. -- In the backend we are using the Pandas on Spark API. This might be less optimal than - native Spark code but allows for better maintainability and readability. - +- ``SparkPandasCompare`` compares ``pyspark.pandas.DataFrame``'s +- ``SparkSQLCompare`` compares ``pyspark.sql.DataFrame``'s Supported Version ------------------ @@ -35,22 +37,22 @@ Supported Version Until then we will not be supporting Pandas 2 for the Pandas on Spark API implementaion. -For Fugue and the Native Pandas implementation Pandas 2 is supported. If you need to use Spark DataFrame with -Pandas 2+ then consider using Fugue otherwise downgrade to Pandas 1.5.3 +For Fugue, the Native Pandas (`Compare`), and `SparkSQLCompare` implementations, Pandas 2 is supported. -SparkCompare Object Setup -------------------------- +SparkPandasCompare and SparkSQLCompare Object Setup +--------------------------------------------------- There is currently only one supported method for joining your dataframes - by join column(s). + .. code-block:: python from io import StringIO import pandas as pd import pyspark.pandas as ps - from datacompy import SparkCompare + from datacompy import SparkPandasCompare, SparkSQLCompare from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() @@ -71,10 +73,31 @@ join column(s). 10000001238,1.05,Loose Seal Bluth,111 """ + # SparkPandasCompare df1 = ps.from_pandas(pd.read_csv(StringIO(data1))) df2 = ps.from_pandas(pd.read_csv(StringIO(data2))) - compare = SparkCompare( + compare = SparkPandasCompare( + df1, + df2, + join_columns='acct_id', # You can also specify a list of columns + abs_tol=0, # Optional, defaults to 0 + rel_tol=0, # Optional, defaults to 0 + df1_name='Original', # Optional, defaults to 'df1' + df2_name='New' # Optional, defaults to 'df2' + ) + compare.matches(ignore_extra_columns=False) + # False + # This method prints out a human-readable report summarizing and sampling differences + print(compare.report()) + + + # SparkSQLCompare + df1 = spark.createDataFrame(pd.read_csv(StringIO(data1))) + df2 = spark.createDataFrame(pd.read_csv(StringIO(data2))) + + compare = SparkSQLCompare( + spark, df1, df2, join_columns='acct_id', # You can also specify a list of columns @@ -92,7 +115,7 @@ join column(s). Reports ------- -A report is generated by calling ``SparkCompare.report()``, which returns a string. +A report is generated by calling ``report()``, which returns a string. Here is a sample report generated by ``datacompy`` for the two tables above, joined on ``acct_id`` (Note: if you don't specify ``df1_name`` and/or ``df2_name``, then any instance of "original" or "new" in the report is replaced with "df1" @@ -175,7 +198,7 @@ and/or "df2".):: Convenience Methods ------------------- -There are a few convenience methods available after the comparison has been run: +There are a few convenience methods and attributes available after the comparison has been run: .. code-block:: python diff --git a/pyproject.toml b/pyproject.toml index 1b4c1c23..0c49f5f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ python-tag = "py3" [project.optional-dependencies] duckdb = ["fugue[duckdb]"] -spark = ["pyspark>=3.1.1; python_version < \"3.11\"", "pyspark>=3.4; python_version >= \"3.11\""] +spark = ["pyspark[connect]>=3.1.1; python_version < \"3.11\"", "pyspark[connect]>=3.4; python_version >= \"3.11\""] dask = ["fugue[dask]"] ray = ["fugue[ray]"] docs = ["sphinx", "furo", "myst-parser"] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_legacy_spark.py b/tests/test_spark/test_legacy_spark.py similarity index 99% rename from tests/test_legacy_spark.py rename to tests/test_spark/test_legacy_spark.py index 30ec1500..3a1cebe5 100644 --- a/tests/test_legacy_spark.py +++ b/tests/test_spark/test_legacy_spark.py @@ -34,7 +34,7 @@ StructType, ) -from datacompy.legacy import ( # noqa: E402 +from datacompy.spark.legacy import ( # noqa: E402 NUMERIC_SPARK_TYPES, LegacySparkCompare, _is_comparable, diff --git a/tests/test_spark.py b/tests/test_spark/test_pandas_spark.py similarity index 92% rename from tests/test_spark.py rename to tests/test_spark/test_pandas_spark.py index 937396ec..5517ff9a 100644 --- a/tests/test_spark.py +++ b/tests/test_spark/test_pandas_spark.py @@ -35,8 +35,8 @@ import pyspark.pandas as ps # noqa: E402 from pandas.testing import assert_series_equal # noqa: E402 -from datacompy.spark import ( # noqa: E402 - SparkCompare, +from datacompy.spark.pandas import ( # noqa: E402 + SparkPandasCompare, calculate_max_diff, columns_equal, generate_id_within_group, @@ -428,12 +428,12 @@ def test_infinity_and_beyond(): def test_compare_df_setter_bad(): df = ps.DataFrame([{"a": 1, "c": 2}, {"a": 2, "c": 2}]) with raises(TypeError, match="df1 must be a pyspark.pandas.frame.DataFrame"): - SparkCompare("a", "a", ["a"]) + SparkPandasCompare("a", "a", ["a"]) with raises(ValueError, match="df1 must have all columns from join_columns"): - SparkCompare(df, df.copy(), ["b"]) + SparkPandasCompare(df, df.copy(), ["b"]) df_dupe = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 3}]) assert ( - SparkCompare(df_dupe, df_dupe.copy(), ["a", "b"]) + SparkPandasCompare(df_dupe, df_dupe.copy(), ["a", "b"]) .df1.equals(df_dupe) .all() .all() @@ -444,11 +444,11 @@ def test_compare_df_setter_bad(): def test_compare_df_setter_good(): df1 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}]) df2 = ps.DataFrame([{"A": 1, "B": 2}, {"A": 2, "B": 3}]) - compare = SparkCompare(df1, df2, ["a"]) + compare = SparkPandasCompare(df1, df2, ["a"]) assert compare.df1.equals(df1).all().all() assert compare.df2.equals(df2).all().all() assert compare.join_columns == ["a"] - compare = SparkCompare(df1, df2, ["A", "b"]) + compare = SparkPandasCompare(df1, df2, ["A", "b"]) assert compare.df1.equals(df1).all().all() assert compare.df2.equals(df2).all().all() assert compare.join_columns == ["a", "b"] @@ -458,7 +458,7 @@ def test_compare_df_setter_good(): def test_compare_df_setter_different_cases(): df1 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}]) df2 = ps.DataFrame([{"A": 1, "b": 2}, {"A": 2, "b": 3}]) - compare = SparkCompare(df1, df2, ["a"]) + compare = SparkPandasCompare(df1, df2, ["a"]) assert compare.df1.equals(df1).all().all() assert compare.df2.equals(df2).all().all() @@ -467,7 +467,7 @@ def test_compare_df_setter_different_cases(): def test_columns_overlap(): df1 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}]) df2 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 3}]) - compare = SparkCompare(df1, df2, ["a"]) + compare = SparkPandasCompare(df1, df2, ["a"]) assert compare.df1_unq_columns() == set() assert compare.df2_unq_columns() == set() assert compare.intersect_columns() == {"a", "b"} @@ -477,7 +477,7 @@ def test_columns_overlap(): def test_columns_no_overlap(): df1 = ps.DataFrame([{"a": 1, "b": 2, "c": "hi"}, {"a": 2, "b": 2, "c": "yo"}]) df2 = ps.DataFrame([{"a": 1, "b": 2, "d": "oh"}, {"a": 2, "b": 3, "d": "ya"}]) - compare = SparkCompare(df1, df2, ["a"]) + compare = SparkPandasCompare(df1, df2, ["a"]) assert compare.df1_unq_columns() == {"c"} assert compare.df2_unq_columns() == {"d"} assert compare.intersect_columns() == {"a", "b"} @@ -499,7 +499,7 @@ def test_columns_maintain_order_through_set_operations(): ], columns=["join", "e", "h", "b", "a", "g", "d"], ) - compare = SparkCompare(df1, df2, ["join"]) + compare = SparkPandasCompare(df1, df2, ["join"]) assert list(compare.df1_unq_columns()) == ["f", "c"] assert list(compare.df2_unq_columns()) == ["e", "d"] assert list(compare.intersect_columns()) == ["join", "g", "b", "h", "a"] @@ -512,7 +512,7 @@ def test_10k_rows(): df1.columns = ["a", "b", "c"] df2 = df1.copy() df2["b"] = df2["b"] + 0.1 - compare_tol = SparkCompare(df1, df2, ["a"], abs_tol=0.2) + compare_tol = SparkPandasCompare(df1, df2, ["a"], abs_tol=0.2) assert compare_tol.matches() assert len(compare_tol.df1_unq_rows) == 0 assert len(compare_tol.df2_unq_rows) == 0 @@ -521,7 +521,7 @@ def test_10k_rows(): assert compare_tol.all_rows_overlap() assert compare_tol.intersect_rows_match() - compare_no_tol = SparkCompare(df1, df2, ["a"]) + compare_no_tol = SparkPandasCompare(df1, df2, ["a"]) assert not compare_no_tol.matches() assert len(compare_no_tol.df1_unq_rows) == 0 assert len(compare_no_tol.df2_unq_rows) == 0 @@ -536,7 +536,7 @@ def test_subset(caplog): caplog.set_level(logging.DEBUG) df1 = ps.DataFrame([{"a": 1, "b": 2, "c": "hi"}, {"a": 2, "b": 2, "c": "yo"}]) df2 = ps.DataFrame([{"a": 1, "c": "hi"}]) - comp = SparkCompare(df1, df2, ["a"]) + comp = SparkPandasCompare(df1, df2, ["a"]) assert comp.subset() assert "Checking equality" in caplog.text @@ -546,7 +546,7 @@ def test_not_subset(caplog): caplog.set_level(logging.INFO) df1 = ps.DataFrame([{"a": 1, "b": 2, "c": "hi"}, {"a": 2, "b": 2, "c": "yo"}]) df2 = ps.DataFrame([{"a": 1, "b": 2, "c": "hi"}, {"a": 2, "b": 2, "c": "great"}]) - comp = SparkCompare(df1, df2, ["a"]) + comp = SparkPandasCompare(df1, df2, ["a"]) assert not comp.subset() assert "c: 1 / 2 (50.00%) match" in caplog.text @@ -557,7 +557,7 @@ def test_large_subset(): df1.reset_index(inplace=True) df1.columns = ["a", "b", "c"] df2 = df1[["a", "b"]].head(50).copy() - comp = SparkCompare(df1, df2, ["a"]) + comp = SparkPandasCompare(df1, df2, ["a"]) assert not comp.matches() assert comp.subset() @@ -566,7 +566,7 @@ def test_large_subset(): def test_string_joiner(): df1 = ps.DataFrame([{"ab": 1, "bc": 2}, {"ab": 2, "bc": 2}]) df2 = ps.DataFrame([{"ab": 1, "bc": 2}, {"ab": 2, "bc": 2}]) - compare = SparkCompare(df1, df2, "ab") + compare = SparkPandasCompare(df1, df2, "ab") assert compare.matches() @@ -574,7 +574,7 @@ def test_string_joiner(): def test_decimal_with_joins(): df1 = ps.DataFrame([{"a": Decimal("1"), "b": 2}, {"a": Decimal("2"), "b": 2}]) df2 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}]) - compare = SparkCompare(df1, df2, "a") + compare = SparkPandasCompare(df1, df2, "a") assert compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() @@ -585,7 +585,7 @@ def test_decimal_with_joins(): def test_decimal_with_nulls(): df1 = ps.DataFrame([{"a": 1, "b": Decimal("2")}, {"a": 2, "b": Decimal("2")}]) df2 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}, {"a": 3, "b": 2}]) - compare = SparkCompare(df1, df2, "a") + compare = SparkPandasCompare(df1, df2, "a") assert not compare.matches() assert compare.all_columns_match() assert not compare.all_rows_overlap() @@ -596,7 +596,7 @@ def test_decimal_with_nulls(): def test_strings_with_joins(): df1 = ps.DataFrame([{"a": "hi", "b": 2}, {"a": "bye", "b": 2}]) df2 = ps.DataFrame([{"a": "hi", "b": 2}, {"a": "bye", "b": 2}]) - compare = SparkCompare(df1, df2, "a") + compare = SparkPandasCompare(df1, df2, "a") assert compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() @@ -670,7 +670,7 @@ def test_temp_column_name_one_already(): def test_simple_dupes_one_field(): df1 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}]) df2 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}]) - compare = SparkCompare(df1, df2, join_columns=["a"]) + compare = SparkPandasCompare(df1, df2, join_columns=["a"]) assert compare.matches() # Just render the report to make sure it renders. compare.report() @@ -680,7 +680,7 @@ def test_simple_dupes_one_field(): def test_simple_dupes_two_fields(): df1 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 2}]) df2 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 2}]) - compare = SparkCompare(df1, df2, join_columns=["a", "b"]) + compare = SparkPandasCompare(df1, df2, join_columns=["a", "b"]) assert compare.matches() # Just render the report to make sure it renders. compare.report() @@ -690,7 +690,7 @@ def test_simple_dupes_two_fields(): def test_simple_dupes_one_field_two_vals_1(): df1 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}]) df2 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}]) - compare = SparkCompare(df1, df2, join_columns=["a"]) + compare = SparkPandasCompare(df1, df2, join_columns=["a"]) assert compare.matches() # Just render the report to make sure it renders. compare.report() @@ -700,7 +700,7 @@ def test_simple_dupes_one_field_two_vals_1(): def test_simple_dupes_one_field_two_vals_2(): df1 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}]) df2 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 0}]) - compare = SparkCompare(df1, df2, join_columns=["a"]) + compare = SparkPandasCompare(df1, df2, join_columns=["a"]) assert not compare.matches() assert len(compare.df1_unq_rows) == 1 assert len(compare.df2_unq_rows) == 1 @@ -713,7 +713,7 @@ def test_simple_dupes_one_field_two_vals_2(): def test_simple_dupes_one_field_three_to_two_vals(): df1 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}, {"a": 1, "b": 0}]) df2 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}]) - compare = SparkCompare(df1, df2, join_columns=["a"]) + compare = SparkPandasCompare(df1, df2, join_columns=["a"]) assert not compare.matches() assert len(compare.df1_unq_rows) == 1 assert len(compare.df2_unq_rows) == 0 @@ -750,9 +750,9 @@ def test_dupes_from_real_data(): 200,0,2017-07-01,1009393,2.01,2017-06-29,D,USA,5814,22102,,F,""" df1 = ps.from_pandas(pd.read_csv(StringIO(data), sep=",")) df2 = df1.copy() - compare_acct = SparkCompare(df1, df2, join_columns=["acct_id"]) + compare_acct = SparkPandasCompare(df1, df2, join_columns=["acct_id"]) assert compare_acct.matches() - compare_unq = SparkCompare( + compare_unq = SparkPandasCompare( df1, df2, join_columns=["acct_id", "acct_sfx_num", "trxn_post_dt", "trxn_post_seq_num"], @@ -767,13 +767,13 @@ def test_dupes_from_real_data(): def test_strings_with_joins_with_ignore_spaces(): df1 = ps.DataFrame([{"a": "hi", "b": " A"}, {"a": "bye", "b": "A"}]) df2 = ps.DataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "A "}]) - compare = SparkCompare(df1, df2, "a", ignore_spaces=False) + compare = SparkPandasCompare(df1, df2, "a", ignore_spaces=False) assert not compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() assert not compare.intersect_rows_match() - compare = SparkCompare(df1, df2, "a", ignore_spaces=True) + compare = SparkPandasCompare(df1, df2, "a", ignore_spaces=True) assert compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() @@ -784,13 +784,13 @@ def test_strings_with_joins_with_ignore_spaces(): def test_strings_with_joins_with_ignore_case(): df1 = ps.DataFrame([{"a": "hi", "b": "a"}, {"a": "bye", "b": "A"}]) df2 = ps.DataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "a"}]) - compare = SparkCompare(df1, df2, "a", ignore_case=False) + compare = SparkPandasCompare(df1, df2, "a", ignore_case=False) assert not compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() assert not compare.intersect_rows_match() - compare = SparkCompare(df1, df2, "a", ignore_case=True) + compare = SparkPandasCompare(df1, df2, "a", ignore_case=True) assert compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() @@ -801,13 +801,13 @@ def test_strings_with_joins_with_ignore_case(): def test_decimal_with_joins_with_ignore_spaces(): df1 = ps.DataFrame([{"a": 1, "b": " A"}, {"a": 2, "b": "A"}]) df2 = ps.DataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "A "}]) - compare = SparkCompare(df1, df2, "a", ignore_spaces=False) + compare = SparkPandasCompare(df1, df2, "a", ignore_spaces=False) assert not compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() assert not compare.intersect_rows_match() - compare = SparkCompare(df1, df2, "a", ignore_spaces=True) + compare = SparkPandasCompare(df1, df2, "a", ignore_spaces=True) assert compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() @@ -818,13 +818,13 @@ def test_decimal_with_joins_with_ignore_spaces(): def test_decimal_with_joins_with_ignore_case(): df1 = ps.DataFrame([{"a": 1, "b": "a"}, {"a": 2, "b": "A"}]) df2 = ps.DataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "a"}]) - compare = SparkCompare(df1, df2, "a", ignore_case=False) + compare = SparkPandasCompare(df1, df2, "a", ignore_case=False) assert not compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() assert not compare.intersect_rows_match() - compare = SparkCompare(df1, df2, "a", ignore_case=True) + compare = SparkPandasCompare(df1, df2, "a", ignore_case=True) assert compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() @@ -836,7 +836,7 @@ def test_joins_with_ignore_spaces(): df1 = ps.DataFrame([{"a": 1, "b": " A"}, {"a": 2, "b": "A"}]) df2 = ps.DataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "A "}]) - compare = SparkCompare(df1, df2, "a", ignore_spaces=True) + compare = SparkPandasCompare(df1, df2, "a", ignore_spaces=True) assert compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() @@ -848,7 +848,7 @@ def test_joins_with_ignore_case(): df1 = ps.DataFrame([{"a": 1, "b": "a"}, {"a": 2, "b": "A"}]) df2 = ps.DataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "a"}]) - compare = SparkCompare(df1, df2, "a", ignore_case=True) + compare = SparkPandasCompare(df1, df2, "a", ignore_case=True) assert compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() @@ -859,13 +859,13 @@ def test_joins_with_ignore_case(): def test_strings_with_ignore_spaces_and_join_columns(): df1 = ps.DataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "A"}]) df2 = ps.DataFrame([{"a": " hi ", "b": "A"}, {"a": " bye ", "b": "A"}]) - compare = SparkCompare(df1, df2, "a", ignore_spaces=False) + compare = SparkPandasCompare(df1, df2, "a", ignore_spaces=False) assert not compare.matches() assert compare.all_columns_match() assert not compare.all_rows_overlap() assert compare.count_matching_rows() == 0 - compare = SparkCompare(df1, df2, "a", ignore_spaces=True) + compare = SparkPandasCompare(df1, df2, "a", ignore_spaces=True) assert compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() @@ -877,14 +877,14 @@ def test_strings_with_ignore_spaces_and_join_columns(): def test_integers_with_ignore_spaces_and_join_columns(): df1 = ps.DataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "A"}]) df2 = ps.DataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "A"}]) - compare = SparkCompare(df1, df2, "a", ignore_spaces=False) + compare = SparkPandasCompare(df1, df2, "a", ignore_spaces=False) assert compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() assert compare.intersect_rows_match() assert compare.count_matching_rows() == 2 - compare = SparkCompare(df1, df2, "a", ignore_spaces=True) + compare = SparkPandasCompare(df1, df2, "a", ignore_spaces=True) assert compare.matches() assert compare.all_columns_match() assert compare.all_rows_overlap() @@ -915,7 +915,7 @@ def test_sample_mismatch(): df1 = ps.from_pandas(pd.read_csv(StringIO(data1), sep=",")) df2 = ps.from_pandas(pd.read_csv(StringIO(data2), sep=",")) - compare = SparkCompare(df1, df2, "acct_id") + compare = SparkPandasCompare(df1, df2, "acct_id") output = compare.sample_mismatch(column="name", sample_count=1) assert output.shape[0] == 1 @@ -951,7 +951,7 @@ def test_all_mismatch_not_ignore_matching_cols_no_cols_matching(): """ df1 = ps.from_pandas(pd.read_csv(StringIO(data1), sep=",")) df2 = ps.from_pandas(pd.read_csv(StringIO(data2), sep=",")) - compare = SparkCompare(df1, df2, "acct_id") + compare = SparkPandasCompare(df1, df2, "acct_id") output = compare.all_mismatch() assert output.shape[0] == 4 @@ -992,7 +992,7 @@ def test_all_mismatch_not_ignore_matching_cols_some_cols_matching(): """ df1 = ps.from_pandas(pd.read_csv(StringIO(data1), sep=",")) df2 = ps.from_pandas(pd.read_csv(StringIO(data2), sep=",")) - compare = SparkCompare(df1, df2, "acct_id") + compare = SparkPandasCompare(df1, df2, "acct_id") output = compare.all_mismatch() assert output.shape[0] == 4 @@ -1034,7 +1034,7 @@ def test_all_mismatch_ignore_matching_cols_some_cols_matching_diff_rows(): """ df1 = ps.from_pandas(pd.read_csv(StringIO(data1), sep=",")) df2 = ps.from_pandas(pd.read_csv(StringIO(data2), sep=",")) - compare = SparkCompare(df1, df2, "acct_id") + compare = SparkPandasCompare(df1, df2, "acct_id") output = compare.all_mismatch(ignore_matching_cols=True) @@ -1073,7 +1073,7 @@ def test_all_mismatch_ignore_matching_cols_some_calls_matching(): """ df1 = ps.from_pandas(pd.read_csv(StringIO(data1), sep=",")) df2 = ps.from_pandas(pd.read_csv(StringIO(data2), sep=",")) - compare = SparkCompare(df1, df2, "acct_id") + compare = SparkPandasCompare(df1, df2, "acct_id") output = compare.all_mismatch(ignore_matching_cols=True) @@ -1111,7 +1111,7 @@ def test_all_mismatch_ignore_matching_cols_no_cols_matching(): """ df1 = ps.from_pandas(pd.read_csv(StringIO(data1), sep=",")) df2 = ps.from_pandas(pd.read_csv(StringIO(data2), sep=",")) - compare = SparkCompare(df1, df2, "acct_id") + compare = SparkPandasCompare(df1, df2, "acct_id") output = compare.all_mismatch() assert output.shape[0] == 4 @@ -1182,7 +1182,7 @@ def test_dupes_with_nulls_strings(): "fld_3": [1, 2, 3, 4, 5], } ) - comp = SparkCompare(df1, df2, join_columns=["fld_1", "fld_2"]) + comp = SparkPandasCompare(df1, df2, join_columns=["fld_1", "fld_2"]) assert comp.subset() @@ -1202,7 +1202,7 @@ def test_dupes_with_nulls_ints(): "fld_3": [1, 2, 3, 4, 5], } ) - comp = SparkCompare(df1, df2, join_columns=["fld_1", "fld_2"]) + comp = SparkPandasCompare(df1, df2, join_columns=["fld_1", "fld_2"]) assert comp.subset() @@ -1242,26 +1242,28 @@ def test_lower(): # should match df1 = ps.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]}) df2 = ps.DataFrame({"a": [1, 2, 3], "B": [0, 1, 2]}) - compare = SparkCompare(df1, df2, join_columns=["a"]) + compare = SparkPandasCompare(df1, df2, join_columns=["a"]) assert compare.matches() # should not match df1 = ps.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]}) df2 = ps.DataFrame({"a": [1, 2, 3], "B": [0, 1, 2]}) - compare = SparkCompare(df1, df2, join_columns=["a"], cast_column_names_lower=False) + compare = SparkPandasCompare( + df1, df2, join_columns=["a"], cast_column_names_lower=False + ) assert not compare.matches() # test join column # should match df1 = ps.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]}) df2 = ps.DataFrame({"A": [1, 2, 3], "B": [0, 1, 2]}) - compare = SparkCompare(df1, df2, join_columns=["a"]) + compare = SparkPandasCompare(df1, df2, join_columns=["a"]) assert compare.matches() # should fail because "a" is not found in df2 df1 = ps.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]}) df2 = ps.DataFrame({"A": [1, 2, 3], "B": [0, 1, 2]}) expected_message = "df2 must have all columns from join_columns" with raises(ValueError, match=expected_message): - compare = SparkCompare( + compare = SparkPandasCompare( df1, df2, join_columns=["a"], cast_column_names_lower=False ) @@ -1271,19 +1273,19 @@ def test_integer_column_names(): """This function tests that integer column names would also work""" df1 = ps.DataFrame({1: [1, 2, 3], 2: [0, 1, 2]}) df2 = ps.DataFrame({1: [1, 2, 3], 2: [0, 1, 2]}) - compare = SparkCompare(df1, df2, join_columns=[1]) + compare = SparkPandasCompare(df1, df2, join_columns=[1]) assert compare.matches() @pandas_version -@mock.patch("datacompy.spark.render") +@mock.patch("datacompy.spark.pandas.render") def test_save_html(mock_render): df1 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}]) df2 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}]) - compare = SparkCompare(df1, df2, join_columns=["a"]) + compare = SparkPandasCompare(df1, df2, join_columns=["a"]) m = mock.mock_open() - with mock.patch("datacompy.spark.open", m, create=True): + with mock.patch("datacompy.spark.pandas.open", m, create=True): # assert without HTML call compare.report() assert mock_render.call_count == 4 @@ -1291,7 +1293,7 @@ def test_save_html(mock_render): mock_render.reset_mock() m = mock.mock_open() - with mock.patch("datacompy.spark.open", m, create=True): + with mock.patch("datacompy.spark.pandas.open", m, create=True): # assert with HTML call compare.report(html_file="test.html") assert mock_render.call_count == 4 @@ -1304,10 +1306,10 @@ def test_pandas_version(): df2 = ps.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}]) with mock.patch("pandas.__version__", "2.0.0"): with raises(Exception, match=re.escape(expected_message)): - SparkCompare(df1, df2, join_columns=["a"]) + SparkPandasCompare(df1, df2, join_columns=["a"]) with mock.patch("pandas.__version__", "1.5.3"): - SparkCompare(df1, df2, join_columns=["a"]) + SparkPandasCompare(df1, df2, join_columns=["a"]) @pandas_version @@ -1318,7 +1320,7 @@ def test_unicode_columns(): df2 = ps.DataFrame( [{"a": 1, "例": 2, "予測対象日": "test"}, {"a": 1, "例": 3, "予測対象日": "test"}] ) - compare = SparkCompare(df1, df2, join_columns=["例"]) + compare = SparkPandasCompare(df1, df2, join_columns=["例"]) assert compare.matches() # Just render the report to make sure it renders. compare.report() diff --git a/tests/test_spark/test_sql_spark.py b/tests/test_spark/test_sql_spark.py new file mode 100644 index 00000000..8fb4cfcf --- /dev/null +++ b/tests/test_spark/test_sql_spark.py @@ -0,0 +1,1343 @@ +# +# Copyright 2024 Capital One Services, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Testing out the datacompy functionality +""" + +import io +import logging +import re +import sys +from datetime import datetime +from decimal import Decimal +from io import StringIO +from unittest import mock + +import numpy as np +import pandas as pd +import pytest +from pytest import raises + +pytest.importorskip("pyspark") + +from pandas.testing import assert_series_equal # noqa: E402 + +from datacompy.spark.sql import ( # noqa: E402 + SparkSQLCompare, + _generate_id_within_group, + calculate_max_diff, + columns_equal, + temp_column_name, +) + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + +pd.DataFrame.iteritems = pd.DataFrame.items # Pandas 2+ compatability +np.bool = np.bool_ # Numpy 1.24.3+ comptability + + +def test_numeric_columns_equal_abs(spark_session): + data = """a|b|expected +1|1|True +2|2.1|True +3|4|False +4|NULL|False +NULL|4|False +NULL|NULL|True""" + + df = spark_session.createDataFrame(pd.read_csv(StringIO(data), sep="|")) + actual_out = columns_equal(df, "a", "b", "actual", abs_tol=0.2).toPandas()["actual"] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + + +def test_numeric_columns_equal_rel(spark_session): + data = """a|b|expected +1|1|True +2|2.1|True +3|4|False +4|NULL|False +NULL|4|False +NULL|NULL|True""" + df = spark_session.createDataFrame(pd.read_csv(StringIO(data), sep="|")) + actual_out = columns_equal(df, "a", "b", "actual", rel_tol=0.2).toPandas()["actual"] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + + +def test_string_columns_equal(spark_session): + data = """a|b|expected +Hi|Hi|True +Yo|Yo|True +Hey|Hey |False +résumé|resume|False +résumé|résumé|True +💩|💩|True +💩|🤔|False + | |True + | |False +datacompy|DataComPy|False +something||False +|something|False +||True""" + df = spark_session.createDataFrame(pd.read_csv(StringIO(data), sep="|")) + actual_out = columns_equal(df, "a", "b", "actual", rel_tol=0.2).toPandas()["actual"] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + + +def test_string_columns_equal_with_ignore_spaces(spark_session): + data = """a|b|expected +Hi|Hi|True +Yo|Yo|True +Hey|Hey |True +résumé|resume|False +résumé|résumé|True +💩|💩|True +💩|🤔|False + | |True + | |True +datacompy|DataComPy|False +something||False +|something|False +||True""" + df = spark_session.createDataFrame(pd.read_csv(StringIO(data), sep="|")) + actual_out = columns_equal( + df, "a", "b", "actual", rel_tol=0.2, ignore_spaces=True + ).toPandas()["actual"] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + + +def test_string_columns_equal_with_ignore_spaces_and_case(spark_session): + data = """a|b|expected +Hi|Hi|True +Yo|Yo|True +Hey|Hey |True +résumé|resume|False +résumé|résumé|True +💩|💩|True +💩|🤔|False + | |True + | |True +datacompy|DataComPy|True +something||False +|something|False +||True""" + df = spark_session.createDataFrame(pd.read_csv(StringIO(data), sep="|")) + actual_out = columns_equal( + df, "a", "b", "actual", rel_tol=0.2, ignore_spaces=True, ignore_case=True + ).toPandas()["actual"] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + + +def test_date_columns_equal(spark_session): + data = """a|b|expected +2017-01-01|2017-01-01|True +2017-01-02|2017-01-02|True +2017-10-01|2017-10-10|False +2017-01-01||False +|2017-01-01|False +||True""" + pdf = pd.read_csv(io.StringIO(data), sep="|") + df = spark_session.createDataFrame(pdf) + # First compare just the strings + actual_out = columns_equal(df, "a", "b", "actual", rel_tol=0.2).toPandas()["actual"] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + + # Then compare converted to datetime objects + pdf["a"] = pd.to_datetime(pdf["a"]) + pdf["b"] = pd.to_datetime(pdf["b"]) + df = spark_session.createDataFrame(pdf) + actual_out = columns_equal(df, "a", "b", "actual", rel_tol=0.2).toPandas()["actual"] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + # and reverse + actual_out_rev = columns_equal(df, "b", "a", "actual", rel_tol=0.2).toPandas()[ + "actual" + ] + assert_series_equal(expect_out, actual_out_rev, check_names=False) + + +def test_date_columns_equal_with_ignore_spaces(spark_session): + data = """a|b|expected +2017-01-01|2017-01-01 |True +2017-01-02 |2017-01-02|True +2017-10-01 |2017-10-10 |False +2017-01-01||False +|2017-01-01|False +||True""" + pdf = pd.read_csv(io.StringIO(data), sep="|") + df = spark_session.createDataFrame(pdf) + # First compare just the strings + actual_out = columns_equal( + df, "a", "b", "actual", rel_tol=0.2, ignore_spaces=True + ).toPandas()["actual"] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + + # Then compare converted to datetime objects + try: # pandas 2 + pdf["a"] = pd.to_datetime(pdf["a"], format="mixed") + pdf["b"] = pd.to_datetime(pdf["b"], format="mixed") + except ValueError: # pandas 1.5 + pdf["a"] = pd.to_datetime(pdf["a"]) + pdf["b"] = pd.to_datetime(pdf["b"]) + df = spark_session.createDataFrame(pdf) + actual_out = columns_equal( + df, "a", "b", "actual", rel_tol=0.2, ignore_spaces=True + ).toPandas()["actual"] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + # and reverse + actual_out_rev = columns_equal( + df, "b", "a", "actual", rel_tol=0.2, ignore_spaces=True + ).toPandas()["actual"] + assert_series_equal(expect_out, actual_out_rev, check_names=False) + + +def test_date_columns_equal_with_ignore_spaces_and_case(spark_session): + data = """a|b|expected +2017-01-01|2017-01-01 |True +2017-01-02 |2017-01-02|True +2017-10-01 |2017-10-10 |False +2017-01-01||False +|2017-01-01|False +||True""" + pdf = pd.read_csv(io.StringIO(data), sep="|") + df = spark_session.createDataFrame(pdf) + # First compare just the strings + actual_out = columns_equal( + df, "a", "b", "actual", rel_tol=0.2, ignore_spaces=True, ignore_case=True + ).toPandas()["actual"] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + + # Then compare converted to datetime objects + try: # pandas 2 + pdf["a"] = pd.to_datetime(pdf["a"], format="mixed") + pdf["b"] = pd.to_datetime(pdf["b"], format="mixed") + except ValueError: # pandas 1.5 + pdf["a"] = pd.to_datetime(pdf["a"]) + pdf["b"] = pd.to_datetime(pdf["b"]) + df = spark_session.createDataFrame(pdf) + actual_out = columns_equal( + df, "a", "b", "actual", rel_tol=0.2, ignore_spaces=True, ignore_case=True + ).toPandas()["actual"] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + # and reverse + actual_out_rev = columns_equal( + df, "b", "a", "actual", rel_tol=0.2, ignore_spaces=True, ignore_case=True + ).toPandas()["actual"] + assert_series_equal(expect_out, actual_out_rev, check_names=False) + + +def test_date_columns_unequal(spark_session): + """I want datetime fields to match with dates stored as strings""" + data = [{"a": "2017-01-01", "b": "2017-01-02"}, {"a": "2017-01-01"}] + pdf = pd.DataFrame(data) + pdf["a_dt"] = pd.to_datetime(pdf["a"]) + pdf["b_dt"] = pd.to_datetime(pdf["b"]) + df = spark_session.createDataFrame(pdf) + assert columns_equal(df, "a", "a_dt", "actual").toPandas()["actual"].all() + assert columns_equal(df, "b", "b_dt", "actual").toPandas()["actual"].all() + assert columns_equal(df, "a_dt", "a", "actual").toPandas()["actual"].all() + assert columns_equal(df, "b_dt", "b", "actual").toPandas()["actual"].all() + assert not columns_equal(df, "b_dt", "a", "actual").toPandas()["actual"].any() + assert not columns_equal(df, "a_dt", "b", "actual").toPandas()["actual"].any() + assert not columns_equal(df, "a", "b_dt", "actual").toPandas()["actual"].any() + assert not columns_equal(df, "b", "a_dt", "actual").toPandas()["actual"].any() + + +def test_bad_date_columns(spark_session): + """If strings can't be coerced into dates then it should be false for the + whole column. + """ + data = [ + {"a": "2017-01-01", "b": "2017-01-01"}, + {"a": "2017-01-01", "b": "217-01-01"}, + ] + pdf = pd.DataFrame(data) + pdf["a_dt"] = pd.to_datetime(pdf["a"]) + df = spark_session.createDataFrame(pdf) + assert not columns_equal(df, "a_dt", "b", "actual").toPandas()["actual"].all() + assert columns_equal(df, "a_dt", "b", "actual").toPandas()["actual"].any() + + +def test_rounded_date_columns(spark_session): + """If strings can't be coerced into dates then it should be false for the + whole column. + """ + data = [ + {"a": "2017-01-01", "b": "2017-01-01 00:00:00.000000", "exp": True}, + {"a": "2017-01-01", "b": "2017-01-01 00:00:00.123456", "exp": False}, + {"a": "2017-01-01", "b": "2017-01-01 00:00:01.000000", "exp": False}, + {"a": "2017-01-01", "b": "2017-01-01 00:00:00", "exp": True}, + ] + pdf = pd.DataFrame(data) + pdf["a_dt"] = pd.to_datetime(pdf["a"]) + df = spark_session.createDataFrame(pdf) + actual = columns_equal(df, "a_dt", "b", "actual").toPandas()["actual"] + expected = df.select("exp").toPandas()["exp"] + assert_series_equal(actual, expected, check_names=False) + + +def test_decimal_float_columns_equal(spark_session): + data = [ + {"a": Decimal("1"), "b": 1, "expected": True}, + {"a": Decimal("1.3"), "b": 1.3, "expected": True}, + {"a": Decimal("1.000003"), "b": 1.000003, "expected": True}, + {"a": Decimal("1.000000004"), "b": 1.000000003, "expected": False}, + {"a": Decimal("1.3"), "b": 1.2, "expected": False}, + {"a": np.nan, "b": np.nan, "expected": True}, + {"a": np.nan, "b": 1, "expected": False}, + {"a": Decimal("1"), "b": np.nan, "expected": False}, + ] + pdf = pd.DataFrame(data) + df = spark_session.createDataFrame(pdf) + actual_out = columns_equal(df, "a", "b", "actual").toPandas()["actual"] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + + +def test_decimal_float_columns_equal_rel(spark_session): + data = [ + {"a": Decimal("1"), "b": 1, "expected": True}, + {"a": Decimal("1.3"), "b": 1.3, "expected": True}, + {"a": Decimal("1.000003"), "b": 1.000003, "expected": True}, + {"a": Decimal("1.000000004"), "b": 1.000000003, "expected": True}, + {"a": Decimal("1.3"), "b": 1.2, "expected": False}, + {"a": np.nan, "b": np.nan, "expected": True}, + {"a": np.nan, "b": 1, "expected": False}, + {"a": Decimal("1"), "b": np.nan, "expected": False}, + ] + pdf = pd.DataFrame(data) + df = spark_session.createDataFrame(pdf) + actual_out = columns_equal(df, "a", "b", "actual", abs_tol=0.001).toPandas()[ + "actual" + ] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + + +def test_decimal_columns_equal(spark_session): + data = [ + {"a": Decimal("1"), "b": Decimal("1"), "expected": True}, + {"a": Decimal("1.3"), "b": Decimal("1.3"), "expected": True}, + {"a": Decimal("1.000003"), "b": Decimal("1.000003"), "expected": True}, + { + "a": Decimal("1.000000004"), + "b": Decimal("1.000000003"), + "expected": False, + }, + {"a": Decimal("1.3"), "b": Decimal("1.2"), "expected": False}, + {"a": np.nan, "b": np.nan, "expected": True}, + {"a": np.nan, "b": Decimal("1"), "expected": False}, + {"a": Decimal("1"), "b": np.nan, "expected": False}, + ] + pdf = pd.DataFrame(data) + df = spark_session.createDataFrame(pdf) + actual_out = columns_equal(df, "a", "b", "actual").toPandas()["actual"] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + + +def test_decimal_columns_equal_rel(spark_session): + data = [ + {"a": Decimal("1"), "b": Decimal("1"), "expected": True}, + {"a": Decimal("1.3"), "b": Decimal("1.3"), "expected": True}, + {"a": Decimal("1.000003"), "b": Decimal("1.000003"), "expected": True}, + { + "a": Decimal("1.000000004"), + "b": Decimal("1.000000003"), + "expected": True, + }, + {"a": Decimal("1.3"), "b": Decimal("1.2"), "expected": False}, + {"a": np.nan, "b": np.nan, "expected": True}, + {"a": np.nan, "b": Decimal("1"), "expected": False}, + {"a": Decimal("1"), "b": np.nan, "expected": False}, + ] + pdf = pd.DataFrame(data) + df = spark_session.createDataFrame(pdf) + actual_out = columns_equal(df, "a", "b", "actual", abs_tol=0.001).toPandas()[ + "actual" + ] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + + +def test_infinity_and_beyond(spark_session): + # https://spark.apache.org/docs/latest/sql-ref-datatypes.html#positivenegative-infinity-semantics + # Positive/negative infinity multiplied by 0 returns NaN. + # Positive infinity sorts lower than NaN and higher than any other values. + # Negative infinity sorts lower than any other values. + data = [ + {"a": np.inf, "b": np.inf, "expected": True}, + {"a": -np.inf, "b": -np.inf, "expected": True}, + {"a": -np.inf, "b": np.inf, "expected": True}, + {"a": np.inf, "b": -np.inf, "expected": True}, + {"a": 1, "b": 1, "expected": True}, + {"a": 1, "b": 0, "expected": False}, + ] + pdf = pd.DataFrame(data) + df = spark_session.createDataFrame(pdf) + actual_out = columns_equal(df, "a", "b", "actual").toPandas()["actual"] + expect_out = df.select("expected").toPandas()["expected"] + assert_series_equal(expect_out, actual_out, check_names=False) + + +def test_compare_df_setter_bad(spark_session): + pdf = pd.DataFrame([{"a": 1, "c": 2}, {"a": 2, "c": 2}]) + df = spark_session.createDataFrame(pdf) + with raises(TypeError, match="df1 must be a pyspark.sql.DataFrame"): + SparkSQLCompare(spark_session, "a", "a", ["a"]) + with raises(ValueError, match="df1 must have all columns from join_columns"): + SparkSQLCompare(spark_session, df, df.select("*"), ["b"]) + pdf = pd.DataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 3}]) + df_dupe = spark_session.createDataFrame(pdf) + assert ( + SparkSQLCompare(spark_session, df_dupe, df_dupe.select("*"), ["a", "b"]) + .df1.toPandas() + .equals(pdf) + ) + + +def test_compare_df_setter_good(spark_session): + df1 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}]) + df2 = spark_session.createDataFrame([{"A": 1, "B": 2}, {"A": 2, "B": 3}]) + compare = SparkSQLCompare(spark_session, df1, df2, ["a"]) + assert compare.df1.toPandas().equals(df1.toPandas()) + assert not compare.df2.toPandas().equals(df2.toPandas()) + assert compare.join_columns == ["a"] + compare = SparkSQLCompare(spark_session, df1, df2, ["A", "b"]) + assert compare.df1.toPandas().equals(df1.toPandas()) + assert not compare.df2.toPandas().equals(df2.toPandas()) + assert compare.join_columns == ["a", "b"] + + +def test_compare_df_setter_different_cases(spark_session): + df1 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}]) + df2 = spark_session.createDataFrame([{"A": 1, "b": 2}, {"A": 2, "b": 3}]) + compare = SparkSQLCompare(spark_session, df1, df2, ["a"]) + assert compare.df1.toPandas().equals(df1.toPandas()) + assert not compare.df2.toPandas().equals(df2.toPandas()) + + +def test_columns_overlap(spark_session): + df1 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}]) + df2 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 3}]) + compare = SparkSQLCompare(spark_session, df1, df2, ["a"]) + assert compare.df1_unq_columns() == set() + assert compare.df2_unq_columns() == set() + assert compare.intersect_columns() == {"a", "b"} + + +def test_columns_no_overlap(spark_session): + df1 = spark_session.createDataFrame( + [{"a": 1, "b": 2, "c": "hi"}, {"a": 2, "b": 2, "c": "yo"}] + ) + df2 = spark_session.createDataFrame( + [{"a": 1, "b": 2, "d": "oh"}, {"a": 2, "b": 3, "d": "ya"}] + ) + compare = SparkSQLCompare(spark_session, df1, df2, ["a"]) + assert compare.df1_unq_columns() == {"c"} + assert compare.df2_unq_columns() == {"d"} + assert compare.intersect_columns() == {"a", "b"} + + +def test_columns_maintain_order_through_set_operations(spark_session): + pdf1 = pd.DataFrame( + { + "join": ["A", "B"], + "f": [0, 0], + "g": [1, 2], + "b": [2, 2], + "h": [3, 3], + "a": [4, 4], + "c": [-2, -3], + } + ) + pdf2 = pd.DataFrame( + { + "join": ["A", "B"], + "e": [0, 1], + "h": [1, 2], + "b": [2, 3], + "a": [-1, -1], + "g": [4, 4], + "d": [-3, -2], + } + ) + df1 = spark_session.createDataFrame(pdf1) + df2 = spark_session.createDataFrame(pdf2) + compare = SparkSQLCompare(spark_session, df1, df2, ["join"]) + assert list(compare.df1_unq_columns()) == ["f", "c"] + assert list(compare.df2_unq_columns()) == ["e", "d"] + assert list(compare.intersect_columns()) == ["join", "g", "b", "h", "a"] + + +def test_10k_rows(spark_session): + pdf = pd.DataFrame(np.random.randint(0, 100, size=(10000, 2)), columns=["b", "c"]) + pdf.reset_index(inplace=True) + pdf.columns = ["a", "b", "c"] + pdf2 = pdf.copy() + pdf2["b"] = pdf2["b"] + 0.1 + df1 = spark_session.createDataFrame(pdf) + df2 = spark_session.createDataFrame(pdf2) + compare_tol = SparkSQLCompare(spark_session, df1, df2, ["a"], abs_tol=0.2) + assert compare_tol.matches() + assert compare_tol.df1_unq_rows.count() == 0 + assert compare_tol.df2_unq_rows.count() == 0 + assert compare_tol.intersect_columns() == {"a", "b", "c"} + assert compare_tol.all_columns_match() + assert compare_tol.all_rows_overlap() + assert compare_tol.intersect_rows_match() + + compare_no_tol = SparkSQLCompare(spark_session, df1, df2, ["a"]) + assert not compare_no_tol.matches() + assert compare_no_tol.df1_unq_rows.count() == 0 + assert compare_no_tol.df2_unq_rows.count() == 0 + assert compare_no_tol.intersect_columns() == {"a", "b", "c"} + assert compare_no_tol.all_columns_match() + assert compare_no_tol.all_rows_overlap() + assert not compare_no_tol.intersect_rows_match() + + +def test_subset(spark_session, caplog): + caplog.set_level(logging.DEBUG) + df1 = spark_session.createDataFrame( + [{"a": 1, "b": 2, "c": "hi"}, {"a": 2, "b": 2, "c": "yo"}] + ) + df2 = spark_session.createDataFrame([{"a": 1, "c": "hi"}]) + comp = SparkSQLCompare(spark_session, df1, df2, ["a"]) + assert comp.subset() + + +def test_not_subset(spark_session, caplog): + caplog.set_level(logging.INFO) + df1 = spark_session.createDataFrame( + [{"a": 1, "b": 2, "c": "hi"}, {"a": 2, "b": 2, "c": "yo"}] + ) + df2 = spark_session.createDataFrame( + [{"a": 1, "b": 2, "c": "hi"}, {"a": 2, "b": 2, "c": "great"}] + ) + comp = SparkSQLCompare(spark_session, df1, df2, ["a"]) + assert not comp.subset() + assert "c: 1 / 2 (50.00%) match" in caplog.text + + +def test_large_subset(spark_session): + pdf = pd.DataFrame(np.random.randint(0, 100, size=(10000, 2)), columns=["b", "c"]) + pdf.reset_index(inplace=True) + pdf.columns = ["a", "b", "c"] + pdf2 = pdf[["a", "b"]].head(50).copy() + df1 = spark_session.createDataFrame(pdf) + df2 = spark_session.createDataFrame(pdf2) + comp = SparkSQLCompare(spark_session, df1, df2, ["a"]) + assert not comp.matches() + assert comp.subset() + + +def test_string_joiner(spark_session): + df1 = spark_session.createDataFrame([{"ab": 1, "bc": 2}, {"ab": 2, "bc": 2}]) + df2 = spark_session.createDataFrame([{"ab": 1, "bc": 2}, {"ab": 2, "bc": 2}]) + compare = SparkSQLCompare(spark_session, df1, df2, "ab") + assert compare.matches() + + +def test_decimal_with_joins(spark_session): + df1 = spark_session.createDataFrame( + [{"a": Decimal("1"), "b": 2}, {"a": Decimal("2"), "b": 2}] + ) + df2 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 2}]) + compare = SparkSQLCompare(spark_session, df1, df2, "a") + assert compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert compare.intersect_rows_match() + + +def test_decimal_with_nulls(spark_session): + df1 = spark_session.createDataFrame( + [{"a": 1, "b": Decimal("2")}, {"a": 2, "b": Decimal("2")}] + ) + df2 = spark_session.createDataFrame( + [{"a": 1, "b": 2}, {"a": 2, "b": 2}, {"a": 3, "b": 2}] + ) + compare = SparkSQLCompare(spark_session, df1, df2, "a") + assert not compare.matches() + assert compare.all_columns_match() + assert not compare.all_rows_overlap() + assert compare.intersect_rows_match() + + +def test_strings_with_joins(spark_session): + df1 = spark_session.createDataFrame([{"a": "hi", "b": 2}, {"a": "bye", "b": 2}]) + df2 = spark_session.createDataFrame([{"a": "hi", "b": 2}, {"a": "bye", "b": 2}]) + compare = SparkSQLCompare(spark_session, df1, df2, "a") + assert compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert compare.intersect_rows_match() + + +def test_temp_column_name(spark_session): + df1 = spark_session.createDataFrame([{"a": "hi", "b": 2}, {"a": "bye", "b": 2}]) + df2 = spark_session.createDataFrame( + [{"a": "hi", "b": 2}, {"a": "bye", "b": 2}, {"a": "back fo mo", "b": 3}] + ) + actual = temp_column_name(df1, df2) + assert actual == "_temp_0" + + +def test_temp_column_name_one_has(spark_session): + df1 = spark_session.createDataFrame( + [{"_temp_0": "hi", "b": 2}, {"_temp_0": "bye", "b": 2}] + ) + df2 = spark_session.createDataFrame( + [{"a": "hi", "b": 2}, {"a": "bye", "b": 2}, {"a": "back fo mo", "b": 3}] + ) + actual = temp_column_name(df1, df2) + assert actual == "_temp_1" + + +def test_temp_column_name_both_have_temp_1(spark_session): + df1 = spark_session.createDataFrame( + [{"_temp_0": "hi", "b": 2}, {"_temp_0": "bye", "b": 2}] + ) + df2 = spark_session.createDataFrame( + [ + {"_temp_0": "hi", "b": 2}, + {"_temp_0": "bye", "b": 2}, + {"a": "back fo mo", "b": 3}, + ] + ) + actual = temp_column_name(df1, df2) + assert actual == "_temp_1" + + +def test_temp_column_name_both_have_temp_2(spark_session): + df1 = spark_session.createDataFrame( + [{"_temp_0": "hi", "b": 2}, {"_temp_0": "bye", "b": 2}] + ) + df2 = spark_session.createDataFrame( + [ + {"_temp_0": "hi", "b": 2}, + {"_temp_1": "bye", "b": 2}, + {"a": "back fo mo", "b": 3}, + ] + ) + actual = temp_column_name(df1, df2) + assert actual == "_temp_2" + + +def test_temp_column_name_one_already(spark_session): + df1 = spark_session.createDataFrame( + [{"_temp_1": "hi", "b": 2}, {"_temp_1": "bye", "b": 2}] + ) + df2 = spark_session.createDataFrame( + [ + {"_temp_1": "hi", "b": 2}, + {"_temp_1": "bye", "b": 2}, + {"a": "back fo mo", "b": 3}, + ] + ) + actual = temp_column_name(df1, df2) + assert actual == "_temp_0" + + +### Duplicate testing! + + +def test_simple_dupes_one_field(spark_session): + df1 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}]) + df2 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}]) + compare = SparkSQLCompare(spark_session, df1, df2, join_columns=["a"]) + assert compare.matches() + # Just render the report to make sure it renders. + compare.report() + + +def test_simple_dupes_two_fields(spark_session): + df1 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 2}]) + df2 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 2}]) + compare = SparkSQLCompare(spark_session, df1, df2, join_columns=["a", "b"]) + assert compare.matches() + # Just render the report to make sure it renders. + compare.report() + + +def test_simple_dupes_one_field_two_vals_1(spark_session): + df1 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}]) + df2 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}]) + compare = SparkSQLCompare(spark_session, df1, df2, join_columns=["a"]) + assert compare.matches() + # Just render the report to make sure it renders. + compare.report() + + +def test_simple_dupes_one_field_two_vals_2(spark_session): + df1 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}]) + df2 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 2, "b": 0}]) + compare = SparkSQLCompare(spark_session, df1, df2, join_columns=["a"]) + assert not compare.matches() + assert compare.df1_unq_rows.count() == 1 + assert compare.df2_unq_rows.count() == 1 + assert compare.intersect_rows.count() == 1 + # Just render the report to make sure it renders. + compare.report() + + +def test_simple_dupes_one_field_three_to_two_vals(spark_session): + df1 = spark_session.createDataFrame( + [{"a": 1, "b": 2}, {"a": 1, "b": 0}, {"a": 1, "b": 0}] + ) + df2 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 0}]) + compare = SparkSQLCompare(spark_session, df1, df2, join_columns=["a"]) + assert not compare.matches() + assert compare.df1_unq_rows.count() == 1 + assert compare.df2_unq_rows.count() == 0 + assert compare.intersect_rows.count() == 2 + # Just render the report to make sure it renders. + compare.report() + assert "(First 1 Columns)" in compare.report(column_count=1) + assert "(First 2 Columns)" in compare.report(column_count=2) + + +def test_dupes_from_real_data(spark_session): + data = """acct_id,acct_sfx_num,trxn_post_dt,trxn_post_seq_num,trxn_amt,trxn_dt,debit_cr_cd,cash_adv_trxn_comn_cntry_cd,mrch_catg_cd,mrch_pstl_cd,visa_mail_phn_cd,visa_rqstd_pmt_svc_cd,mc_pmt_facilitator_idn_num +100,0,2017-06-17,1537019,30.64,2017-06-15,D,CAN,5812,M2N5P5,,,0.0 +200,0,2017-06-24,1022477,485.32,2017-06-22,D,USA,4511,7114,7.0,1, +100,0,2017-06-17,1537039,2.73,2017-06-16,D,CAN,5812,M4J 1M9,,,0.0 +200,0,2017-06-29,1049223,22.41,2017-06-28,D,USA,4789,21211,,A, +100,0,2017-06-17,1537029,34.05,2017-06-16,D,CAN,5812,M4E 2C7,,,0.0 +200,0,2017-06-29,1049213,9.12,2017-06-28,D,CAN,5814,0,,, +100,0,2017-06-19,1646426,165.21,2017-06-17,D,CAN,5411,M4M 3H9,,,0.0 +200,0,2017-06-30,1233082,28.54,2017-06-29,D,USA,4121,94105,7.0,G, +100,0,2017-06-19,1646436,17.87,2017-06-18,D,CAN,5812,M4J 1M9,,,0.0 +200,0,2017-06-30,1233092,24.39,2017-06-29,D,USA,4121,94105,7.0,G, +100,0,2017-06-19,1646446,5.27,2017-06-17,D,CAN,5200,M4M 3G6,,,0.0 +200,0,2017-06-30,1233102,61.8,2017-06-30,D,CAN,4121,0,,, +100,0,2017-06-20,1607573,41.99,2017-06-19,D,CAN,5661,M4C1M9,,,0.0 +200,0,2017-07-01,1009403,2.31,2017-06-29,D,USA,5814,22102,,F, +100,0,2017-06-20,1607553,86.88,2017-06-19,D,CAN,4812,H2R3A8,,,0.0 +200,0,2017-07-01,1009423,5.5,2017-06-29,D,USA,5812,2903,,F, +100,0,2017-06-20,1607563,25.17,2017-06-19,D,CAN,5641,M4C 1M9,,,0.0 +200,0,2017-07-01,1009433,214.12,2017-06-29,D,USA,3640,20170,,A, +100,0,2017-06-20,1607593,1.67,2017-06-19,D,CAN,5814,M2N 6L7,,,0.0 +200,0,2017-07-01,1009393,2.01,2017-06-29,D,USA,5814,22102,,F,""" + df1 = spark_session.createDataFrame(pd.read_csv(StringIO(data), sep=",")) + df2 = df1.select("*") + compare_acct = SparkSQLCompare(spark_session, df1, df2, join_columns=["acct_id"]) + assert compare_acct.matches() + compare_unq = SparkSQLCompare( + spark_session, + df1, + df2, + join_columns=["acct_id", "acct_sfx_num", "trxn_post_dt", "trxn_post_seq_num"], + ) + assert compare_unq.matches() + # Just render the report to make sure it renders. + compare_acct.report() + compare_unq.report() + + +def test_strings_with_joins_with_ignore_spaces(spark_session): + df1 = spark_session.createDataFrame( + [{"a": "hi", "b": " A"}, {"a": "bye", "b": "A"}] + ) + df2 = spark_session.createDataFrame( + [{"a": "hi", "b": "A"}, {"a": "bye", "b": "A "}] + ) + compare = SparkSQLCompare(spark_session, df1, df2, "a", ignore_spaces=False) + assert not compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert not compare.intersect_rows_match() + + compare = SparkSQLCompare(spark_session, df1, df2, "a", ignore_spaces=True) + assert compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert compare.intersect_rows_match() + + +def test_strings_with_joins_with_ignore_case(spark_session): + df1 = spark_session.createDataFrame([{"a": "hi", "b": "a"}, {"a": "bye", "b": "A"}]) + df2 = spark_session.createDataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "a"}]) + compare = SparkSQLCompare(spark_session, df1, df2, "a", ignore_case=False) + assert not compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert not compare.intersect_rows_match() + + compare = SparkSQLCompare(spark_session, df1, df2, "a", ignore_case=True) + assert compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert compare.intersect_rows_match() + + +def test_decimal_with_joins_with_ignore_spaces(spark_session): + df1 = spark_session.createDataFrame([{"a": 1, "b": " A"}, {"a": 2, "b": "A"}]) + df2 = spark_session.createDataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "A "}]) + compare = SparkSQLCompare(spark_session, df1, df2, "a", ignore_spaces=False) + assert not compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert not compare.intersect_rows_match() + + compare = SparkSQLCompare(spark_session, df1, df2, "a", ignore_spaces=True) + assert compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert compare.intersect_rows_match() + + +def test_decimal_with_joins_with_ignore_case(spark_session): + df1 = spark_session.createDataFrame([{"a": 1, "b": "a"}, {"a": 2, "b": "A"}]) + df2 = spark_session.createDataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "a"}]) + compare = SparkSQLCompare(spark_session, df1, df2, "a", ignore_case=False) + assert not compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert not compare.intersect_rows_match() + + compare = SparkSQLCompare(spark_session, df1, df2, "a", ignore_case=True) + assert compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert compare.intersect_rows_match() + + +def test_joins_with_ignore_spaces(spark_session): + df1 = spark_session.createDataFrame([{"a": 1, "b": " A"}, {"a": 2, "b": "A"}]) + df2 = spark_session.createDataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "A "}]) + + compare = SparkSQLCompare(spark_session, df1, df2, "a", ignore_spaces=True) + assert compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert compare.intersect_rows_match() + + +def test_joins_with_ignore_case(spark_session): + df1 = spark_session.createDataFrame([{"a": 1, "b": "a"}, {"a": 2, "b": "A"}]) + df2 = spark_session.createDataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "a"}]) + + compare = SparkSQLCompare(spark_session, df1, df2, "a", ignore_case=True) + assert compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert compare.intersect_rows_match() + + +def test_strings_with_ignore_spaces_and_join_columns(spark_session): + df1 = spark_session.createDataFrame([{"a": "hi", "b": "A"}, {"a": "bye", "b": "A"}]) + df2 = spark_session.createDataFrame( + [{"a": " hi ", "b": "A"}, {"a": " bye ", "b": "A"}] + ) + compare = SparkSQLCompare(spark_session, df1, df2, "a", ignore_spaces=False) + assert not compare.matches() + assert compare.all_columns_match() + assert not compare.all_rows_overlap() + assert compare.count_matching_rows() == 0 + + compare = SparkSQLCompare(spark_session, df1, df2, "a", ignore_spaces=True) + assert compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert compare.intersect_rows_match() + assert compare.count_matching_rows() == 2 + + +def test_integers_with_ignore_spaces_and_join_columns(spark_session): + df1 = spark_session.createDataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "A"}]) + df2 = spark_session.createDataFrame([{"a": 1, "b": "A"}, {"a": 2, "b": "A"}]) + compare = SparkSQLCompare(spark_session, df1, df2, "a", ignore_spaces=False) + assert compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert compare.intersect_rows_match() + assert compare.count_matching_rows() == 2 + + compare = SparkSQLCompare(spark_session, df1, df2, "a", ignore_spaces=True) + assert compare.matches() + assert compare.all_columns_match() + assert compare.all_rows_overlap() + assert compare.intersect_rows_match() + assert compare.count_matching_rows() == 2 + + +def test_sample_mismatch(spark_session): + data1 = """acct_id,dollar_amt,name,float_fld,date_fld + 10000001234,123.45,George Maharis,14530.1555,2017-01-01 + 10000001235,0.45,Michael Bluth,1,2017-01-01 + 10000001236,1345,George Bluth,,2017-01-01 + 10000001237,123456,Bob Loblaw,345.12,2017-01-01 + 10000001239,1.05,Lucille Bluth,,2017-01-01 + 10000001240,123.45,George Maharis,14530.1555,2017-01-02 + """ + + data2 = """acct_id,dollar_amt,name,float_fld,date_fld + 10000001234,123.4,George Michael Bluth,14530.155, + 10000001235,0.45,Michael Bluth,, + 10000001236,1345,George Bluth,1, + 10000001237,123456,Robert Loblaw,345.12, + 10000001238,1.05,Loose Seal Bluth,111, + 10000001240,123.45,George Maharis,14530.1555,2017-01-02 + """ + + df1 = spark_session.createDataFrame(pd.read_csv(StringIO(data1), sep=",")) + df2 = spark_session.createDataFrame(pd.read_csv(StringIO(data2), sep=",")) + + compare = SparkSQLCompare(spark_session, df1, df2, "acct_id") + + output = compare.sample_mismatch(column="name", sample_count=1).toPandas() + assert output.shape[0] == 1 + assert (output.name_df1 != output.name_df2).all() + + output = compare.sample_mismatch(column="name", sample_count=2).toPandas() + assert output.shape[0] == 2 + assert (output.name_df1 != output.name_df2).all() + + output = compare.sample_mismatch(column="name", sample_count=3).toPandas() + assert output.shape[0] == 2 + assert (output.name_df1 != output.name_df2).all() + + +def test_all_mismatch_not_ignore_matching_cols_no_cols_matching(spark_session): + data1 = """acct_id,dollar_amt,name,float_fld,date_fld + 10000001234,123.45,George Maharis,14530.1555,2017-01-01 + 10000001235,0.45,Michael Bluth,1,2017-01-01 + 10000001236,1345,George Bluth,,2017-01-01 + 10000001237,123456,Bob Loblaw,345.12,2017-01-01 + 10000001239,1.05,Lucille Bluth,,2017-01-01 + 10000001240,123.45,George Maharis,14530.1555,2017-01-02 + """ + + data2 = """acct_id,dollar_amt,name,float_fld,date_fld + 10000001234,123.4,George Michael Bluth,14530.155, + 10000001235,0.45,Michael Bluth,, + 10000001236,1345,George Bluth,1, + 10000001237,123456,Robert Loblaw,345.12, + 10000001238,1.05,Loose Seal Bluth,111, + 10000001240,123.45,George Maharis,14530.1555,2017-01-02 + """ + df1 = spark_session.createDataFrame(pd.read_csv(StringIO(data1), sep=",")) + df2 = spark_session.createDataFrame(pd.read_csv(StringIO(data2), sep=",")) + compare = SparkSQLCompare(spark_session, df1, df2, "acct_id") + + output = compare.all_mismatch().toPandas() + assert output.shape[0] == 4 + assert output.shape[1] == 9 + + assert (output.name_df1 != output.name_df2).values.sum() == 2 + assert (~(output.name_df1 != output.name_df2)).values.sum() == 2 + + assert (output.dollar_amt_df1 != output.dollar_amt_df2).values.sum() == 1 + assert (~(output.dollar_amt_df1 != output.dollar_amt_df2)).values.sum() == 3 + + assert (output.float_fld_df1 != output.float_fld_df2).values.sum() == 3 + assert (~(output.float_fld_df1 != output.float_fld_df2)).values.sum() == 1 + + assert (output.date_fld_df1 != output.date_fld_df2).values.sum() == 4 + assert (~(output.date_fld_df1 != output.date_fld_df2)).values.sum() == 0 + + +def test_all_mismatch_not_ignore_matching_cols_some_cols_matching(spark_session): + # Columns dollar_amt and name are matching + data1 = """acct_id,dollar_amt,name,float_fld,date_fld + 10000001234,123.45,George Maharis,14530.1555,2017-01-01 + 10000001235,0.45,Michael Bluth,1,2017-01-01 + 10000001236,1345,George Bluth,,2017-01-01 + 10000001237,123456,Bob Loblaw,345.12,2017-01-01 + 10000001239,1.05,Lucille Bluth,,2017-01-01 + 10000001240,123.45,George Maharis,14530.1555,2017-01-02 + """ + + data2 = """acct_id,dollar_amt,name,float_fld,date_fld + 10000001234,123.45,George Maharis,14530.155, + 10000001235,0.45,Michael Bluth,, + 10000001236,1345,George Bluth,1, + 10000001237,123456,Bob Loblaw,345.12, + 10000001238,1.05,Lucille Bluth,111, + 10000001240,123.45,George Maharis,14530.1555,2017-01-02 + """ + df1 = spark_session.createDataFrame(pd.read_csv(StringIO(data1), sep=",")) + df2 = spark_session.createDataFrame(pd.read_csv(StringIO(data2), sep=",")) + compare = SparkSQLCompare(spark_session, df1, df2, "acct_id") + + output = compare.all_mismatch().toPandas() + assert output.shape[0] == 4 + assert output.shape[1] == 9 + + assert (output.name_df1 != output.name_df2).values.sum() == 0 + assert (~(output.name_df1 != output.name_df2)).values.sum() == 4 + + assert (output.dollar_amt_df1 != output.dollar_amt_df2).values.sum() == 0 + assert (~(output.dollar_amt_df1 != output.dollar_amt_df2)).values.sum() == 4 + + assert (output.float_fld_df1 != output.float_fld_df2).values.sum() == 3 + assert (~(output.float_fld_df1 != output.float_fld_df2)).values.sum() == 1 + + assert (output.date_fld_df1 != output.date_fld_df2).values.sum() == 4 + assert (~(output.date_fld_df1 != output.date_fld_df2)).values.sum() == 0 + + +def test_all_mismatch_ignore_matching_cols_some_cols_matching_diff_rows(spark_session): + # Case where there are rows on either dataset which don't match up. + # Columns dollar_amt and name are matching + data1 = """acct_id,dollar_amt,name,float_fld,date_fld + 10000001234,123.45,George Maharis,14530.1555,2017-01-01 + 10000001235,0.45,Michael Bluth,1,2017-01-01 + 10000001236,1345,George Bluth,,2017-01-01 + 10000001237,123456,Bob Loblaw,345.12,2017-01-01 + 10000001239,1.05,Lucille Bluth,,2017-01-01 + 10000001240,123.45,George Maharis,14530.1555,2017-01-02 + 10000001241,1111.05,Lucille Bluth, + """ + + data2 = """acct_id,dollar_amt,name,float_fld,date_fld + 10000001234,123.45,George Maharis,14530.155, + 10000001235,0.45,Michael Bluth,, + 10000001236,1345,George Bluth,1, + 10000001237,123456,Bob Loblaw,345.12, + 10000001238,1.05,Lucille Bluth,111, + """ + df1 = spark_session.createDataFrame(pd.read_csv(StringIO(data1), sep=",")) + df2 = spark_session.createDataFrame(pd.read_csv(StringIO(data2), sep=",")) + compare = SparkSQLCompare(spark_session, df1, df2, "acct_id") + + output = compare.all_mismatch(ignore_matching_cols=True).toPandas() + + assert output.shape[0] == 4 + assert output.shape[1] == 5 + + assert (output.float_fld_df1 != output.float_fld_df2).values.sum() == 3 + assert (~(output.float_fld_df1 != output.float_fld_df2)).values.sum() == 1 + + assert (output.date_fld_df1 != output.date_fld_df2).values.sum() == 4 + assert (~(output.date_fld_df1 != output.date_fld_df2)).values.sum() == 0 + + assert not ("name_df1" in output and "name_df2" in output) + assert not ("dollar_amt_df1" in output and "dollar_amt_df1" in output) + + +def test_all_mismatch_ignore_matching_cols_some_cols_matching(spark_session): + # Columns dollar_amt and name are matching + data1 = """acct_id,dollar_amt,name,float_fld,date_fld + 10000001234,123.45,George Maharis,14530.1555,2017-01-01 + 10000001235,0.45,Michael Bluth,1,2017-01-01 + 10000001236,1345,George Bluth,,2017-01-01 + 10000001237,123456,Bob Loblaw,345.12,2017-01-01 + 10000001239,1.05,Lucille Bluth,,2017-01-01 + 10000001240,123.45,George Maharis,14530.1555,2017-01-02 + """ + + data2 = """acct_id,dollar_amt,name,float_fld,date_fld + 10000001234,123.45,George Maharis,14530.155, + 10000001235,0.45,Michael Bluth,, + 10000001236,1345,George Bluth,1, + 10000001237,123456,Bob Loblaw,345.12, + 10000001238,1.05,Lucille Bluth,111, + 10000001240,123.45,George Maharis,14530.1555,2017-01-02 + """ + df1 = spark_session.createDataFrame(pd.read_csv(StringIO(data1), sep=",")) + df2 = spark_session.createDataFrame(pd.read_csv(StringIO(data2), sep=",")) + compare = SparkSQLCompare(spark_session, df1, df2, "acct_id") + + output = compare.all_mismatch(ignore_matching_cols=True).toPandas() + + assert output.shape[0] == 4 + assert output.shape[1] == 5 + + assert (output.float_fld_df1 != output.float_fld_df2).values.sum() == 3 + assert (~(output.float_fld_df1 != output.float_fld_df2)).values.sum() == 1 + + assert (output.date_fld_df1 != output.date_fld_df2).values.sum() == 4 + assert (~(output.date_fld_df1 != output.date_fld_df2)).values.sum() == 0 + + assert not ("name_df1" in output and "name_df2" in output) + assert not ("dollar_amt_df1" in output and "dollar_amt_df1" in output) + + +def test_all_mismatch_ignore_matching_cols_no_cols_matching(spark_session): + data1 = """acct_id,dollar_amt,name,float_fld,date_fld + 10000001234,123.45,George Maharis,14530.1555,2017-01-01 + 10000001235,0.45,Michael Bluth,1,2017-01-01 + 10000001236,1345,George Bluth,,2017-01-01 + 10000001237,123456,Bob Loblaw,345.12,2017-01-01 + 10000001239,1.05,Lucille Bluth,,2017-01-01 + 10000001240,123.45,George Maharis,14530.1555,2017-01-02 + """ + + data2 = """acct_id,dollar_amt,name,float_fld,date_fld + 10000001234,123.4,George Michael Bluth,14530.155, + 10000001235,0.45,Michael Bluth,, + 10000001236,1345,George Bluth,1, + 10000001237,123456,Robert Loblaw,345.12, + 10000001238,1.05,Loose Seal Bluth,111, + 10000001240,123.45,George Maharis,14530.1555,2017-01-02 + """ + df1 = spark_session.createDataFrame(pd.read_csv(StringIO(data1), sep=",")) + df2 = spark_session.createDataFrame(pd.read_csv(StringIO(data2), sep=",")) + compare = SparkSQLCompare(spark_session, df1, df2, "acct_id") + + output = compare.all_mismatch().toPandas() + assert output.shape[0] == 4 + assert output.shape[1] == 9 + + assert (output.name_df1 != output.name_df2).values.sum() == 2 + assert (~(output.name_df1 != output.name_df2)).values.sum() == 2 + + assert (output.dollar_amt_df1 != output.dollar_amt_df2).values.sum() == 1 + assert (~(output.dollar_amt_df1 != output.dollar_amt_df2)).values.sum() == 3 + + assert (output.float_fld_df1 != output.float_fld_df2).values.sum() == 3 + assert (~(output.float_fld_df1 != output.float_fld_df2)).values.sum() == 1 + + assert (output.date_fld_df1 != output.date_fld_df2).values.sum() == 4 + assert (~(output.date_fld_df1 != output.date_fld_df2)).values.sum() == 0 + + +@pytest.mark.parametrize( + "column,expected", + [ + ("base", 0), + ("floats", 0.2), + ("decimals", 0.1), + ("null_floats", 0.1), + ("strings", 0.1), + ("mixed_strings", 1), + ("infinity", np.inf), + ], +) +def test_calculate_max_diff(spark_session, column, expected): + pdf = pd.DataFrame( + { + "base": [1, 1, 1, 1, 1], + "floats": [1.1, 1.1, 1.1, 1.2, 0.9], + "decimals": [ + Decimal("1.1"), + Decimal("1.1"), + Decimal("1.1"), + Decimal("1.1"), + Decimal("1.1"), + ], + "null_floats": [np.nan, 1.1, 1, 1, 1], + "strings": ["1", "1", "1", "1.1", "1"], + "mixed_strings": ["1", "1", "1", "2", "some string"], + "infinity": [1, 1, 1, 1, np.inf], + } + ) + MAX_DIFF_DF = spark_session.createDataFrame(pdf) + assert np.isclose(calculate_max_diff(MAX_DIFF_DF, "base", column), expected) + + +def test_dupes_with_nulls_strings(spark_session): + pdf1 = pd.DataFrame( + { + "fld_1": [1, 2, 2, 3, 3, 4, 5, 5], + "fld_2": ["A", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + "fld_3": [1, 2, 2, 3, 3, 4, 5, 5], + } + ) + pdf2 = pd.DataFrame( + { + "fld_1": [1, 2, 3, 4, 5], + "fld_2": ["A", np.nan, np.nan, np.nan, np.nan], + "fld_3": [1, 2, 3, 4, 5], + } + ) + df1 = spark_session.createDataFrame(pdf1) + df2 = spark_session.createDataFrame(pdf2) + comp = SparkSQLCompare(spark_session, df1, df2, join_columns=["fld_1", "fld_2"]) + assert comp.subset() + + +def test_dupes_with_nulls_ints(spark_session): + pdf1 = pd.DataFrame( + { + "fld_1": [1, 2, 2, 3, 3, 4, 5, 5], + "fld_2": [1, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + "fld_3": [1, 2, 2, 3, 3, 4, 5, 5], + } + ) + pdf2 = pd.DataFrame( + { + "fld_1": [1, 2, 3, 4, 5], + "fld_2": [1, np.nan, np.nan, np.nan, np.nan], + "fld_3": [1, 2, 3, 4, 5], + } + ) + df1 = spark_session.createDataFrame(pdf1) + df2 = spark_session.createDataFrame(pdf2) + comp = SparkSQLCompare(spark_session, df1, df2, join_columns=["fld_1", "fld_2"]) + assert comp.subset() + + +def test_generate_id_within_group(spark_session): + matrix = [ + ( + pd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "__index": [1, 2, 3]}), + pd.Series([0, 0, 0]), + ), + ( + pd.DataFrame( + { + "a": ["a", "a", "DATACOMPY_NULL"], + "b": [1, 1, 2], + "__index": [1, 2, 3], + } + ), + pd.Series([0, 1, 0]), + ), + ( + pd.DataFrame({"a": [-999, 2, 3], "b": [1, 2, 3], "__index": [1, 2, 3]}), + pd.Series([0, 0, 0]), + ), + ( + pd.DataFrame( + {"a": [1, np.nan, np.nan], "b": [1, 2, 2], "__index": [1, 2, 3]} + ), + pd.Series([0, 0, 1]), + ), + ( + pd.DataFrame( + {"a": ["1", np.nan, np.nan], "b": ["1", "2", "2"], "__index": [1, 2, 3]} + ), + pd.Series([0, 0, 1]), + ), + ( + pd.DataFrame( + { + "a": [datetime(2018, 1, 1), np.nan, np.nan], + "b": ["1", "2", "2"], + "__index": [1, 2, 3], + } + ), + pd.Series([0, 0, 1]), + ), + ] + for i in matrix: + dataframe = i[0] + expected = i[1] + actual = ( + _generate_id_within_group( + spark_session.createDataFrame(dataframe), ["a", "b"], "_temp_0" + ) + .orderBy("__index") + .select("_temp_0") + .toPandas() + ) + assert (actual["_temp_0"] == expected).all() + + +def test_generate_id_within_group_single_join(spark_session): + dataframe = spark_session.createDataFrame( + [{"a": 1, "b": 2, "__index": 1}, {"a": 1, "b": 2, "__index": 2}] + ) + expected = pd.Series([0, 1]) + actual = ( + _generate_id_within_group(dataframe, ["a"], "_temp_0") + .orderBy("__index") + .select("_temp_0") + ).toPandas() + assert (actual["_temp_0"] == expected).all() + + +def test_lower(spark_session): + """This function tests the toggle to use lower case for column names or not""" + # should match + df1 = spark_session.createDataFrame(pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})) + df2 = spark_session.createDataFrame(pd.DataFrame({"a": [1, 2, 3], "B": [0, 1, 2]})) + compare = SparkSQLCompare(spark_session, df1, df2, join_columns=["a"]) + assert compare.matches() + # should not match + df1 = spark_session.createDataFrame(pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})) + df2 = spark_session.createDataFrame(pd.DataFrame({"a": [1, 2, 3], "B": [0, 1, 2]})) + compare = SparkSQLCompare( + spark_session, df1, df2, join_columns=["a"], cast_column_names_lower=False + ) + assert not compare.matches() + + # test join column + # should match + df1 = spark_session.createDataFrame(pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})) + df2 = spark_session.createDataFrame(pd.DataFrame({"A": [1, 2, 3], "B": [0, 1, 2]})) + compare = SparkSQLCompare(spark_session, df1, df2, join_columns=["a"]) + assert compare.matches() + # should fail because "a" is not found in df2 + df1 = spark_session.createDataFrame(pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})) + df2 = spark_session.createDataFrame(pd.DataFrame({"A": [1, 2, 3], "B": [0, 1, 2]})) + expected_message = "df2 must have all columns from join_columns" + with raises(ValueError, match=expected_message): + compare = SparkSQLCompare( + spark_session, df1, df2, join_columns=["a"], cast_column_names_lower=False + ) + + +def test_integer_column_names(spark_session): + """This function tests that integer column names would also work""" + df1 = spark_session.createDataFrame(pd.DataFrame({1: [1, 2, 3], 2: [0, 1, 2]})) + df2 = spark_session.createDataFrame(pd.DataFrame({1: [1, 2, 3], 2: [0, 1, 2]})) + compare = SparkSQLCompare(spark_session, df1, df2, join_columns=[1]) + assert compare.matches() + + +@mock.patch("datacompy.spark.sql.render") +def test_save_html(mock_render, spark_session): + df1 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}]) + df2 = spark_session.createDataFrame([{"a": 1, "b": 2}, {"a": 1, "b": 2}]) + compare = SparkSQLCompare(spark_session, df1, df2, join_columns=["a"]) + + m = mock.mock_open() + with mock.patch("datacompy.spark.sql.open", m, create=True): + # assert without HTML call + compare.report() + assert mock_render.call_count == 4 + m.assert_not_called() + + mock_render.reset_mock() + m = mock.mock_open() + with mock.patch("datacompy.spark.sql.open", m, create=True): + # assert with HTML call + compare.report(html_file="test.html") + assert mock_render.call_count == 4 + m.assert_called_with("test.html", "w") + + +def test_unicode_columns(spark_session): + df1 = spark_session.createDataFrame( + [ + {"a": 1, "例": 2, "予測対象日": "test"}, + {"a": 1, "例": 3, "予測対象日": "test"}, + ] + ) + df2 = spark_session.createDataFrame( + [ + {"a": 1, "例": 2, "予測対象日": "test"}, + {"a": 1, "例": 3, "予測対象日": "test"}, + ] + ) + compare = SparkSQLCompare(spark_session, df1, df2, join_columns=["例"]) + assert compare.matches() + # Just render the report to make sure it renders. + compare.report()