capitalone · fdosani · Jun 21, 2024 · May 27, 2024 · May 29, 2024 · Jun 3, 2024
@@ -49,7 +49,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         python -m pip install pytest pytest-spark pypandoc
-        python -m pip install pyspark==${{ matrix.spark-version }}
+        python -m pip install pyspark[connect]==${{ matrix.spark-version }}
         python -m pip install pandas==${{ matrix.pandas-version }}
         python -m pip install .[dev]
     - name: Test with pytest

@@ -22,4 +22,4 @@ docs/source/api/
 
 #edgetest
 .edgetest/
-tmp/
+.tmp/
@@ -39,14 +39,20 @@ pip install datacompy[ray]
 
 ### Legacy Spark Deprecation
 
-#### Starting with version 0.12.0
+With version ``v0.12.0`` the original ``SparkCompare`` was replaced with a 
+Pandas on Spark implementation. The original ``SparkCompare`` implementation differs 
+from all the other native implementations. To align the API better,  and keep behaviour 
+consistent we are deprecating the original ``SparkCompare`` into a new module ``LegacySparkCompare``
 
-The original ``SparkCompare`` implementation differs from all the other native implementations. To align the API better, and keep behaviour consistent we are deprecating ``SparkCompare`` into a new module ``LegacySparkCompare``
+Subsequently in ``v0.13.0`` a PySaprk DataFrame class has been introduced (``SparkSQLCompare``)
+which accepts ``pyspark.sql.DataFrame`` and should provide better performance. With this version 
+the Pandas on Spark implementation has been renamed to ``SparkPandasCompare`` and all the spark 
+logic is now under the ``spark`` submodule.
 
-If you wish to use the old SparkCompare moving forward you can
+If you wish to use the old SparkCompare moving forward you can import it like so:
 
 ```python
-from datacompy.legacy import LegacySparkCompare
+from datacompy.spark.legacy import LegacySparkCompare
 ``` 
 
 #### Supported versions and dependncies
@@ -55,11 +61,6 @@ Different versions of Spark, Pandas, and Python interact differently. Below is a
 With the move to Pandas on Spark API and compatability issues with Pandas 2+ we will for the mean time note support Pandas 2 
 with the Pandas on Spark implementation. Spark plans to support Pandas 2 in [Spark 4](https://issues.apache.org/jira/browse/SPARK-44101)
 
-With version ``0.12.0``:
-- Not support Pandas ``2.0.0`` For the native Spark implemention
-- Spark ``3.1`` support will be dropped
-- Python ``3.8`` support is dropped
-
 
 |             | Spark 3.2.4 | Spark 3.3.4 | Spark 3.4.2 | Spark 3.5.1 |
 |-------------|-------------|-------------|-------------|-------------|
@@ -69,11 +70,12 @@ With version ``0.12.0``:
 | Python 3.12 | ❌           | ❌           | ❌           | ❌           |
 
 
-|               | Pandas < 1.5.3 | Pandas >=2.0.0 |
-|---------------|----------------|----------------|
-| Native Pandas | ✅              | ✅              |
-| Native Spark  | ✅              | ❌              |
-| Fugue         | ✅              | ✅              |
+|                        | Pandas < 1.5.3 | Pandas >=2.0.0 |
+|------------------------|----------------|----------------|
+| ``Compare``            | ✅              | ✅              |
+| ``SparkPandasCompare`` | ✅              | ❌              |
+| ``SparkSQLCompare``    | ✅              | ✅              |
+| Fugue                  | ✅              | ✅              |
 
 
 
@@ -85,8 +87,8 @@ With version ``0.12.0``:
 ## Supported backends
 
 - Pandas: ([See documentation](https://capitalone.github.io/datacompy/pandas_usage.html))
-- Spark (Pandas on Spark API): ([See documentation](https://capitalone.github.io/datacompy/spark_usage.html))
-- Polars (Experimental): ([See documentation](https://capitalone.github.io/datacompy/polars_usage.html))
+- Spark: ([See documentation](https://capitalone.github.io/datacompy/spark_usage.html))
+- Polars: ([See documentation](https://capitalone.github.io/datacompy/polars_usage.html))
 - Fugue is a Python library that provides a unified interface for data processing on Pandas, DuckDB, Polars, Arrow,
   Spark, Dask, Ray, and many other backends. DataComPy integrates with Fugue to provide a simple way to compare data
   across these backends. Please note that Fugue will use the Pandas (Native) logic at its lowest level

@@ -13,13 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.12.1"
+__version__ = "0.13.0"
 
 import platform
 from warnings import warn
 
-from datacompy.core import *
-from datacompy.fugue import (
+from .core import *  # noqa: F403
+from .fugue import (  # noqa: F401
     all_columns_match,
     all_rows_overlap,
     count_matching_rows,
@@ -28,8 +28,9 @@
     report,
     unq_columns,
 )
-from datacompy.polars import PolarsCompare
-from datacompy.spark import SparkCompare
+from .polars import PolarsCompare  # noqa: F401
+from .spark.pandas import SparkPandasCompare  # noqa: F401
+from .spark.sql import SparkSQLCompare  # noqa: F401
 
 major = platform.python_version_tuple()[0]
 minor = platform.python_version_tuple()[1]

@@ -139,3 +139,33 @@ def report(
         html_file: Optional[str] = None,
     ) -> str:
         pass
+
+
+def temp_column_name(*dataframes) -> str:
+    """Gets a temp column name that isn't included in columns of any dataframes
+
+    Parameters
+    ----------
+    dataframes : list of DataFrames
+        The DataFrames to create a temporary column name for
+
+    Returns
+    -------
+    str
+        String column name that looks like '_temp_x' for some integer x
+    """
+    i = 0
+    columns = []
+    for dataframe in dataframes:
+        columns = columns + list(dataframe.columns)
+    columns = set(columns)
+
+    while True:
+        temp_column = f"_temp_{i}"
+        unique = True
+
+        if temp_column in columns:
+            i += 1
+            unique = False
+        if unique:
+            return temp_column
@@ -29,7 +29,7 @@
 import pandas as pd
 from ordered_set import OrderedSet
 
-from datacompy.base import BaseCompare
+from .base import BaseCompare, temp_column_name
 
 LOG = logging.getLogger(__name__)
 
@@ -890,31 +890,6 @@ def get_merged_columns(
     return columns
 
 
-def temp_column_name(*dataframes: pd.DataFrame) -> str:
-    """Gets a temp column name that isn't included in columns of any dataframes
-
-    Parameters
-    ----------
-    dataframes : list of Pandas.DataFrame
-        The DataFrames to create a temporary column name for
-
-    Returns
-    -------
-    str
-        String column name that looks like '_temp_x' for some integer x
-    """
-    i = 0
-    while True:
-        temp_column = f"_temp_{i}"
-        unique = True
-        for dataframe in dataframes:
-            if temp_column in dataframe.columns:
-                i += 1
-                unique = False
-        if unique:
-            return temp_column
-
-
 def calculate_max_diff(col_1: "pd.Series[Any]", col_2: "pd.Series[Any]") -> float:
     """Get a maximum difference between two columns
 

@@ -29,7 +29,7 @@
 import numpy as np
 from ordered_set import OrderedSet
 
-from datacompy.base import BaseCompare
+from .base import BaseCompare, temp_column_name
 
 try:
     import polars as pl
@@ -278,11 +278,20 @@ def _dataframe_merge(self, ignore_spaces: bool) -> None:
 
         # process merge indicator
         outer_join = outer_join.with_columns(
-            pl.when((pl.col("_merge_left") == True) & (pl.col("_merge_right") == True))
+            pl.when(
+                (pl.col("_merge_left") == True)
+                & (pl.col("_merge_right") == True)  # noqa: E712
+            )
             .then(pl.lit("both"))
-            .when((pl.col("_merge_left") == True) & (pl.col("_merge_right").is_null()))
+            .when(
+                (pl.col("_merge_left") == True)
+                & (pl.col("_merge_right").is_null())  # noqa: E712
+            )
             .then(pl.lit("left_only"))
-            .when((pl.col("_merge_left").is_null()) & (pl.col("_merge_right") == True))
+            .when(
+                (pl.col("_merge_left").is_null())
+                & (pl.col("_merge_right") == True)  # noqa: E712
+            )
             .then(pl.lit("right_only"))
             .alias("_merge")
         )
@@ -497,9 +506,9 @@ def sample_mismatch(
         col_match = self.intersect_rows[column + "_match"]
         match_cnt = col_match.sum()
         sample_count = min(sample_count, row_cnt - match_cnt)  # type: ignore
-        sample = self.intersect_rows.filter(pl.col(column + "_match") != True).sample(
-            sample_count
-        )
+        sample = self.intersect_rows.filter(
+            pl.col(column + "_match") != True  # noqa: E712
+        ).sample(sample_count)
         return_cols = self.join_columns + [
             column + "_" + self.df1_name,
             column + "_" + self.df2_name,
@@ -558,7 +567,7 @@ def all_mismatch(self, ignore_matching_cols: bool = False) -> "pl.DataFrame":
                     )
         return (
             self.intersect_rows.with_columns(__all=pl.all_horizontal(match_list))
-            .filter(pl.col("__all") != True)
+            .filter(pl.col("__all") != True)  # noqa: E712
             .select(self.join_columns + return_list)
         )
 
@@ -899,31 +908,6 @@ def get_merged_columns(
     return columns
 
 
-def temp_column_name(*dataframes: "pl.DataFrame") -> str:
-    """Gets a temp column name that isn't included in columns of any dataframes
-
-    Parameters
-    ----------
-    dataframes : list of Polars.DataFrame
-        The DataFrames to create a temporary column name for
-
-    Returns
-    -------
-    str
-        String column name that looks like '_temp_x' for some integer x
-    """
-    i = 0
-    while True:
-        temp_column = f"_temp_{i}"
-        unique = True
-        for dataframe in dataframes:
-            if temp_column in dataframe.columns:
-                i += 1
-                unique = False
-        if unique:
-            return temp_column
-
-
 def calculate_max_diff(col_1: "pl.Series", col_2: "pl.Series") -> float:
     """Get a maximum difference between two columns
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,4 +22,4 @@ docs/source/api/ @@
     #edgetest
     .edgetest/
-    tmp/
+    .tmp/