Merge pull request #202 from capitalone/develop

Release v0.9.0
capitalone · May 11, 2023 · 46d4ba3 · 46d4ba3
2 parents f9e8694 + d436ea5
commit 46d4ba3
Show file tree

Hide file tree

Showing 17 changed files with 165 additions and 175 deletions.
diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
@@ -19,7 +19,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: '3.8'
+        python-version: '3.9'
     - name: Install dependencies
       run: python -m pip install .[dev]
     - name: Build

diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml
@@ -18,7 +18,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: '3.8'
+        python-version: '3.9'
     - name: Install dependencies
       run: python -m pip install -r requirements.txt .[dev]
     - name: Build and publish

diff --git a/.github/workflows/test-package.yml b/.github/workflows/test-package.yml
@@ -16,13 +16,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.7, 3.8, 3.9, '3.10']
-        spark-version: [3.0.3, 3.1.2, 3.2.0]
-        hadoop: [3.2]
-        include:
-          - python-version: 3.7
-            spark-version: 2.4.8
-            hadoop: 2.7
+        python-version: [3.8, 3.9, '3.10']
+        spark-version: [3.0.3, 3.1.3, 3.2.3, 3.3.1]
     env:
       PYTHON_VERSION: ${{ matrix.python-version }} 
       SPARK_VERSION: ${{ matrix.spark-version }}
@@ -36,26 +31,12 @@ jobs:
         python-version: ${{ matrix.python-version }}
 
     - name: Setup Java JDK
-      uses: actions/setup-java@v1.4.3 
+      uses: actions/setup-java@v3
       with:
-        java-version: 1.8
+        java-version: '8'
+        distribution: 'adopt'
 
     - name: Install Spark
-      run: |
-        wget -q -O spark.tgz https://archive.apache.org/dist/spark/spark-${{ matrix.spark-version }}/spark-${{ matrix.spark-version }}-bin-hadoop${{ matrix.hadoop }}.tgz
-        tar xzf spark.tgz
-        rm spark.tgz
-        echo "SPARK_HOME=${{ runner.workspace }}/datacompy/spark-${{ matrix.spark-version }}-bin-hadoop${{ matrix.hadoop }}" >> $GITHUB_ENV
-        echo "${{ runner.workspace }}/datacompy/spark-${{ matrix.spark-version }}-bin-hadoop${{ matrix.hadoop }}/bin" >> $GITHUB_PATH
-    - name: Install dependencies, Spark 2.4.8, Hadoop 2.7
-      if: matrix.spark-version == '2.4.8' && matrix.hadoop == '2.7'
-      run: |
-        python -m pip install --upgrade pip
-        python -m pip install pytest pytest-spark pypandoc==1.7.5
-        python -m pip install pyspark==${{ matrix.spark-version }}
-        python -m pip install .[dev,spark]
-    - name: Install dependencies, everything else
-      if: matrix.spark-version != '2.4.8' && matrix.hadoop != '2.7'
       run: |
         python -m pip install --upgrade pip
         python -m pip install pytest pytest-spark pypandoc

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,12 +1,12 @@
 repos:
   - repo: https://github.com/psf/black
-    rev: 22.6.0
+    rev: 23.3.0
     hooks:
       - id: black
         types: [file, python]
-        language_version: python3.9
+        language_version: python3.10
   - repo: https://github.com/pycqa/isort
-    rev: 5.10.1
+    rev: 5.12.0
     hooks:
       - id: isort
         name: isort (python)
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
@@ -2,4 +2,5 @@
 - Dan Coates
 - Usman Azhar
 - Mark Zhou
-- Ian Whitestone
+- Ian Whitestone
+- Faisal Dosani
diff --git a/README.rst b/README.rst
@@ -119,6 +119,12 @@ Things that are happening behind the scenes
 Spark Detail
 ============
 
+.. important::
+
+    With version ``v0.9.0`` SparkCompare now uses Null Safe (``<=>``) comparisons
+
+..
+
 DataComPy's ``SparkCompare`` class will join two dataframes either on a list of join
 columns. It has the capability to map column names that may be different in each
 dataframe, including in the join columns. You are responsible for creating the

diff --git a/datacompy/__init__.py b/datacompy/__init__.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.8.4"
+__version__ = "0.9.0"
 
 from datacompy.core import *
 from datacompy.sparkcompare import NUMERIC_SPARK_TYPES, SparkCompare
diff --git a/datacompy/core.py b/datacompy/core.py
@@ -161,18 +161,18 @@ def _validate_dataframe(self, index, cast_column_names_lower=True):
         """
         dataframe = getattr(self, index)
         if not isinstance(dataframe, pd.DataFrame):
-            raise TypeError("{} must be a pandas DataFrame".format(index))
+            raise TypeError(f"{index} must be a pandas DataFrame")
 
         if cast_column_names_lower:
             dataframe.columns = [str(col).lower() for col in dataframe.columns]
         else:
             dataframe.columns = [str(col) for col in dataframe.columns]
         # Check if join_columns are present in the dataframe
         if not set(self.join_columns).issubset(set(dataframe.columns)):
-            raise ValueError("{} must have all columns from join_columns".format(index))
+            raise ValueError(f"{index} must have all columns from join_columns")
 
         if len(set(dataframe.columns)) < len(dataframe.columns):
-            raise ValueError("{} must have unique column names".format(index))
+            raise ValueError(f"{index} must have unique column names")
 
         if self.on_index:
             if dataframe.index.duplicated().sum() > 0:
@@ -196,22 +196,18 @@ def _compare(self, ignore_spaces, ignore_case):
         else:
             LOG.info("df1 does not Pandas.DataFrame.equals df2")
         LOG.info(
-            "Number of columns in common: {}".format(len(self.intersect_columns()))
+            f"Number of columns in common: {len(self.intersect_columns())}"
         )
         LOG.debug("Checking column overlap")
         for col in self.df1_unq_columns():
-            LOG.info("Column in df1 and not in df2: {}".format(col))
+            LOG.info(f"Column in df1 and not in df2: {col}")
         LOG.info(
-            "Number of columns in df1 and not in df2: {}".format(
-                len(self.df1_unq_columns())
-            )
+            f"Number of columns in df1 and not in df2: {len(self.df1_unq_columns())}"
         )
         for col in self.df2_unq_columns():
-            LOG.info("Column in df2 and not in df1: {}".format(col))
+            LOG.info(f"Column in df2 and not in df1: {col}")
         LOG.info(
-            "Number of columns in df2 and not in df1: {}".format(
-                len(self.df2_unq_columns())
-            )
+            f"Number of columns in df2 and not in df1: {len(self.df2_unq_columns())}"
         )
         LOG.debug("Merging dataframes")
         self._dataframe_merge(ignore_spaces)
@@ -306,18 +302,16 @@ def _dataframe_merge(self, ignore_spaces):
         ].copy()
         self.df2_unq_rows.columns = self.df2.columns
         LOG.info(
-            "Number of rows in df1 and not in df2: {}".format(len(self.df1_unq_rows))
+            f"Number of rows in df1 and not in df2: {len(self.df1_unq_rows)}"
         )
         LOG.info(
-            "Number of rows in df2 and not in df1: {}".format(len(self.df2_unq_rows))
+            f"Number of rows in df2 and not in df1: {len(self.df2_unq_rows)}"
         )
 
         LOG.debug("Selecting intersecting rows")
         self.intersect_rows = outer_join[outer_join["_merge"] == "both"].copy()
         LOG.info(
-            "Number of rows in df1 and df2 (not necessarily equal): {}".format(
-                len(self.intersect_rows)
-            )
+            "Number of rows in df1 and df2 (not necessarily equal): {len(self.intersect_rows)}"
         )
 
     def _intersect_compare(self, ignore_spaces, ignore_case):
@@ -361,9 +355,7 @@ def _intersect_compare(self, ignore_spaces, ignore_case):
             else:
                 match_rate = 0
             LOG.info(
-                "{}: {} / {} ({:.2%}) match".format(
-                    column, match_cnt, row_cnt, match_rate
-                )
+                f"{column}: {match_cnt} / {row_cnt} ({match_rate:.2%}) match"
             )
 
             self.column_stats.append(
@@ -519,14 +511,12 @@ def all_mismatch(self, ignore_matching_cols=False):
                 if not ignore_matching_cols or (
                     ignore_matching_cols and not col_comparison.all()
                 ):
-                    LOG.debug("Adding column {} to the result.".format(orig_col_name))
+                    LOG.debug(f"Adding column {orig_col_name} to the result.")
                     match_list.append(col)
                     return_list.extend([orig_col_name + "_df1", orig_col_name + "_df2"])
                 elif ignore_matching_cols:
                     LOG.debug(
-                        "Column {} is equal in df1 and df2. It will not be added to the result.".format(
-                            orig_col_name
-                        )
+                        f"Column {orig_col_name} is equal in df1 and df2. It will not be added to the result."
                     )
 
         mm_bool = self.intersect_rows[match_list].all(axis="columns")
@@ -612,8 +602,8 @@ def report(self, sample_count=10, column_count=10, html_file=None):
                 match_stats.append(
                     {
                         "Column": column["column"],
-                        "{} dtype".format(self.df1_name): column["dtype1"],
-                        "{} dtype".format(self.df2_name): column["dtype2"],
+                        f"{self.df1_name} dtype": column["dtype1"],
+                        f"{self.df2_name} dtype": column["dtype2"],
                         "# Unequal": column["unequal_cnt"],
                         "Max Diff": column["max_diff"],
                         "# Null Diff": column["null_diff"],
@@ -636,8 +626,8 @@ def report(self, sample_count=10, column_count=10, html_file=None):
             report += df_match_stats[
                 [
                     "Column",
-                    "{} dtype".format(self.df1_name),
-                    "{} dtype".format(self.df2_name),
+                    f"{self.df1_name} dtype",
+                    f"{self.df2_name} dtype",
                     "# Unequal",
                     "Max Diff",
                     "# Null Diff",
@@ -654,25 +644,17 @@ def report(self, sample_count=10, column_count=10, html_file=None):
                     report += "\n\n"
 
         if min(sample_count, self.df1_unq_rows.shape[0]) > 0:
-            report += "Sample Rows Only in {} (First {} Columns)\n".format(
-                self.df1_name, column_count
-            )
-            report += "---------------------------------------{}\n".format(
-                "-" * len(self.df1_name)
-            )
+            report += f"Sample Rows Only in {self.df1_name} (First {column_count} Columns)\n"
+            report += f"---------------------------------------{'-' * len(self.df1_name)}\n"
             report += "\n"
             columns = self.df1_unq_rows.columns[:column_count]
             unq_count = min(sample_count, self.df1_unq_rows.shape[0])
             report += self.df1_unq_rows.sample(unq_count)[columns].to_string()
             report += "\n\n"
 
         if min(sample_count, self.df2_unq_rows.shape[0]) > 0:
-            report += "Sample Rows Only in {} (First {} Columns)\n".format(
-                self.df2_name, column_count
-            )
-            report += "---------------------------------------{}\n".format(
-                "-" * len(self.df2_name)
-            )
+            report += f"Sample Rows Only in {self.df2_name} (First {column_count} Columns)\n"
+            report += f"---------------------------------------{'-' * len(self.df2_name)}\n"
             report += "\n"
             columns = self.df2_unq_rows.columns[:column_count]
             unq_count = min(sample_count, self.df2_unq_rows.shape[0])
@@ -859,7 +841,7 @@ def temp_column_name(*dataframes):
     """
     i = 0
     while True:
-        temp_column = "_temp_{}".format(i)
+        temp_column = f"_temp_{i}"
         unique = True
         for dataframe in dataframes:
             if temp_column in dataframe.columns:
@@ -909,7 +891,7 @@ def generate_id_within_group(dataframe, join_columns):
     default_value = "DATACOMPY_NULL"
     if dataframe[join_columns].isnull().any().any():
         if (dataframe[join_columns] == default_value).any().any():
-            raise ValueError("{} was found in your join columns".format(default_value))
+            raise ValueError(f"{default_value} was found in your join columns")
         return (
             dataframe[join_columns]
             .astype(str)