Skip to content

Commit

Permalink
Merge pull request #202 from capitalone/develop
Browse files Browse the repository at this point in the history
Release v0.9.0
  • Loading branch information
fdosani authored May 11, 2023
2 parents f9e8694 + d436ea5 commit 46d4ba3
Show file tree
Hide file tree
Showing 17 changed files with 165 additions and 175 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/publish-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.8'
python-version: '3.9'
- name: Install dependencies
run: python -m pip install .[dev]
- name: Build
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.8'
python-version: '3.9'
- name: Install dependencies
run: python -m pip install -r requirements.txt .[dev]
- name: Build and publish
Expand Down
29 changes: 5 additions & 24 deletions .github/workflows/test-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,8 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, '3.10']
spark-version: [3.0.3, 3.1.2, 3.2.0]
hadoop: [3.2]
include:
- python-version: 3.7
spark-version: 2.4.8
hadoop: 2.7
python-version: [3.8, 3.9, '3.10']
spark-version: [3.0.3, 3.1.3, 3.2.3, 3.3.1]
env:
PYTHON_VERSION: ${{ matrix.python-version }}
SPARK_VERSION: ${{ matrix.spark-version }}
Expand All @@ -36,26 +31,12 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: Setup Java JDK
uses: actions/setup-java@v1.4.3
uses: actions/setup-java@v3
with:
java-version: 1.8
java-version: '8'
distribution: 'adopt'

- name: Install Spark
run: |
wget -q -O spark.tgz https://archive.apache.org/dist/spark/spark-${{ matrix.spark-version }}/spark-${{ matrix.spark-version }}-bin-hadoop${{ matrix.hadoop }}.tgz
tar xzf spark.tgz
rm spark.tgz
echo "SPARK_HOME=${{ runner.workspace }}/datacompy/spark-${{ matrix.spark-version }}-bin-hadoop${{ matrix.hadoop }}" >> $GITHUB_ENV
echo "${{ runner.workspace }}/datacompy/spark-${{ matrix.spark-version }}-bin-hadoop${{ matrix.hadoop }}/bin" >> $GITHUB_PATH
- name: Install dependencies, Spark 2.4.8, Hadoop 2.7
if: matrix.spark-version == '2.4.8' && matrix.hadoop == '2.7'
run: |
python -m pip install --upgrade pip
python -m pip install pytest pytest-spark pypandoc==1.7.5
python -m pip install pyspark==${{ matrix.spark-version }}
python -m pip install .[dev,spark]
- name: Install dependencies, everything else
if: matrix.spark-version != '2.4.8' && matrix.hadoop != '2.7'
run: |
python -m pip install --upgrade pip
python -m pip install pytest pytest-spark pypandoc
Expand Down
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
repos:
- repo: https://github.com/psf/black
rev: 22.6.0
rev: 23.3.0
hooks:
- id: black
types: [file, python]
language_version: python3.9
language_version: python3.10
- repo: https://github.com/pycqa/isort
rev: 5.10.1
rev: 5.12.0
hooks:
- id: isort
name: isort (python)
3 changes: 2 additions & 1 deletion CONTRIBUTORS
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
- Dan Coates
- Usman Azhar
- Mark Zhou
- Ian Whitestone
- Ian Whitestone
- Faisal Dosani
6 changes: 6 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,12 @@ Things that are happening behind the scenes
Spark Detail
============

.. important::

With version ``v0.9.0`` SparkCompare now uses Null Safe (``<=>``) comparisons

..
DataComPy's ``SparkCompare`` class will join two dataframes either on a list of join
columns. It has the capability to map column names that may be different in each
dataframe, including in the join columns. You are responsible for creating the
Expand Down
2 changes: 1 addition & 1 deletion datacompy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.8.4"
__version__ = "0.9.0"

from datacompy.core import *
from datacompy.sparkcompare import NUMERIC_SPARK_TYPES, SparkCompare
66 changes: 24 additions & 42 deletions datacompy/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,18 +161,18 @@ def _validate_dataframe(self, index, cast_column_names_lower=True):
"""
dataframe = getattr(self, index)
if not isinstance(dataframe, pd.DataFrame):
raise TypeError("{} must be a pandas DataFrame".format(index))
raise TypeError(f"{index} must be a pandas DataFrame")

if cast_column_names_lower:
dataframe.columns = [str(col).lower() for col in dataframe.columns]
else:
dataframe.columns = [str(col) for col in dataframe.columns]
# Check if join_columns are present in the dataframe
if not set(self.join_columns).issubset(set(dataframe.columns)):
raise ValueError("{} must have all columns from join_columns".format(index))
raise ValueError(f"{index} must have all columns from join_columns")

if len(set(dataframe.columns)) < len(dataframe.columns):
raise ValueError("{} must have unique column names".format(index))
raise ValueError(f"{index} must have unique column names")

if self.on_index:
if dataframe.index.duplicated().sum() > 0:
Expand All @@ -196,22 +196,18 @@ def _compare(self, ignore_spaces, ignore_case):
else:
LOG.info("df1 does not Pandas.DataFrame.equals df2")
LOG.info(
"Number of columns in common: {}".format(len(self.intersect_columns()))
f"Number of columns in common: {len(self.intersect_columns())}"
)
LOG.debug("Checking column overlap")
for col in self.df1_unq_columns():
LOG.info("Column in df1 and not in df2: {}".format(col))
LOG.info(f"Column in df1 and not in df2: {col}")
LOG.info(
"Number of columns in df1 and not in df2: {}".format(
len(self.df1_unq_columns())
)
f"Number of columns in df1 and not in df2: {len(self.df1_unq_columns())}"
)
for col in self.df2_unq_columns():
LOG.info("Column in df2 and not in df1: {}".format(col))
LOG.info(f"Column in df2 and not in df1: {col}")
LOG.info(
"Number of columns in df2 and not in df1: {}".format(
len(self.df2_unq_columns())
)
f"Number of columns in df2 and not in df1: {len(self.df2_unq_columns())}"
)
LOG.debug("Merging dataframes")
self._dataframe_merge(ignore_spaces)
Expand Down Expand Up @@ -306,18 +302,16 @@ def _dataframe_merge(self, ignore_spaces):
].copy()
self.df2_unq_rows.columns = self.df2.columns
LOG.info(
"Number of rows in df1 and not in df2: {}".format(len(self.df1_unq_rows))
f"Number of rows in df1 and not in df2: {len(self.df1_unq_rows)}"
)
LOG.info(
"Number of rows in df2 and not in df1: {}".format(len(self.df2_unq_rows))
f"Number of rows in df2 and not in df1: {len(self.df2_unq_rows)}"
)

LOG.debug("Selecting intersecting rows")
self.intersect_rows = outer_join[outer_join["_merge"] == "both"].copy()
LOG.info(
"Number of rows in df1 and df2 (not necessarily equal): {}".format(
len(self.intersect_rows)
)
"Number of rows in df1 and df2 (not necessarily equal): {len(self.intersect_rows)}"
)

def _intersect_compare(self, ignore_spaces, ignore_case):
Expand Down Expand Up @@ -361,9 +355,7 @@ def _intersect_compare(self, ignore_spaces, ignore_case):
else:
match_rate = 0
LOG.info(
"{}: {} / {} ({:.2%}) match".format(
column, match_cnt, row_cnt, match_rate
)
f"{column}: {match_cnt} / {row_cnt} ({match_rate:.2%}) match"
)

self.column_stats.append(
Expand Down Expand Up @@ -519,14 +511,12 @@ def all_mismatch(self, ignore_matching_cols=False):
if not ignore_matching_cols or (
ignore_matching_cols and not col_comparison.all()
):
LOG.debug("Adding column {} to the result.".format(orig_col_name))
LOG.debug(f"Adding column {orig_col_name} to the result.")
match_list.append(col)
return_list.extend([orig_col_name + "_df1", orig_col_name + "_df2"])
elif ignore_matching_cols:
LOG.debug(
"Column {} is equal in df1 and df2. It will not be added to the result.".format(
orig_col_name
)
f"Column {orig_col_name} is equal in df1 and df2. It will not be added to the result."
)

mm_bool = self.intersect_rows[match_list].all(axis="columns")
Expand Down Expand Up @@ -612,8 +602,8 @@ def report(self, sample_count=10, column_count=10, html_file=None):
match_stats.append(
{
"Column": column["column"],
"{} dtype".format(self.df1_name): column["dtype1"],
"{} dtype".format(self.df2_name): column["dtype2"],
f"{self.df1_name} dtype": column["dtype1"],
f"{self.df2_name} dtype": column["dtype2"],
"# Unequal": column["unequal_cnt"],
"Max Diff": column["max_diff"],
"# Null Diff": column["null_diff"],
Expand All @@ -636,8 +626,8 @@ def report(self, sample_count=10, column_count=10, html_file=None):
report += df_match_stats[
[
"Column",
"{} dtype".format(self.df1_name),
"{} dtype".format(self.df2_name),
f"{self.df1_name} dtype",
f"{self.df2_name} dtype",
"# Unequal",
"Max Diff",
"# Null Diff",
Expand All @@ -654,25 +644,17 @@ def report(self, sample_count=10, column_count=10, html_file=None):
report += "\n\n"

if min(sample_count, self.df1_unq_rows.shape[0]) > 0:
report += "Sample Rows Only in {} (First {} Columns)\n".format(
self.df1_name, column_count
)
report += "---------------------------------------{}\n".format(
"-" * len(self.df1_name)
)
report += f"Sample Rows Only in {self.df1_name} (First {column_count} Columns)\n"
report += f"---------------------------------------{'-' * len(self.df1_name)}\n"
report += "\n"
columns = self.df1_unq_rows.columns[:column_count]
unq_count = min(sample_count, self.df1_unq_rows.shape[0])
report += self.df1_unq_rows.sample(unq_count)[columns].to_string()
report += "\n\n"

if min(sample_count, self.df2_unq_rows.shape[0]) > 0:
report += "Sample Rows Only in {} (First {} Columns)\n".format(
self.df2_name, column_count
)
report += "---------------------------------------{}\n".format(
"-" * len(self.df2_name)
)
report += f"Sample Rows Only in {self.df2_name} (First {column_count} Columns)\n"
report += f"---------------------------------------{'-' * len(self.df2_name)}\n"
report += "\n"
columns = self.df2_unq_rows.columns[:column_count]
unq_count = min(sample_count, self.df2_unq_rows.shape[0])
Expand Down Expand Up @@ -859,7 +841,7 @@ def temp_column_name(*dataframes):
"""
i = 0
while True:
temp_column = "_temp_{}".format(i)
temp_column = f"_temp_{i}"
unique = True
for dataframe in dataframes:
if temp_column in dataframe.columns:
Expand Down Expand Up @@ -909,7 +891,7 @@ def generate_id_within_group(dataframe, join_columns):
default_value = "DATACOMPY_NULL"
if dataframe[join_columns].isnull().any().any():
if (dataframe[join_columns] == default_value).any().any():
raise ValueError("{} was found in your join columns".format(default_value))
raise ValueError(f"{default_value} was found in your join columns")
return (
dataframe[join_columns]
.astype(str)
Expand Down
Loading

0 comments on commit 46d4ba3

Please sign in to comment.