Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release v0.13.3 #332

Merged
merged 4 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/workflows/test-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,20 @@ permissions:
contents: read

jobs:
lint-and-format:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.9
uses: actions/setup-python@v5
with:
python-version: "3.9"
- name: Install dependencies
run: python -m pip install .[qa]
- name: Linting by ruff
run: ruff check
- name: Formatting by ruff
run: ruff format --check
test-dev-install:

runs-on: ubuntu-latest
Expand Down
31 changes: 22 additions & 9 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
repos:
- repo: https://github.com/psf/black
rev: 23.3.0
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.5.7
hooks:
- id: black
types: [file, python]
language_version: python3.10
- repo: https://github.com/pycqa/isort
rev: 5.12.0
# Run the linter.
- id: ruff
args: [ --fix ]
# Run the formatter.
- id: ruff-format
types_or: [ python, jupyter ]
# # Mypy: Optional static type checking
# # https://github.com/pre-commit/mirrors-mypy
# - repo: https://github.com/pre-commit/mirrors-mypy
# rev: v1.11.1
# hooks:
# - id: mypy
# exclude: ^(docs|tests)\/
# language_version: python3.9
# args: [--namespace-packages, --explicit-package-bases, --ignore-missing-imports, --non-interactive, --install-types]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: isort
name: isort (python)
- id: trailing-whitespace
- id: debug-statements
- id: end-of-file-fixer
48 changes: 42 additions & 6 deletions datacompy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,28 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DataComPy is a package to compare two Pandas DataFrames.

__version__ = "0.13.2"
Originally started to be something of a replacement for SAS's PROC COMPARE for Pandas DataFrames with some more functionality than just Pandas.DataFrame.equals(Pandas.DataFrame) (in that it prints out some stats, and lets you tweak how accurate matches have to be).
Then extended to carry that functionality over to Spark Dataframes.
"""

__version__ = "0.13.3"

import platform
from warnings import warn

from .core import * # noqa: F403
from .fugue import ( # noqa: F401
from datacompy.base import BaseCompare, temp_column_name
from datacompy.core import (
Compare,
calculate_max_diff,
columns_equal,
compare_string_and_date_columns,
generate_id_within_group,
get_merged_columns,
render,
)
from datacompy.fugue import (
all_columns_match,
all_rows_overlap,
count_matching_rows,
Expand All @@ -28,9 +42,31 @@
report,
unq_columns,
)
from .polars import PolarsCompare # noqa: F401
from .spark.pandas import SparkPandasCompare # noqa: F401
from .spark.sql import SparkSQLCompare # noqa: F401
from datacompy.polars import PolarsCompare
from datacompy.spark.pandas import SparkPandasCompare
from datacompy.spark.sql import SparkSQLCompare

__all__ = [
"BaseCompare",
"Compare",
"PolarsCompare",
"SparkPandasCompare",
"SparkSQLCompare",
"all_columns_match",
"all_rows_overlap",
"calculate_max_diff",
"columns_equal",
"compare_string_and_date_columns",
"count_matching_rows",
"generate_id_within_group",
"get_merged_columns",
"intersect_columns",
"is_match",
"render",
"report",
"temp_column_name",
"unq_columns",
]

major = platform.python_version_tuple()[0]
minor = platform.python_version_tuple()[1]
Expand Down
40 changes: 29 additions & 11 deletions datacompy/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.

"""
Compare two Pandas DataFrames
Compare two Pandas DataFrames.

Originally this package was meant to provide similar functionality to
PROC COMPARE in SAS - i.e. human-readable reporting on the difference between
Expand All @@ -31,36 +31,42 @@


class BaseCompare(ABC):
"""Base comparison class."""

@property
def df1(self) -> Any:
"""Get the first dataframe."""
return self._df1 # type: ignore

@df1.setter
@abstractmethod
def df1(self, df1: Any) -> None:
"""Check that it is a dataframe and has the join columns"""
"""Check that it is a dataframe and has the join columns."""
pass

@property
def df2(self) -> Any:
"""Get the second dataframe."""
return self._df2 # type: ignore

@df2.setter
@abstractmethod
def df2(self, df2: Any) -> None:
"""Check that it is a dataframe and has the join columns"""
"""Check that it is a dataframe and has the join columns."""
pass

@abstractmethod
def _validate_dataframe(
self, index: str, cast_column_names_lower: bool = True
) -> None:
"""Check that it is a dataframe and has the join columns"""
"""Check that it is a dataframe and has the join columns."""
pass

@abstractmethod
def _compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
"""Actually run the comparison. This tries to run df1.equals(df2)
"""Run the comparison.

This tries to run df1.equals(df2)
first so that if they're truly equal we can tell.

This method will log out information about what is different between
Expand All @@ -70,23 +76,25 @@ def _compare(self, ignore_spaces: bool, ignore_case: bool) -> None:

@abstractmethod
def df1_unq_columns(self) -> OrderedSet[str]:
"""Get columns that are unique to df1"""
"""Get columns that are unique to df1."""
pass

@abstractmethod
def df2_unq_columns(self) -> OrderedSet[str]:
"""Get columns that are unique to df2"""
"""Get columns that are unique to df2."""
pass

@abstractmethod
def intersect_columns(self) -> OrderedSet[str]:
"""Get columns that are shared between the two dataframes"""
"""Get columns that are shared between the two dataframes."""
pass

@abstractmethod
def _dataframe_merge(self, ignore_spaces: bool) -> None:
"""Merge df1 to df2 on the join columns, to get df1 - df2, df2 - df1
and df1 & df2
"""Merge df1 to df2 on the join columns.

To get df1 - df2, df2 - df1
and df1 & df2.

If ``on_index`` is True, this will join on index values, otherwise it
will join on the ``join_columns``.
Expand All @@ -95,40 +103,49 @@ def _dataframe_merge(self, ignore_spaces: bool) -> None:

@abstractmethod
def _intersect_compare(self, ignore_spaces: bool, ignore_case: bool) -> None:
"""Compare the intersection of the two dataframes."""
pass

@abstractmethod
def all_columns_match(self) -> bool:
"""Check if all columns match."""
pass

@abstractmethod
def all_rows_overlap(self) -> bool:
"""Check if all rows overlap."""
pass

@abstractmethod
def count_matching_rows(self) -> int:
"""Count the number of matchin grows."""
pass

@abstractmethod
def intersect_rows_match(self) -> bool:
"""Check if the intersection of rows match."""
pass

@abstractmethod
def matches(self, ignore_extra_columns: bool = False) -> bool:
"""Check if the dataframes match."""
pass

@abstractmethod
def subset(self) -> bool:
"""Check if one dataframe is a subset of the other."""
pass

@abstractmethod
def sample_mismatch(
self, column: str, sample_count: int = 10, for_display: bool = False
) -> Any:
"""Get a sample of rows that mismatch."""
pass

@abstractmethod
def all_mismatch(self, ignore_matching_cols: bool = False) -> Any:
"""Get all rows that mismatch."""
pass

@abstractmethod
Expand All @@ -138,11 +155,12 @@ def report(
column_count: int = 10,
html_file: Optional[str] = None,
) -> str:
"""Return a string representation of a report."""
pass


def temp_column_name(*dataframes) -> str:
"""Gets a temp column name that isn't included in columns of any dataframes
"""Get a temp column name that isn't included in columns of any dataframes.

Parameters
----------
Expand Down
Loading
Loading