From a2aca030986a0f3257442da4fd448be1dd26cace Mon Sep 17 00:00:00 2001 From: Doug Davis Date: Wed, 18 Oct 2023 12:56:05 -0500 Subject: [PATCH 01/19] starting on datetime support --- .gitignore | 3 ++ src/awkward_pandas/accessor.py | 5 +++ src/awkward_pandas/datetimes.py | 75 +++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+) create mode 100644 src/awkward_pandas/datetimes.py diff --git a/.gitignore b/.gitignore index 15de219..e211849 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# emacs +.dir-locals.el + # setuptools_scm src/awkward_pandas/version.py diff --git a/src/awkward_pandas/accessor.py b/src/awkward_pandas/accessor.py index 2b068ea..ed8943b 100644 --- a/src/awkward_pandas/accessor.py +++ b/src/awkward_pandas/accessor.py @@ -7,6 +7,7 @@ import pandas as pd from awkward_pandas.array import AwkwardExtensionArray +from awkward_pandas.datetimes import DatetimeAccessor from awkward_pandas.dtype import AwkwardDtype from awkward_pandas.strings import StringAccessor @@ -143,6 +144,10 @@ def _validate(obj): def str(self) -> StringAccessor: return StringAccessor(self) + @property + def dt(self) -> DatetimeAccessor: + return DatetimeAccessor(self) + def __getattr__(self, item): """Call awkward namespace function on a series""" # replace with concrete implementations of all top-level ak functions diff --git a/src/awkward_pandas/datetimes.py b/src/awkward_pandas/datetimes.py new file mode 100644 index 0000000..99e43ed --- /dev/null +++ b/src/awkward_pandas/datetimes.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import functools +from typing import TYPE_CHECKING, Any + +import awkward as ak +import pandas as pd +import pyarrow.compute as pc + +from awkward_pandas.array import AwkwardExtensionArray + +if TYPE_CHECKING: + from awkward_pandas.accessor import AwkwardAccessor + + +component_extraction = [ + "day", + "day_of_week", + "day_of_year", + "hour", + "iso_week", + "iso_year", + "iso_calendar", + "is_leap_year", + "microsecond", + "millisecond", + "minute", + "month", + "nanosecond", + "quarter", + "second", + "subsecond", + "us_week", + "us_year", + "week", + "year", + "year_month_day", +] + + +differences = [ + "day_time_interval_between", + "days_between", + "hours_between", + "microseconds_between", + "milliseconds_between", + "minutes_between", + "month_day_nano_interval_between", + "month_interval_between", + "nanoseconds_between", + "quarters_between", + "seconds_between", + "weeks_between", + "years_between", +] + + +class DatetimeAccessor: + def __init__(self, accessor: AwkwardAccessor) -> None: + self.accessor = accessor + + def __getattr__(self, attr: str) -> Any: + if not (attr in component_extraction or attr in differences): + raise ValueError + + fn = getattr(pc, attr) + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + arrow_array = ak.to_arrow(self.accessor.array, extensionarray=False) + result = ak.from_arrow(fn(arrow_array)) + idx = self.accessor._obj.index + return pd.Series(AwkwardExtensionArray(result), index=idx) + + return wrapper From ab6b514bf7bbfd2aa0accee9555c4f7d53bb9d22 Mon Sep 17 00:00:00 2001 From: Doug Davis Date: Wed, 18 Oct 2023 12:56:49 -0500 Subject: [PATCH 02/19] pre-commit --- .pre-commit-config.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e9ef277..cb28c14 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: check-added-large-files - id: check-case-conflict @@ -13,7 +13,7 @@ repos: - id: trailing-whitespace - repo: https://github.com/psf/black - rev: 23.7.0 + rev: 23.10.0 hooks: - id: black language_version: python3 @@ -21,7 +21,7 @@ repos: - --target-version=py38 - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.0.282 + rev: v0.1.0 hooks: - id: ruff @@ -32,7 +32,7 @@ repos: language_version: python3 - repo: https://github.com/asottile/pyupgrade - rev: v3.10.1 + rev: v3.15.0 hooks: - id: pyupgrade args: @@ -49,7 +49,7 @@ repos: - id: yesqa - repo: https://github.com/adamchainz/blacken-docs - rev: 1.15.0 + rev: 1.16.0 hooks: - id: blacken-docs additional_dependencies: From 7c542fd97ee36b479d0e7bf0e411ccd3c5b2f52f Mon Sep 17 00:00:00 2001 From: Doug Davis Date: Wed, 18 Oct 2023 13:02:58 -0500 Subject: [PATCH 03/19] tests --- src/awkward_pandas/lib.py | 2 +- tests/test_upstream_extension_tests.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/awkward_pandas/lib.py b/src/awkward_pandas/lib.py index 85ef2ae..43a9535 100644 --- a/src/awkward_pandas/lib.py +++ b/src/awkward_pandas/lib.py @@ -29,7 +29,7 @@ def merge(dataframe: pd.DataFrame, name: str | None = None) -> pd.Series: if dataframe[c].dtype == "awkward": out[c] = dataframe[c].values._data elif dataframe[c].dtype == "string[pyarrow]": - out[c] = ak.from_arrow(dataframe[c].values._data) + out[c] = ak.from_arrow(dataframe[c].values._pa_array) elif dataframe[c].dtype == np.dtype("O"): out[c] = ak.from_iter(dataframe[c]) else: diff --git a/tests/test_upstream_extension_tests.py b/tests/test_upstream_extension_tests.py index 2d40747..9d15b5c 100644 --- a/tests/test_upstream_extension_tests.py +++ b/tests/test_upstream_extension_tests.py @@ -1,6 +1,7 @@ from __future__ import annotations import pandas as pd +import pandas._testing as tm from pandas.tests.extension.base import BaseConstructorsTests, BaseDtypeTests from pandas.tests.extension.base.casting import BaseCastingTests # noqa from pandas.tests.extension.base.dim2 import Dim2CompatTests # noqa @@ -46,15 +47,15 @@ def test_from_dtype(self, data): expected = pd.Series(data) result = pd.Series(list(data), dtype=dtype) - self.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = pd.Series(list(data), dtype=str(dtype)) - self.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # this is the test that breaks the upstream version # expected = pd.DataFrame(data).astype(dtype) # result = pd.DataFrame(list(data), dtype=dtype) - # self.assert_frame_equal(result, expected) + # tm.assert_frame_equal(result, expected) # class TestAwkwardBaseCastingTests(BaseCastingTests): From 7b73a099d1091919127929ebc9cb7b1894866140 Mon Sep 17 00:00:00 2001 From: Doug Davis Date: Wed, 18 Oct 2023 13:05:09 -0500 Subject: [PATCH 04/19] drop 3.8 --- .github/workflows/pypi.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 0e1103d..a923c49 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -18,7 +18,7 @@ jobs: fail-fast: false matrix: platform: [ubuntu-latest, macos-latest, windows-latest] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11"] runs-on: ${{matrix.platform}} steps: - name: Checkout From 3435d8508ae65a2ae3f541ba1287561d041f6fcd Mon Sep 17 00:00:00 2001 From: Doug Davis Date: Wed, 18 Oct 2023 13:18:06 -0500 Subject: [PATCH 05/19] support args and kwargs --- src/awkward_pandas/datetimes.py | 35 +++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/src/awkward_pandas/datetimes.py b/src/awkward_pandas/datetimes.py index 99e43ed..3ad268a 100644 --- a/src/awkward_pandas/datetimes.py +++ b/src/awkward_pandas/datetimes.py @@ -13,7 +13,17 @@ from awkward_pandas.accessor import AwkwardAccessor -component_extraction = [ +PYARROW_FUNCTIONS = [ + # CONVERSIONS + "cast", + "ceil_temporal", + "floor_temporal", + "round_temporal", + "run_end_decode", + "run_end_encode", + "strftime", + "strptime ", + # COMPONENT_EXTRACTION "day", "day_of_week", "day_of_year", @@ -35,10 +45,7 @@ "week", "year", "year_month_day", -] - - -differences = [ + # DIFFERENCES "day_time_interval_between", "days_between", "hours_between", @@ -59,8 +66,11 @@ class DatetimeAccessor: def __init__(self, accessor: AwkwardAccessor) -> None: self.accessor = accessor + def __dir__(self) -> list[str]: + return sorted(PYARROW_FUNCTIONS) + def __getattr__(self, attr: str) -> Any: - if not (attr in component_extraction or attr in differences): + if attr not in PYARROW_FUNCTIONS: raise ValueError fn = getattr(pc, attr) @@ -68,8 +78,17 @@ def __getattr__(self, attr: str) -> Any: @functools.wraps(fn) def wrapper(*args, **kwargs): arrow_array = ak.to_arrow(self.accessor.array, extensionarray=False) - result = ak.from_arrow(fn(arrow_array)) + + arrow_args, arrow_kwargs = [], {} + for arg in args: + if isinstance(arg, pd.Series) and arg.dtype == "awkward": + arrow_args.append(ak.to_arrow(arg.ak.array, extensionarray=False)) + for k, v in kwargs.items(): + if isinstance(v, pd.Series) and v.dtype == "awkward": + arrow_kwargs[k] = ak.to_arrow(v.ak.array, extensionarray=False) + + result = fn(arrow_array, *arrow_args, **arrow_kwargs) idx = self.accessor._obj.index - return pd.Series(AwkwardExtensionArray(result), index=idx) + return pd.Series(AwkwardExtensionArray(ak.from_arrow(result)), index=idx) return wrapper From 4ab40213b9779fc73e3614c18528458094fd2cd0 Mon Sep 17 00:00:00 2001 From: Doug Davis Date: Wed, 18 Oct 2023 13:22:20 -0500 Subject: [PATCH 06/19] dir --- src/awkward_pandas/datetimes.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/awkward_pandas/datetimes.py b/src/awkward_pandas/datetimes.py index 3ad268a..8ea8d73 100644 --- a/src/awkward_pandas/datetimes.py +++ b/src/awkward_pandas/datetimes.py @@ -67,7 +67,11 @@ def __init__(self, accessor: AwkwardAccessor) -> None: self.accessor = accessor def __dir__(self) -> list[str]: - return sorted(PYARROW_FUNCTIONS) + return sorted( + [x for x in dir(type(self)) if not x.startswith("_")] + + dir(super()) + + PYARROW_FUNCTIONS + ) def __getattr__(self, attr: str) -> Any: if attr not in PYARROW_FUNCTIONS: From da62b84558470130ddefa938cb1283f212479371 Mon Sep 17 00:00:00 2001 From: Doug Davis Date: Wed, 18 Oct 2023 13:22:56 -0500 Subject: [PATCH 07/19] rm spaces --- src/awkward_pandas/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/awkward_pandas/datetimes.py b/src/awkward_pandas/datetimes.py index 8ea8d73..eb27b11 100644 --- a/src/awkward_pandas/datetimes.py +++ b/src/awkward_pandas/datetimes.py @@ -22,7 +22,7 @@ "run_end_decode", "run_end_encode", "strftime", - "strptime ", + "strptime", # COMPONENT_EXTRACTION "day", "day_of_week", From ed1622b22ea0003e9acfe71ad2fb48f6fc152168 Mon Sep 17 00:00:00 2001 From: Doug Davis Date: Wed, 18 Oct 2023 13:26:34 -0500 Subject: [PATCH 08/19] don't drop non-arrow-array args and kwargs --- src/awkward_pandas/datetimes.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/awkward_pandas/datetimes.py b/src/awkward_pandas/datetimes.py index eb27b11..f422ef6 100644 --- a/src/awkward_pandas/datetimes.py +++ b/src/awkward_pandas/datetimes.py @@ -83,15 +83,19 @@ def __getattr__(self, attr: str) -> Any: def wrapper(*args, **kwargs): arrow_array = ak.to_arrow(self.accessor.array, extensionarray=False) - arrow_args, arrow_kwargs = [], {} + new_args, new_kwargs = [], {} for arg in args: if isinstance(arg, pd.Series) and arg.dtype == "awkward": - arrow_args.append(ak.to_arrow(arg.ak.array, extensionarray=False)) + new_args.append(ak.to_arrow(arg.ak.array, extensionarray=False)) + else: + new_args.append(arg) for k, v in kwargs.items(): if isinstance(v, pd.Series) and v.dtype == "awkward": - arrow_kwargs[k] = ak.to_arrow(v.ak.array, extensionarray=False) + new_kwargs[k] = ak.to_arrow(v.ak.array, extensionarray=False) + else: + new_kwargs[k] = v - result = fn(arrow_array, *arrow_args, **arrow_kwargs) + result = fn(arrow_array, *new_args, **new_kwargs) idx = self.accessor._obj.index return pd.Series(AwkwardExtensionArray(ak.from_arrow(result)), index=idx) From 53089339fa2c86b7953d47dfef2f3000c91b4ce3 Mon Sep 17 00:00:00 2001 From: Doug Davis Date: Wed, 18 Oct 2023 13:28:51 -0500 Subject: [PATCH 09/19] organize --- src/awkward_pandas/datetimes.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/awkward_pandas/datetimes.py b/src/awkward_pandas/datetimes.py index f422ef6..7e93979 100644 --- a/src/awkward_pandas/datetimes.py +++ b/src/awkward_pandas/datetimes.py @@ -83,12 +83,18 @@ def __getattr__(self, attr: str) -> Any: def wrapper(*args, **kwargs): arrow_array = ak.to_arrow(self.accessor.array, extensionarray=False) - new_args, new_kwargs = [], {} + # parse args so that other series backed by awkward are + # converted to arrow array objects. + new_args = [] for arg in args: if isinstance(arg, pd.Series) and arg.dtype == "awkward": new_args.append(ak.to_arrow(arg.ak.array, extensionarray=False)) else: new_args.append(arg) + + # parse kwargs so that other series backed by awkward are + # converted to arrow array objects. + new_kwargs = {} for k, v in kwargs.items(): if isinstance(v, pd.Series) and v.dtype == "awkward": new_kwargs[k] = ak.to_arrow(v.ak.array, extensionarray=False) From 6b51214d0cf6f687e393e2eb8c3da0bd78f859a1 Mon Sep 17 00:00:00 2001 From: Doug Davis Date: Thu, 19 Oct 2023 10:30:25 -0500 Subject: [PATCH 10/19] explicitly implement interface (not getattr); add conversion helpers --- src/awkward_pandas/accessor.py | 13 +- src/awkward_pandas/datetimes.py | 432 +++++++++++++++++++++++++------- 2 files changed, 349 insertions(+), 96 deletions(-) diff --git a/src/awkward_pandas/accessor.py b/src/awkward_pandas/accessor.py index ed8943b..57be74e 100644 --- a/src/awkward_pandas/accessor.py +++ b/src/awkward_pandas/accessor.py @@ -189,8 +189,11 @@ def apply(self, fn): return result def __dir__(self) -> list[str]: - return [ - _ - for _ in (dir(ak)) - if not _.startswith(("_", "ak_")) and not _[0].isupper() - ] + ["to_column"] + return sorted( + [ + _ + for _ in (dir(ak)) + if not _.startswith(("_", "ak_")) and not _[0].isupper() + ] + + ["to_column", "dt"] + ) diff --git a/src/awkward_pandas/datetimes.py b/src/awkward_pandas/datetimes.py index 7e93979..fe4807d 100644 --- a/src/awkward_pandas/datetimes.py +++ b/src/awkward_pandas/datetimes.py @@ -1,10 +1,10 @@ from __future__ import annotations -import functools -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING import awkward as ak import pandas as pd +import pyarrow as pa import pyarrow.compute as pc from awkward_pandas.array import AwkwardExtensionArray @@ -13,96 +13,346 @@ from awkward_pandas.accessor import AwkwardAccessor -PYARROW_FUNCTIONS = [ - # CONVERSIONS - "cast", - "ceil_temporal", - "floor_temporal", - "round_temporal", - "run_end_decode", - "run_end_encode", - "strftime", - "strptime", - # COMPONENT_EXTRACTION - "day", - "day_of_week", - "day_of_year", - "hour", - "iso_week", - "iso_year", - "iso_calendar", - "is_leap_year", - "microsecond", - "millisecond", - "minute", - "month", - "nanosecond", - "quarter", - "second", - "subsecond", - "us_week", - "us_year", - "week", - "year", - "year_month_day", - # DIFFERENCES - "day_time_interval_between", - "days_between", - "hours_between", - "microseconds_between", - "milliseconds_between", - "minutes_between", - "month_day_nano_interval_between", - "month_interval_between", - "nanoseconds_between", - "quarters_between", - "seconds_between", - "weeks_between", - "years_between", -] - - class DatetimeAccessor: def __init__(self, accessor: AwkwardAccessor) -> None: self.accessor = accessor - def __dir__(self) -> list[str]: - return sorted( - [x for x in dir(type(self)) if not x.startswith("_")] - + dir(super()) - + PYARROW_FUNCTIONS - ) - - def __getattr__(self, attr: str) -> Any: - if attr not in PYARROW_FUNCTIONS: - raise ValueError - - fn = getattr(pc, attr) - - @functools.wraps(fn) - def wrapper(*args, **kwargs): - arrow_array = ak.to_arrow(self.accessor.array, extensionarray=False) - - # parse args so that other series backed by awkward are - # converted to arrow array objects. - new_args = [] - for arg in args: - if isinstance(arg, pd.Series) and arg.dtype == "awkward": - new_args.append(ak.to_arrow(arg.ak.array, extensionarray=False)) - else: - new_args.append(arg) - - # parse kwargs so that other series backed by awkward are - # converted to arrow array objects. - new_kwargs = {} - for k, v in kwargs.items(): - if isinstance(v, pd.Series) and v.dtype == "awkward": - new_kwargs[k] = ak.to_arrow(v.ak.array, extensionarray=False) - else: - new_kwargs[k] = v - - result = fn(arrow_array, *new_args, **new_kwargs) - idx = self.accessor._obj.index - return pd.Series(AwkwardExtensionArray(ak.from_arrow(result)), index=idx) - - return wrapper + def cast(self, target_type=None, safe=None, options=None, memory_pool=None): + raise NotImplementedError("TODO") + + def ceil_temporal( + self, + /, + multiple=1, + unit="day", + *, + week_starts_monday=True, + ceil_is_strictly_greater=False, + calendar_based_origin=False, + options=None, + memory_pool=None, + ): + raise NotImplementedError("TODO") + + def floor_temporal( + self, + /, + multiple=1, + unit="day", + *, + week_starts_monday=True, + ceil_is_strictly_greater=False, + calendar_based_origin=False, + options=None, + memory_pool=None, + ): + raise NotImplementedError("TODO") + + def round_temporal( + self, + /, + multiple=1, + unit="day", + *, + week_starts_monday=True, + ceil_is_strictly_greater=False, + calendar_based_origin=False, + options=None, + memory_pool=None, + ): + raise NotImplementedError("TODO") + + def run_end_decode(self, array, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def run_end_encode( + self, + /, + run_end_type=pa.int32(), + *, + options=None, + memory_pool=None, + ): + raise NotImplementedError("TODO") + + def strftime( + self, + /, + format="%Y-%m-%dT%H:%M:%S", + locale="C", + *, + options=None, + memory_pool=None, + ): + raise NotImplementedError("TODO") + + def strptime( + self, + /, + format, + unit, + error_is_null=False, + *, + options=None, + memory_pool=None, + ): + raise NotImplementedError("TODO") + + def day(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def day_of_week( + self, + /, + *, + count_from_zero=True, + week_start=1, + options=None, + memory_pool=None, + ): + raise NotImplementedError("TODO") + + def day_of_year(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def hour(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def iso_week(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def iso_year(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def iso_calendar(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def is_leap_year(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def microsecond(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def millisecond(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def minute(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def month(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def nanosecond(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def quarter(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def second(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def subsecond(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def us_week(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def us_year(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def week( + self, + /, + *, + week_starts_monday=True, + count_from_zero=False, + first_week_is_fully_in_year=False, + options=None, + memory_pool=None, + ): + raise NotImplementedError("TODO") + + def year(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def year_month_day(self, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def day_time_interval_between(self, end, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def days_between(self, end, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def hours_between(self, end, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def microseconds_between(self, end, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def milliseconds_between(self, end, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def minutes_between(self, end, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def month_day_nano_interval_between(self, end, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def month_interval_between(self, end, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def nanoseconds_between(self, end, /, *, memory_pool=None): + arr, args, kwargs = _arrowize(self, end, memory_pool=memory_pool) + return _as_series(pc.nanoseconds_between(arr, *args, **kwargs)) + + def quarters_between(self, end, /, *, memory_pool=None): + raise NotImplementedError("TODO") + + def seconds_between(self, end, /, *, memory_pool=None): + arr, args, kwargs = _arrowize(self, end, memory_pool=memory_pool) + return _as_series(pc.seconds_between(arr, *args, **kwargs)) + + def weeks_between( + self, + end, + /, + *, + count_from_zero=True, + week_start=1, + options=None, + memory_pool=None, + ): + raise NotImplementedError("TODO") + + def years_between(self, end, /, *, memory_pool=None): + arr, args, kwargs = _arrowize(self, end, memory_pool=memory_pool) + return _as_series(pc.years_between(arr, *args, **kwargs)) + + # def __getattr__(self, attr: str) -> Callable: + # if attr not in dir(self): + # raise AttributeError + + # fn = getattr(pc, attr, None) + + # if fn: + + # @functools.wraps(fn) + # def wrapper(*args, **kwargs): + # try: + # arrow_array = ak.to_arrow(self.accessor.array, extensionarray=False) + # except ArrowNotImplementedError("TODO") as err: + # msg = ( + # "Could not convert data to arrow\n" + # "Arrow requires datetime with units: " + # "seconds, milliseconds, microseconds, nanoseconds" + # ) + # raise ArrowNotImplementedError(msg) + + # # parse args so that other series backed by awkward are + # # converted to arrow array objects. + # new_args = [] + # for arg in args: + # if isinstance(arg, pd.Series) and arg.dtype == "awkward": + # new_args.append(ak.to_arrow(arg.ak.array, extensionarray=False)) + # else: + # new_args.append(arg) + + # # parse kwargs so that other series backed by awkward are + # # converted to arrow array objects. + # new_kwargs = {} + # for k, v in kwargs.items(): + # if isinstance(v, pd.Series) and v.dtype == "awkward": + # new_kwargs[k] = ak.to_arrow(v.ak.array, extensionarray=False) + # else: + # new_kwargs[k] = v + + # result = fn(arrow_array, *new_args, **new_kwargs) + # idx = self.accessor._obj.index + # return pd.Series( + # AwkwardExtensionArray(ak.from_arrow(result)), index=idx + # ) + + # else: + # raise AttributeError + + # return wrapper + + +def _to_arrow(array): + array = _make_unit_compatible(array) + return ak.to_arrow(array, extensionarray=False) + + +def _make_unit_compatible(array): + # TODO, actually convert units if not compatible + return array + + +def _arrowize(dta, *args, **kwargs): + """Convert objects to arrow arrays. + + Parameters + ---------- + dta : DatetimeAccessor + The DatetimeAccessor with information about the main Series + object that is of dtype :obj:`~awkward_pandas.AwkwardDtype`. + *args : Any + Arguments that should be converted to arrow if necessary. Any + arguments that are Series backed by the + :obj:`~awkward_pandas.AwkwardDtype` will have the underlying + awkward array converted to an arrow array. + **kwargs : Any + Keyword arguments that should be converted to arrow if + necessary. Any values that are Series backed by the + :obj:`~awkward_pandas.AwkwardDtype` will have the underlying + awkward array converted to an arrow array. + + Returns + ------- + Array + Primary awkward Series converted to arrow. + tuple + New arguments with necessary conversions. + dict + New keyword arguments with necessary conversions. + + """ + primary_as_arrow = _to_arrow(dta.accessor.array) + + # parse args so that other series backed by awkward are + # converted to arrow array objects. + new_args = [] + for arg in args: + if isinstance(arg, pd.Series) and arg.dtype == "awkward": + new_args.append(_to_arrow(arg.ak.array)) + else: + new_args.append(arg) + + # parse kwargs so that other series backed by awkward are + # converted to arrow array objects. + new_kwargs = {} + for k, v in kwargs.items(): + if isinstance(v, pd.Series) and v.dtype == "awkward": + new_kwargs[k] = _to_arrow(v.ak.array) + else: + new_kwargs[k] = v + + return primary_as_arrow, new_args, new_kwargs + + +def _as_series(pyarrow_result): + """Convert pyarrow Array back in to awkward Series. + + Parameters + ---------- + pyarrow_result : pyarrow Array + PyArray array that was the result of a pyarrow.compute call. + + Examples + -------- + pd.Series + Series of type :obj:`~awkward_pandas.AwkwardDtype`. + + """ + return pd.Series(AwkwardExtensionArray(ak.from_arrow(pyarrow_result))) From fa544beed997dc147a583671e68f5fd59595a266 Mon Sep 17 00:00:00 2001 From: Doug Davis Date: Thu, 19 Oct 2023 16:02:57 -0500 Subject: [PATCH 11/19] cleanup --- src/awkward_pandas/datetimes.py | 156 +++++++++++--------------------- 1 file changed, 53 insertions(+), 103 deletions(-) diff --git a/src/awkward_pandas/datetimes.py b/src/awkward_pandas/datetimes.py index fe4807d..4528423 100644 --- a/src/awkward_pandas/datetimes.py +++ b/src/awkward_pandas/datetimes.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import awkward as ak import pandas as pd @@ -14,10 +14,11 @@ class DatetimeAccessor: - def __init__(self, accessor: AwkwardAccessor) -> None: - self.accessor = accessor + def __init__(self, ak_accessor: AwkwardAccessor) -> None: + self.ak_accessor = ak_accessor + self.index = ak_accessor._obj.index - def cast(self, target_type=None, safe=None, options=None, memory_pool=None): + def cast(self, target_type=None, safe=None, options=None): raise NotImplementedError("TODO") def ceil_temporal( @@ -30,7 +31,6 @@ def ceil_temporal( ceil_is_strictly_greater=False, calendar_based_origin=False, options=None, - memory_pool=None, ): raise NotImplementedError("TODO") @@ -44,7 +44,6 @@ def floor_temporal( ceil_is_strictly_greater=False, calendar_based_origin=False, options=None, - memory_pool=None, ): raise NotImplementedError("TODO") @@ -58,11 +57,10 @@ def round_temporal( ceil_is_strictly_greater=False, calendar_based_origin=False, options=None, - memory_pool=None, ): raise NotImplementedError("TODO") - def run_end_decode(self, array, /, *, memory_pool=None): + def run_end_decode(self, array): raise NotImplementedError("TODO") def run_end_encode( @@ -71,7 +69,6 @@ def run_end_encode( run_end_type=pa.int32(), *, options=None, - memory_pool=None, ): raise NotImplementedError("TODO") @@ -82,7 +79,6 @@ def strftime( locale="C", *, options=None, - memory_pool=None, ): raise NotImplementedError("TODO") @@ -94,11 +90,10 @@ def strptime( error_is_null=False, *, options=None, - memory_pool=None, ): raise NotImplementedError("TODO") - def day(self, /, *, memory_pool=None): + def day(self): raise NotImplementedError("TODO") def day_of_week( @@ -108,56 +103,55 @@ def day_of_week( count_from_zero=True, week_start=1, options=None, - memory_pool=None, ): raise NotImplementedError("TODO") - def day_of_year(self, /, *, memory_pool=None): + def day_of_year(self): raise NotImplementedError("TODO") - def hour(self, /, *, memory_pool=None): + def hour(self): raise NotImplementedError("TODO") - def iso_week(self, /, *, memory_pool=None): + def iso_week(self): raise NotImplementedError("TODO") - def iso_year(self, /, *, memory_pool=None): + def iso_year(self): raise NotImplementedError("TODO") - def iso_calendar(self, /, *, memory_pool=None): + def iso_calendar(self): raise NotImplementedError("TODO") - def is_leap_year(self, /, *, memory_pool=None): + def is_leap_year(self): raise NotImplementedError("TODO") - def microsecond(self, /, *, memory_pool=None): + def microsecond(self): raise NotImplementedError("TODO") - def millisecond(self, /, *, memory_pool=None): + def millisecond(self): raise NotImplementedError("TODO") - def minute(self, /, *, memory_pool=None): + def minute(self): raise NotImplementedError("TODO") - def month(self, /, *, memory_pool=None): + def month(self): raise NotImplementedError("TODO") - def nanosecond(self, /, *, memory_pool=None): + def nanosecond(self): raise NotImplementedError("TODO") - def quarter(self, /, *, memory_pool=None): + def quarter(self): raise NotImplementedError("TODO") - def second(self, /, *, memory_pool=None): + def second(self): raise NotImplementedError("TODO") - def subsecond(self, /, *, memory_pool=None): + def subsecond(self): raise NotImplementedError("TODO") - def us_week(self, /, *, memory_pool=None): + def us_week(self): raise NotImplementedError("TODO") - def us_year(self, /, *, memory_pool=None): + def us_year(self): raise NotImplementedError("TODO") def week( @@ -168,49 +162,51 @@ def week( count_from_zero=False, first_week_is_fully_in_year=False, options=None, - memory_pool=None, ): raise NotImplementedError("TODO") - def year(self, /, *, memory_pool=None): + def year(self): raise NotImplementedError("TODO") - def year_month_day(self, /, *, memory_pool=None): + def year_month_day(self): raise NotImplementedError("TODO") - def day_time_interval_between(self, end, /, *, memory_pool=None): + def day_time_interval_between(self, end): raise NotImplementedError("TODO") - def days_between(self, end, /, *, memory_pool=None): + def days_between(self, end): raise NotImplementedError("TODO") - def hours_between(self, end, /, *, memory_pool=None): + def hours_between(self, end): raise NotImplementedError("TODO") - def microseconds_between(self, end, /, *, memory_pool=None): + def microseconds_between(self, end): raise NotImplementedError("TODO") - def milliseconds_between(self, end, /, *, memory_pool=None): + def milliseconds_between(self, end): raise NotImplementedError("TODO") - def minutes_between(self, end, /, *, memory_pool=None): + def minutes_between(self, end): raise NotImplementedError("TODO") - def month_day_nano_interval_between(self, end, /, *, memory_pool=None): + def month_day_nano_interval_between(self, end): raise NotImplementedError("TODO") - def month_interval_between(self, end, /, *, memory_pool=None): + def month_interval_between(self, end): raise NotImplementedError("TODO") - def nanoseconds_between(self, end, /, *, memory_pool=None): - arr, args, kwargs = _arrowize(self, end, memory_pool=memory_pool) - return _as_series(pc.nanoseconds_between(arr, *args, **kwargs)) + def nanoseconds_between(self, end): + arr, args, kwargs = _arrowize(self, end) + return _as_series( + pc.nanoseconds_between(arr, *args, **kwargs), + index=self.index, + ) - def quarters_between(self, end, /, *, memory_pool=None): + def quarters_between(self, end): raise NotImplementedError("TODO") - def seconds_between(self, end, /, *, memory_pool=None): - arr, args, kwargs = _arrowize(self, end, memory_pool=memory_pool) + def seconds_between(self, end): + arr, args, kwargs = _arrowize(self, end) return _as_series(pc.seconds_between(arr, *args, **kwargs)) def weeks_between( @@ -221,63 +217,13 @@ def weeks_between( count_from_zero=True, week_start=1, options=None, - memory_pool=None, ): raise NotImplementedError("TODO") - def years_between(self, end, /, *, memory_pool=None): - arr, args, kwargs = _arrowize(self, end, memory_pool=memory_pool) + def years_between(self, end): + arr, args, kwargs = _arrowize(self, end) return _as_series(pc.years_between(arr, *args, **kwargs)) - # def __getattr__(self, attr: str) -> Callable: - # if attr not in dir(self): - # raise AttributeError - - # fn = getattr(pc, attr, None) - - # if fn: - - # @functools.wraps(fn) - # def wrapper(*args, **kwargs): - # try: - # arrow_array = ak.to_arrow(self.accessor.array, extensionarray=False) - # except ArrowNotImplementedError("TODO") as err: - # msg = ( - # "Could not convert data to arrow\n" - # "Arrow requires datetime with units: " - # "seconds, milliseconds, microseconds, nanoseconds" - # ) - # raise ArrowNotImplementedError(msg) - - # # parse args so that other series backed by awkward are - # # converted to arrow array objects. - # new_args = [] - # for arg in args: - # if isinstance(arg, pd.Series) and arg.dtype == "awkward": - # new_args.append(ak.to_arrow(arg.ak.array, extensionarray=False)) - # else: - # new_args.append(arg) - - # # parse kwargs so that other series backed by awkward are - # # converted to arrow array objects. - # new_kwargs = {} - # for k, v in kwargs.items(): - # if isinstance(v, pd.Series) and v.dtype == "awkward": - # new_kwargs[k] = ak.to_arrow(v.ak.array, extensionarray=False) - # else: - # new_kwargs[k] = v - - # result = fn(arrow_array, *new_args, **new_kwargs) - # idx = self.accessor._obj.index - # return pd.Series( - # AwkwardExtensionArray(ak.from_arrow(result)), index=idx - # ) - - # else: - # raise AttributeError - - # return wrapper - def _to_arrow(array): array = _make_unit_compatible(array) @@ -289,7 +235,11 @@ def _make_unit_compatible(array): return array -def _arrowize(dta, *args, **kwargs): +def _arrowize( + dta: DatetimeAccessor, + *args: Any, + **kwargs: Any, +) -> tuple[Any, tuple[Any, ...], dict[str, Any]]: """Convert objects to arrow arrays. Parameters @@ -318,7 +268,7 @@ def _arrowize(dta, *args, **kwargs): New keyword arguments with necessary conversions. """ - primary_as_arrow = _to_arrow(dta.accessor.array) + primary_as_arrow = _to_arrow(dta.ak_accessor.array) # parse args so that other series backed by awkward are # converted to arrow array objects. @@ -338,10 +288,10 @@ def _arrowize(dta, *args, **kwargs): else: new_kwargs[k] = v - return primary_as_arrow, new_args, new_kwargs + return primary_as_arrow, tuple(new_args), new_kwargs -def _as_series(pyarrow_result): +def _as_series(pyarrow_result, index): """Convert pyarrow Array back in to awkward Series. Parameters @@ -355,4 +305,4 @@ def _as_series(pyarrow_result): Series of type :obj:`~awkward_pandas.AwkwardDtype`. """ - return pd.Series(AwkwardExtensionArray(ak.from_arrow(pyarrow_result))) + return pd.Series(AwkwardExtensionArray(ak.from_arrow(pyarrow_result)), index=index) From 6aefd3794836a7a43b4402e787dc675120beeacc Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 25 Apr 2024 14:51:50 -0400 Subject: [PATCH 12/19] Start --- src/awkward_pandas/polars.py | 12 +++++++++--- tests/test_polars.py | 3 +-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/awkward_pandas/polars.py b/src/awkward_pandas/polars.py index 9a67d3d..d7871b8 100644 --- a/src/awkward_pandas/polars.py +++ b/src/awkward_pandas/polars.py @@ -10,8 +10,8 @@ @pl.api.register_series_namespace("ak") @pl.api.register_dataframe_namespace("ak") class AwkwardOperations(ArithmeticMixin): - def __init__(self, df: pl.DataFrame): - self._df = df + def __init__(self, df: pl.DataFrame | pl.Series): + self._obj = df def __array_function__(self, *args, **kwargs): return self.array.__array_function__(*args, **kwargs) @@ -41,7 +41,13 @@ def __getitem__(self, item): @property def array(self): - return ak.from_arrow(self._df.to_arrow()) + return ak.from_arrow(self._obj.to_arrow()) + + @property + def str(self): + from awkward_pandas.strings import StringAccessor + + return StringAccessor(self) def __getattr__(self, item): if item not in dir(self): diff --git a/tests/test_polars.py b/tests/test_polars.py index ff5778a..3ecf817 100644 --- a/tests/test_polars.py +++ b/tests/test_polars.py @@ -1,9 +1,8 @@ import numpy as np import pytest -import awkward_pandas.polars # noqa: F401 - pl = pytest.importorskip("polars") +pytest.importorskip("awkward_pandas.polars") def test_simple(): From 8575603a4e8cc6cd535520f08e77831826c77029 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 25 Apr 2024 16:07:09 -0400 Subject: [PATCH 13/19] polars refactor --- src/awkward_pandas/mixin.py | 133 +++++++++++++++++++++++++++-------- src/awkward_pandas/polars.py | 75 ++++++++------------ 2 files changed, 133 insertions(+), 75 deletions(-) diff --git a/src/awkward_pandas/mixin.py b/src/awkward_pandas/mixin.py index a2168e2..cbe37e6 100644 --- a/src/awkward_pandas/mixin.py +++ b/src/awkward_pandas/mixin.py @@ -1,4 +1,14 @@ import operator +from typing import Callable, Iterable + +import awkward as ak + +methods = [ + _ for _ in (dir(ak)) if not _.startswith(("_", "ak_")) and not _[0].isupper() +] + ["apply", "array", "explode"] + +df_methods = sorted(methods + ["merge"]) +series_methods = sorted(methods + ["unmerge"]) def radd(left, right): @@ -62,56 +72,117 @@ class AbstractMethodError(NotImplementedError): class ArithmeticMixin: @classmethod - def _create_arithmetic_method(cls, op): + def _create_op(cls, op): raise AbstractMethodError(cls) @classmethod - def _create_comparison_method(cls, op): + def _create_op(cls, op): raise AbstractMethodError(cls) @classmethod - def _create_logical_method(cls, op): + def _create_op(cls, op): raise AbstractMethodError(cls) @classmethod def _add_arithmetic_ops(cls) -> None: - setattr(cls, "__add__", cls._create_arithmetic_method(operator.add)) - setattr(cls, "__radd__", cls._create_arithmetic_method(radd)) - setattr(cls, "__sub__", cls._create_arithmetic_method(operator.sub)) - setattr(cls, "__rsub__", cls._create_arithmetic_method(rsub)) - setattr(cls, "__mul__", cls._create_arithmetic_method(operator.mul)) - setattr(cls, "__rmul__", cls._create_arithmetic_method(rmul)) - setattr(cls, "__pow__", cls._create_arithmetic_method(operator.pow)) - setattr(cls, "__rpow__", cls._create_arithmetic_method(rpow)) - setattr(cls, "__mod__", cls._create_arithmetic_method(operator.mod)) - setattr(cls, "__rmod__", cls._create_arithmetic_method(rmod)) - setattr(cls, "__floordiv__", cls._create_arithmetic_method(operator.floordiv)) - setattr(cls, "__rfloordiv__", cls._create_arithmetic_method(rfloordiv)) - setattr(cls, "__truediv__", cls._create_arithmetic_method(operator.truediv)) - setattr(cls, "__rtruediv__", cls._create_arithmetic_method(rtruediv)) - setattr(cls, "__divmod__", cls._create_arithmetic_method(divmod)) - setattr(cls, "__rdivmod__", cls._create_arithmetic_method(rdivmod)) + setattr(cls, "__add__", cls._create_op(operator.add)) + setattr(cls, "__radd__", cls._create_op(radd)) + setattr(cls, "__sub__", cls._create_op(operator.sub)) + setattr(cls, "__rsub__", cls._create_op(rsub)) + setattr(cls, "__mul__", cls._create_op(operator.mul)) + setattr(cls, "__rmul__", cls._create_op(rmul)) + setattr(cls, "__pow__", cls._create_op(operator.pow)) + setattr(cls, "__rpow__", cls._create_op(rpow)) + setattr(cls, "__mod__", cls._create_op(operator.mod)) + setattr(cls, "__rmod__", cls._create_op(rmod)) + setattr(cls, "__floordiv__", cls._create_op(operator.floordiv)) + setattr(cls, "__rfloordiv__", cls._create_op(rfloordiv)) + setattr(cls, "__truediv__", cls._create_op(operator.truediv)) + setattr(cls, "__rtruediv__", cls._create_op(rtruediv)) + setattr(cls, "__divmod__", cls._create_op(divmod)) + setattr(cls, "__rdivmod__", cls._create_op(rdivmod)) @classmethod def _add_comparison_ops(cls) -> None: - setattr(cls, "__eq__", cls._create_comparison_method(operator.eq)) - setattr(cls, "__ne__", cls._create_comparison_method(operator.ne)) - setattr(cls, "__lt__", cls._create_comparison_method(operator.lt)) - setattr(cls, "__gt__", cls._create_comparison_method(operator.gt)) - setattr(cls, "__le__", cls._create_comparison_method(operator.le)) - setattr(cls, "__ge__", cls._create_comparison_method(operator.ge)) + setattr(cls, "__eq__", cls._create_op(operator.eq)) + setattr(cls, "__ne__", cls._create_op(operator.ne)) + setattr(cls, "__lt__", cls._create_op(operator.lt)) + setattr(cls, "__gt__", cls._create_op(operator.gt)) + setattr(cls, "__le__", cls._create_op(operator.le)) + setattr(cls, "__ge__", cls._create_op(operator.ge)) @classmethod def _add_logical_ops(cls) -> None: - setattr(cls, "__and__", cls._create_logical_method(operator.and_)) - setattr(cls, "__rand__", cls._create_logical_method(rand_)) - setattr(cls, "__or__", cls._create_logical_method(operator.or_)) - setattr(cls, "__ror__", cls._create_logical_method(ror_)) - setattr(cls, "__xor__", cls._create_logical_method(operator.xor)) - setattr(cls, "__rxor__", cls._create_logical_method(rxor)) + setattr(cls, "__and__", cls._create_op(operator.and_)) + setattr(cls, "__rand__", cls._create_op(rand_)) + setattr(cls, "__or__", cls._create_op(operator.or_)) + setattr(cls, "__ror__", cls._create_op(ror_)) + setattr(cls, "__xor__", cls._create_op(operator.xor)) + setattr(cls, "__rxor__", cls._create_op(rxor)) @classmethod def _add_all(cls): cls._add_logical_ops() cls._add_arithmetic_ops() cls._add_comparison_ops() + + +class Accessor(ArithmeticMixin): + def __init__(self, obj): + self._obj = obj + + @property + def series_type(self): + raise NotImplementedError + + @property + def dataframe_type(self): + raise NotImplementedError + + def is_series(self, data): + return isinstance(data, self.series_type) + + def is_dataframe(self, data): + return isinstance(data, self.dataframe_type) + + def to_output(self, data): + raise NotImplementedError + + def apply(self, fn: Callable): + """Perform arbitrary function on all the values of the series""" + return self.to_output(fn(self.array)) + + def __getitem__(self, item): + out = self.array.__getitem__(item) + return self.to_output(out) + + def __dir__(self) -> Iterable[str]: + return series_methods if self.is_series(self._obj) else df_methods + + def __array_function__(self, *args, **kwargs): + return self.array.__array_function__(*args, **kwargs) + + def __array_ufunc__(self, *args, **kwargs): + if args[1] == "__call__": + return args[0](self.array, *args[3:], **kwargs) + raise NotImplementedError + + def __getattr__(self, item): + raise NotImplementedError + + @property + def array(self) -> ak.Array: + """Data as an awkward array""" + raise NotImplementedError + + def merge(self, *args): + """Create single complex series from a dataframe""" + raise NotImplementedError + + def unmerge(self): + """Create dict of series from a series with record type""" + raise NotImplementedError + + @classmethod + def _create_op(cls, op) -> Callable: + raise NotImplementedError diff --git a/src/awkward_pandas/polars.py b/src/awkward_pandas/polars.py index d7871b8..bd51ea9 100644 --- a/src/awkward_pandas/polars.py +++ b/src/awkward_pandas/polars.py @@ -1,50 +1,20 @@ import functools -from typing import Callable, Iterable, Union import awkward as ak import polars as pl -from awkward_pandas.mixin import ArithmeticMixin +from awkward_pandas.mixin import Accessor @pl.api.register_series_namespace("ak") @pl.api.register_dataframe_namespace("ak") -class AwkwardOperations(ArithmeticMixin): - def __init__(self, df: pl.DataFrame | pl.Series): - self._obj = df - - def __array_function__(self, *args, **kwargs): - return self.array.__array_function__(*args, **kwargs) - - def __array_ufunc__(self, *args, **kwargs): - if args[1] == "__call__": - return args[0](self.array, *args[3:], **kwargs) - raise NotImplementedError - - def __dir__(self) -> Iterable[str]: - return [ - _ - for _ in (dir(ak)) - if not _.startswith(("_", "ak_")) and not _[0].isupper() - ] + ["apply", "array"] - - def apply(self, fn: Callable) -> pl.DataFrame: - """Perform function on all the values of the series""" - out = fn(self.array) - return ak_to_polars(out) - - def __getitem__(self, item): - # scalars? - out = self.array.__getitem__(item) - result = ak_to_polars(out) - return result - - @property - def array(self): - return ak.from_arrow(self._obj.to_arrow()) +class AwkwardOperations(Accessor): + series_type = pl.Series + dataframe_type = pl.DataFrame @property def str(self): + """String operations""" from awkward_pandas.strings import StringAccessor return StringAccessor(self) @@ -71,27 +41,44 @@ def f(*others, **kwargs): ak_arr = func(self.array, *others, **kwargs) if isinstance(ak_arr, ak.Array): - return ak_to_polars(ak_arr) + return self.to_output(ak_arr) return ak_arr else: raise AttributeError(item) return f + def to_output(self, arr: ak.Array) -> pl.DataFrame | pl.Series: + # Series Vs DataFrame? + return pl.from_arrow(ak.to_arrow(arr, extensionarray=False)) + + @property + def array(self) -> ak.Array: + return ak.from_arrow(self._obj.to_arrow()) + @classmethod def _create_op(cls, op): def run(self, *args, **kwargs): - return ak_to_polars(op(self.array, *args, **kwargs)) + return self.to_output(op(self.array, *args, **kwargs)) return run - _create_arithmetic_method = _create_op - _create_comparison_method = _create_op - _create_logical_method = _create_op + def merge(self): + # TODO: this is almost totally generic + if not self.is_dataframe(self._obj): + raise ValueError("Can only merge on a dataframe") + out = {} + for k in self._obj.columns: + out[k] = ak.from_arrow(self._obj[k].to_arrow()) + arr = ak.Array(out) + return self.to_output(arr) + def unmerge(self): + arr = self.array + if not arr.fields: + raise ValueError + out = {k: self.to_output(arr[k]) for k in arr.fields} + return self.dataframe_type(out) -AwkwardOperations._add_all() - -def ak_to_polars(arr: ak.Array) -> Union[pl.DataFrame, pl.Series]: - return pl.from_arrow(ak.to_arrow(arr, extensionarray=False)) +AwkwardOperations._add_all() From 734d1efde74ff71248a587a89ca5900383489c0b Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 25 Apr 2024 21:46:35 -0400 Subject: [PATCH 14/19] more --- src/awkward_pandas/mixin.py | 51 +++++++++++++++++++++++++++++------ src/awkward_pandas/polars.py | 52 ------------------------------------ 2 files changed, 43 insertions(+), 60 deletions(-) diff --git a/src/awkward_pandas/mixin.py b/src/awkward_pandas/mixin.py index cbe37e6..6fbb9c3 100644 --- a/src/awkward_pandas/mixin.py +++ b/src/awkward_pandas/mixin.py @@ -1,3 +1,4 @@ +import functools import operator from typing import Callable, Iterable @@ -167,22 +168,56 @@ def __array_ufunc__(self, *args, **kwargs): return args[0](self.array, *args[3:], **kwargs) raise NotImplementedError - def __getattr__(self, item): - raise NotImplementedError - @property def array(self) -> ak.Array: """Data as an awkward array""" raise NotImplementedError def merge(self, *args): - """Create single complex series from a dataframe""" + """Create single record-type nested series from a dataframe""" raise NotImplementedError def unmerge(self): - """Create dict of series from a series with record type""" - raise NotImplementedError + arr = self.array + if not arr.fields: + raise ValueError + out = {k: self.to_output(arr[k]) for k in arr.fields} + return self.dataframe_type(out) @classmethod - def _create_op(cls, op) -> Callable: - raise NotImplementedError + def _create_op(cls, op): + def run(self, *args, **kwargs): + return self.to_output(op(self.array, *args, **kwargs)) + + return run + + def __getattr__(self, item): + if item not in dir(self): + raise AttributeError + func = getattr(ak, item, None) + + if func: + + @functools.wraps(func) + def f(*others, **kwargs): + others = [ + other.ak.array + if isinstance(other, (self.series_type, self.dataframe_type)) + else other + for other in others + ] + kwargs = { + k: v.ak.array + if isinstance(v, (self.series_type, self.dataframe_type)) + else v + for k, v in kwargs.items() + } + + ak_arr = func(self.array, *others, **kwargs) + if isinstance(ak_arr, ak.Array): + return self.to_output(ak_arr) + return ak_arr + + else: + raise AttributeError(item) + return f diff --git a/src/awkward_pandas/polars.py b/src/awkward_pandas/polars.py index bd51ea9..efe7ed5 100644 --- a/src/awkward_pandas/polars.py +++ b/src/awkward_pandas/polars.py @@ -1,5 +1,3 @@ -import functools - import awkward as ak import polars as pl @@ -12,42 +10,6 @@ class AwkwardOperations(Accessor): series_type = pl.Series dataframe_type = pl.DataFrame - @property - def str(self): - """String operations""" - from awkward_pandas.strings import StringAccessor - - return StringAccessor(self) - - def __getattr__(self, item): - if item not in dir(self): - raise AttributeError - func = getattr(ak, item, None) - - if func: - - @functools.wraps(func) - def f(*others, **kwargs): - others = [ - other.ak.array - if isinstance(other, (pl.DataFrame, pl.Series)) - else other - for other in others - ] - kwargs = { - k: v.ak.array if isinstance(v, (pl.DataFrame, pl.Series)) else v - for k, v in kwargs.items() - } - - ak_arr = func(self.array, *others, **kwargs) - if isinstance(ak_arr, ak.Array): - return self.to_output(ak_arr) - return ak_arr - - else: - raise AttributeError(item) - return f - def to_output(self, arr: ak.Array) -> pl.DataFrame | pl.Series: # Series Vs DataFrame? return pl.from_arrow(ak.to_arrow(arr, extensionarray=False)) @@ -56,13 +18,6 @@ def to_output(self, arr: ak.Array) -> pl.DataFrame | pl.Series: def array(self) -> ak.Array: return ak.from_arrow(self._obj.to_arrow()) - @classmethod - def _create_op(cls, op): - def run(self, *args, **kwargs): - return self.to_output(op(self.array, *args, **kwargs)) - - return run - def merge(self): # TODO: this is almost totally generic if not self.is_dataframe(self._obj): @@ -73,12 +28,5 @@ def merge(self): arr = ak.Array(out) return self.to_output(arr) - def unmerge(self): - arr = self.array - if not arr.fields: - raise ValueError - out = {k: self.to_output(arr[k]) for k in arr.fields} - return self.dataframe_type(out) - AwkwardOperations._add_all() From badcab3b0c4f6a3763ff48bfe40b7793017c10c8 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 26 Apr 2024 10:19:17 -0400 Subject: [PATCH 15/19] more --- src/awkward_pandas/mixin.py | 35 +++++++++++++++++++---------------- src/awkward_pandas/polars.py | 13 ++----------- 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/src/awkward_pandas/mixin.py b/src/awkward_pandas/mixin.py index 6fbb9c3..a27917a 100644 --- a/src/awkward_pandas/mixin.py +++ b/src/awkward_pandas/mixin.py @@ -132,21 +132,19 @@ class Accessor(ArithmeticMixin): def __init__(self, obj): self._obj = obj - @property - def series_type(self): - raise NotImplementedError - - @property - def dataframe_type(self): - raise NotImplementedError + series_type = () + dataframe_type = () - def is_series(self, data): - return isinstance(data, self.series_type) + @classmethod + def is_series(cls, data): + return isinstance(data, cls.series_type) - def is_dataframe(self, data): - return isinstance(data, self.dataframe_type) + @classmethod + def is_dataframe(cls, data): + return isinstance(data, cls.dataframe_type) - def to_output(self, data): + @classmethod + def to_output(cls, data): raise NotImplementedError def apply(self, fn: Callable): @@ -173,14 +171,19 @@ def array(self) -> ak.Array: """Data as an awkward array""" raise NotImplementedError - def merge(self, *args): - """Create single record-type nested series from a dataframe""" - raise NotImplementedError + def merge(self): + if not self.is_dataframe(self._obj): + raise ValueError("Can only merge on a dataframe") + out = {} + for k in self._obj.columns: + out[k] = self._obj[k].ak.array + arr = ak.Array(out) + return self.to_output(arr) def unmerge(self): arr = self.array if not arr.fields: - raise ValueError + raise ValueError("Not array-of-records") out = {k: self.to_output(arr[k]) for k in arr.fields} return self.dataframe_type(out) diff --git a/src/awkward_pandas/polars.py b/src/awkward_pandas/polars.py index efe7ed5..acb448a 100644 --- a/src/awkward_pandas/polars.py +++ b/src/awkward_pandas/polars.py @@ -10,7 +10,8 @@ class AwkwardOperations(Accessor): series_type = pl.Series dataframe_type = pl.DataFrame - def to_output(self, arr: ak.Array) -> pl.DataFrame | pl.Series: + @classmethod + def to_output(cls, arr: ak.Array) -> pl.DataFrame | pl.Series: # Series Vs DataFrame? return pl.from_arrow(ak.to_arrow(arr, extensionarray=False)) @@ -18,15 +19,5 @@ def to_output(self, arr: ak.Array) -> pl.DataFrame | pl.Series: def array(self) -> ak.Array: return ak.from_arrow(self._obj.to_arrow()) - def merge(self): - # TODO: this is almost totally generic - if not self.is_dataframe(self._obj): - raise ValueError("Can only merge on a dataframe") - out = {} - for k in self._obj.columns: - out[k] = ak.from_arrow(self._obj[k].to_arrow()) - arr = ak.Array(out) - return self.to_output(arr) - AwkwardOperations._add_all() From 7d661c563e44d704bcd5581aefad092cd5bfe571 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 26 Apr 2024 16:18:55 -0400 Subject: [PATCH 16/19] BIG refactor --- src/awkward_pandas/__init__.py | 13 +- src/awkward_pandas/accessor.py | 199 ------------------------- src/awkward_pandas/array.py | 186 ----------------------- src/awkward_pandas/dask.py | 41 +++++ src/awkward_pandas/dask_connect.py | 53 ------- src/awkward_pandas/datetimes.py | 102 ++----------- src/awkward_pandas/dtype.py | 77 ---------- src/awkward_pandas/io.py | 41 +---- src/awkward_pandas/lib.py | 37 ----- src/awkward_pandas/mixin.py | 37 ++++- src/awkward_pandas/pandas.py | 23 +++ src/awkward_pandas/polars.py | 12 +- src/awkward_pandas/strings.py | 30 ++-- tests/conftest.py | 10 -- tests/test_accessor.py | 65 ++------ tests/test_array.py | 59 -------- tests/test_base.py | 29 ---- tests/test_dask.py | 23 +-- tests/test_str.py | 14 +- tests/test_upstream_extension_tests.py | 153 ------------------- 20 files changed, 157 insertions(+), 1047 deletions(-) delete mode 100644 src/awkward_pandas/accessor.py delete mode 100644 src/awkward_pandas/array.py create mode 100644 src/awkward_pandas/dask.py delete mode 100644 src/awkward_pandas/dask_connect.py delete mode 100644 src/awkward_pandas/dtype.py delete mode 100644 src/awkward_pandas/lib.py create mode 100644 src/awkward_pandas/pandas.py delete mode 100644 tests/test_array.py delete mode 100644 tests/test_base.py delete mode 100644 tests/test_upstream_extension_tests.py diff --git a/src/awkward_pandas/__init__.py b/src/awkward_pandas/__init__.py index 137c934..8613a25 100644 --- a/src/awkward_pandas/__init__.py +++ b/src/awkward_pandas/__init__.py @@ -1,18 +1,11 @@ from __future__ import annotations -import awkward_pandas.accessor -import awkward_pandas.dask_connect # noqa -from awkward_pandas.array import AwkwardExtensionArray -from awkward_pandas.dtype import AwkwardDtype -from awkward_pandas.io import from_awkward, read_json, read_parquet -from awkward_pandas.lib import merge +import awkward_pandas.dask +import awkward_pandas.pandas # noqa +from awkward_pandas.io import read_json, read_parquet from awkward_pandas.version import version as __version__ # noqa __all__ = ( - "AwkwardDtype", - "AwkwardExtensionArray", - "from_awkward", - "merge", "read_parquet", "read_json", ) diff --git a/src/awkward_pandas/accessor.py b/src/awkward_pandas/accessor.py deleted file mode 100644 index 57be74e..0000000 --- a/src/awkward_pandas/accessor.py +++ /dev/null @@ -1,199 +0,0 @@ -from __future__ import annotations - -import functools -import inspect - -import awkward as ak -import pandas as pd - -from awkward_pandas.array import AwkwardExtensionArray -from awkward_pandas.datetimes import DatetimeAccessor -from awkward_pandas.dtype import AwkwardDtype -from awkward_pandas.strings import StringAccessor - -funcs = [n for n in dir(ak) if inspect.isfunction(getattr(ak, n))] - - -@pd.api.extensions.register_series_accessor("ak") -class AwkwardAccessor: - def __init__(self, pandas_obj): - if not self._validate(pandas_obj): - raise AttributeError("ak accessor called on incompatible data") - self._obj = pandas_obj - self._arr = None - - @property - def extarray(self): - if self._arr is None: - if isinstance(self._obj, AwkwardExtensionArray): - self._arr = self._obj - elif isinstance(self._obj.dtype, AwkwardDtype) and isinstance( - self._obj, pd.Series - ): - # this is a pandas Series that contains an Awkward - self._arr = self._obj.values - elif isinstance(self._obj.dtype, AwkwardDtype): - # a dask series - figure out what to do here - raise NotImplementedError - else: - # this recreates series, possibly by iteration - self._arr = AwkwardExtensionArray(self._obj) - return self._arr - - @property - def array(self) -> ak.Array: - """Get underlying awkward array""" - return self.extarray._data - - def __getitem__(self, items): - """Extract components using awkward indexing""" - ds = self.array.__getitem__(items) - index = None - if items[0]: - if not isinstance(items[0], str) and not ( - isinstance(items[0], list) and isinstance(items[0][0], str) - ): - index = self._obj.index[items[0]] - return pd.Series(AwkwardExtensionArray(ds), index=index) - - def to_column(self) -> pd.Series: - """Convert awkward series to regular pandas type - - Will convert to numpy or string[pyarrow] if appropriate. - May fail if the conversion cannot be done. - """ - data = self.array - if data.ndim > 1: - raise ValueError - # TODO: if all_strings(data) - accept ?str - if data.layout.parameter("__array__") == "string": - from pandas.core.arrays.string_arrow import ArrowStringArray - - new_ak_array = ak.to_arrow( - data, - string_to32=True, - extensionarray=False, - ) - return pd.Series(ArrowStringArray(new_ak_array)) - else: - return pd.Series(ak.to_numpy(data)) - - def to_columns( - self, - cull: bool = True, - extract_all: bool = False, - awkward_name: str = "awkward-data", - ) -> pd.DataFrame: - """Extract columns from an awkward series - - Where the series is a record type, each field may become a regular - pandas column. - - Parameters - ---------- - cull: bool - For those columns that we convert into regular ones, remove them - from the original awkward series if True - extract_all: bool - If False (default), only extract columns that can turn into normal - pandas columns. If True, all columns will be extracted, but those - that cannot be converted retain "awkward" type - awkward_name: str - If there are leftover columns in the original series, in the - resultant dataframe, these leftovers will get this column name - - Returns - ------- - pd.DataFrame - """ - s = self._obj - fields = self.array.fields - out = {} - for field in fields: - try: - out[field] = s.ak[field].ak.to_column() - except Exception: - if extract_all: - out[field] = s.ak[field] - if cull and extract_all: - pass - elif cull: - n = s.name or awkward_name - outfields = [_ for _ in fields if _ not in out] - if outfields: - out[n] = s.ak[outfields] - else: - out[s.name] = s - return pd.DataFrame(out) - - @staticmethod - def _validate(obj): - return isinstance( - obj, (AwkwardExtensionArray, ak.Array, ak.Record) - ) or isinstance(obj.values, AwkwardExtensionArray) - - # def to_arrow(self): - # return self.array.to_arrow() - - # def cartesian(self, other, **kwargs): - # if isinstance(other, AwkwardExtensionArray): - # other = other._data - # return AwkwardExtensionArray(ak.cartesian([self.array, other], **kwargs)) - - @property - def str(self) -> StringAccessor: - return StringAccessor(self) - - @property - def dt(self) -> DatetimeAccessor: - return DatetimeAccessor(self) - - def __getattr__(self, item): - """Call awkward namespace function on a series""" - # replace with concrete implementations of all top-level ak functions - if item not in dir(self): - raise AttributeError - func = getattr(ak, item, None) - - if func: - - @functools.wraps(func) - def f(*others, **kwargs): - others = [ - other._data - if isinstance(getattr(other, "_data", None), ak.Array) - else other - for other in others - ] - ak_arr = func(self.array, *others, **kwargs) - # TODO: special case to carry over index and name information where output - # is similar to input, e.g., has same length - if isinstance(ak_arr, ak.Array): - # TODO: perhaps special case here if the output can be represented - # as a regular num/cupy array - return pd.Series( - AwkwardExtensionArray(ak_arr), index=self._obj.index - ) - return ak_arr - - else: - raise AttributeError - - return f - - def apply(self, fn): - """Perform function on all the values of the series""" - result = fn(self.array) - if isinstance(result, ak.Array): - return pd.Series(AwkwardExtensionArray(result)) - return result - - def __dir__(self) -> list[str]: - return sorted( - [ - _ - for _ in (dir(ak)) - if not _.startswith(("_", "ak_")) and not _[0].isupper() - ] - + ["to_column", "dt"] - ) diff --git a/src/awkward_pandas/array.py b/src/awkward_pandas/array.py deleted file mode 100644 index 08008d0..0000000 --- a/src/awkward_pandas/array.py +++ /dev/null @@ -1,186 +0,0 @@ -from __future__ import annotations - -import operator -from collections.abc import Iterable -from typing import TYPE_CHECKING, Any, Literal - -import awkward as ak -import numpy as np -import pandas as pd -from pandas.core.arrays.base import ( - ExtensionArray, - ExtensionScalarOpsMixin, - set_function_name, -) -from pandas.core.dtypes.dtypes import ArrowDtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndex, ABCSeries - -from awkward_pandas.dtype import AwkwardDtype - -if TYPE_CHECKING: - from numpy.typing import DTypeLike, NDArray - - -class AwkwardExtensionArray(ExtensionArray, ExtensionScalarOpsMixin): - _dtype: AwkwardDtype - _data: ak.Array - - def __init__(self, data: Any) -> None: - """ - - Parameters - ---------- - data : awkward array, dict, JSON string, iterable - Construct extension array from this data. If an iterable or dict, - pass to awkward to generate the internal array. If a JSON string, - parse it using awkward. - """ - self._dtype = AwkwardDtype() - if isinstance(data, type(self)): - self._data = data._data - elif isinstance(data, ak.Array): - self._data = data - elif hasattr(data, "dtype") and isinstance(data.dtype, ArrowDtype): - self._data = ak.from_arrow(data._pa_array) - elif isinstance(data, dict): - self._data = ak.Array(data) - elif isinstance(data, str): - self._data = ak.from_json(data) - elif isinstance(data, Iterable): - self._data = ak.from_iter(None if a is pd.NA else a for a in data) - elif data is None: - self._data = ak.Array([]) - else: - raise ValueError - - @classmethod - def _from_sequence(cls, scalars, *, dtype=None, copy=False): - return cls(scalars) - - @classmethod - def _empty(cls, shape, dtype): - if isinstance(shape, tuple) and len(shape) != 1: - raise ValueError - if isinstance(shape, tuple): - return cls([None] * shape[0]) - return cls([None] * shape) - - @classmethod - def _from_factorized(cls, values, original): - return cls(values) - - def __getitem__(self, item): - if isinstance(item, int): - return operator.getitem(self._data, item) - elif isinstance(item, (slice, np.ndarray, ak.Array)): - new = operator.getitem(self._data, item) - return type(self)(new) - else: - raise ValueError(f"bad item passed to getitem: {type(item)}") - - def __setitem__(self, key, value): - raise NotImplementedError - - def __len__(self) -> int: - return len(self._data) - - def __iter__(self): - for i in range(len(self)): - yield self._data[i] - - @classmethod - def _create_method(cls, op, coerce_to_dtype=True, result_dtype=None): - def _binop(self, other): - if isinstance(other, (ABCSeries, ABCIndex, ABCDataFrame)): - # rely on pandas to unbox and dispatch to us - return NotImplemented - - lvalues = self - if isinstance(other, list) or ( - isinstance(other, pd.Series) and other.dtype == "O" - ): - rvalues = cls(other) - return cls(op(lvalues._data, rvalues._data)) - else: - return cls(op(lvalues._data, other)) - - op_name = f"__{op.__name__}__" - return set_function_name(_binop, op_name, cls) - - def _reduce(self, name: str, *, skipna: bool = True, axis=None, **kwargs): - return getattr(ak, name)(self._data, **kwargs) - - def _explode(self): - nums = ak.num(self._data, axis=1) - nums_filled = ak.fill_none(nums, 0) - data = ak.where(nums_filled == 0, [[None]], self._data) - flat = ak.flatten(data) - arr = type(self)(flat) - return arr, ak.num(data, axis=1) - - @property - def dtype(self) -> AwkwardDtype: - return self._dtype - - @property - def nbytes(self) -> int: - return self._data.layout.nbytes - - def isna(self): - return np.array(ak.is_none(self._data)) - - def take(self, indices, *, allow_fill=False, fill_value=None): - return self[indices] - - def copy(self): - return type(self)(ak.copy(self._data)) - - @classmethod - def _concat_same_type(cls, to_concat): - return cls(ak.concatenate([a._data for a in to_concat])) - - @property - def ndim(self) -> Literal[1]: - return 1 - - @property - def shape(self) -> tuple[int]: - return (len(self._data),) - - def __array__(self, dtype: DTypeLike | None = None) -> NDArray: - dtype = np.dtype(object) if dtype is None else np.dtype(dtype) - - if dtype == np.dtype("O"): - return np.asarray(self._data.tolist(), dtype=dtype) - - return np.asarray(self._data, dtype=dtype) - - def __arrow_array__(self, type=None): - import pyarrow as pa - - return pa.chunked_array(ak.to_arrow(self._data), type=type) - - def tolist(self) -> list: - return self._data.tolist() - - def __array_ufunc__(self, *inputs, **kwargs): - return type(self)(self._data.__array_ufunc__(*inputs, **kwargs)) - - def max(self, **kwargs): - return ak.max(self._data, **kwargs) - - def min(self, **kwargs): - return ak.min(self._data, **kwargs) - - def mean(self, **kwargs): - return ak.mean(self._data, **kwargs) - - def std(self, **kwargs): - return ak.std(self._data, **kwargs) - - def sum(self, axis=None, **kwargs): - return ak.sum(self._data, axis=axis, **kwargs) - - -AwkwardExtensionArray._add_arithmetic_ops() -AwkwardExtensionArray._add_comparison_ops() diff --git a/src/awkward_pandas/dask.py b/src/awkward_pandas/dask.py new file mode 100644 index 0000000..7883108 --- /dev/null +++ b/src/awkward_pandas/dask.py @@ -0,0 +1,41 @@ +import functools + +import awkward as ak +import dask.dataframe as dd +from dask.dataframe.extensions import ( + register_dataframe_accessor, + register_series_accessor, +) + +from awkward_pandas.mixin import Accessor as AkAccessor + + +class DaskAwkwardAccessor(AkAccessor): + series_type = dd.Series + dataframe_type = dd.DataFrame + aggregations = ( + False # you need dask-awkward for that, which we could optionally do here + ) + + def __getattr__(self, item): + if item not in dir(self): + raise AttributeError + func = getattr(ak, item, None) + + if func: + orig = self._obj.head() + + @functools.wraps(func) + def f(*others, **kwargs): + def func2(data): + return getattr(data.ak, item)(*others, **kwargs) + + return self._obj.map_partitions(func2, meta=func(orig)) + + else: + raise AttributeError(item) + return f + + +register_series_accessor("ak")(DaskAwkwardAccessor) +register_dataframe_accessor("ak")(DaskAwkwardAccessor) diff --git a/src/awkward_pandas/dask_connect.py b/src/awkward_pandas/dask_connect.py deleted file mode 100644 index 9fc561d..0000000 --- a/src/awkward_pandas/dask_connect.py +++ /dev/null @@ -1,53 +0,0 @@ -import pandas as pd - -from awkward_pandas.dtype import AwkwardDtype - -s = pd.Series(["hello"], dtype="awkward") - - -def register_dtype(): - from dask.dataframe.extensions import make_array_nonempty - - data = [[1], [1, 2, None]] - - @make_array_nonempty.register(AwkwardDtype) - def _(x): - return pd.Series(data, dtype="awkward") - - -try: - from dask_expr._accessor import Accessor -except (ImportError, ModuleNotFoundError): - try: - from dask.dataframe.accessor import Accessor - except (ImportError, ModuleNotFoundError): - Accessor = object - - -class DaskAwkwardAccessor(Accessor): - _accessor_name = "ak" - - _accessor_methods = dir(s.ak) - - _accessor_properties = () - - # TODO: dask-awkward could take over here - for method in _accessor_methods: - - def _(self, *args, method=method, **kwargs): - def __(s, method=method): - return s.ak.__getattr__(method)(*args, **kwargs) - - return self._series.map_partitions(__) - - locals()[method] = _ - - -try: - import dask - from dask.dataframe.extensions import register_series_accessor - - register_dtype() - register_series_accessor("ak")(DaskAwkwardAccessor) -except (ImportError, ModuleNotFoundError): - dask = False diff --git a/src/awkward_pandas/datetimes.py b/src/awkward_pandas/datetimes.py index 4528423..ed6b382 100644 --- a/src/awkward_pandas/datetimes.py +++ b/src/awkward_pandas/datetimes.py @@ -1,22 +1,13 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any - import awkward as ak -import pandas as pd import pyarrow as pa import pyarrow.compute as pc -from awkward_pandas.array import AwkwardExtensionArray - -if TYPE_CHECKING: - from awkward_pandas.accessor import AwkwardAccessor - class DatetimeAccessor: - def __init__(self, ak_accessor: AwkwardAccessor) -> None: - self.ak_accessor = ak_accessor - self.index = ak_accessor._obj.index + def __init__(self, accessor) -> None: + self.accessor = accessor def cast(self, target_type=None, safe=None, options=None): raise NotImplementedError("TODO") @@ -196,18 +187,17 @@ def month_interval_between(self, end): raise NotImplementedError("TODO") def nanoseconds_between(self, end): - arr, args, kwargs = _arrowize(self, end) - return _as_series( - pc.nanoseconds_between(arr, *args, **kwargs), - index=self.index, + return self.accessor.to_output( + pc.nanoseconds_between(self.accessor.arrow, end.ak.arrow), ) def quarters_between(self, end): raise NotImplementedError("TODO") def seconds_between(self, end): - arr, args, kwargs = _arrowize(self, end) - return _as_series(pc.seconds_between(arr, *args, **kwargs)) + return self.accessor.to_output( + pc.seconds_between(self.accessor.arrow, end.ak.arrow) + ) def weeks_between( self, @@ -221,8 +211,9 @@ def weeks_between( raise NotImplementedError("TODO") def years_between(self, end): - arr, args, kwargs = _arrowize(self, end) - return _as_series(pc.years_between(arr, *args, **kwargs)) + return self.accessor.to_output( + pc.years_between(self.accessor.arrow, end.ak.arrow) + ) def _to_arrow(array): @@ -233,76 +224,3 @@ def _to_arrow(array): def _make_unit_compatible(array): # TODO, actually convert units if not compatible return array - - -def _arrowize( - dta: DatetimeAccessor, - *args: Any, - **kwargs: Any, -) -> tuple[Any, tuple[Any, ...], dict[str, Any]]: - """Convert objects to arrow arrays. - - Parameters - ---------- - dta : DatetimeAccessor - The DatetimeAccessor with information about the main Series - object that is of dtype :obj:`~awkward_pandas.AwkwardDtype`. - *args : Any - Arguments that should be converted to arrow if necessary. Any - arguments that are Series backed by the - :obj:`~awkward_pandas.AwkwardDtype` will have the underlying - awkward array converted to an arrow array. - **kwargs : Any - Keyword arguments that should be converted to arrow if - necessary. Any values that are Series backed by the - :obj:`~awkward_pandas.AwkwardDtype` will have the underlying - awkward array converted to an arrow array. - - Returns - ------- - Array - Primary awkward Series converted to arrow. - tuple - New arguments with necessary conversions. - dict - New keyword arguments with necessary conversions. - - """ - primary_as_arrow = _to_arrow(dta.ak_accessor.array) - - # parse args so that other series backed by awkward are - # converted to arrow array objects. - new_args = [] - for arg in args: - if isinstance(arg, pd.Series) and arg.dtype == "awkward": - new_args.append(_to_arrow(arg.ak.array)) - else: - new_args.append(arg) - - # parse kwargs so that other series backed by awkward are - # converted to arrow array objects. - new_kwargs = {} - for k, v in kwargs.items(): - if isinstance(v, pd.Series) and v.dtype == "awkward": - new_kwargs[k] = _to_arrow(v.ak.array) - else: - new_kwargs[k] = v - - return primary_as_arrow, tuple(new_args), new_kwargs - - -def _as_series(pyarrow_result, index): - """Convert pyarrow Array back in to awkward Series. - - Parameters - ---------- - pyarrow_result : pyarrow Array - PyArray array that was the result of a pyarrow.compute call. - - Examples - -------- - pd.Series - Series of type :obj:`~awkward_pandas.AwkwardDtype`. - - """ - return pd.Series(AwkwardExtensionArray(ak.from_arrow(pyarrow_result)), index=index) diff --git a/src/awkward_pandas/dtype.py b/src/awkward_pandas/dtype.py deleted file mode 100644 index 5b97a04..0000000 --- a/src/awkward_pandas/dtype.py +++ /dev/null @@ -1,77 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -import awkward as ak -import numpy as np -from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype - -if TYPE_CHECKING: - from awkward_pandas.array import AwkwardExtensionArray - - -@register_extension_dtype -class AwkwardDtype(ExtensionDtype): - @property - def name(self) -> str: - return "awkward" - - @property - def type(self) -> type[ak.Array]: - return ak.Array - - @property - def kind(self) -> str: - return "O" - - @property - def na_value(self) -> object: - return np.nan - - @property - def _is_numeric(self) -> bool: - return True - - @property - def _is_boolean(self) -> bool: - return True - - @classmethod - def construct_from_string(cls, string: str) -> AwkwardDtype: - """Construct an instance from a string. - - Parameters - ---------- - string : str - Should be "awkward". - - Returns - ------- - AwkwardDtype - Instance of the dtype. - - """ - - if not isinstance(string, str): - raise TypeError( - f"'construct_from_string' expects a string, got {type(string)}" - ) - - if string == cls().name: - return cls() - else: - raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") - - @classmethod - def construct_array_type(cls) -> type[AwkwardExtensionArray]: # type: ignore[valid-type] - from awkward_pandas.array import AwkwardExtensionArray - - return AwkwardExtensionArray - - def __from_arrow__(self, data: Any) -> AwkwardExtensionArray: - from awkward_pandas.array import AwkwardExtensionArray - - return AwkwardExtensionArray(ak.from_arrow(data)) - - def __repr__(self) -> str: - return "" diff --git a/src/awkward_pandas/io.py b/src/awkward_pandas/io.py index 1a4cca3..0a36977 100644 --- a/src/awkward_pandas/io.py +++ b/src/awkward_pandas/io.py @@ -1,39 +1,8 @@ from __future__ import annotations import awkward as ak -import pandas as pd -from awkward_pandas.array import AwkwardExtensionArray - - -def from_awkward(array: ak.Array, name: str | None = None) -> pd.Series: - """Wrap an awkward Array in a pandas Series. - - Parameters - ---------- - array : ak.Array - Awkward array to wrap. - name : str, optional - Name for the series. - - Returns - ------- - pandas.Series - Resulting Series with dtype AwkwardDtype - - Examples - -------- - >>> import awkward as ak - >>> import awkward_pandas as akpd - >>> a = ak.from_iter([[1, 2, 3], [4, 5], [6]]) - >>> s = akpd.from_awkward(a, name="my-array") - 0 [1, 2, 3] - 1 [4, 5] - 2 [6] - Name: my-array, dtype: awkward - - """ - return pd.Series(AwkwardExtensionArray(array), name=name) +import awkward_pandas.pandas def read_parquet( @@ -45,9 +14,9 @@ def read_parquet( ): """Read a Parquet dataset with nested data into a Series or DataFrame.""" ds = ak.from_parquet(url, **kwargs) - s = from_awkward(ds, name=root_name) + s = awkward_pandas.pandas.Accessor.to_output(None, ds) if extract: - return s.ak.to_columns(cull=True, extract_all=extract_all) + return s.ak.unmerge() return s @@ -64,7 +33,7 @@ def read_json( line_delimited=True, **kwargs, ) - s = from_awkward(ds, name=root_name) + s = awkward_pandas.pandas.Accessor.to_output(None, ds) if extract: - return s.ak.to_columns(cull=True, extract_all=extract_all) + return s.ak.unmerge() return s diff --git a/src/awkward_pandas/lib.py b/src/awkward_pandas/lib.py deleted file mode 100644 index 43a9535..0000000 --- a/src/awkward_pandas/lib.py +++ /dev/null @@ -1,37 +0,0 @@ -from __future__ import annotations - -import awkward as ak -import numpy as np -import pandas as pd - -from awkward_pandas.io import from_awkward - - -def merge(dataframe: pd.DataFrame, name: str | None = None) -> pd.Series: - """Create a single awkward series by merging the columns of a dataframe. - - Parameters - ---------- - dataframe: pd.DataFrame - Containing columns of simple numpy type, object type (e.g., - srtings, lists or dicts) or existing awkward columns. - name: str or None - Name of the output series. - - Returns - ------- - pd.Series - Resuling Series with dtype AwkwardDtype - - """ - out = {} - for c in dataframe.columns: - if dataframe[c].dtype == "awkward": - out[c] = dataframe[c].values._data - elif dataframe[c].dtype == "string[pyarrow]": - out[c] = ak.from_arrow(dataframe[c].values._pa_array) - elif dataframe[c].dtype == np.dtype("O"): - out[c] = ak.from_iter(dataframe[c]) - else: - out[c] = dataframe[c].values - return from_awkward(ak.Array(out), name=name) diff --git a/src/awkward_pandas/mixin.py b/src/awkward_pandas/mixin.py index a27917a..6c1f46a 100644 --- a/src/awkward_pandas/mixin.py +++ b/src/awkward_pandas/mixin.py @@ -6,7 +6,7 @@ methods = [ _ for _ in (dir(ak)) if not _.startswith(("_", "ak_")) and not _[0].isupper() -] + ["apply", "array", "explode"] +] + ["apply", "array", "explode", "dt", "str"] df_methods = sorted(methods + ["merge"]) series_methods = sorted(methods + ["unmerge"]) @@ -129,12 +129,15 @@ def _add_all(cls): class Accessor(ArithmeticMixin): - def __init__(self, obj): - self._obj = obj + """Bring the awkward API to dataframes and series""" + aggregations = True # False means data is partitioned series_type = () dataframe_type = () + def __init__(self, obj): + self._obj = obj + @classmethod def is_series(cls, data): return isinstance(data, cls.series_type) @@ -143,8 +146,10 @@ def is_series(cls, data): def is_dataframe(cls, data): return isinstance(data, cls.dataframe_type) - @classmethod - def to_output(cls, data): + def to_output(self, data): + # this is not a classmethod, so that pandas and cudf can apply index + # to output + # rename from_awkward? raise NotImplementedError def apply(self, fn: Callable): @@ -166,10 +171,26 @@ def __array_ufunc__(self, *args, **kwargs): return args[0](self.array, *args[3:], **kwargs) raise NotImplementedError + @property + def arrow(self): + raise NotImplementedError + @property def array(self) -> ak.Array: """Data as an awkward array""" - raise NotImplementedError + return ak.from_arrow(self.arrow) + + @property + def str(self): + from awkward_pandas.strings import StringAccessor + + return StringAccessor(self) + + @property + def dt(self): + from awkward_pandas.datetimes import DatetimeAccessor + + return DatetimeAccessor(self) def merge(self): if not self.is_dataframe(self._obj): @@ -224,3 +245,7 @@ def f(*others, **kwargs): else: raise AttributeError(item) return f + + def __init_subclass__(cls, **kwargs): + # auto add methods to all derivative classes + cls._add_all() diff --git a/src/awkward_pandas/pandas.py b/src/awkward_pandas/pandas.py new file mode 100644 index 0000000..daa245c --- /dev/null +++ b/src/awkward_pandas/pandas.py @@ -0,0 +1,23 @@ +import awkward as ak +import pandas as pd +import pyarrow as pa + +from awkward_pandas.mixin import Accessor + + +@pd.api.extensions.register_series_accessor("ak") +@pd.api.extensions.register_dataframe_accessor("ak") +class AwkwardAccessor(Accessor): + @property + def arrow(self): + return pa.array(self._obj) + + def to_output(self, data): + return pd.Series( + pd.arrays.ArrowExtensionArray(ak.to_arrow(data, extensionarray=False)) + ) + + @staticmethod + def _validate(_): + # required by pandas + return True diff --git a/src/awkward_pandas/polars.py b/src/awkward_pandas/polars.py index acb448a..74d555c 100644 --- a/src/awkward_pandas/polars.py +++ b/src/awkward_pandas/polars.py @@ -6,18 +6,14 @@ @pl.api.register_series_namespace("ak") @pl.api.register_dataframe_namespace("ak") -class AwkwardOperations(Accessor): +class PolarsAwkwardAccessor(Accessor): series_type = pl.Series dataframe_type = pl.DataFrame - @classmethod - def to_output(cls, arr: ak.Array) -> pl.DataFrame | pl.Series: + def to_output(self, arr: ak.Array) -> pl.DataFrame | pl.Series: # Series Vs DataFrame? return pl.from_arrow(ak.to_arrow(arr, extensionarray=False)) @property - def array(self) -> ak.Array: - return ak.from_arrow(self._obj.to_arrow()) - - -AwkwardOperations._add_all() + def arrow(self) -> ak.Array: + return self._obj.to_arrow() diff --git a/src/awkward_pandas/strings.py b/src/awkward_pandas/strings.py index 918cb30..e24cf8c 100644 --- a/src/awkward_pandas/strings.py +++ b/src/awkward_pandas/strings.py @@ -4,9 +4,6 @@ from collections.abc import Callable import awkward as ak -import pandas as pd - -from awkward_pandas.array import AwkwardExtensionArray def _encode(layout): @@ -60,19 +57,24 @@ def decode(arr: ak.Array, encoding: str = "utf-8") -> ak.Array: "isupper": "is_upper", "startswith": "starts_with", } +methods = [ + aname + for aname in (dir(ak.str)) + if not aname.startswith(("_", "akstr_")) and not aname[0].isupper() +] class StringAccessor: def __init__(self, accessor): self.accessor = accessor - def encode(self, encoding: str = "utf-8") -> pd.Series: - """Encode Series of strings to Series of bytes.""" - return pd.Series(AwkwardExtensionArray(encode(self.accessor.array))) + def encode(self, encoding: str = "utf-8"): + """Encode Series of strings to Series of bytes. Leaves non-strings alone.""" + return self.accessor.to_output(encode(self.accessor.array, encoding=encoding)) - def decode(self, encoding: str = "utf-8") -> pd.Series: - """Decode Series of bytes to Series of strings.""" - return pd.Series(AwkwardExtensionArray(decode(self.accessor.array))) + def decode(self, encoding: str = "utf-8"): + """Decode Series of bytes to Series of strings. Leaves non-bytestrings alone.""" + return self.accessor.to_output(decode(self.accessor.array, encoding=encoding)) @staticmethod def method_name(attr: str) -> str: @@ -85,16 +87,12 @@ def __getattr__(self, attr: str) -> Callable: @functools.wraps(fn) def f(*args, **kwargs): arr = fn(self.accessor.array, *args, **kwargs) - idx = self.accessor._obj.index + # idx = self.accessor._obj.index if isinstance(arr, ak.Array): - return pd.Series(AwkwardExtensionArray(arr), index=idx) + return self.accessor.to_output(arr) return arr return f def __dir__(self) -> list[str]: - return [ - aname - for aname in (dir(ak.str)) - if not aname.startswith(("_", "akstr_")) and not aname[0].isupper() - ] + return sorted(methods) diff --git a/tests/conftest.py b/tests/conftest.py index 518b1b3..d3eb6de 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,18 +3,8 @@ import numpy as np import pandas as pd import pytest - -# from pandas.conftest import * from pandas.tests.extension.conftest import * # noqa -from awkward_pandas import AwkwardDtype - - -@pytest.fixture -def dtype(): - """Fixture overriding function in pandas/tests/extension/conftest.py""" - return AwkwardDtype() - @pytest.fixture def data(dtype): diff --git a/tests/test_accessor.py b/tests/test_accessor.py index 191b949..3222620 100644 --- a/tests/test_accessor.py +++ b/tests/test_accessor.py @@ -1,77 +1,29 @@ import awkward as ak import pandas as pd -import pytest - -import awkward_pandas def test_len(): - s = pd.Series(awkward_pandas.AwkwardExtensionArray([[6, 2, 3], [4, 5]])) + s = pd.Series([[6, 2, 3], [4, 5]]) assert s.ak.count() == 5 s2 = s.ak.count(axis=1) assert s2.tolist() == [3, 2] -def test_no_access(): - s = pd.Series([1, 2]) - with pytest.raises(Exception): - s.ak.count() - - def test_getitem(): - s = pd.Series(awkward_pandas.AwkwardExtensionArray([[6, 2, 3], [4, 5]])) + s = pd.Series([[6, 2, 3], [4, 5]]) s2 = s.ak[:, :1] assert isinstance(s2, pd.Series) - assert s2.dtype == "awkward" assert s2.tolist() == [[6], [4]] -def test_to_column_ints(): - s = pd.Series(awkward_pandas.AwkwardExtensionArray([6, 2, 3]), name="test") - s2 = s.ak.to_column() - assert s2.dtype == "int64" - - -def test_to_column_strings(): - pytest.importorskip("pyarrow") - s = pd.Series(awkward_pandas.AwkwardExtensionArray(["6", "2", "3"]), name="test") - s2 = s.ak.to_column() - assert s2.dtype == "string[pyarrow]" - - s = pd.Series(awkward_pandas.AwkwardExtensionArray([["6", "2", "3"]]), name="test") - with pytest.raises(ValueError): - s.ak.to_column() - - -def test_to_columns(): - pytest.importorskip("pyarrow") - s = pd.Series( - awkward_pandas.AwkwardExtensionArray( - {"num": [6, 2, 3], "deep": [[0], [], None], "text": ["hi", "ho", "hum"]} - ), - name="test", - ) - df = s.ak.to_columns() - assert df.columns.tolist() == ["num", "text", "test"] - assert df.num.tolist() == [6, 2, 3] - assert df.test.tolist() == [{"deep": [0]}, {"deep": []}, {"deep": None}] - assert df.text.tolist() == ["hi", "ho", "hum"] - - df = s.ak.to_columns(cull=False) - assert df.columns.tolist() == ["num", "text", "test"] - assert df.num.tolist() == [6, 2, 3] - assert df.test[0].tolist() == {"num": 6, "deep": [0], "text": "hi"} - assert df.text.tolist() == ["hi", "ho", "hum"] - - def test_apply(): - s = pd.Series(awkward_pandas.AwkwardExtensionArray([[6, 2, 3], [4]]), name="test") + s = pd.Series([[6, 2, 3], [4]], name="test") applied = s.ak.apply(lambda x: ak.num(x)) - assert applied.values._data.tolist() == ak.num(s.values._data).tolist() + assert applied.tolist() == ak.num(s.ak.array).tolist() def test_dir(): - s = pd.Series(awkward_pandas.AwkwardExtensionArray([6, 2, 3]), name="test") + s = pd.Series([6, 2, 3], name="test") assert "sum" in dir(s.ak) assert "Array" not in dir(s.ak) assert "ak_num" not in dir(s.ak) @@ -79,7 +31,8 @@ def test_dir(): def test_array_property(): - a = ak.from_iter([[1, 2, 3], [4, 5], [6]]) - s = pd.Series(awkward_pandas.AwkwardExtensionArray(a)) + a = [[1, 2, 3], [4, 5], [6]] + s = pd.Series(a) # ensure that the array associated with the accessor is the same as the original - assert s.ak.array is a + assert isinstance(s.ak.array, ak.Array) + assert a == s.ak.array.tolist() diff --git a/tests/test_array.py b/tests/test_array.py deleted file mode 100644 index 087a184..0000000 --- a/tests/test_array.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import annotations - -import pandas as pd -import pytest -from pandas.testing import assert_frame_equal - -from awkward_pandas import AwkwardExtensionArray, merge - - -def test_merge_no_ak(): - pytest.importorskip("pyarrow") - df = pd.DataFrame( - { - "a": [1, 2, 3], - "b": ["hay", "ho", "hi"], - "c": pd.Series(["hay", "ho", "hi"], dtype="string[pyarrow]"), - "d": [[1, 2, 3], None, []], - } - ) - s = merge(df, name="test") - assert s.name == "test" - assert s.dtype == "awkward" - assert len(s) == 3 - arr = s.values._data - assert arr.fields == ["a", "b", "c", "d"] - assert arr["a"].tolist() == [1, 2, 3] - assert arr["b"].tolist() == ["hay", "ho", "hi"] - assert arr["c"].tolist() == ["hay", "ho", "hi"] - assert arr["d"].tolist() == [[1, 2, 3], None, []] - - -def test_merge_one_ak(): - df = pd.DataFrame({"a": [1, 2, 3]}) - df["b"] = pd.Series(AwkwardExtensionArray([[1, 2, 3], [5], [6, 7]])) - s = merge(df, name="test") - assert s.name == "test" - assert s.dtype == "awkward" - assert len(s) == 3 - arr = s.values._data - assert arr.fields == ["a", "b"] - assert arr["b"].tolist() == [[1, 2, 3], [5], [6, 7]] - - -def test_parquet_roundtrip(tmp_path): - pytest.importorskip("pyarrow") - df = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": pd.Series(AwkwardExtensionArray([[1, 2, 3], [5], [6, 7], [], None])), - } - ) - - assert df["b"].dtype == "awkward" - - path = tmp_path / "output.parquet" - df.to_parquet(path, engine="pyarrow") - result = pd.read_parquet(path) - - assert_frame_equal(df, result) diff --git a/tests/test_base.py b/tests/test_base.py deleted file mode 100644 index 6a3240c..0000000 --- a/tests/test_base.py +++ /dev/null @@ -1,29 +0,0 @@ -from __future__ import annotations - -import awkward as ak -import pandas as pd -import pytest - -import awkward_pandas - - -def test_select(): - s = pd.Series(awkward_pandas.AwkwardExtensionArray([[6, 2, 3], [4, 5]])) - s2 = s[0] - assert isinstance(s2, ak.Array) - assert s2.tolist() == [6, 2, 3] - - s2 = s[0:1] - assert s2.dtype == "awkward" - assert isinstance(s2.values, awkward_pandas.AwkwardExtensionArray) - assert s2.tolist() == [[6, 2, 3]] - - -@pytest.mark.xfail(reason='numpy dtype("O") comparison giving issues') -def test_astype_to_ak(): - s = pd.Series([[6, 2, 3], [4, 5]], dtype=object) - s2 = s.astype("awkward") - assert s2.dtype == "awkward" - assert s2.tolist() == [[6, 2, 3], [4, 5]] - assert (s2 == s).tolist() == [[True, True, True], [True, True]] - assert (s2 == s).all() diff --git a/tests/test_dask.py b/tests/test_dask.py index ee36a96..d42ed44 100644 --- a/tests/test_dask.py +++ b/tests/test_dask.py @@ -1,5 +1,5 @@ -import awkward as ak import pandas as pd +import pyarrow as pa import pytest import awkward_pandas # noqa @@ -8,22 +8,23 @@ def test_simple_map(): - data = [[0], [0, 1]] * 2 - s = pd.Series(data, dtype="awkward") + data = pd.arrays.ArrowExtensionArray(pa.array([[0], [0, 1]] * 2)) + s = pd.Series(data) df = pd.DataFrame({"s": s}) ddf = dd.from_pandas(df, 2) - out = ddf.s.map(ak.count) + out = ddf.s.ak.count(axis=0) assert out.dtype == "int64" - assert out.compute().tolist() == [1, 2, 1, 2] + result = out.compute() + assert result.loc[0].tolist() == [2, 2] + assert result.loc[1].tolist() == [1, 1] - out = ddf + 1 - assert out.s.dtype == "awkward" + out = ddf.ak + 1 assert out.compute().s.tolist() == [[1], [1, 2]] * 2 def test_accessor(): - data = [[0], [0, 1]] * 2 - s = pd.Series(data, dtype="awkward") + data = pd.arrays.ArrowExtensionArray(pa.array([[0], [0, 1]] * 2)) + s = pd.Series(data) df = pd.DataFrame({"s": s}) ddf = dd.from_pandas(df, 2) out = ddf.s.ak.count() @@ -36,8 +37,8 @@ def test_accessor(): def test_distributed(): distributed = pytest.importorskip("distributed") with distributed.Client(n_workers=1, threads_per_worker=1): - data = [[0], [0, 1]] * 2 - s = pd.Series(data, dtype="awkward") + data = pd.arrays.ArrowExtensionArray(pa.array([[0], [0, 1]] * 2)) + s = pd.Series(data) df = pd.DataFrame({"s": s}) ddf = dd.from_pandas(df, 2) out = ddf.s.ak.count() diff --git a/tests/test_str.py b/tests/test_str.py index 9ce6ab4..9d50f28 100644 --- a/tests/test_str.py +++ b/tests/test_str.py @@ -1,15 +1,11 @@ -from __future__ import annotations - import pandas as pd import pytest -pytest.importorskip("pyarrow") - @pytest.mark.parametrize("binary", [True, False]) @pytest.mark.parametrize("method", ["upper", "capitalize", "isalpha"]) def test_unary_methods(method, binary): - s = pd.Series(["hello world", "oi"], dtype="awkward") + s = pd.Series(["hello world", "oi"]) if binary: s = s.ak.str.encode() out = getattr(s.ak.str, method)() @@ -18,7 +14,7 @@ def test_unary_methods(method, binary): def test_with_argument(): - s = pd.Series(["hello world", "oi"], dtype="awkward") + s = pd.Series(["hello world", "oi"]) out1 = s.ak.str.starts_with("hello") out2 = s.ak.str.startswith("hello") expected = [_.startswith("hello") for _ in s.tolist()] @@ -27,7 +23,7 @@ def test_with_argument(): def test_encode_decode(): - s = pd.Series(["hello world", "oi"], dtype="awkward") + s = pd.Series(["hello world", "oi"]) s2 = s.ak.str.encode() assert s2.tolist() == [_.encode() for _ in s.tolist()] s3 = s2.ak.str.decode() @@ -35,12 +31,12 @@ def test_encode_decode(): def test_split(): - s = pd.Series(["hello world", "oio", ""], dtype="awkward") + s = pd.Series(["hello world", "oio", ""]) s2 = s.ak.str.split_whitespace() assert s2.tolist() == [["hello", "world"], ["oio"], [""]] s2 = s.ak.str.split_pattern("i") assert s2.tolist() == [["hello world"], ["o", "o"], [""]] - s = pd.Series([b"hello world", b"oio", b""], dtype="awkward") + s = pd.Series([b"hello world", b"oio", b""]) s2 = s.ak.str.split_whitespace() assert s2.tolist() == [[b"hello", b"world"], [b"oio"], [b""]] diff --git a/tests/test_upstream_extension_tests.py b/tests/test_upstream_extension_tests.py deleted file mode 100644 index 9d15b5c..0000000 --- a/tests/test_upstream_extension_tests.py +++ /dev/null @@ -1,153 +0,0 @@ -from __future__ import annotations - -import pandas as pd -import pandas._testing as tm -from pandas.tests.extension.base import BaseConstructorsTests, BaseDtypeTests -from pandas.tests.extension.base.casting import BaseCastingTests # noqa -from pandas.tests.extension.base.dim2 import Dim2CompatTests # noqa -from pandas.tests.extension.base.dim2 import NDArrayBacked2DTests # noqa -from pandas.tests.extension.base.getitem import BaseGetitemTests # noqa -from pandas.tests.extension.base.groupby import BaseGroupbyTests # noqa -from pandas.tests.extension.base.index import BaseIndexTests # noqa -from pandas.tests.extension.base.interface import BaseInterfaceTests # noqa -from pandas.tests.extension.base.io import BaseParsingTests # noqa -from pandas.tests.extension.base.methods import BaseMethodsTests # noqa -from pandas.tests.extension.base.missing import BaseMissingTests # noqa -from pandas.tests.extension.base.ops import BaseArithmeticOpsTests # noqa -from pandas.tests.extension.base.ops import BaseComparisonOpsTests # noqa -from pandas.tests.extension.base.ops import BaseOpsUtil # noqa -from pandas.tests.extension.base.ops import BaseUnaryOpsTests # noqa -from pandas.tests.extension.base.printing import BasePrintingTests # noqa -from pandas.tests.extension.base.reduce import BaseBooleanReduceTests # noqa -from pandas.tests.extension.base.reduce import BaseNoReduceTests # noqa -from pandas.tests.extension.base.reduce import BaseNumericReduceTests # noqa -from pandas.tests.extension.base.reshaping import BaseReshapingTests # noqa -from pandas.tests.extension.base.setitem import BaseSetitemTests # noqa - -import awkward_pandas - - -def test_version(): - assert awkward_pandas.__version__ - - -class TestAwkwardDtype(BaseDtypeTests): - pass - - -class TestAwkwardConstructors(BaseConstructorsTests): - def test_series_constructor_scalar_with_index(self, data, dtype): - assert True - - # Overridden because pd.DataFrame(list(AwkwardExtensionArray)) - # won't work. - def test_from_dtype(self, data): - # construct from our dtype & string dtype - dtype = data.dtype - - expected = pd.Series(data) - result = pd.Series(list(data), dtype=dtype) - tm.assert_series_equal(result, expected) - - result = pd.Series(list(data), dtype=str(dtype)) - tm.assert_series_equal(result, expected) - - # this is the test that breaks the upstream version - # expected = pd.DataFrame(data).astype(dtype) - # result = pd.DataFrame(list(data), dtype=dtype) - # tm.assert_frame_equal(result, expected) - - -# class TestAwkwardBaseCastingTests(BaseCastingTests): - -# # Overridden because list(AwkwardExtensionArray) will contain -# # ak.Array as elements, not python objects. -# def test_tolist(self, data): -# result = pd.Series(data).tolist() -# expected = data.tolist() -# assert result == expected - -# result = list(pd.Series(data)) -# expected = list(data) -# for res, exp in zip(result, expected): -# assert ak.all(res == exp) - - -# class TestAwkwardBaseGetitemTests(BaseGetitemTests): -# pass - - -# class TestAwkwardBaseGroupbyTests(BaseGroupbyTests): -# pass - - -# class TestAwkwardBaseIndexTests(BaseIndexTests): -# pass - - -# class TestAwkwardBaseInterfaceTests(BaseInterfaceTests): -# pass - - -# class TestAwkwardDim2CompatTests(Dim2CompatTests): -# pass - - -# # Not compatible with awkward array -# # class TestAwkwardNDArrayBacked2DTests(NDArrayBacked2DTests): -# # pass - - -# class TestAwkwardBaseParsingTests(BaseParsingTests): -# pass - - -# class TestAwkwardBaseMethodsTests(BaseMethodsTests): -# pass - - -# class TestAwkwardBaseMissingTests(BaseMissingTests): -# pass - - -# class TestAwkwardBaseArithmeticOpsTests(BaseArithmeticOpsTests): -# pass - - -# class TestAwkwardBaseComparisonOpsTests(BaseComparisonOpsTests): -# pass - - -# class TestAwkwardBaseOpsUtil(BaseOpsUtil): -# pass - - -# class TestAwkwardBaseUnaryOpsTests(BaseUnaryOpsTests): -# pass - - -# class TestAwkwardBasePrintingTests(BasePrintingTests): -# pass - - -# class TestAwkwardBaseBooleanReduceTests(BaseBooleanReduceTests): -# pass - - -# class TestAwkwardBaseNoReduceTests(BaseNoReduceTests): -# pass - - -# class TestAwkwardBaseNumericReduceTests(BaseNumericReduceTests): -# pass - - -# class TestAwkwardBaseReshapingTests(BaseReshapingTests): -# def test_ravel(self, data): -# result = data.ravel() -# assert type(result) == type(data) -# result._data is data._data - - -# # class TestAwkwardBaseSetitemTests(BaseSetitemTests): -# # pass From 04fffc1364b6ff074bef52f47ec1a4f8ac89a045 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 29 Apr 2024 11:18:21 -0400 Subject: [PATCH 17/19] fix --- src/awkward_pandas/dask.py | 26 +++++++++++++++++++--- src/awkward_pandas/mixin.py | 19 +++++++++++----- src/awkward_pandas/pandas.py | 24 +++++++++++++++----- src/awkward_pandas/polars.py | 10 ++++----- tests/test_dask.py | 10 ++++----- tests/{test_accessor.py => test_pandas.py} | 12 ++++++++++ 6 files changed, 77 insertions(+), 24 deletions(-) rename tests/{test_accessor.py => test_pandas.py} (75%) diff --git a/src/awkward_pandas/dask.py b/src/awkward_pandas/dask.py index 7883108..23ad843 100644 --- a/src/awkward_pandas/dask.py +++ b/src/awkward_pandas/dask.py @@ -8,14 +8,31 @@ ) from awkward_pandas.mixin import Accessor as AkAccessor +from awkward_pandas.pandas import PandasAwkwardAccessor class DaskAwkwardAccessor(AkAccessor): series_type = dd.Series dataframe_type = dd.DataFrame - aggregations = ( - False # you need dask-awkward for that, which we could optionally do here - ) + aggregations = False # you need dask-awkward for that + + @classmethod + def _create_op(cls, op): + def run(self, *args, **kwargs): + orig = self._obj.head() + ar = (ar.head() if hasattr(ar, "ak") else ar for ar in args) + meta = PandasAwkwardAccessor._to_output(op(orig.ak.array, *ar, **kwargs)) + + def inner(data): + import awkward_pandas.pandas # noqa: F401 + + ar2 = (ar.ak.array if hasattr(ar, "ak") else ar for ar in args) + out = op(data.ak.array, *ar2, **kwargs) + return PandasAwkwardAccessor._to_output(out) + + return self._obj.map_partitions(inner, meta=meta) + + return run def __getattr__(self, item): if item not in dir(self): @@ -28,6 +45,9 @@ def __getattr__(self, item): @functools.wraps(func) def f(*others, **kwargs): def func2(data): + import awkward_pandas.pandas # noqa: F401 + + # data and others are pandas objects here return getattr(data.ak, item)(*others, **kwargs) return self._obj.map_partitions(func2, meta=func(orig)) diff --git a/src/awkward_pandas/mixin.py b/src/awkward_pandas/mixin.py index 6c1f46a..5c64074 100644 --- a/src/awkward_pandas/mixin.py +++ b/src/awkward_pandas/mixin.py @@ -146,12 +146,13 @@ def is_series(cls, data): def is_dataframe(cls, data): return isinstance(data, cls.dataframe_type) - def to_output(self, data): - # this is not a classmethod, so that pandas and cudf can apply index - # to output - # rename from_awkward? + @classmethod + def _to_output(cls, data): raise NotImplementedError + def to_output(self, data): + return self._to_output(data) + def apply(self, fn: Callable): """Perform arbitrary function on all the values of the series""" return self.to_output(fn(self.array)) @@ -172,7 +173,11 @@ def __array_ufunc__(self, *args, **kwargs): raise NotImplementedError @property - def arrow(self): + def arrow(self) -> ak.Array: + return self.to_arrow(self._obj) + + @classmethod + def to_arrow(cls, data): raise NotImplementedError @property @@ -211,7 +216,9 @@ def unmerge(self): @classmethod def _create_op(cls, op): def run(self, *args, **kwargs): - return self.to_output(op(self.array, *args, **kwargs)) + ar2 = (ar.ak.array if hasattr(ar, "ak") else ar for ar in args) + ar3 = (ar.array if isinstance(ar, cls) else ar for ar in ar2) + return self.to_output(op(self.array, *ar3, **kwargs)) return run diff --git a/src/awkward_pandas/pandas.py b/src/awkward_pandas/pandas.py index daa245c..5965679 100644 --- a/src/awkward_pandas/pandas.py +++ b/src/awkward_pandas/pandas.py @@ -7,16 +7,30 @@ @pd.api.extensions.register_series_accessor("ak") @pd.api.extensions.register_dataframe_accessor("ak") -class AwkwardAccessor(Accessor): - @property - def arrow(self): - return pa.array(self._obj) +class PandasAwkwardAccessor(Accessor): + series_type = pd.Series + dataframe_type = pd.DataFrame - def to_output(self, data): + @classmethod + def to_arrow(cls, data): + if cls.is_series(data): + return pa.array(data) + return pa.table(data) + + @classmethod + def _to_output(cls, data): return pd.Series( pd.arrays.ArrowExtensionArray(ak.to_arrow(data, extensionarray=False)) ) + def to_output(self, data): + # override to apply index + arr = pd.arrays.ArrowExtensionArray(ak.to_arrow(data, extensionarray=False)) + if self._obj is not None and len(arr) == len(self._obj.index): + return pd.Series(arr, index=self._obj.index) + else: + return arr + @staticmethod def _validate(_): # required by pandas diff --git a/src/awkward_pandas/polars.py b/src/awkward_pandas/polars.py index 74d555c..249a8e3 100644 --- a/src/awkward_pandas/polars.py +++ b/src/awkward_pandas/polars.py @@ -10,10 +10,10 @@ class PolarsAwkwardAccessor(Accessor): series_type = pl.Series dataframe_type = pl.DataFrame - def to_output(self, arr: ak.Array) -> pl.DataFrame | pl.Series: - # Series Vs DataFrame? + @classmethod + def _to_output(cls, arr): return pl.from_arrow(ak.to_arrow(arr, extensionarray=False)) - @property - def arrow(self) -> ak.Array: - return self._obj.to_arrow() + @classmethod + def to_arrow(cls, data): + return data.to_arrow() diff --git a/tests/test_dask.py b/tests/test_dask.py index d42ed44..6e87c29 100644 --- a/tests/test_dask.py +++ b/tests/test_dask.py @@ -14,12 +14,11 @@ def test_simple_map(): ddf = dd.from_pandas(df, 2) out = ddf.s.ak.count(axis=0) assert out.dtype == "int64" - result = out.compute() - assert result.loc[0].tolist() == [2, 2] - assert result.loc[1].tolist() == [1, 1] + result = out.compute(scheduler="sync") + assert set(result) == {1, 2} - out = ddf.ak + 1 - assert out.compute().s.tolist() == [[1], [1, 2]] * 2 + out = ddf.s.ak + 1 + assert out.compute(scheduler="sync").ak.to_list() == [[1], [1, 2]] * 2 def test_accessor(): @@ -36,6 +35,7 @@ def test_accessor(): def test_distributed(): distributed = pytest.importorskip("distributed") + with distributed.Client(n_workers=1, threads_per_worker=1): data = pd.arrays.ArrowExtensionArray(pa.array([[0], [0, 1]] * 2)) s = pd.Series(data) diff --git a/tests/test_accessor.py b/tests/test_pandas.py similarity index 75% rename from tests/test_accessor.py rename to tests/test_pandas.py index 3222620..f2ae855 100644 --- a/tests/test_accessor.py +++ b/tests/test_pandas.py @@ -1,5 +1,8 @@ import awkward as ak import pandas as pd +import pytest + +pytest.importorskip("awkward_pandas.pandas") def test_len(): @@ -36,3 +39,12 @@ def test_array_property(): # ensure that the array associated with the accessor is the same as the original assert isinstance(s.ak.array, ak.Array) assert a == s.ak.array.tolist() + + +def test_ufunc(): + a = [[1, 2, 3], [4, 5], [6]] + s = pd.Series(a) + assert (s.ak + 1).tolist() == [[2, 3, 4], [5, 6], [7]] + + assert (s.ak + s.ak).tolist() == [[2, 4, 6], [8, 10], [12]] + assert (s.ak + s).tolist() == [[2, 4, 6], [8, 10], [12]] From 10aa053217a3dfc6f9fe667728c513dff3678ce3 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 29 Apr 2024 14:33:50 -0400 Subject: [PATCH 18/19] docstrings --- src/awkward_pandas/datetimes.py | 20 +++++++++++++- src/awkward_pandas/io.py | 48 ++++++++++++++++++++++----------- src/awkward_pandas/mixin.py | 10 +++++++ 3 files changed, 61 insertions(+), 17 deletions(-) diff --git a/src/awkward_pandas/datetimes.py b/src/awkward_pandas/datetimes.py index ed6b382..44a03de 100644 --- a/src/awkward_pandas/datetimes.py +++ b/src/awkward_pandas/datetimes.py @@ -10,7 +10,25 @@ def __init__(self, accessor) -> None: self.accessor = accessor def cast(self, target_type=None, safe=None, options=None): - raise NotImplementedError("TODO") + """Cast values to given type + + This may be the easiest way to make time types from scratch + + Examples + -------- + + >>> import pandas as pd + >>> import awkward_pandas.pandas + >>> s = pd.Series([0, 1, 2]) + >>> s.ak.dt.cast("timestamp[s]") + 0 1970-01-01 00:00:00 + 1 1970-01-01 00:00:01 + 2 1970-01-01 00:00:02 + dtype: timestamp[s][pyarrow] + """ + return self.accessor.to_output( + pc.cast(self.accessor.arrow, target_type, safe, options) + ) def ceil_temporal( self, diff --git a/src/awkward_pandas/io.py b/src/awkward_pandas/io.py index 0a36977..305e0da 100644 --- a/src/awkward_pandas/io.py +++ b/src/awkward_pandas/io.py @@ -1,20 +1,28 @@ from __future__ import annotations import awkward as ak +import fsspec import awkward_pandas.pandas def read_parquet( - url, - extract=True, - root_name="awkward", - extract_all=False, + url: str, + storage_options: dict | None = None, + extract: bool = True, **kwargs, ): - """Read a Parquet dataset with nested data into a Series or DataFrame.""" - ds = ak.from_parquet(url, **kwargs) - s = awkward_pandas.pandas.Accessor.to_output(None, ds) + """Read a Parquet dataset with nested data into a Series or DataFrame. + + Parameters + ---------- + url: data location + storage_options: any arguments for an fsspec backend + extract: whether to turn top-level records into a dataframe. If False, + will return a series. + """ + ds = ak.from_parquet(url, storage_options=storage_options, **kwargs) + s = awkward_pandas.pandas.PandasAwkwardAccessor._to_output(ds) if extract: return s.ak.unmerge() return s @@ -22,18 +30,26 @@ def read_parquet( def read_json( url, + storage_options: dict | None = None, extract=True, - root_name="awkward", - extract_all=False, **kwargs, ): - """Read a JSON dataset with nested data into a Series or DataFrame.""" - ds = ak.from_json( - url, - line_delimited=True, - **kwargs, - ) - s = awkward_pandas.pandas.Accessor.to_output(None, ds) + """Read a JSON dataset with nested data into a Series or DataFrame. + + Parameters + ---------- + url: data location + storage_options: any arguments for an fsspec backend + extract: whether to turn top-level records into a dataframe. If False, + will return a series. + """ + with fsspec.open(url, **storage_options) as f: + ds = ak.from_json( + f, + line_delimited=True, + **kwargs, + ) + s = awkward_pandas.pandas.PandasAwkwardAccessor._to_output(ds) if extract: return s.ak.unmerge() return s diff --git a/src/awkward_pandas/mixin.py b/src/awkward_pandas/mixin.py index 5c64074..d84434f 100644 --- a/src/awkward_pandas/mixin.py +++ b/src/awkward_pandas/mixin.py @@ -174,10 +174,12 @@ def __array_ufunc__(self, *args, **kwargs): @property def arrow(self) -> ak.Array: + """Data as an arrow array""" return self.to_arrow(self._obj) @classmethod def to_arrow(cls, data): + """Data as an arrow array""" raise NotImplementedError @property @@ -187,34 +189,42 @@ def array(self) -> ak.Array: @property def str(self): + """Nested string operations""" from awkward_pandas.strings import StringAccessor return StringAccessor(self) @property def dt(self): + """Nested datetime operations""" from awkward_pandas.datetimes import DatetimeAccessor return DatetimeAccessor(self) def merge(self): + """Make a single complex series out of the columns of a dataframe""" if not self.is_dataframe(self._obj): raise ValueError("Can only merge on a dataframe") out = {} for k in self._obj.columns: + # TODO: partial merge when column names are like "record.field" out[k] = self._obj[k].ak.array arr = ak.Array(out) return self.to_output(arr) def unmerge(self): + """Make dataframe out of a series of record type""" arr = self.array if not arr.fields: raise ValueError("Not array-of-records") + # TODO: partial unmerge when (some) fields are records out = {k: self.to_output(arr[k]) for k in arr.fields} return self.dataframe_type(out) @classmethod def _create_op(cls, op): + """Make functions to perform all the arithmetic, logical and comparison ops""" + def run(self, *args, **kwargs): ar2 = (ar.ak.array if hasattr(ar, "ak") else ar for ar in args) ar3 = (ar.array if isinstance(ar, cls) else ar for ar in ar2) From 17f05cad973429aaa68f2147bb89b40ed70b4855 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 29 Apr 2024 16:26:52 -0400 Subject: [PATCH 19/19] Add dt unary POC --- src/awkward_pandas/datetimes.py | 33 +++++++++++++++++++++++++++------ src/awkward_pandas/strings.py | 5 ++++- tests/test_dt.py | 18 ++++++++++++++++++ 3 files changed, 49 insertions(+), 7 deletions(-) create mode 100644 tests/test_dt.py diff --git a/src/awkward_pandas/datetimes.py b/src/awkward_pandas/datetimes.py index 44a03de..9f09b2d 100644 --- a/src/awkward_pandas/datetimes.py +++ b/src/awkward_pandas/datetimes.py @@ -5,6 +5,21 @@ import pyarrow.compute as pc +def _run_unary(layout, op, kind=None, **kw): + if layout.is_record: + [_run_unary(_, op, kind=kind, **kw) for _ in layout._contents] + elif layout.is_leaf and (kind is None or layout.dtype.kind == kind): + layout._data = ak.str._apply_through_arrow(op, layout, **kw).data + elif layout.is_option or layout.is_list: + _run_unary(layout.content, op, kind=kind, **kw) + + +def run_unary(arr: ak.Array, op, kind=None, **kw) -> ak.Array: + arr2 = ak.copy(arr) + _run_unary(arr2.layout, op, kind=kind, **kw) + return ak.Array(arr2) + + class DatetimeAccessor: def __init__(self, accessor) -> None: self.accessor = accessor @@ -19,15 +34,21 @@ def cast(self, target_type=None, safe=None, options=None): >>> import pandas as pd >>> import awkward_pandas.pandas - >>> s = pd.Series([0, 1, 2]) + >>> s = pd.Series([[0, 1], [1, 0], [2]]) >>> s.ak.dt.cast("timestamp[s]") - 0 1970-01-01 00:00:00 - 1 1970-01-01 00:00:01 - 2 1970-01-01 00:00:02 - dtype: timestamp[s][pyarrow] + 0 ['1970-01-01T00:00:00' '1970-01-01T00:00:01'] + 1 ['1970-01-01T00:00:01' '1970-01-01T00:00:00'] + 2 ['1970-01-01T00:00:02'] + dtype: list[pyarrow] """ return self.accessor.to_output( - pc.cast(self.accessor.arrow, target_type, safe, options) + run_unary( + self.accessor.array, + pc.cast, + target_type=target_type, + safe=safe, + options=options, + ) ) def ceil_temporal( diff --git a/src/awkward_pandas/strings.py b/src/awkward_pandas/strings.py index e24cf8c..6a224c7 100644 --- a/src/awkward_pandas/strings.py +++ b/src/awkward_pandas/strings.py @@ -73,7 +73,10 @@ def encode(self, encoding: str = "utf-8"): return self.accessor.to_output(encode(self.accessor.array, encoding=encoding)) def decode(self, encoding: str = "utf-8"): - """Decode Series of bytes to Series of strings. Leaves non-bytestrings alone.""" + """Decode Series of bytes to Series of strings. Leaves non-bytestrings alone. + + Validity of UTF8 is *not* checked. + """ return self.accessor.to_output(decode(self.accessor.array, encoding=encoding)) @staticmethod diff --git a/tests/test_dt.py b/tests/test_dt.py new file mode 100644 index 0000000..807da4d --- /dev/null +++ b/tests/test_dt.py @@ -0,0 +1,18 @@ +import datetime + +import pytest + +import awkward_pandas.pandas # noqa + +pd = pytest.importorskip("pandas") + + +def test_cast(): + s = pd.Series([[0, 1], [1, 0], [2]]) + out = s.ak.dt.cast("timestamp[s]") + assert str(out.dtype) == "list[pyarrow]" + assert out.to_list() == [ + [datetime.datetime(1970, 1, 1, 0, 0), datetime.datetime(1970, 1, 1, 0, 0, 1)], + [datetime.datetime(1970, 1, 1, 0, 0, 1), datetime.datetime(1970, 1, 1, 0, 0)], + [datetime.datetime(1970, 1, 1, 0, 0, 2)], + ]