Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add all unary dt methods #53

Merged
merged 2 commits into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
239 changes: 56 additions & 183 deletions src/awkward_pandas/datetimes.py
Original file line number Diff line number Diff line change
@@ -1,206 +1,79 @@
from __future__ import annotations

import functools
import inspect

import awkward as ak
import pyarrow as pa
import pyarrow.compute as pc


def _run_unary(layout, op, kind=None, **kw):
if layout.is_record:
[_run_unary(_, op, kind=kind, **kw) for _ in layout._contents]
elif layout.is_leaf and (kind is None or layout.dtype.kind == kind):
layout._data = ak.str._apply_through_arrow(op, layout, **kw).data
elif layout.is_option or layout.is_list:
_run_unary(layout.content, op, kind=kind, **kw)
if layout.is_leaf and (kind is None or layout.dtype.kind == kind):
return ak.str._apply_through_arrow(op, layout, **kw)
if layout.is_list and layout.parameter("__array__") in ["bytestring", "string"]:
return ak.str._apply_through_arrow(op, layout, **kw)


def run_unary(arr: ak.Array, op, kind=None, **kw) -> ak.Array:
arr2 = ak.copy(arr)
_run_unary(arr2.layout, op, kind=kind, **kw)
return ak.Array(arr2)
def func(x, **kwargs):
return _run_unary(x, op, kind=kind, **kw)

return ak.transform(func, arr)

class DatetimeAccessor:
def __init__(self, accessor) -> None:
self.accessor = accessor

def cast(self, target_type=None, safe=None, options=None):
"""Cast values to given type
def dec(func, mode="unary"):
# TODO: require kind= on functions that need timestamps

This may be the easiest way to make time types from scratch
if mode == "unary":
# TODO: modily __doc__?
@functools.wraps(func)
def f(self, *args, **kwargs):
if args:
sig = list(inspect.signature(func).parameters)[1:]
kwargs.update({k: arg for k, arg in zip(sig, args)})

Examples
--------

>>> import pandas as pd
>>> import awkward_pandas.pandas
>>> s = pd.Series([[0, 1], [1, 0], [2]])
>>> s.ak.dt.cast("timestamp[s]")
0 ['1970-01-01T00:00:00' '1970-01-01T00:00:01']
1 ['1970-01-01T00:00:01' '1970-01-01T00:00:00']
2 ['1970-01-01T00:00:02']
dtype: list<item: timestamp[s]>[pyarrow]
"""
return self.accessor.to_output(
run_unary(
self.accessor.array,
pc.cast,
target_type=target_type,
safe=safe,
options=options,
return self.accessor.to_output(
run_unary(self.accessor.array, func, **kwargs)
)
)

def ceil_temporal(
self,
/,
multiple=1,
unit="day",
*,
week_starts_monday=True,
ceil_is_strictly_greater=False,
calendar_based_origin=False,
options=None,
):
raise NotImplementedError("TODO")
else:
raise NotImplementedError
return f

def floor_temporal(
self,
/,
multiple=1,
unit="day",
*,
week_starts_monday=True,
ceil_is_strictly_greater=False,
calendar_based_origin=False,
options=None,
):
raise NotImplementedError("TODO")

def round_temporal(
self,
/,
multiple=1,
unit="day",
*,
week_starts_monday=True,
ceil_is_strictly_greater=False,
calendar_based_origin=False,
options=None,
):
raise NotImplementedError("TODO")

def run_end_decode(self, array):
raise NotImplementedError("TODO")

def run_end_encode(
self,
/,
run_end_type=pa.int32(),
*,
options=None,
):
raise NotImplementedError("TODO")

def strftime(
self,
/,
format="%Y-%m-%dT%H:%M:%S",
locale="C",
*,
options=None,
):
raise NotImplementedError("TODO")

def strptime(
self,
/,
format,
unit,
error_is_null=False,
*,
options=None,
):
raise NotImplementedError("TODO")

def day(self):
raise NotImplementedError("TODO")

def day_of_week(
self,
/,
*,
count_from_zero=True,
week_start=1,
options=None,
):
raise NotImplementedError("TODO")

def day_of_year(self):
raise NotImplementedError("TODO")

def hour(self):
raise NotImplementedError("TODO")

def iso_week(self):
raise NotImplementedError("TODO")

def iso_year(self):
raise NotImplementedError("TODO")

def iso_calendar(self):
raise NotImplementedError("TODO")

def is_leap_year(self):
raise NotImplementedError("TODO")

def microsecond(self):
raise NotImplementedError("TODO")

def millisecond(self):
raise NotImplementedError("TODO")

def minute(self):
raise NotImplementedError("TODO")

def month(self):
raise NotImplementedError("TODO")

def nanosecond(self):
raise NotImplementedError("TODO")

def quarter(self):
raise NotImplementedError("TODO")

def second(self):
raise NotImplementedError("TODO")

def subsecond(self):
raise NotImplementedError("TODO")

def us_week(self):
raise NotImplementedError("TODO")

def us_year(self):
raise NotImplementedError("TODO")

def week(
self,
/,
*,
week_starts_monday=True,
count_from_zero=False,
first_week_is_fully_in_year=False,
options=None,
):
raise NotImplementedError("TODO")

def year(self):
raise NotImplementedError("TODO")

def year_month_day(self):
raise NotImplementedError("TODO")
class DatetimeAccessor:
def __init__(self, accessor) -> None:
self.accessor = accessor

cast = dec(pc.cast)
ceil_temporal = dec(pc.ceil_temporal)
floor_temporal = dec(pc.floor_temporal)
reound_temporal = dec(pc.round_temporal)
strftime = dec(pc.strftime)
strptime = dec(pc.strptime)
day = dec(pc.day)
day_of_week = dec(pc.day_of_week)
day_of_year = dec(pc.day_of_year)
hour = dec(pc.hour)
iso_week = dec(pc.iso_week)
iso_year = dec(pc.iso_year)
iso_calendar = dec(pc.iso_calendar)
is_leap_year = dec(pc.is_leap_year)
microsecond = dec(pc.microsecond)
millisecond = dec(pc.millisecond)
minute = dec(pc.minute)
month = dec(pc.month)
nanosecond = dec(pc.nanosecond)
quarter = dec(pc.quarter)
second = dec(pc.second)
subsecond = dec(pc.subsecond)
us_week = dec(pc.us_week)
us_year = dec(pc.us_year)
week = dec(pc.week)
year = dec(pc.year)
year_month_day = dec(pc.year_month_day)

# the rest are binary
def day_time_interval_between(self, end):
raise NotImplementedError("TODO")

Expand Down
13 changes: 11 additions & 2 deletions src/awkward_pandas/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,13 @@ def read_parquet(
extract: bool = True,
**kwargs,
):
"""Read a Parquet dataset with nested data into a Series or DataFrame.
"""Read a Parquet dataset with nested data into a pandas Series or DataFrame.

This may cope with some deeply nested structures that pandas refuses
to read by itself.

You can pass a selection of columns to read (list of strings), and
other columns will not be parsed into memory.

Parameters
----------
Expand All @@ -34,7 +40,10 @@ def read_json(
extract=True,
**kwargs,
):
"""Read a JSON dataset with nested data into a Series or DataFrame.
"""Read a JSON dataset with nested data into a pandas Series or DataFrame.

You can pass a selection of columns to read (list or jsonschema format), and
other columns will not be parsed into memory.

Parameters
----------
Expand Down
14 changes: 14 additions & 0 deletions tests/test_dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,17 @@ def test_cast():
[datetime.datetime(1970, 1, 1, 0, 0, 1), datetime.datetime(1970, 1, 1, 0, 0)],
[datetime.datetime(1970, 1, 1, 0, 0, 2)],
]


def test_unary_unit():
s = pd.Series([[0, 1], [1, 0], [2]])
ts = s.ak.dt.cast("timestamp[s]")
s2 = ts.ak.dt.second()
assert s.to_list() == s2.to_list()


def test_bad_type():
# consider more specific exception rather than hitting arrow's one
s = pd.Series([[0, 1], [1, 0], [2]])
with pytest.raises(NotImplementedError):
s.ak.dt.second()
Loading