Skip to content

Commit

Permalink
Merge pull request #82 from martindurant/ufunc_map
Browse files Browse the repository at this point in the history
map ufuncs and dunder ops
  • Loading branch information
martindurant authored Nov 11, 2024
2 parents 5c12191 + f43f1ac commit 5f5c752
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 15 deletions.
31 changes: 28 additions & 3 deletions src/akimbo/apply_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,22 @@
import pyarrow as pa


def match_any(*layout, **_):
return True


def leaf(*layout, **_):
"""True for the lowest elements of any akwward layout tree"""
return layout[0].is_leaf


def numeric(*layout, **_):
return layout[0].is_leaf and layout[0].parameters.get("__array__", None) not in {
"string",
"char",
}


def run_with_transform(
arr: ak.Array,
op,
Expand All @@ -24,6 +35,8 @@ def run_with_transform(
**kw,
) -> ak.Array:
def func(layout, **kwargs):
from akimbo.utils import match_string

if not isinstance(layout, tuple):
layout = (layout,)
if all(match(lay, **(match_kwargs or {})) for lay in layout):
Expand All @@ -34,11 +47,23 @@ def func(layout, **kwargs):
elif inmode == "numpy":
# works on numpy/cupy contents
out = op(*(lay.data for lay in layout), **kw, **(match_kwargs or {}))
else:
elif inmode == "ak":
out = op(*layout, **kw, **(match_kwargs or {}))
return outtype(out) if callable(outtype) else out
else:
out = op(
*(ak.Array(lay) for lay in layout), **kw, **(match_kwargs or {})
)
if callable(outtype):
return outtype(out)
elif isinstance(out, ak.Array):
return out.layout
else:
return out
if match_string(*layout):
# non-string op may fail to descend into string
return layout[0]

return ak.transform(func, arr, *others)
return ak.transform(func, arr, *others, allow_records=True)


def dec(
Expand Down
73 changes: 67 additions & 6 deletions src/akimbo/mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import awkward as ak
import pyarrow.compute as pc

from akimbo.apply_tree import dec
from akimbo.apply_tree import dec, match_any, numeric, run_with_transform
from akimbo.utils import to_ak_layout

methods = [
_ for _ in (dir(ak)) if not _.startswith(("_", "ak_")) and not _[0].isupper()
Expand Down Expand Up @@ -179,6 +180,9 @@ def apply(self, fn: Callable, where=None, **kwargs):
The function should take an ak array as input and produce an
ak array or scalar.
Unlike ``transform``, the function takes and returns ak.Array instances
and acts on a whole schema tree.
"""
if where:
bits = tuple(where.split("."))
Expand All @@ -190,6 +194,44 @@ def apply(self, fn: Callable, where=None, **kwargs):
final = fn(self.array)
return self.to_output(final)

def transform(
self, fn: Callable, *others, where=None, match=match_any, inmode="ak", **kwargs
):
"""Perform arbitrary function to selected parts of the data tree
This process walks thought the data's schema tree, and applies the given
function only on the matching nodes.
Parameters
----------
fn: the operation you want to perform. Typically unary or binary, and may take
extra kwargs
others: extra arguments, perhaps other akimbo series
where: path in the schema tree to apply this
match: when walking the schema, this determines if a node should be processed;
it will be a function taking one or more ak.contents classes. ak.apaply_tree
contains convenience matchers macth_any, leaf and numeric, and more matchers
can be found in the string and datetime modules
inmode: data should be passed to the given function as:
"arrow" | "numpy" (includes cupy) | "ak" layout | "array" high-level ak.Array
kwargs: passed to the operation, except those that are taken by ``run_with_transform``.
"""
if where:
bits = tuple(where.split("."))
arr = self.array
part = arr.__getitem__(bits)
# TODO: apply ``where`` to any arrays in others
# other = [to_ak_layout(ar) for ar in others]
out = run_with_transform(
part, fn, match=match, others=others, inmode=inmode, **kwargs
)
final = ak.with_field(arr, out, where=where)
else:
final = run_with_transform(
self.array, fn, match=match, others=others, inmode=inmode, **kwargs
)
return self.to_output(final)

def __getitem__(self, item):
out = self.array.__getitem__(item)
return self.to_output(out)
Expand Down Expand Up @@ -331,12 +373,31 @@ def join(
def _create_op(cls, op):
"""Make functions to perform all the arithmetic, logical and comparison ops"""

def run(self, *args, **kwargs):
ar2 = (ar.ak.array if hasattr(ar, "ak") else ar for ar in args)
ar3 = (ar.array if isinstance(ar, cls) else ar for ar in ar2)
return self.to_output(op(self.array, *ar3, **kwargs))
def op2(*args, extra=None, **kw):
args = list(args) + list(extra or [])
return op(*args, **kw)

def f(self, *args, **kw):
# TODO: test here is for literals, but really we want "don't know how to
# array that" condition
extra = (_ for _ in args if isinstance(_, (str, int, float)))
args = (
to_ak_layout(_) for _ in args if not isinstance(_, (str, int, float))
)
out = self.transform(
op2,
*args,
match=numeric,
inmode="numpy",
extra=extra,
outtype=ak.contents.NumpyArray,
**kw,
)
if isinstance(self._obj, self.dataframe_type):
return out.ak.unmerge()
return out

return run
return f

def __getattr__(self, item):
arr = self.array
Expand Down
16 changes: 10 additions & 6 deletions src/akimbo/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,7 @@

from akimbo.apply_tree import dec
from akimbo.mixin import Accessor


def match_string(*layout):
return layout[0].is_list and layout[0].parameter("__array__") == "string"
from akimbo.utils import match_string


def _encode(layout):
Expand Down Expand Up @@ -53,14 +50,21 @@ def _decode(layout):

# make sensible defaults for strptime
strptime = functools.wraps(pc.strptime)(
lambda *args, format="%FT%T", unit="s", error_is_null=True, **kw:
pc.strptime(*args, format=format, unit=unit, error_is_null=error_is_null)
lambda *args, format="%FT%T", unit="s", error_is_null=True, **kw: pc.strptime(
*args, format=format, unit=unit, error_is_null=error_is_null
)
)


class StringAccessor:
"""String operations on nested/var-length data"""

# TODO: implement dunder add (concat strings) and mul (repeat strings)
# - s.ak.str + "suffix" (and arguments swapped)
# - s.ak.str + s2.ak.str (with matching schemas)
# - s.ak.str * N (and arguments swapped)
# - s.ak.str * s (where each string maps to integers for variable repeats)

def __init__(self, accessor):
self.accessor = accessor

Expand Down
20 changes: 20 additions & 0 deletions src/akimbo/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from __future__ import annotations

import awkward as ak


class NoAttributes:
"""Allows importing akimbo.cudf even if cudf isn't installed
Expand All @@ -20,3 +25,18 @@ def __call__(self, *args, **kwargs):
__name__ = "DummyAttributesObject"
__doc__ = None
__annotations__ = None


def to_ak_layout(ar):
if hasattr(ar, "ak"):
return ar.ak.array
elif hasattr(ar, "array"):
return ar.array
elif isinstance(ar, (ak.Array)):
return ar
else:
return ak.Array(ak.to_layout(ar))


def match_string(*layout):
return layout[0].is_list and layout[0].parameter("__array__") == "string"
52 changes: 52 additions & 0 deletions tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,58 @@ def test_ufunc():
assert (s.ak + s.ak).tolist() == [[2, 4, 6], [8, 10], [12]]
assert (s.ak + s).tolist() == [[2, 4, 6], [8, 10], [12]]

s = pd.DataFrame({"a": s})
assert (s.ak + 1).a.tolist() == [[2, 3, 4], [5, 6], [7]]

assert (s.ak + s.ak).a.tolist() == [[2, 4, 6], [8, 10], [12]]
assert (s.ak + s).a.tolist() == [[2, 4, 6], [8, 10], [12]]


def test_manual_ufunc():
from akimbo.apply_tree import numeric

df = pd.DataFrame(
{"a": [["hey", "hi", "ho"], [None], ["blah"]], "b": [[1, 2, 3], [4, 5], [6]]}
)
df2 = df.ak.transform(
lambda x: x + 1, match=numeric, inmode="numpy", outtype=ak.contents.NumpyArray
)
expected = [
{"a": ["hey", "hi", "ho"], "b": [2, 3, 4]},
{"a": [None], "b": [5, 6]},
{"a": ["blah"], "b": [7]},
]
assert df2.tolist() == expected


def test_mixed_ufunc():
# ufuncs are numeric only by default, doesn't touch strings
df = pd.DataFrame(
{"a": [["hey", "hi", "ho"], [None], ["blah"]], "b": [[1, 2, 3], [4, 5], [6]]}
)
df2 = df.ak + 1
expected = [
{"a": ["hey", "hi", "ho"], "b": [2, 3, 4]},
{"a": [None], "b": [5, 6]},
{"a": ["blah"], "b": [7]},
]
assert df2.ak.tolist() == expected

df2 = df.ak * 2
expected = [
{"a": ["hey", "hi", "ho"], "b": [2, 4, 6]},
{"a": [None], "b": [8, 10]},
{"a": ["blah"], "b": [12]},
]
assert df2.ak.tolist() == expected
df2 = 2 * df.ak
assert df2.ak.tolist() == expected

df2 = df.ak == df.ak
expected = [[True, True, True], [True, True], [True]]
assert df2["b"].tolist() == expected
assert df2["a"].tolist() == df["a"].tolist()


def test_to_autoarrow():
a = [[1, 2, 3], [4, 5], [6]]
Expand Down
4 changes: 4 additions & 0 deletions tests/test_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,7 @@ def test_ufunc():

s2 = np.add(s.ak, 1)
assert s2.to_list() == [[2, 3, 4], [], [5, 6]]

df = pl.DataFrame({"a": s})
df2 = df.ak + 1
assert df2["a"].to_list() == [[2, 3, 4], [], [5, 6]]

0 comments on commit 5f5c752

Please sign in to comment.