From dcbba9a45ae2a190b31badec530ea54a58437606 Mon Sep 17 00:00:00 2001 From: Shoumik Palkar Date: Sat, 4 Apr 2020 13:50:54 -0700 Subject: [PATCH] DataFrame foundations (#510) --- .gitignore | 1 + weld-python/tests/grizzly/core/test_frame.py | 100 ++++ weld-python/tests/grizzly/core/test_series.py | 37 +- .../tests/grizzly/core/test_strings.py | 8 + weld-python/tests/weld/core/test_lazy.py | 2 +- weld-python/tests/weld/encoders/test_numpy.py | 5 + .../tests/weld/encoders/test_primitives.py | 2 +- weld-python/weld/compile.py | 8 +- weld-python/weld/encoders/__init__.py | 1 - weld-python/weld/encoders/numpy.py | 37 +- weld-python/weld/encoders/primitives.py | 45 +- weld-python/weld/encoders/struct.py | 78 +++ weld-python/weld/grizzly/__init__.py | 1 + weld-python/weld/grizzly/core/forwarding.py | 71 +++ weld-python/weld/grizzly/core/frame.py | 349 ++++++++++++ weld-python/weld/grizzly/core/generic.py | 109 ++++ .../weld/grizzly/core/indexes/__init__.py | 2 + weld-python/weld/grizzly/core/indexes/base.py | 9 + .../weld/grizzly/core/indexes/column.py | 153 +++++ weld-python/weld/grizzly/core/series.py | 524 +++++++++++------- weld-python/weld/grizzly/weld/ops.py | 16 + weld-python/weld/types.py | 3 + 22 files changed, 1273 insertions(+), 288 deletions(-) create mode 100644 weld-python/tests/grizzly/core/test_frame.py create mode 100644 weld-python/weld/encoders/struct.py create mode 100644 weld-python/weld/grizzly/core/forwarding.py create mode 100644 weld-python/weld/grizzly/core/frame.py create mode 100644 weld-python/weld/grizzly/core/generic.py create mode 100644 weld-python/weld/grizzly/core/indexes/__init__.py create mode 100644 weld-python/weld/grizzly/core/indexes/base.py create mode 100644 weld-python/weld/grizzly/core/indexes/column.py diff --git a/.gitignore b/.gitignore index 2f64d6bdc..75a41b30d 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ Cargo.lock .#* *~ *.swp +*.swo *.bc *.pyc *.o diff --git a/weld-python/tests/grizzly/core/test_frame.py b/weld-python/tests/grizzly/core/test_frame.py new file mode 100644 index 000000000..519b501fb --- /dev/null +++ b/weld-python/tests/grizzly/core/test_frame.py @@ -0,0 +1,100 @@ +""" +Test basic DataFrame functionality. + +""" + +import pandas as pd +import pytest +import weld.grizzly as gr + +def get_frames(cls, strings): + """ + Returns two DataFrames for testing binary operators. + + The DataFrames have columns of overlapping/different names, types, etc. + + """ + df1 = pd.DataFrame({ + 'name': ['Bob', 'Sally', 'Kunal', 'Deepak', 'James', 'Pratiksha'], + 'lastName': ['Kahn', 'Lopez', 'Smith', 'Narayanan', 'Thomas', 'Thaker'], + 'age': [20, 30, 35, 20, 50, 35], + 'score': [20.0, 30.0, 35.0, 50.0, 35.0, 25.0] + }) + df2 = pd.DataFrame({ + 'firstName': ['Bob', 'Sally', 'Kunal', 'Deepak', 'James', 'Pratiksha'], + 'lastName': ['Kahn', 'Lopez', 'smith', 'narayanan', 'Thomas', 'thaker'], + 'age': [25, 30, 45, 20, 60, 35], + 'scores': [20.0, 30.0, 35.0, 50.0, 35.0, 25.0] + }) + if not strings: + df1 = df1.drop(['name', 'lastName'], axis=1) + df2 = df2.drop(['firstName', 'lastName'], axis=1) + return (cls(df1), cls(df2)) + +def _test_binop(pd_op, gr_op, strings=True): + """ + Test a binary operator. + + Binary operators align on column name. For columns that don't exist in both + DataFrames, the column is filled with NaN (for non-comparison operations) and + or False (for comparison operations). + + If the RHS is a Series, the Series should be added to all columns. + + """ + df1, df2 = get_frames(pd.DataFrame, strings) + gdf1, gdf2 = get_frames(gr.GrizzlyDataFrame, strings) + + expect = pd_op(df1, df2) + result = gr_op(gdf1, gdf2).to_pandas() + assert expect.equals(result) + +def test_evaluation(): + # Test to make sure that evaluating a DataFrame once caches the result/ + # doesn't cause another evaluation. + df1 = gr.GrizzlyDataFrame({ + 'age': [20, 30, 35, 20, 50, 35], + 'score': [20.0, 30.0, 35.0, 50.0, 35.0, 25.0] + }) + df2 = gr.GrizzlyDataFrame({ + 'age': [20, 30, 35, 20, 50, 35], + 'scores': [20.0, 30.0, 35.0, 50.0, 35.0, 25.0] + }) + df3 = (df1 + df2) * df2 + df1 / df2 + assert not df3.is_value + df3.evaluate() + assert df3.is_value + weld_value = df3.weld_value + df3.evaluate() + # The same weld_value should be returned. + assert weld_value is df3.weld_value + +def test_add(): + _test_binop(pd.DataFrame.add, gr.GrizzlyDataFrame.add, strings=False) + +def test_sub(): + _test_binop(pd.DataFrame.sub, gr.GrizzlyDataFrame.sub, strings=False) + +def test_mul(): + _test_binop(pd.DataFrame.mul, gr.GrizzlyDataFrame.mul, strings=False) + +def test_div(): + _test_binop(pd.DataFrame.div, gr.GrizzlyDataFrame.div, strings=False) + +def test_eq(): + _test_binop(pd.DataFrame.eq, gr.GrizzlyDataFrame.eq, strings=True) + +def test_ne(): + _test_binop(pd.DataFrame.ne, gr.GrizzlyDataFrame.ne, strings=True) + +def test_le(): + _test_binop(pd.DataFrame.le, gr.GrizzlyDataFrame.le, strings=False) + +def test_lt(): + _test_binop(pd.DataFrame.lt, gr.GrizzlyDataFrame.lt, strings=False) + +def test_ge(): + _test_binop(pd.DataFrame.ge, gr.GrizzlyDataFrame.ge, strings=False) + +def test_gt(): + _test_binop(pd.DataFrame.gt, gr.GrizzlyDataFrame.gt, strings=False) diff --git a/weld-python/tests/grizzly/core/test_series.py b/weld-python/tests/grizzly/core/test_series.py index eaf603cf5..fe569ef58 100644 --- a/weld-python/tests/grizzly/core/test_series.py +++ b/weld-python/tests/grizzly/core/test_series.py @@ -85,25 +85,6 @@ def eval_expression(cls): yield a + b + c * d - e _compare_vs_pandas(eval_expression) -def test_basic_fallback(): - # Tests basic unsupported functionality. - # NOTE: This test will need to change as more features are added... - def eval_expression(cls): - a = cls([1, 2, 3]) - b = cls([-4, 5, -6]) - # Test 1: abs() - c = a + b - yield (c.abs() + a) - # Test 2: argmin() - c = a + b - yield cls(c.argmin()) - # Test 3: reindex() - c = a + b - res = c.reindex(index=[2, 0, 1]) - # Falls back to Pandas, since we don't support indices. - assert isinstance(res, pd.Series) - _compare_vs_pandas(eval_expression) - def test_scalar(): types = ['int8', 'uint8', 'int16', 'uint16', 'int32',\ 'uint32', 'int64', 'uint64', 'float32', 'float64'] @@ -129,6 +110,19 @@ def test_indexing(): assert np.array_equal(x[x == 2].evaluate().values, np.array([2], dtype='int64')) assert np.array_equal(x[x < 0].evaluate().values, np.array([], dtype='int64')) +def test_name(): + # Test that names propagate after operations. + x = gr.GrizzlySeries([1,2,3], name="testname") + y = x + x + assert y.evaluate().name == "testname" + y = x.agg(['sum', 'count']) + assert y.evaluate().name == "testname" + y = x[:2] + assert y.evaluate().name == "testname" + y = x[x == 1] + assert y.evaluate().name == "testname" + + def test_unsupported_binop_error(): # Test unsupported from weld.grizzly.core.error import GrizzlyError @@ -136,3 +130,8 @@ def test_unsupported_binop_error(): a = gr.GrizzlySeries([1,2,3]) b = pd.Series([1,2,3]) a.add(b) + + with pytest.raises(TypeError): + a = gr.GrizzlySeries(["hello", "world"]) + b = gr.GrizzlySeries(["hello", "world"]) + a.divide(b) diff --git a/weld-python/tests/grizzly/core/test_strings.py b/weld-python/tests/grizzly/core/test_strings.py index e760c1cd0..c807bae3b 100644 --- a/weld-python/tests/grizzly/core/test_strings.py +++ b/weld-python/tests/grizzly/core/test_strings.py @@ -64,6 +64,14 @@ def test_get(): pandas_result = pd.Series(expect) assert pandas_result.equals(grizzly_result) +def test_eq(): + left = ["hello", "world", "strings", "morestrings"] + right = ["hel", "world", "string", "morestrings"] + x = gr.GrizzlySeries(left) + y = gr.GrizzlySeries(right) + assert list(x.eq(y).evaluate().values) == [False, True, False, True] + assert list(x.ne(y).evaluate().values) == [True, False, True, False] + def test_strip(): compare_vs_pandas('strip', ["", " hi ", diff --git a/weld-python/tests/weld/core/test_lazy.py b/weld-python/tests/weld/core/test_lazy.py index 61e3b5164..cac6da22c 100644 --- a/weld-python/tests/weld/core/test_lazy.py +++ b/weld-python/tests/weld/core/test_lazy.py @@ -2,7 +2,7 @@ Tests for constructing and evaluating lazy operations. """ -from weld.encoders import PrimitiveWeldEncoder, PrimitiveWeldDecoder +from weld.encoders.primitives import PrimitiveWeldEncoder, PrimitiveWeldDecoder from weld.types import * from weld.lazy import * diff --git a/weld-python/tests/weld/encoders/test_numpy.py b/weld-python/tests/weld/encoders/test_numpy.py index c353453b4..2e1f91ef9 100644 --- a/weld-python/tests/weld/encoders/test_numpy.py +++ b/weld-python/tests/weld/encoders/test_numpy.py @@ -99,6 +99,11 @@ def test_float32_vec(): def test_float64_vec(): encdec(array('float64'), WeldVec(F64())) +def test_struct_of_vecs(): + arrays = (array('float32'), array('uint16'), array('uint32')) + ty = WeldStruct([WeldVec(F32()), WeldVec(U16()), WeldVec(U32())]) + encdec(arrays, ty) + def test_type_conversions(): types = ['bool', 'int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64', 'float32', 'float64'] diff --git a/weld-python/tests/weld/encoders/test_primitives.py b/weld-python/tests/weld/encoders/test_primitives.py index 09e551075..8c1a9f1ee 100644 --- a/weld-python/tests/weld/encoders/test_primitives.py +++ b/weld-python/tests/weld/encoders/test_primitives.py @@ -5,7 +5,7 @@ import ctypes from .helpers import encdec_factory -from weld.encoders import PrimitiveWeldEncoder, PrimitiveWeldDecoder +from weld.encoders.primitives import PrimitiveWeldEncoder, PrimitiveWeldDecoder from weld.types import * encdec = encdec_factory(PrimitiveWeldEncoder, PrimitiveWeldDecoder) diff --git a/weld-python/weld/compile.py b/weld-python/weld/compile.py index 8ef0586d0..d626e8d63 100644 --- a/weld-python/weld/compile.py +++ b/weld-python/weld/compile.py @@ -4,10 +4,10 @@ """ -from .core import * -from .encoders import WeldEncoder, WeldDecoder, PrimitiveWeldEncoder,\ - PrimitiveWeldDecoder -from .types import WeldType +from weld.core import * +from weld.encoders import WeldEncoder, WeldDecoder +from weld.encoders.primitives import PrimitiveWeldEncoder, PrimitiveWeldDecoder +from weld.types import WeldType import ctypes import logging diff --git a/weld-python/weld/encoders/__init__.py b/weld-python/weld/encoders/__init__.py index d1ced7685..79979cb27 100644 --- a/weld-python/weld/encoders/__init__.py +++ b/weld-python/weld/encoders/__init__.py @@ -1,3 +1,2 @@ from .encoder_base import * -from .primitives import PrimitiveWeldEncoder, PrimitiveWeldDecoder diff --git a/weld-python/weld/encoders/numpy.py b/weld-python/weld/encoders/numpy.py index 23852f946..e798df67e 100644 --- a/weld-python/weld/encoders/numpy.py +++ b/weld-python/weld/encoders/numpy.py @@ -17,7 +17,7 @@ import ctypes import numpy as np -from .encoder_base import * +from weld.encoders.struct import StructWeldEncoder, StructWeldDecoder from weld.types import * # We just need this for the path. @@ -118,15 +118,15 @@ def binop_output_type(left_ty, right_ty, truediv=False): Examples -------- >>> binop_output_type(Bool(), Bool()) - + bool >>> binop_output_type(I8(), U16()) - + i32 >>> binop_output_type(U8(), U16()) - + u16 >>> binop_output_type(F32(), U16()) - + f32 >>> binop_output_type(I8(), U64()) - + f64 """ if not truediv and left_ty == right_ty: return left_ty @@ -238,13 +238,13 @@ def dtype_to_weld_type(ty): Examples -------- >>> dtype_to_weld_type('int32') - + i32 >>> dtype_to_weld_type('float') - + f64 >>> dtype_to_weld_type('i8') - + i64 >>> dtype_to_weld_type(np.int16) - + i16 Parameters ---------- @@ -295,7 +295,11 @@ def weld_string_array_to_numpy(arr): return result -class NumPyWeldEncoder(WeldEncoder): +class NumPyWeldEncoder(StructWeldEncoder): + """ + Encodes NumPy arrays as Weld arrays. + + """ @staticmethod def _convert_1d_array(array, check_type=None): @@ -353,7 +357,7 @@ def _is_string_array(obj): return False return True - def encode(self, obj, ty): + def encode_element(self, obj, ty): if NumPyWeldEncoder._is_string_array(obj): assert ty == WeldVec(WeldVec(I8())) return StringConversionFuncs.numpy_string_array_to_weld(obj) @@ -365,9 +369,12 @@ def encode(self, obj, ty): else: raise TypeError("Unexpected type {} in NumPy encoder".format(type(obj))) -class NumPyWeldDecoder(WeldDecoder): - """ Decodes an encoded Weld array into a NumPy array. +class NumPyWeldDecoder(StructWeldDecoder): + """ + Decodes an encoded Weld array into a NumPy array. + Examples + -------- >>> arr = np.array([1,2,3], dtype='int32') >>> encoded = NumPyWeldEncoder().encode(arr, WeldVec(I32())) >>> NumPyWeldDecoder().decode(ctypes.pointer(encoded), WeldVec(I32())) @@ -457,7 +464,7 @@ def _is_string_array(restype): return True return False - def decode(self, obj, restype, context=None): + def decode_element(self, obj, restype, context=None): # A 1D NumPy array obj = obj.contents if NumPyWeldDecoder._is_string_array(restype): diff --git a/weld-python/weld/encoders/primitives.py b/weld-python/weld/encoders/primitives.py index 8c7a6f36b..fa0055158 100644 --- a/weld-python/weld/encoders/primitives.py +++ b/weld-python/weld/encoders/primitives.py @@ -6,15 +6,14 @@ """ -from .encoder_base import * -from ..types import * +from weld.encoders.struct import StructWeldEncoder, StructWeldDecoder +from weld.types import * -class PrimitiveWeldEncoder(WeldEncoder): - """ - A primitive encoder for booleans, integers and floats. +import ctypes - Eventually, this will also support encoding for tuples (structs) of other - primitive types. +class PrimitiveWeldEncoder(StructWeldEncoder): + """ + A primitive encoder for booleans, integers, floats, and tuples thereof. Examples -------- @@ -33,24 +32,13 @@ class PrimitiveWeldEncoder(WeldEncoder): >>> s._1 1 """ - def encode(self, obj, target_type): + def encode_element(self, obj, target_type): encoder = target_type.ctype_class - if isinstance(target_type, WeldStruct): - struct = encoder() - for (i, (field, weld_ty)) in enumerate(zip(\ - obj, target_type.field_types)): - encoded = self.encode(field, weld_ty) - setattr(struct, "_" + str(i), encoded) - return struct - else: - return encoder(obj) + return encoder(obj) -class PrimitiveWeldDecoder(WeldDecoder): +class PrimitiveWeldDecoder(StructWeldDecoder): """ - A primitive encoder for booleans, integers, and floats. - - Eventually, this will also support decoding for structs (tuples) of other - primitive types. + A primitive encoder for booleans, integers, floats, and tuples thereof. Examples -------- @@ -65,18 +53,9 @@ class PrimitiveWeldDecoder(WeldDecoder): >>> decoder.decode(ctypes.pointer(x), struct_type) (1, 1.0) """ - def decode(self, obj, restype, context=None): + + def decode_element(self, obj, restype, context=None): if isinstance(restype, Bool): return bool(obj.contents.value) - elif isinstance(restype, WeldStruct): - struct = obj.contents - ctype_class = restype.ctype_class - result = [] - for (i, (weld_ty, (cfield, cty))) in enumerate(zip(\ - restype.field_types, ctype_class._fields_)): - ofs = getattr(ctype_class, cfield).offset - p = ctypes.pointer(cty.from_buffer(struct, ofs)) - result.append(self.decode(p, weld_ty)) - return tuple(result) else: return obj.contents.value diff --git a/weld-python/weld/encoders/struct.py b/weld-python/weld/encoders/struct.py new file mode 100644 index 000000000..047a813a1 --- /dev/null +++ b/weld-python/weld/encoders/struct.py @@ -0,0 +1,78 @@ +""" +Implements an encoder mixin for tuples of values. + +Tuples are encoded as Weld Structs. Weld structs are decoded back into +Python tuples. This encoder takes another encoder to encode or decode +individual elements. + +""" + +from abc import abstractmethod +from weld.encoders import WeldEncoder, WeldDecoder +from weld.types import WeldStruct + +import ctypes + +class StructWeldEncoder(WeldEncoder): + """ + An encoder mixin for structs, to be subclassed by another encoders. + + Subclasses should encode their own types in 'encode_element()' instead + of 'encode()'. + + """ + + @abstractmethod + def encode_element(self, obj, target_type): + """ + Encodes a single non-struct element. + + Subclasses' 'encode()' implementation should go here. + + """ + pass + + def encode(self, obj, target_type): + if isinstance(target_type, WeldStruct): + encoder = target_type.ctype_class + struct = encoder() + for (i, (field, weld_ty)) in enumerate(zip(\ + obj, target_type.field_types)): + encoded = self.encode(field, weld_ty) + setattr(struct, "_" + str(i), encoded) + return struct + else: + return self.encode_element(obj, target_type) + +class StructWeldDecoder(WeldDecoder): + """ + A decoder abstract class for structs. + + Subclasses should encode their own types in 'decode_element()' instead + of 'decode()'. + + """ + + @abstractmethod + def decode_element(self, obj, restype, context=None): + """ + Decodes a single non-struct element. + + Subclasses' 'decode()' implementation should go here. + + """ + pass + + def decode(self, obj, restype, context=None): + if isinstance(restype, WeldStruct): + struct = obj.contents + ctype_class = restype.ctype_class + result = [] + for (i, (weld_ty, (cfield, cty))) in enumerate(zip(\ + restype.field_types, ctype_class._fields_)): + ofs = getattr(ctype_class, cfield).offset + p = ctypes.pointer(cty.from_buffer(struct, ofs)) + result.append(self.decode(p, weld_ty)) + return tuple(result) + else: + return self.decode_element(obj, restype, context) diff --git a/weld-python/weld/grizzly/__init__.py b/weld-python/weld/grizzly/__init__.py index 963902a18..ab72d7cd0 100644 --- a/weld-python/weld/grizzly/__init__.py +++ b/weld-python/weld/grizzly/__init__.py @@ -1,2 +1,3 @@ +from weld.grizzly.core.frame import GrizzlyDataFrame from weld.grizzly.core.series import GrizzlySeries diff --git a/weld-python/weld/grizzly/core/forwarding.py b/weld-python/weld/grizzly/core/forwarding.py new file mode 100644 index 000000000..1c2a83e87 --- /dev/null +++ b/weld-python/weld/grizzly/core/forwarding.py @@ -0,0 +1,71 @@ +""" +Implements method forwarding from one class to another. + +The `Forwarding` class can be used as a mixin. + +""" + +import inspect +import warnings + +class Forwarding(object): + @classmethod + def _get_class_that_defined_method(cls, meth): + """ + Returns the class that defines the requested method. + + For methods that are defined outside of a particular set of + Grizzly-defined classes, Grizzly will first evaluate lazy results + before forwarding the data to the requested class. + + """ + if inspect.ismethod(meth): + for cls in inspect.getmro(meth.__self__.__class__): + if cls.__dict__.get(meth.__name__) is meth: + return cls + if inspect.isfunction(meth): + return getattr(inspect.getmodule(meth), + meth.__qualname__.split('.', 1)[0].rsplit('.', 1)[0]) + + @classmethod + def _requires_forwarding(cls, meth): + defined_in = cls._get_class_that_defined_method(meth) + if defined_in is not None and defined_in is not cls: + return True + else: + return False + + @classmethod + def _forward(cls, to_cls): + from functools import wraps + def forward_decorator(func): + @wraps(func) + def forwarding_wrapper(self, *args, **kwargs): + self.evaluate() + result = func(self, *args, **kwargs) + # Unsupported functions will return Series -- try to + # switch back to GrizzlySeries. + if not isinstance(result, cls) and isinstance(result, to_cls): + try_convert = cls(data=result.values, index=result.index) + if not isinstance(try_convert, cls): + warnings.warn("Unsupported operation '{}' produced unsupported Series: falling back to Pandas".format( + func.__name__)) + return try_convert + return result + return forwarding_wrapper + return forward_decorator + + @classmethod + def add_forwarding_methods(cls, to_cls): + """ + Add forwarding methods from this class to `to_cls`. + + """ + methods = dir(cls) + for meth in methods: + if meth.startswith("_"): + # We only want to do this for API methods. + continue + attr = getattr(cls, meth) + if cls._requires_forwarding(attr): + setattr(cls, meth, cls._forward(to_cls)(attr)) diff --git a/weld-python/weld/grizzly/core/frame.py b/weld-python/weld/grizzly/core/frame.py new file mode 100644 index 000000000..0ff1fd6d1 --- /dev/null +++ b/weld-python/weld/grizzly/core/frame.py @@ -0,0 +1,349 @@ +""" +A Weld wrapper for pandas.DataFrame. + +""" + +import numpy as np +import pandas as pd + +import weld.encoders.numpy as wenp +import weld.grizzly.weld.ops as weldops + +from weld.lazy import WeldLazy, identity +from weld.grizzly.core.forwarding import Forwarding +from weld.grizzly.core.generic import GrizzlyBase +from weld.grizzly.core.indexes import ColumnIndex +from weld.grizzly.core.series import GrizzlySeries +from weld.types import * + +class GrizzlyDataFrame(Forwarding, GrizzlyBase): + """ + An API-compatible DataFrame backed by Weld. + + DataFrames are dictionary-like containers of Series of the same length. + Each Series can be a different data type. Operations on DataFrames align on + column name. Unlike pandas, DataFrame operations do not align on row + indexes. + + Examples + -------- + >>> df = GrizzlyDataFrame({'name': ['mike', 'sam', 'sally'], 'age': [20, 22, 56]}) + >>> df + name age + 0 mike 20 + 1 sam 22 + 2 sally 56 + >>> df2 = GrizzlyDataFrame({'nom': ['jacques', 'kelly', 'marie'], 'age': [50, 60, 70]}) + >>> df.add(df2).to_pandas() + age name nom + 0 70 NaN NaN + 1 82 NaN NaN + 2 126 NaN NaN + + """ + + # Indicates that the length of a DataFrame is not known. Certain operations on DataFrames + # with this length are disallowed, since Grizzly (like Pandas) assumes each column in a DataFrame + # is of the same length. + UNKNOWN_LENGTH = -1 + + _encoder = wenp.NumPyWeldEncoder() + _decoder = wenp.NumPyWeldDecoder() + + # ------------------- GrizzlyBase methods ---------------------- + + @property + def weld_value(self): + if hasattr(self, "weld_value_"): + return self.weld_value_ + + if len(self.data) == 0: + raise GrizzlyError("weld_value cannot be accessed on DataFrame with no data") + output_type = WeldStruct([col.weld_value.output_type for col in self.data]) + self.weld_value_ = weldops.make_struct(*[col.weld_value for col in self.data])(output_type, GrizzlyDataFrame._decoder) + return self.weld_value_ + + @property + def is_value(self): + """ + Returns whether this DataFrame is a physical value. + + If this is True, evaluate() is guaranteed to be a no-op. + + Examples + -------- + >>> df = GrizzlyDataFrame({'name': ['sam', 'sally'], 'age': [25, 50]}) + >>> df.is_value + True + >>> df = df.eq(df) + >>> df.is_value + False + + """ + return self.pandas_df is not None or\ + all([child.is_identity for child in self.children]) + + def evaluate(self): + """ + Evaluates this `GrizzlyDataFrame` and returns itself. + + Evaluation reduces the currently stored computation to a physical value + by compiling and running a Weld program. If this `GrizzlyDataFrame` refers + to a physical value and no computation, no program is compiled, and this + method returns `self` unmodified. + + Returns + ------- + GrizzlyDataFrame + + """ + if not self.is_value: + if len(self.data) == 0: + # We're an empty DataFrame + self.pandas_df = pd.DataFrame() + return + # Collect each vector into a struct rather than evaluating each Series individually: + # this is more efficient so computations shared among vectors can be optimized. + result = self.weld_value.evaluate() + columns = result[0] + new_data = [] + length = None + for column in columns: + data = column.copy2numpy() + column_length = len(data) + if length is not None: + assert column_length == length, "invalid DataFrame produced after evaluation" + else: + length = column_length + series = GrizzlySeries(data) + new_data.append(series) + + # Columns is unchanged + self.pandas_df = None + self.data = new_data + self.length = length + # Reset the weld representation. + delattr(self, "weld_value_") + assert self.is_value + return self + + def to_pandas(self, copy=False): + """ + Evaluate and convert this GrizzlyDataFrame to a pandas DataFrame. + + Parameters + ---------- + copy : bool + whether to copy the data into the new DataFrame. This only guarantees + that a copy _will_ occur; it does not guarantee that a copy will not. + + Returns + ------- + pandas.DataFrame + + """ + self.evaluate() + if self.pandas_df is not None: + return self.pandas_df + col_to_data = dict() + for col in self.columns: + col_to_data[col] = self._col(col).values + self.pandas_df = pd.DataFrame(data=col_to_data, copy=copy) + return self.pandas_df + + # ------------------- Initialization ---------------------- + + def __init__(self, data, columns=None, _length=UNKNOWN_LENGTH, _fastpath=False): + if _fastpath: + assert all([isinstance(d, GrizzlySeries) for d in data]) + assert isinstance(columns, ColumnIndex) + self.data = data + self.columns = columns + self.length = _length + self.pandas_df = None + return + + self.data = [] + column_index = [] + # Keep a reference to the Pandas DataFrame. This should not consume any extra memory, since GrizzlySeries + # just keep references ot the same values. The only exception is for string data, where a copy will be created + # to force use of the 'S' dtype. + # + # TODO(shoumik): Convert string data to dtype 'S' here so it doesn't get copied when wrapping the Series as + # GrizzlySeries. + if isinstance(data, pd.DataFrame): + self.pandas_df = data + else: + self.pandas_df = pd.DataFrame(data, columns=columns) + self.length = len(self.pandas_df) + for (i, col) in enumerate(self.pandas_df): + grizzly_series = GrizzlySeries(self.pandas_df[col], name=self.pandas_df[col].name) + if not isinstance(grizzly_series, GrizzlySeries): + raise TypeError("Unsupported Series in DataFrame: {}".format(self.pandas_df[col])) + self.data.append(grizzly_series) + column_index.append(col) + + self.columns = ColumnIndex(column_index) + + def _col(self, col): + """ + Returns the column associated with the column name 'col'. + + """ + return self.data[self.columns[col]] + + # ------------------- Getting and Setting Items ---------------------- + + # TODO! + + # ------------------- Ops ---------------------- + + def explain(self): + """ + Prints a string that describes the operations to compute each column. + + If this DataFrame is a value, prints the data. + + """ + if self.pandas_df is not None: + print(self.pandas_df) + else: + for col in self.columns: + code = self._col(col).code + code = code.replace("\n", "\n\t") + print("{}: {}".format(col, code)) + + + def _arithmetic_binop_impl(self, other, series_op, fill=np.nan): + """ + Apply the binary operation between this DataFrame and other. + + Parameters + ---------- + other : DataFrame, scalar, GrizzlySeries + If other is a DataFrame, aligns on column name. If the binary + operator is not supported on any column, raises a TypeError. + series_op : func + The binary operation to apply. + compare : bool + whether this is a comparison operation. + + Returns + ------- + GrizzlyDataFrame + + """ + new_data = [] + if not isinstance(other, GrizzlyDataFrame): + for data in self.data: + new_data.append(series_op(data, other)) + return GrizzlyDataFrame(new_data, + columns=copy.deepcopy(self.columns), + _length=self.length, + _fastpath=True) + + new_cols = [] + for (col, left_slot, right_slot) in self.columns.zip(other.columns): + new_cols.append(col) + if left_slot is None or right_slot is None: + # TODO(shoumik): Handle this case by making a lazy computation. + assert self.length != GrizzlyDataFrame.UNKNOWN_LENGTH + dtype = np.array([fill]).dtype + vals = np.empty(self.length, dtype=dtype) + vals[:] = fill + new_data.append(GrizzlySeries(vals)) + else: + new_data.append(series_op(self.data[left_slot], other.data[right_slot])) + return GrizzlyDataFrame(new_data, + columns=ColumnIndex(new_cols), + _length=self.length, + _fastpath=True) + + def add(self, other): + return self._arithmetic_binop_impl(other, GrizzlySeries.add) + + def sub(self, other): + return self._arithmetic_binop_impl(other, GrizzlySeries.sub) + + def mod(self, other): + return self._arithmetic_binop_impl(other, GrizzlySeries.mod) + + def mul(self, other): + return self._arithmetic_binop_impl(other, GrizzlySeries.mul) + + def truediv(self, other): + return self._arithmetic_binop_impl(other, GrizzlySeries.truediv) + + def divide(self, other): + return self.truediv(other) + + def div(self, other): + return self.truediv(other) + + def eq(self, other): + return self._arithmetic_binop_impl(other, GrizzlySeries.eq, fill=False) + + def ne(self, other): + # Fill with True on this one. + return self._arithmetic_binop_impl(other, GrizzlySeries.ne, fill=True) + + def ge(self, other): + return self._arithmetic_binop_impl(other, GrizzlySeries.ge, fill=False) + + def gt(self, other): + return self._arithmetic_binop_impl(other, GrizzlySeries.gt, fill=False) + + def le(self, other): + return self._arithmetic_binop_impl(other, GrizzlySeries.le, fill=False) + + def lt(self, other): + return self._arithmetic_binop_impl(other, GrizzlySeries.lt, fill=False) + + def __add__(self, other): + return self.add(other) + + def __sub__(self, other): + return self.sub(other) + + def __mul__(self, other): + return self.mul(other) + + def __truediv__(self, other): + return self.truediv(other) + + def __divmod__(self, other): + return self.divmod(other) + + def __mod__(self, other): + return self.mod(other) + + def __eq__(self, other): + return self.eq(other) + + def __ne__(self, other): + return self.ne(other) + + def __ge__(self, other): + return self.ge(other) + + def __gt__(self, other): + return self.gt(other) + + def __le__(self, other): + return self.le(other) + + def __lt__(self, other): + return self.lt(other) + + def __str__(self): + if self.pandas_df is not None: + return str(self.pandas_df) + else: + return repr(self) + + def __repr__(self): + if self.pandas_df is not None: + return repr(self.pandas_df) + else: + return "GrizzlyDataFrame(lazy, {})".format([name for name in self.columns.columns]) + diff --git a/weld-python/weld/grizzly/core/generic.py b/weld-python/weld/grizzly/core/generic.py new file mode 100644 index 000000000..0ccaf7be1 --- /dev/null +++ b/weld-python/weld/grizzly/core/generic.py @@ -0,0 +1,109 @@ +""" +Base class for Grizzly collection types. + +""" + +from abc import ABC, abstractmethod + +class GrizzlyBase(ABC): + + @property + @abstractmethod + def weld_value(self): + """ + Returns the WeldLazy represention of this object. + + """ + pass + + @property + @abstractmethod + def is_value(self): + """ + Returns whether this collection wraps a physical value rather than + a computation. + """ + pass + + @abstractmethod + def evaluate(self): + """ + Evaluates this collection and returns itself. + + Evaluation reduces the currently stored computation to a physical value + by compiling and running a Weld program. If this collection refers + to a physical value and no computation, no program is compiled, and this + method returns `self` unmodified. + + """ + pass + + @abstractmethod + def to_pandas(self, copy=False): + """ + Evaluates this computation and coerces it into a pandas object. + + This is guaranteed to be 0-copy if `self.is_value == True`. Otherwise, + some allocation may occur. Regardless, `self` and the returned value + will always have the same underlying data unless `copy = True`. + + Parameters + ---------- + copy : boolean, optional + Specifies whether the new collection should copy data from self + + """ + pass + + @property + def children(self): + """ + Returns the Weld children of this value. + """ + return self.weld_value.children + + @property + def output_type(self): + """ + Returns the Weld output type of this collection. + + The output type is always a `WeldVec` of some type. + """ + return self.weld_value.output_type + + @property + def code(self): + """ + Returns the Weld code for this computation. + """ + return self.weld_value.code + + @classmethod + def scalar_ty(cls, value, cast_ty): + """ + Returns the scalar Weld type of a scalar Python value. If the value is not a scalar, + returns None. For primitive 'int' values, returns 'cast_ty'. + + This returns 'None' if the value type is not supported. + + Parameters + ---------- + value : any + value whose dtype to obtain. + + Returns + ------- + WeldType + + """ + if hasattr(value, 'dtype') and hasattr(value, 'shape') and value.shape == (): + return wenp.dtype_to_weld_type(value.dtype) + if isinstance(value, int): + return cast_ty + if isinstance(value, float): + return F64() + if isinstance(value, bool): + return Bool() + + + diff --git a/weld-python/weld/grizzly/core/indexes/__init__.py b/weld-python/weld/grizzly/core/indexes/__init__.py new file mode 100644 index 000000000..1be09ae96 --- /dev/null +++ b/weld-python/weld/grizzly/core/indexes/__init__.py @@ -0,0 +1,2 @@ + +from weld.grizzly.core.indexes.column import ColumnIndex diff --git a/weld-python/weld/grizzly/core/indexes/base.py b/weld-python/weld/grizzly/core/indexes/base.py new file mode 100644 index 000000000..f9c085c8c --- /dev/null +++ b/weld-python/weld/grizzly/core/indexes/base.py @@ -0,0 +1,9 @@ + +from abc import ABC + +class Index(ABC): + """ + Base class for an index in Grizzly. + + """ + pass diff --git a/weld-python/weld/grizzly/core/indexes/column.py b/weld-python/weld/grizzly/core/indexes/column.py new file mode 100644 index 000000000..66801670d --- /dev/null +++ b/weld-python/weld/grizzly/core/indexes/column.py @@ -0,0 +1,153 @@ +""" +Index used for access columns in Grizzly. + +""" + +from weld.grizzly.core.indexes.base import Index + +class ColumnIndex(Index): + """ + An index used for columns in a Grizzly DataFrame. + + Each index value is a Python object. For operations between two DataFrames + with the same ColumnIndex, the result will also have the same index. For + operations between two DataFrames with different ColumnIndex, the output + will have a join of the two ColumnIndex, sorted by the index values. + + Two ColumnIndex are equal if their index names are equal and have the same + order. + + Parameters + ---------- + columns : iterable + column names. + slots : iterable of int or None + slots associated with each column. If provided, the length must be + len(columns). This is used for underlying data access only; index + equality depends only on the column names and ordering. + + + Examples + -------- + >>> ColumnIndex(["name", "age"]) + ColumnIndex(['name', 'age'], [0, 1]) + >>> ColumnIndex(["name", "age"], slots=[1, 0]) + ColumnIndex(['name', 'age'], [1, 0]) + >>> ColumnIndex(["name", "age"], slots=[1, 2]) + Traceback (most recent call last): + ... + ValueError: slots must be contiguous starting at 0 + + """ + + def __init__(self, columns, slots=None): + if not isinstance(columns, list): + columns = list(columns) + if slots is not None: + assert len(columns) == len(slots) + sorted_slots = sorted(slots) + # Make sure each slot is occupied/there are no "holes". + if not sorted_slots == list(range(len(slots))): + raise ValueError("slots must be contiguous starting at 0") + else: + slots = range(len(columns)) + + # The original column order. + self.columns = columns + # The mapping from columns to slots. + self.index = dict(zip(columns, slots)) + + def __iter__(self): + """ + Iterates over columns in the order in which they appear in a DataFrame. + + Examples + -------- + >>> x = ColumnIndex(["name", "age"], slots=[1, 0]) + >>> [name for name in x] + ['name', 'age'] + + """ + for col in self.columns: + yield col + + def zip(self, other): + """ + Zips this index with 'other', returning an iterator of `(name, + slot_in_self, slot_in_other)`. The slot may be `None` if the name does + not appear in either column. + + The result columns are ordered in a way consistent with how DataFrame + columns should be be ordered (i.e., same order `self` if `self == + other`, and sorted by the union of columns from `self` and `other` + otherwise). + + Examples + -------- + >>> a = ColumnIndex(["name", "age"]) + >>> b = ColumnIndex(["name", "age"]) + >>> list(a.zip(b)) + [('name', 0, 0), ('age', 1, 1)] + >>> b = ColumnIndex(["income", "age", "name"]) + >>> list(a.zip(b)) + [('age', 1, 1), ('income', None, 0), ('name', 0, 2)] + + """ + if self == other: + for name in self.columns: + yield (name, self.index[name], other.index[name]) + else: + columns = sorted(list(set(self.columns).union(other.columns))) + for name in columns: + yield (name, self.index.get(name), other.index.get(name)) + + def __getitem__(self, key): + """ + Get the slot for a paritcular column name. + + Examples + -------- + >>> a = ColumnIndex(["name", "age"]) + >>> a["age"] + 1 + + """ + return self.index[key] + + def append(self, key): + """ + Add a new column to the index. The slot is set to be `len(columns) - 1`. + + Examples + -------- + >>> a = ColumnIndex(["name", "age"]) + >>> a.append("income") + >>> a["income"] + 2 + + """ + self.index[key] = len(self.columns) + self.columns.append(key) + + def __eq__(self, other): + """ + Compare equality depending on column names. + + Examples + -------- + >>> a = ColumnIndex(["name", "age"]) + >>> a == ColumnIndex(["name", "age"]) + True + >>> a == ColumnIndex(["age", "name"]) + False + >>> a == ColumnIndex(["name", "age", "income"]) + False + + """ + return isinstance(other, ColumnIndex) and self.columns == other.columns + + def __str__(self): + return repr(self) + + def __repr__(self): + return "ColumnIndex({}, {})".format(self.columns, [self.index[col] for col in self.columns]) diff --git a/weld-python/weld/grizzly/core/series.py b/weld-python/weld/grizzly/core/series.py index 81a3e576d..a9abd2356 100644 --- a/weld-python/weld/grizzly/core/series.py +++ b/weld-python/weld/grizzly/core/series.py @@ -1,11 +1,10 @@ """ A Weld wrapper for pandas.Series. + """ -import inspect import numpy as np import pandas as pd -import warnings import weld.encoders.numpy as wenp import weld.grizzly.weld.agg as weldagg @@ -13,20 +12,12 @@ from weld.lazy import PhysicalValue, WeldLazy, WeldNode, identity from weld.grizzly.weld.ops import * from weld.grizzly.core.error import GrizzlyError +from weld.grizzly.core.forwarding import Forwarding +from weld.grizzly.core.generic import GrizzlyBase from weld.grizzly.core.strings import StringMethods from weld.types import * -def _grizzlyseries_constructor_with_fallback(data=None, **kwargs): - """ - A flexible constructor for Series._constructor, which needs to be able - to fall back to a Series (if a certain operation cannot produce GrizzlySeries). - """ - try: - return GrizzlySeries(data=data, **kwargs) - except TypeError: - return pd.Series(data=data, **kwargs) - -class GrizzlySeries(pd.Series): +class GrizzlySeries(Forwarding, GrizzlyBase): """ A lazy `Series` object backed by a Weld computation. @@ -36,6 +27,31 @@ class GrizzlySeries(pd.Series): regular pandas behavior. Similarly, `GrizzlySeries` that are initialized with features that Grizzly does not understand are initialized as `pd.Series`. + Implementation Notes + -------------------- + + A Series in Grizzly is a 1D array of a single data type. Transformations on + Series can result in scalar values, which are directly evaluated and + returned as scalars, as other Series that encapsulate lazy Weld + computations, or as DataFrames (currently unsupported). + + A Series always has a known type: unlike pandas Series, which are wrappers + around Python object, Series with heterogeneous types (i.e., Series with + `dtype=object`) are disallowed. Series that are not evaluated/store a lazy + compuations also always have a known type. + + Internally, most Series are backed by NumPy arrays, which themselves are + stored as contiguous C arrays. The main exception is string data. Grizzly + operates exclusively over UTF-8 encoded data, and most string operations + will create a copy of the input data upon evaluation. Strings are currently + encoded as NumPy arrays with the 'S' dtype: this may change in the future + (the 'S' dtype forces memory usage per string to match the length of the + largest string in the array, which is clearly suboptimal). + + Unlike pandas Series, GrizzlySeries do not support alignment of index + values for performance reasons. To align the indices of two Series, consider + using a DataFrame with a join operator. + Parameters ---------- @@ -43,65 +59,57 @@ class GrizzlySeries(pd.Series): Contains data stored in the series. dtype : numpy.dtype or str Data type of the values. + name : str + A name to give the series. Examples -------- - >>> x = GrizzlySeries([1,2,3]) + >>> x = GrizzlySeries([1,2,3], name="numbers") >>> x 0 1 1 2 2 3 - dtype: int64 + Name: numbers, dtype: int64 """ - # TODO(shoumik): Does this need to be in _metadata instead? - _internal_names = pd.Series._internal_names + ['weld_value_'] - _internal_names_set = set(_internal_names) - # The encoders and decoders are stateless, so no need to instantiate them # for each computation. _encoder = wenp.NumPyWeldEncoder() _decoder = wenp.NumPyWeldDecoder() - # ---------------------- WeldNode abstract methods --------------------- - @property - def children(self): - """ - Returns the Weld children of this `GrizzlySeries. - """ - return self.weld_value_.children + def weld_value(self): + return self.weld_value_ + + @weld_value.setter + def weld_value(self, value): + self.weld_value_ = value @property - def output_type(self): + def is_value(self): """ - Returns the Weld output type of this `GrizzlySeries`. - - The output type is always a `WeldVec` of some type. + Returns whether this collection wraps a physical value rather than + a computation. """ - return self.weld_value_.output_type + return self.weld_value.is_identity or hasattr(self, "evaluating_") @property def elem_type(self): """ Returns the element type of this `GrizzlySeries`. """ - return self.weld_value_.output_type.elem_type + return self.output_type.elem_type @property - def is_value(self): - """ - Returns whether this `GrizzlySeries` wraps a physical value rather than - a computation. + def dtype(self): """ - return self.weld_value_.is_identity or hasattr(self, "evaluating_") + Returns the NumPy dtype of this Series. - @property - def code(self): - """ - Returns the Weld code for this computation. """ - return self.weld_value_.code + elem_type = self.elem_type + if elem_type == WeldVec(I8()): + return np.dtype('S') + return wenp.weld_type_to_dtype(elem_type) @property def values(self): @@ -126,7 +134,7 @@ def values(self): """ if not self.is_value: raise GrizzlyError("GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first.") - return super(GrizzlySeries, self).values + return self.values_ def evaluate(self): """ @@ -139,16 +147,16 @@ def evaluate(self): """ if not self.is_value: - result = self.weld_value_.evaluate() + result = self.weld_value.evaluate() # TODO(shoumik): it's unfortunate that this copy is needed, but # things are breaking without it (even if we hold a reference to # the WeldContext). DEBUG ME! if isinstance(result[0], wenp.weldbasearray): - super(GrizzlySeries, self).__init__(result[0].copy2numpy()) + self.__init__(result[0].copy2numpy(), name=self.name) else: - super(GrizzlySeries, self).__init__(result[0]) + self.__init__(result[0], name=self.name) setattr(self, "evaluating_", 0) - self.weld_value_ = identity(PhysicalValue(self.values,\ + self.weld_value = identity(PhysicalValue(self.values,\ self.output_type, GrizzlySeries._encoder), GrizzlySeries._decoder) delattr(self, "evaluating_") @@ -190,33 +198,27 @@ def to_pandas(self, copy=False): """ self.evaluate() assert self.is_value - return pd.Series(data=self.array, - index=self.index, - copy=copy, - fastpath=True) - - # ------------- pd.Series methods required for subclassing ---------- - - @property - def _constructor(self): - return _grizzlyseries_constructor_with_fallback - - @property - def _constructor_expanddim(self): - return NotImplemented + return pd.Series(data=self.values_, name=self.name, copy=copy) # ---------------------- Class methods ------------------------------ @classmethod - def _supports_grizzly(cls, data): + def supported(cls, data): """ - Returns the Weld type if data supports Grizzly operation, or returns - `None` otherwise. + Returns whether the given ndarray supports Grizzly operation. Parameters ---------- data : np.array + Examples + ------- + >>> GrizzlySeries.supported(np.array([1,2,3], dtype='int32')) + vec[i32] + >>> GrizzlySeries.supported(np.array(["foo","bar"], dtype='object')) + >>> GrizzlySeries.supported(np.array(["foo","bar"], dtype='S')) + vec[vec[i8]] + """ if not isinstance(data, np.ndarray) or data.ndim != 1: return None @@ -228,13 +230,9 @@ def _supports_grizzly(cls, data): # ---------------------- Initialization ------------------------------ - def __init__(self, data, dtype=None, index=None, **kwargs): - # Everything important is done in __new__. - pass - - def __new__(cls, data, dtype=None, index=None, **kwargs): + def __init__(self, data, dtype=None, name=None): """ - Internal initialization. Tests below are for internal visibility only. + Initialize a new GrizzlySeries. >>> x = GrizzlySeries([1,2,3]) >>> x @@ -242,64 +240,46 @@ def __new__(cls, data, dtype=None, index=None, **kwargs): 1 2 2 3 dtype: int64 - >>> x.__class__ - - >>> x = GrizzlySeries(np.ones(5)) - >>> x.__class__ - - >>> y = GrizzlySeries(['hi', 'bye']) - >>> y.__class__ - - >>> y = GrizzlySeries([1, 2, 3], index=[1, 0, 2]) # Unsupported - >>> y.__class__ - + """ + s = None if isinstance(data, WeldLazy): - self = super(GrizzlySeries, cls).__new__(cls) - super(GrizzlySeries, self).__init__(np.array([], dtype=dtype), **kwargs) - self.weld_value_ = data - return self - - if index is not None and not isinstance(index, pd.RangeIndex): - # TODO(shoumik): This is probably incomplete, since we could have a - # RangeIndex that does not capture the full span of the data, has a - # non-zero step, etc. - return pd.Series(data, dtype=dtype, index=index, **kwargs) - - if len(kwargs) != 0: - # Unsupported arguments present: bail for now. - return pd.Series(data, dtype=dtype, index=index, **kwargs) + self.name = name + self.values_ = None + self.weld_value = data + return + self.name = name if isinstance(data, list) and len(data) > 0 and isinstance(data[0], str): # Try to convert a list of strings into a supported Numpy array. - data = np.array(data, dtype='S') - - if isinstance(data, pd.Series): - data = data.values + self.values_ = np.array(data, dtype='S') + elif isinstance(data, pd.Series): + if self.name is None: + self.name = data.name + if data.values.dtype == 'object' and len(data) > 0 and isinstance(data[0], str): + self.values_ = np.array(data, dtype='S') + else: + self.values_ = data.values elif not isinstance(data, np.ndarray): - # First, convert the input into a Series backed by an ndarray. - s = pd.Series(data, dtype=dtype, index=index, **kwargs) - data = s.values + # First, convert the input into a Numpy array. + self.values_ = np.array(data, dtype=dtype) + else: + self.values_ = data # Try to create a Weld type for the input. - weld_type = GrizzlySeries._supports_grizzly(data) - if weld_type is not None: - self = super(GrizzlySeries, cls).__new__(cls) - super(GrizzlySeries, self).__init__(data, dtype=dtype, **kwargs) - self.weld_value_ = identity( - PhysicalValue(data, weld_type, GrizzlySeries._encoder), + weld_type = GrizzlySeries.supported(self.values_) + if weld_type: + self.weld_value = identity( + PhysicalValue(self.values_, weld_type, GrizzlySeries._encoder), GrizzlySeries._decoder) - return self - - # Don't re-convert values if we did it once already -- it's expensive. - return s if s is not None else pd.Series(data, dtype=dtype, index=index, **kwargs) + else: + raise GrizzlyError("unsupported data type '{}'".format(self.values_.dtype)) # ---------------------- StringMethods ------------------------------ @property def str(self): - # TODO(shoumik.palkar): Use pandas.core.accessor.CachedAccessor? return StringMethods(self) # ---------------------- Aggregation ------------------------------ @@ -337,9 +317,9 @@ def agg(self, funcs): # Result is a primitive, so we can use the default primitive decoder. decoder = None - result_weld_value = weldagg.agg(self.weld_value_, self.elem_type, funcs)(output_type, decoder) + result_weld_value = weldagg.agg(self.weld_value, self.elem_type, funcs)(output_type, decoder) if decoder is not None: - return GrizzlySeries(result_weld_value, dtype=wenp.weld_type_to_dtype(output_elem_type)) + return GrizzlySeries(result_weld_value, dtype=wenp.weld_type_to_dtype(output_elem_type), name=self.name) else: # TODO(shoumik.palkar): Do we want to evaluate here? For now, we will, but eventually it may # be advantageous to have a lazy scalar value that can be used elsewhere. @@ -536,7 +516,7 @@ def __getitem__(self, key): """ # If the key is a scalar - scalar_key = GrizzlySeries._scalar_ty(key, I64()) + scalar_key = GrizzlySeries.scalar_ty(key, I64()) if isinstance(scalar_key, I64): self.evaluate() return self.values[key] @@ -552,7 +532,7 @@ def normalize_slice_arg(arg, default=None): arg = default else: raise GrizzlyError("slice got 'None' when value where expected") - arg_ty = GrizzlySeries._scalar_ty(arg, I64()) + arg_ty = GrizzlySeries.scalar_ty(arg, I64()) if not isinstance(arg_ty, I64): raise GrizzlyError("slices in __getitem__() must be integers") return int(arg) @@ -565,8 +545,8 @@ def normalize_slice_arg(arg, default=None): start = normalize_slice_arg(key.start, default=0) stop = normalize_slice_arg(key.stop, default=None) # Weld's slice operator takes a size instead of a stopping index. - lazy = slice_expr(self.weld_value_, start, stop - start)(self.output_type, GrizzlySeries._decoder) - return GrizzlySeries(lazy, dtype=self.dtype) + lazy = slice_expr(self.weld_value, start, stop - start)(self.output_type, GrizzlySeries._decoder) + return GrizzlySeries(lazy, dtype=self.dtype, name=self.name) if not isinstance(key, GrizzlySeries): raise GrizzlyError("array-like key in __getitem__ must be a GrizzlySeries.") @@ -599,47 +579,37 @@ def mask(self, bool_mask, other=None): """ assert other is None if not isinstance(bool_mask, GrizzlySeries) or\ - not isinstance(bool_mask.output_type.elem_type, Bool): + not isinstance(bool_mask.elem_type, Bool): raise GrizzlyError("bool_mask must be a GrizzlySeries with dtype=bool") - lazy = mask(self.weld_value_, self.output_type.elem_type, bool_mask.weld_value_)(self.output_type, GrizzlySeries._decoder) - return GrizzlySeries(lazy, dtype=self.dtype) + lazy = mask(self.weld_value, self.elem_type, bool_mask.weld_value)(self.output_type, GrizzlySeries._decoder) + return GrizzlySeries(lazy, dtype=self.dtype, name=self.name) # ---------------------- Operators ------------------------------ - @classmethod - def _scalar_ty(cls, value, cast_ty): + def _arithmetic_binop_impl(self, other, op, truediv=False, weld_elem_type=None): """ - Returns the scalar Weld type of a scalar Python value. If the value is not a scalar, - returns None. For primitive 'int' values, returns 'cast_ty'. - - This returns 'None' if the value type is not supported. + Performs the operation on two `Series` elementwise. Parameters ---------- - value : any - value whose dtype to obtain. + other: scalar or GrizzlySeries + the RHS operand + op: str + the operator to apply. + truediv: bool, optional + is this a truediv? + weld_elem_type: WeldType or None + the element type produced by this operation. None + means it is inferred based on Numpy's type conversion rules. Returns ------- - np.dtype + GrizzlySeries """ - if hasattr(value, 'dtype') and hasattr(value, 'shape') and value.shape == (): - return wenp.dtype_to_weld_type(value.dtype) - if isinstance(value, int): - return cast_ty - if isinstance(value, float): - return F64() - if isinstance(value, bool): - return Bool() - - def _arithmetic_binop_impl(self, other, op, truediv=False, weld_elem_type=None): - """ - Performs the operation on two `Series` elementwise. - """ - left_ty = self.output_type.elem_type - scalar_ty = GrizzlySeries._scalar_ty(other, left_ty) + left_ty = self.elem_type + scalar_ty = GrizzlySeries.scalar_ty(other, left_ty) if scalar_ty is not None: # Inline scalars directly instead of adding them # as dependencies. @@ -650,19 +620,28 @@ def _arithmetic_binop_impl(self, other, op, truediv=False, weld_elem_type=None): # GrizzlySeries. if not isinstance(other, GrizzlySeries): raise GrizzlyError("RHS of binary operator must be a GrizzlySeries") - right_ty = other.output_type.elem_type - rightval = other.weld_value_ + right_ty = other.elem_type + rightval = other.weld_value + + is_string = isinstance(left_ty, WeldVec) + if op != "==" and op != "!=" and is_string: + raise TypeError("Unsupported operand type(s) for '{}': {} and {}".format(op, left_ty, right_ty)) + + # Don't cast if we're dealing with strings. + if is_string: + cast_type = "" + else: + cast_type = wenp.binop_output_type(left_ty, right_ty, truediv) - cast_type = wenp.binop_output_type(left_ty, right_ty, truediv) output_type = cast_type if weld_elem_type is None else weld_elem_type lazy = binary_map(op, left_type=str(left_ty), right_type=str(right_ty), - leftval=self.weld_value_, + leftval=self.weld_value, rightval=rightval, scalararg=scalar_ty is not None, cast_type=cast_type)(WeldVec(output_type), GrizzlySeries._decoder) - return GrizzlySeries(lazy, dtype=wenp.weld_type_to_dtype(output_type)) + return GrizzlySeries(lazy, dtype=wenp.weld_type_to_dtype(output_type), name=self.name) def _compare_binop_impl(self, other, op): """ @@ -671,39 +650,211 @@ def _compare_binop_impl(self, other, op): return self._arithmetic_binop_impl(other, op, weld_elem_type=Bool()) def add(self, other): + """ + Performs an element-wise addition. + + Examples + -------- + >>> s = GrizzlySeries([10, 20, 30]) + >>> s.add(10).evaluate() + 0 20 + 1 30 + 2 40 + dtype: int64 + + """ return self._arithmetic_binop_impl(other, '+') def sub(self, other): + """ + Performs an element-wise subtraction. + + Examples + -------- + >>> s = GrizzlySeries([10, 20, 30]) + >>> s.sub(10).evaluate() + 0 0 + 1 10 + 2 20 + dtype: int64 + + """ return self._arithmetic_binop_impl(other, '-') def mod(self, other): + """ + Performs an element-wise modulo. + + Examples + -------- + >>> s = GrizzlySeries([10, 25, 31]) + >>> s.mod(10).evaluate() + 0 0 + 1 5 + 2 1 + dtype: int64 + + """ return self._arithmetic_binop_impl(other, '%') def mul(self, other): + """ + Performs an element-wise multiplication. + + Examples + -------- + >>> s = GrizzlySeries([10, 25, 31]) + >>> s.mul(10).evaluate() + 0 100 + 1 250 + 2 310 + dtype: int64 + + """ return self._arithmetic_binop_impl(other, '*') def truediv(self, other): + """ + Performs an element-wise "true" division. + + Examples + -------- + >>> s = GrizzlySeries([10, 20, 30]) + >>> s.truediv(10).evaluate() + 0 1.0 + 1 2.0 + 2 3.0 + dtype: float64 + + """ return self._arithmetic_binop_impl(other, '/', truediv=True) def divide(self, other): + """ + Performs an element-wise "true" division. + + Examples + -------- + >>> s = GrizzlySeries([10, 20, 30]) + >>> s.divide(10).evaluate() + 0 1.0 + 1 2.0 + 2 3.0 + dtype: float64 + + """ return self.truediv(other) def div(self, other): + """ + Performs an element-wise "true" division. + + Examples + -------- + >>> s = GrizzlySeries([10, 20, 30]) + >>> s.div(10).evaluate() + 0 1.0 + 1 2.0 + 2 3.0 + dtype: float64 + + """ return self.truediv(other) def eq(self, other): + """ + Performs an element-wise equality comparison. + + Examples + -------- + >>> s = GrizzlySeries([10, 20, 30]) + >>> s.eq(10).evaluate() + 0 True + 1 False + 2 False + dtype: bool + + """ return self._compare_binop_impl(other, '==') + def ne(self, other): + """ + Performs an element-wise not-equal comparison. + + Examples + -------- + >>> s = GrizzlySeries([10, 20, 30]) + >>> s.ne(10).evaluate() + 0 False + 1 True + 2 True + dtype: bool + + """ + return self._compare_binop_impl(other, '!=') + def ge(self, other): + """ + Performs an element-wise '>=' comparison. + + Examples + -------- + >>> s = GrizzlySeries([10, 20, 30]) + >>> s.ge(20).evaluate() + 0 False + 1 True + 2 True + dtype: bool + + """ return self._compare_binop_impl(other, '>=') def gt(self, other): + """ + Performs an element-wise '>' comparison. + + Examples + -------- + >>> s = GrizzlySeries([10, 20, 30]) + >>> s.gt(20).evaluate() + 0 False + 1 False + 2 True + dtype: bool + + """ return self._compare_binop_impl(other, '>') def le(self, other): + """ + Performs an element-wise '<=' comparison. + + Examples + -------- + >>> s = GrizzlySeries([10, 20, 30]) + >>> s.le(20).evaluate() + 0 True + 1 True + 2 False + dtype: bool + + """ return self._compare_binop_impl(other, '<=') def lt(self, other): + """ + Performs an element-wise '<' comparison. + + Examples + -------- + >>> s = GrizzlySeries([10, 20, 30]) + >>> s.lt(20).evaluate() + 0 True + 1 False + 2 False + dtype: bool + + """ return self._compare_binop_impl(other, '<') def __add__(self, other): @@ -727,6 +878,9 @@ def __mod__(self, other): def __eq__(self, other): return self.eq(other) + def __ne__(self, other): + return self.ne(other) + def __ge__(self, other): return self.ge(other) @@ -742,70 +896,12 @@ def __lt__(self, other): def __str__(self): if self.is_value: # This is guaranteed to be 0-copy. - return str(self.to_pandas()) - return "GrizzlySeries({}, dtype: {}, deps: [{}])".format( - self.weld_value_.expression, - str(wenp.weld_type_to_dtype(self.output_type.elem_type)), + return str(self.to_pandas(copy=False)) + return "GrizzlySeries({}, name: {}, dtype: {}, deps: [{}])".format( + self.weld_value.expression, + self.name, + str(self.dtype), ", ".join([str(child.id) for child in self.children])) def __repr__(self): return str(self) - - # ---------------------- Method forwarding setup ------------------------------ - - @classmethod - def _get_class_that_defined_method(cls, meth): - """ - Returns the class that defines the requested method. For methods that are - defined outside of a particular set of Grizzly-defined classes, Grizzly will - first evaluate lazy results before forwarding the data to `pandas.Series`. - """ - if inspect.ismethod(meth): - for cls in inspect.getmro(meth.__self__.__class__): - if cls.__dict__.get(meth.__name__) is meth: - return cls - if inspect.isfunction(meth): - return getattr(inspect.getmodule(meth), - meth.__qualname__.split('.', 1)[0].rsplit('.', 1)[0]) - - @classmethod - def _requires_forwarding(cls, meth): - defined_in = cls._get_class_that_defined_method(meth) - if defined_in is not None and defined_in is not cls: - return True - else: - return False - - @classmethod - def forward(cls): - from functools import wraps - def forward_decorator(func): - @wraps(func) - def forwarding_wrapper(self, *args, **kwargs): - self.evaluate() - result = func(self, *args, **kwargs) - # Unsupported functions will return Series -- try to - # switch back to GrizzlySeries. - if not isinstance(result, GrizzlySeries) and isinstance(result, pd.Series): - try_convert = GrizzlySeries(data=result.values, index=result.index) - if not isinstance(try_convert, GrizzlySeries): - warnings.warn("Unsupported operation '{}' produced unsupported Series: falling back to Pandas".format( - func.__name__)) - return try_convert - return result - return forwarding_wrapper - return forward_decorator - - @classmethod - def _add_forwarding_methods(cls): - methods = dir(cls) - for meth in methods: - if meth.startswith("_"): - # We only want to do this for API methods. - continue - attr = getattr(cls, meth) - if cls._requires_forwarding(attr): - setattr(cls, meth, cls.forward()(attr)) - -# Wrap public API methods for forwarding -GrizzlySeries._add_forwarding_methods() diff --git a/weld-python/weld/grizzly/weld/ops.py b/weld-python/weld/grizzly/weld/ops.py index 70e3b0bec..ae31686fd 100644 --- a/weld-python/weld/grizzly/weld/ops.py +++ b/weld-python/weld/grizzly/weld/ops.py @@ -35,6 +35,22 @@ def _binary_apply(op, leftval, rightval, cast_type, infix=True): return "{op}({cast_type}({leftval}), {cast_type}({rightval}))".format( op=op, leftval=leftval, rightval=rightval, cast_type=cast_type) +@weld.lazy.weldfunc +def make_struct(*args): + """ + Constructs a struct with the provided args. + + Examples + -------- + >>> make_struct("weldlazy1", "2", "3").code + '{weldlazy1, 2, 3}' + >>> make_struct("weldlazy1").code + '{weldlazy1}' + + """ + assert len(args) > 0 + return "{" + ", ".join(args) + "}" + @weld.lazy.weldfunc def unary_map(op, ty, value): """ diff --git a/weld-python/weld/types.py b/weld-python/weld/types.py index 1bd29dc18..5ae6ca691 100644 --- a/weld-python/weld/types.py +++ b/weld-python/weld/types.py @@ -22,6 +22,9 @@ def __str__(self): """ pass + def __repr__(self): + return str(self) + def __eq__(self, other): return str(self) == str(other)