DataFrame foundations (#510)

weld-project · Apr 4, 2020 · dcbba9a · dcbba9a
1 parent f5f9586
commit dcbba9a
Show file tree

Hide file tree

Showing 22 changed files with 1,273 additions and 288 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@ Cargo.lock
 .#*
 *~
 *.swp
+*.swo
 *.bc
 *.pyc
 *.o

diff --git a/weld-python/tests/grizzly/core/test_frame.py b/weld-python/tests/grizzly/core/test_frame.py
@@ -0,0 +1,100 @@
+"""
+Test basic DataFrame functionality.
+
+"""
+
+import pandas as pd
+import pytest
+import weld.grizzly as gr
+
+def get_frames(cls, strings):
+    """
+    Returns two DataFrames for testing binary operators.
+
+    The DataFrames have columns of overlapping/different names, types, etc.
+
+    """
+    df1 = pd.DataFrame({
+        'name': ['Bob', 'Sally', 'Kunal', 'Deepak', 'James', 'Pratiksha'],
+        'lastName': ['Kahn', 'Lopez', 'Smith', 'Narayanan', 'Thomas', 'Thaker'],
+        'age': [20, 30, 35, 20, 50, 35],
+        'score': [20.0, 30.0, 35.0, 50.0, 35.0, 25.0]
+        })
+    df2 = pd.DataFrame({
+        'firstName': ['Bob', 'Sally', 'Kunal', 'Deepak', 'James', 'Pratiksha'],
+        'lastName': ['Kahn', 'Lopez', 'smith', 'narayanan', 'Thomas', 'thaker'],
+        'age': [25, 30, 45, 20, 60, 35],
+        'scores': [20.0, 30.0, 35.0, 50.0, 35.0, 25.0]
+        })
+    if not strings:
+        df1 = df1.drop(['name', 'lastName'], axis=1)
+        df2 = df2.drop(['firstName', 'lastName'], axis=1)
+    return (cls(df1), cls(df2))
+
+def _test_binop(pd_op, gr_op, strings=True):
+    """
+    Test a binary operator.
+
+    Binary operators align on column name. For columns that don't exist in both
+    DataFrames, the column is filled with NaN (for non-comparison operations) and
+    or False (for comparison operations).
+
+    If the RHS is a Series, the Series should be added to all columns.
+
+    """
+    df1, df2 = get_frames(pd.DataFrame, strings)
+    gdf1, gdf2 = get_frames(gr.GrizzlyDataFrame, strings)
+
+    expect = pd_op(df1, df2)
+    result = gr_op(gdf1, gdf2).to_pandas()
+    assert expect.equals(result)
+
+def test_evaluation():
+    # Test to make sure that evaluating a DataFrame once caches the result/
+    # doesn't cause another evaluation.
+    df1 = gr.GrizzlyDataFrame({
+        'age': [20, 30, 35, 20, 50, 35],
+        'score': [20.0, 30.0, 35.0, 50.0, 35.0, 25.0]
+        })
+    df2 = gr.GrizzlyDataFrame({
+        'age': [20, 30, 35, 20, 50, 35],
+        'scores': [20.0, 30.0, 35.0, 50.0, 35.0, 25.0]
+        })
+    df3 = (df1 + df2) * df2 + df1 / df2
+    assert not df3.is_value
+    df3.evaluate()
+    assert df3.is_value
+    weld_value = df3.weld_value
+    df3.evaluate()
+    # The same weld_value should be returned.
+    assert weld_value is df3.weld_value
+
+def test_add():
+    _test_binop(pd.DataFrame.add, gr.GrizzlyDataFrame.add, strings=False)
+
+def test_sub():
+    _test_binop(pd.DataFrame.sub, gr.GrizzlyDataFrame.sub, strings=False)
+
+def test_mul():
+    _test_binop(pd.DataFrame.mul, gr.GrizzlyDataFrame.mul, strings=False)
+
+def test_div():
+    _test_binop(pd.DataFrame.div, gr.GrizzlyDataFrame.div, strings=False)
+
+def test_eq():
+    _test_binop(pd.DataFrame.eq, gr.GrizzlyDataFrame.eq, strings=True)
+
+def test_ne():
+    _test_binop(pd.DataFrame.ne, gr.GrizzlyDataFrame.ne, strings=True)
+
+def test_le():
+    _test_binop(pd.DataFrame.le, gr.GrizzlyDataFrame.le, strings=False)
+
+def test_lt():
+    _test_binop(pd.DataFrame.lt, gr.GrizzlyDataFrame.lt, strings=False)
+
+def test_ge():
+    _test_binop(pd.DataFrame.ge, gr.GrizzlyDataFrame.ge, strings=False)
+
+def test_gt():
+    _test_binop(pd.DataFrame.gt, gr.GrizzlyDataFrame.gt, strings=False)
diff --git a/weld-python/tests/grizzly/core/test_series.py b/weld-python/tests/grizzly/core/test_series.py
@@ -85,25 +85,6 @@ def eval_expression(cls):
         yield a + b + c * d - e
     _compare_vs_pandas(eval_expression)
 
-def test_basic_fallback():
-    # Tests basic unsupported functionality.
-    # NOTE: This test will need to change as more features are added...
-    def eval_expression(cls):
-        a = cls([1, 2, 3])
-        b = cls([-4, 5, -6])
-        # Test 1: abs()
-        c = a + b
-        yield (c.abs() + a)
-        # Test 2: argmin()
-        c = a + b
-        yield cls(c.argmin())
-        # Test 3: reindex()
-        c = a + b
-        res = c.reindex(index=[2, 0, 1])
-        # Falls back to Pandas, since we don't support indices.
-        assert isinstance(res, pd.Series)
-    _compare_vs_pandas(eval_expression)
-
 def test_scalar():
     types = ['int8', 'uint8', 'int16', 'uint16', 'int32',\
             'uint32', 'int64', 'uint64', 'float32', 'float64']
@@ -129,10 +110,28 @@ def test_indexing():
     assert np.array_equal(x[x == 2].evaluate().values, np.array([2], dtype='int64'))
     assert np.array_equal(x[x < 0].evaluate().values, np.array([], dtype='int64'))
 
+def test_name():
+    # Test that names propagate after operations.
+    x = gr.GrizzlySeries([1,2,3], name="testname")
+    y = x + x
+    assert y.evaluate().name == "testname"
+    y = x.agg(['sum', 'count'])
+    assert y.evaluate().name == "testname"
+    y = x[:2]
+    assert y.evaluate().name == "testname"
+    y = x[x == 1]
+    assert y.evaluate().name == "testname"
+
+
 def test_unsupported_binop_error():
     # Test unsupported
     from weld.grizzly.core.error import GrizzlyError
     with pytest.raises(GrizzlyError):
         a = gr.GrizzlySeries([1,2,3])
         b = pd.Series([1,2,3])
         a.add(b)
+
+    with pytest.raises(TypeError):
+        a = gr.GrizzlySeries(["hello", "world"])
+        b = gr.GrizzlySeries(["hello", "world"])
+        a.divide(b)
diff --git a/weld-python/tests/grizzly/core/test_strings.py b/weld-python/tests/grizzly/core/test_strings.py
@@ -64,6 +64,14 @@ def test_get():
     pandas_result = pd.Series(expect)
     assert pandas_result.equals(grizzly_result)
 
+def test_eq():
+    left = ["hello", "world", "strings", "morestrings"]
+    right = ["hel", "world", "string", "morestrings"]
+    x = gr.GrizzlySeries(left)
+    y = gr.GrizzlySeries(right)
+    assert list(x.eq(y).evaluate().values) == [False, True, False, True]
+    assert list(x.ne(y).evaluate().values) == [True, False, True, False]
+
 def test_strip():
     compare_vs_pandas('strip', ["",
     "   hi   ",

diff --git a/weld-python/tests/weld/core/test_lazy.py b/weld-python/tests/weld/core/test_lazy.py
@@ -2,7 +2,7 @@
 Tests for constructing and evaluating lazy operations.
 """
 
-from weld.encoders import PrimitiveWeldEncoder, PrimitiveWeldDecoder
+from weld.encoders.primitives import PrimitiveWeldEncoder, PrimitiveWeldDecoder
 from weld.types import *
 from weld.lazy import *
 

diff --git a/weld-python/tests/weld/encoders/test_numpy.py b/weld-python/tests/weld/encoders/test_numpy.py
@@ -99,6 +99,11 @@ def test_float32_vec():
 def test_float64_vec():
     encdec(array('float64'), WeldVec(F64()))
 
+def test_struct_of_vecs():
+    arrays = (array('float32'), array('uint16'), array('uint32'))
+    ty = WeldStruct([WeldVec(F32()), WeldVec(U16()), WeldVec(U32())])
+    encdec(arrays, ty)
+
 def test_type_conversions():
     types = ['bool', 'int8', 'uint8', 'int16', 'uint16',
             'int32', 'uint32', 'int64', 'uint64', 'float32', 'float64']

diff --git a/weld-python/tests/weld/encoders/test_primitives.py b/weld-python/tests/weld/encoders/test_primitives.py
@@ -5,7 +5,7 @@
 import ctypes
 
 from .helpers import encdec_factory
-from weld.encoders import PrimitiveWeldEncoder, PrimitiveWeldDecoder
+from weld.encoders.primitives import PrimitiveWeldEncoder, PrimitiveWeldDecoder
 from weld.types import *
 
 encdec = encdec_factory(PrimitiveWeldEncoder, PrimitiveWeldDecoder)

diff --git a/weld-python/weld/compile.py b/weld-python/weld/compile.py
@@ -4,10 +4,10 @@
 
 """
 
-from .core import *
-from .encoders import WeldEncoder, WeldDecoder, PrimitiveWeldEncoder,\
-        PrimitiveWeldDecoder
-from .types import WeldType
+from weld.core import *
+from weld.encoders import WeldEncoder, WeldDecoder
+from weld.encoders.primitives import PrimitiveWeldEncoder, PrimitiveWeldDecoder
+from weld.types import WeldType
 
 import ctypes
 import logging

diff --git a/weld-python/weld/encoders/__init__.py b/weld-python/weld/encoders/__init__.py
@@ -1,3 +1,2 @@
 
 from .encoder_base import *
-from .primitives import PrimitiveWeldEncoder, PrimitiveWeldDecoder
diff --git a/weld-python/weld/encoders/numpy.py b/weld-python/weld/encoders/numpy.py
@@ -17,7 +17,7 @@
 import ctypes
 import numpy as np
 
-from .encoder_base import *
+from weld.encoders.struct import StructWeldEncoder, StructWeldDecoder
 from weld.types import *
 
 # We just need this for the path.
@@ -118,15 +118,15 @@ def binop_output_type(left_ty, right_ty, truediv=False):
     Examples
     --------
     >>> binop_output_type(Bool(), Bool())
-    <weld.types.Bool object at ...>
+    bool
     >>> binop_output_type(I8(), U16())
-    <weld.types.I32 object at ...>
+    i32
     >>> binop_output_type(U8(), U16())
-    <weld.types.U16 object at ...>
+    u16
     >>> binop_output_type(F32(), U16())
-    <weld.types.F32 object at ...>
+    f32
     >>> binop_output_type(I8(), U64())
-    <weld.types.F64 object at ...>
+    f64
     """
     if not truediv and left_ty == right_ty:
         return left_ty
@@ -238,13 +238,13 @@ def dtype_to_weld_type(ty):
     Examples
     --------
     >>> dtype_to_weld_type('int32')
-    <weld.types.I32 object at 0x...>
+    i32
     >>> dtype_to_weld_type('float')
-    <weld.types.F64 object at 0x...>
+    f64
     >>> dtype_to_weld_type('i8')
-    <weld.types.I64 object at 0x...>
+    i64
     >>> dtype_to_weld_type(np.int16)
-    <weld.types.I16 object at 0x...>
+    i16
 
     Parameters
     ----------
@@ -295,7 +295,11 @@ def weld_string_array_to_numpy(arr):
         return result
 
 
-class NumPyWeldEncoder(WeldEncoder):
+class NumPyWeldEncoder(StructWeldEncoder):
+    """
+    Encodes NumPy arrays as Weld arrays.
+
+    """
 
     @staticmethod
     def _convert_1d_array(array, check_type=None):
@@ -353,7 +357,7 @@ def _is_string_array(obj):
             return False
         return True
 
-    def encode(self, obj, ty):
+    def encode_element(self, obj, ty):
         if NumPyWeldEncoder._is_string_array(obj):
             assert ty == WeldVec(WeldVec(I8()))
             return StringConversionFuncs.numpy_string_array_to_weld(obj)
@@ -365,9 +369,12 @@ def encode(self, obj, ty):
         else:
             raise TypeError("Unexpected type {} in NumPy encoder".format(type(obj)))
 
-class NumPyWeldDecoder(WeldDecoder):
-    """ Decodes an encoded Weld array into a NumPy array.
+class NumPyWeldDecoder(StructWeldDecoder):
+    """
+    Decodes an encoded Weld array into a NumPy array.
 
+    Examples
+    --------
     >>> arr = np.array([1,2,3], dtype='int32')
     >>> encoded = NumPyWeldEncoder().encode(arr, WeldVec(I32()))
     >>> NumPyWeldDecoder().decode(ctypes.pointer(encoded), WeldVec(I32()))
@@ -457,7 +464,7 @@ def _is_string_array(restype):
                     return True
         return False
 
-    def decode(self, obj, restype, context=None):
+    def decode_element(self, obj, restype, context=None):
         # A 1D NumPy array
         obj = obj.contents
         if NumPyWeldDecoder._is_string_array(restype):

diff --git a/weld-python/weld/encoders/primitives.py b/weld-python/weld/encoders/primitives.py
@@ -6,15 +6,14 @@
 
 """
 
-from .encoder_base import *
-from ..types import *
+from weld.encoders.struct import StructWeldEncoder, StructWeldDecoder
+from weld.types import *
 
-class PrimitiveWeldEncoder(WeldEncoder):
-    """
-    A primitive encoder for booleans, integers and floats.
+import ctypes
 
-    Eventually, this will also support encoding for tuples (structs) of other
-    primitive types.
+class PrimitiveWeldEncoder(StructWeldEncoder):
+    """
+    A primitive encoder for booleans, integers, floats, and tuples thereof.
 
     Examples
     --------
@@ -33,24 +32,13 @@ class PrimitiveWeldEncoder(WeldEncoder):
     >>> s._1
     1
     """
-    def encode(self, obj, target_type):
+    def encode_element(self, obj, target_type):
         encoder = target_type.ctype_class
-        if isinstance(target_type, WeldStruct):
-            struct = encoder()
-            for (i, (field, weld_ty)) in enumerate(zip(\
-                    obj, target_type.field_types)):
-                encoded = self.encode(field, weld_ty)
-                setattr(struct, "_" + str(i), encoded)
-            return struct
-        else:
-            return encoder(obj)
+        return encoder(obj)
 
-class PrimitiveWeldDecoder(WeldDecoder):
+class PrimitiveWeldDecoder(StructWeldDecoder):
     """
-    A primitive encoder for booleans, integers, and floats.
-
-    Eventually, this will also support decoding for structs (tuples) of other
-    primitive types.
+    A primitive encoder for booleans, integers, floats, and tuples thereof.
 
     Examples
     --------
@@ -65,18 +53,9 @@ class PrimitiveWeldDecoder(WeldDecoder):
     >>> decoder.decode(ctypes.pointer(x), struct_type)
     (1, 1.0)
     """
-    def decode(self, obj, restype, context=None):
+
+    def decode_element(self, obj, restype, context=None):
         if isinstance(restype, Bool):
             return bool(obj.contents.value)
-        elif isinstance(restype, WeldStruct):
-            struct = obj.contents
-            ctype_class = restype.ctype_class
-            result = []
-            for (i, (weld_ty, (cfield, cty))) in enumerate(zip(\
-                    restype.field_types, ctype_class._fields_)):
-                ofs = getattr(ctype_class, cfield).offset
-                p = ctypes.pointer(cty.from_buffer(struct, ofs))
-                result.append(self.decode(p, weld_ty))
-            return tuple(result)
         else:
             return obj.contents.value
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ Cargo.lock @@
     .#*
     *~
     *.swp
+    *.swo
     *.bc
     *.pyc
     *.o
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -1,3 +1,2 @@

		from .encoder_base import *
		from .primitives import PrimitiveWeldEncoder, PrimitiveWeldDecoder