Skip to content

Commit

Permalink
DataFrame foundations (#510)
Browse files Browse the repository at this point in the history
  • Loading branch information
sppalkia authored Apr 4, 2020
1 parent f5f9586 commit dcbba9a
Show file tree
Hide file tree
Showing 22 changed files with 1,273 additions and 288 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Cargo.lock
.#*
*~
*.swp
*.swo
*.bc
*.pyc
*.o
Expand Down
100 changes: 100 additions & 0 deletions weld-python/tests/grizzly/core/test_frame.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""
Test basic DataFrame functionality.
"""

import pandas as pd
import pytest
import weld.grizzly as gr

def get_frames(cls, strings):
"""
Returns two DataFrames for testing binary operators.
The DataFrames have columns of overlapping/different names, types, etc.
"""
df1 = pd.DataFrame({
'name': ['Bob', 'Sally', 'Kunal', 'Deepak', 'James', 'Pratiksha'],
'lastName': ['Kahn', 'Lopez', 'Smith', 'Narayanan', 'Thomas', 'Thaker'],
'age': [20, 30, 35, 20, 50, 35],
'score': [20.0, 30.0, 35.0, 50.0, 35.0, 25.0]
})
df2 = pd.DataFrame({
'firstName': ['Bob', 'Sally', 'Kunal', 'Deepak', 'James', 'Pratiksha'],
'lastName': ['Kahn', 'Lopez', 'smith', 'narayanan', 'Thomas', 'thaker'],
'age': [25, 30, 45, 20, 60, 35],
'scores': [20.0, 30.0, 35.0, 50.0, 35.0, 25.0]
})
if not strings:
df1 = df1.drop(['name', 'lastName'], axis=1)
df2 = df2.drop(['firstName', 'lastName'], axis=1)
return (cls(df1), cls(df2))

def _test_binop(pd_op, gr_op, strings=True):
"""
Test a binary operator.
Binary operators align on column name. For columns that don't exist in both
DataFrames, the column is filled with NaN (for non-comparison operations) and
or False (for comparison operations).
If the RHS is a Series, the Series should be added to all columns.
"""
df1, df2 = get_frames(pd.DataFrame, strings)
gdf1, gdf2 = get_frames(gr.GrizzlyDataFrame, strings)

expect = pd_op(df1, df2)
result = gr_op(gdf1, gdf2).to_pandas()
assert expect.equals(result)

def test_evaluation():
# Test to make sure that evaluating a DataFrame once caches the result/
# doesn't cause another evaluation.
df1 = gr.GrizzlyDataFrame({
'age': [20, 30, 35, 20, 50, 35],
'score': [20.0, 30.0, 35.0, 50.0, 35.0, 25.0]
})
df2 = gr.GrizzlyDataFrame({
'age': [20, 30, 35, 20, 50, 35],
'scores': [20.0, 30.0, 35.0, 50.0, 35.0, 25.0]
})
df3 = (df1 + df2) * df2 + df1 / df2
assert not df3.is_value
df3.evaluate()
assert df3.is_value
weld_value = df3.weld_value
df3.evaluate()
# The same weld_value should be returned.
assert weld_value is df3.weld_value

def test_add():
_test_binop(pd.DataFrame.add, gr.GrizzlyDataFrame.add, strings=False)

def test_sub():
_test_binop(pd.DataFrame.sub, gr.GrizzlyDataFrame.sub, strings=False)

def test_mul():
_test_binop(pd.DataFrame.mul, gr.GrizzlyDataFrame.mul, strings=False)

def test_div():
_test_binop(pd.DataFrame.div, gr.GrizzlyDataFrame.div, strings=False)

def test_eq():
_test_binop(pd.DataFrame.eq, gr.GrizzlyDataFrame.eq, strings=True)

def test_ne():
_test_binop(pd.DataFrame.ne, gr.GrizzlyDataFrame.ne, strings=True)

def test_le():
_test_binop(pd.DataFrame.le, gr.GrizzlyDataFrame.le, strings=False)

def test_lt():
_test_binop(pd.DataFrame.lt, gr.GrizzlyDataFrame.lt, strings=False)

def test_ge():
_test_binop(pd.DataFrame.ge, gr.GrizzlyDataFrame.ge, strings=False)

def test_gt():
_test_binop(pd.DataFrame.gt, gr.GrizzlyDataFrame.gt, strings=False)
37 changes: 18 additions & 19 deletions weld-python/tests/grizzly/core/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,25 +85,6 @@ def eval_expression(cls):
yield a + b + c * d - e
_compare_vs_pandas(eval_expression)

def test_basic_fallback():
# Tests basic unsupported functionality.
# NOTE: This test will need to change as more features are added...
def eval_expression(cls):
a = cls([1, 2, 3])
b = cls([-4, 5, -6])
# Test 1: abs()
c = a + b
yield (c.abs() + a)
# Test 2: argmin()
c = a + b
yield cls(c.argmin())
# Test 3: reindex()
c = a + b
res = c.reindex(index=[2, 0, 1])
# Falls back to Pandas, since we don't support indices.
assert isinstance(res, pd.Series)
_compare_vs_pandas(eval_expression)

def test_scalar():
types = ['int8', 'uint8', 'int16', 'uint16', 'int32',\
'uint32', 'int64', 'uint64', 'float32', 'float64']
Expand All @@ -129,10 +110,28 @@ def test_indexing():
assert np.array_equal(x[x == 2].evaluate().values, np.array([2], dtype='int64'))
assert np.array_equal(x[x < 0].evaluate().values, np.array([], dtype='int64'))

def test_name():
# Test that names propagate after operations.
x = gr.GrizzlySeries([1,2,3], name="testname")
y = x + x
assert y.evaluate().name == "testname"
y = x.agg(['sum', 'count'])
assert y.evaluate().name == "testname"
y = x[:2]
assert y.evaluate().name == "testname"
y = x[x == 1]
assert y.evaluate().name == "testname"


def test_unsupported_binop_error():
# Test unsupported
from weld.grizzly.core.error import GrizzlyError
with pytest.raises(GrizzlyError):
a = gr.GrizzlySeries([1,2,3])
b = pd.Series([1,2,3])
a.add(b)

with pytest.raises(TypeError):
a = gr.GrizzlySeries(["hello", "world"])
b = gr.GrizzlySeries(["hello", "world"])
a.divide(b)
8 changes: 8 additions & 0 deletions weld-python/tests/grizzly/core/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,14 @@ def test_get():
pandas_result = pd.Series(expect)
assert pandas_result.equals(grizzly_result)

def test_eq():
left = ["hello", "world", "strings", "morestrings"]
right = ["hel", "world", "string", "morestrings"]
x = gr.GrizzlySeries(left)
y = gr.GrizzlySeries(right)
assert list(x.eq(y).evaluate().values) == [False, True, False, True]
assert list(x.ne(y).evaluate().values) == [True, False, True, False]

def test_strip():
compare_vs_pandas('strip', ["",
" hi ",
Expand Down
2 changes: 1 addition & 1 deletion weld-python/tests/weld/core/test_lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Tests for constructing and evaluating lazy operations.
"""

from weld.encoders import PrimitiveWeldEncoder, PrimitiveWeldDecoder
from weld.encoders.primitives import PrimitiveWeldEncoder, PrimitiveWeldDecoder
from weld.types import *
from weld.lazy import *

Expand Down
5 changes: 5 additions & 0 deletions weld-python/tests/weld/encoders/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ def test_float32_vec():
def test_float64_vec():
encdec(array('float64'), WeldVec(F64()))

def test_struct_of_vecs():
arrays = (array('float32'), array('uint16'), array('uint32'))
ty = WeldStruct([WeldVec(F32()), WeldVec(U16()), WeldVec(U32())])
encdec(arrays, ty)

def test_type_conversions():
types = ['bool', 'int8', 'uint8', 'int16', 'uint16',
'int32', 'uint32', 'int64', 'uint64', 'float32', 'float64']
Expand Down
2 changes: 1 addition & 1 deletion weld-python/tests/weld/encoders/test_primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import ctypes

from .helpers import encdec_factory
from weld.encoders import PrimitiveWeldEncoder, PrimitiveWeldDecoder
from weld.encoders.primitives import PrimitiveWeldEncoder, PrimitiveWeldDecoder
from weld.types import *

encdec = encdec_factory(PrimitiveWeldEncoder, PrimitiveWeldDecoder)
Expand Down
8 changes: 4 additions & 4 deletions weld-python/weld/compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
"""

from .core import *
from .encoders import WeldEncoder, WeldDecoder, PrimitiveWeldEncoder,\
PrimitiveWeldDecoder
from .types import WeldType
from weld.core import *
from weld.encoders import WeldEncoder, WeldDecoder
from weld.encoders.primitives import PrimitiveWeldEncoder, PrimitiveWeldDecoder
from weld.types import WeldType

import ctypes
import logging
Expand Down
1 change: 0 additions & 1 deletion weld-python/weld/encoders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@

from .encoder_base import *
from .primitives import PrimitiveWeldEncoder, PrimitiveWeldDecoder
37 changes: 22 additions & 15 deletions weld-python/weld/encoders/numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import ctypes
import numpy as np

from .encoder_base import *
from weld.encoders.struct import StructWeldEncoder, StructWeldDecoder
from weld.types import *

# We just need this for the path.
Expand Down Expand Up @@ -118,15 +118,15 @@ def binop_output_type(left_ty, right_ty, truediv=False):
Examples
--------
>>> binop_output_type(Bool(), Bool())
<weld.types.Bool object at ...>
bool
>>> binop_output_type(I8(), U16())
<weld.types.I32 object at ...>
i32
>>> binop_output_type(U8(), U16())
<weld.types.U16 object at ...>
u16
>>> binop_output_type(F32(), U16())
<weld.types.F32 object at ...>
f32
>>> binop_output_type(I8(), U64())
<weld.types.F64 object at ...>
f64
"""
if not truediv and left_ty == right_ty:
return left_ty
Expand Down Expand Up @@ -238,13 +238,13 @@ def dtype_to_weld_type(ty):
Examples
--------
>>> dtype_to_weld_type('int32')
<weld.types.I32 object at 0x...>
i32
>>> dtype_to_weld_type('float')
<weld.types.F64 object at 0x...>
f64
>>> dtype_to_weld_type('i8')
<weld.types.I64 object at 0x...>
i64
>>> dtype_to_weld_type(np.int16)
<weld.types.I16 object at 0x...>
i16
Parameters
----------
Expand Down Expand Up @@ -295,7 +295,11 @@ def weld_string_array_to_numpy(arr):
return result


class NumPyWeldEncoder(WeldEncoder):
class NumPyWeldEncoder(StructWeldEncoder):
"""
Encodes NumPy arrays as Weld arrays.
"""

@staticmethod
def _convert_1d_array(array, check_type=None):
Expand Down Expand Up @@ -353,7 +357,7 @@ def _is_string_array(obj):
return False
return True

def encode(self, obj, ty):
def encode_element(self, obj, ty):
if NumPyWeldEncoder._is_string_array(obj):
assert ty == WeldVec(WeldVec(I8()))
return StringConversionFuncs.numpy_string_array_to_weld(obj)
Expand All @@ -365,9 +369,12 @@ def encode(self, obj, ty):
else:
raise TypeError("Unexpected type {} in NumPy encoder".format(type(obj)))

class NumPyWeldDecoder(WeldDecoder):
""" Decodes an encoded Weld array into a NumPy array.
class NumPyWeldDecoder(StructWeldDecoder):
"""
Decodes an encoded Weld array into a NumPy array.
Examples
--------
>>> arr = np.array([1,2,3], dtype='int32')
>>> encoded = NumPyWeldEncoder().encode(arr, WeldVec(I32()))
>>> NumPyWeldDecoder().decode(ctypes.pointer(encoded), WeldVec(I32()))
Expand Down Expand Up @@ -457,7 +464,7 @@ def _is_string_array(restype):
return True
return False

def decode(self, obj, restype, context=None):
def decode_element(self, obj, restype, context=None):
# A 1D NumPy array
obj = obj.contents
if NumPyWeldDecoder._is_string_array(restype):
Expand Down
45 changes: 12 additions & 33 deletions weld-python/weld/encoders/primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,14 @@
"""

from .encoder_base import *
from ..types import *
from weld.encoders.struct import StructWeldEncoder, StructWeldDecoder
from weld.types import *

class PrimitiveWeldEncoder(WeldEncoder):
"""
A primitive encoder for booleans, integers and floats.
import ctypes

Eventually, this will also support encoding for tuples (structs) of other
primitive types.
class PrimitiveWeldEncoder(StructWeldEncoder):
"""
A primitive encoder for booleans, integers, floats, and tuples thereof.
Examples
--------
Expand All @@ -33,24 +32,13 @@ class PrimitiveWeldEncoder(WeldEncoder):
>>> s._1
1
"""
def encode(self, obj, target_type):
def encode_element(self, obj, target_type):
encoder = target_type.ctype_class
if isinstance(target_type, WeldStruct):
struct = encoder()
for (i, (field, weld_ty)) in enumerate(zip(\
obj, target_type.field_types)):
encoded = self.encode(field, weld_ty)
setattr(struct, "_" + str(i), encoded)
return struct
else:
return encoder(obj)
return encoder(obj)

class PrimitiveWeldDecoder(WeldDecoder):
class PrimitiveWeldDecoder(StructWeldDecoder):
"""
A primitive encoder for booleans, integers, and floats.
Eventually, this will also support decoding for structs (tuples) of other
primitive types.
A primitive encoder for booleans, integers, floats, and tuples thereof.
Examples
--------
Expand All @@ -65,18 +53,9 @@ class PrimitiveWeldDecoder(WeldDecoder):
>>> decoder.decode(ctypes.pointer(x), struct_type)
(1, 1.0)
"""
def decode(self, obj, restype, context=None):

def decode_element(self, obj, restype, context=None):
if isinstance(restype, Bool):
return bool(obj.contents.value)
elif isinstance(restype, WeldStruct):
struct = obj.contents
ctype_class = restype.ctype_class
result = []
for (i, (weld_ty, (cfield, cty))) in enumerate(zip(\
restype.field_types, ctype_class._fields_)):
ofs = getattr(ctype_class, cfield).offset
p = ctypes.pointer(cty.from_buffer(struct, ofs))
result.append(self.decode(p, weld_ty))
return tuple(result)
else:
return obj.contents.value
Loading

0 comments on commit dcbba9a

Please sign in to comment.