Skip to content

Commit

Permalink
Use arrow's string array to place string columns in vineyard (#1611)
Browse files Browse the repository at this point in the history
The change is backwards-compatible and fixes the issue in Rust SDK.

Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
  • Loading branch information
sighingnow authored Nov 9, 2023
1 parent 735abe2 commit 18f0f5e
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 18 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/build-test-graph.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,12 @@ jobs:
wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt update
sudo apt install -y libarrow-dev=14.0.0-1 \
libarrow-dataset-dev=14.0.0-1 \
libarrow-acero-dev=14.0.0-1 \
libarrow-flight-dev=14.0.0-1 \
libgandiva-dev=14.0.0-1 \
libparquet-dev=14.0.0-1
sudo apt install -y libarrow-dev=14.0.1-1 \
libarrow-dataset-dev=14.0.1-1 \
libarrow-acero-dev=14.0.1-1 \
libarrow-flight-dev=14.0.1-1 \
libgandiva-dev=14.0.1-1 \
libparquet-dev=14.0.1-1
# install clang-format
sudo curl -L https://github.com/muttleyxd/clang-tools-static-binaries/releases/download/master-1d7ec53d/clang-format-11_linux-amd64 --output /usr/bin/clang-format
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/build-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,12 @@ jobs:
wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt update
sudo apt install -y libarrow-dev=14.0.0-1 \
libarrow-dataset-dev=14.0.0-1 \
libarrow-acero-dev=14.0.0-1 \
libarrow-flight-dev=14.0.0-1 \
libgandiva-dev=14.0.0-1 \
libparquet-dev=14.0.0-1
sudo apt install -y libarrow-dev=14.0.1-1 \
libarrow-dataset-dev=14.0.1-1 \
libarrow-acero-dev=14.0.1-1 \
libarrow-flight-dev=14.0.1-1 \
libgandiva-dev=14.0.1-1 \
libparquet-dev=14.0.1-1
# install deps for java
sudo apt install -y default-jdk-headless maven
Expand Down
34 changes: 30 additions & 4 deletions python/vineyard/data/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import pickle

import numpy as np
import pyarrow as pa

from vineyard._C import Object
from vineyard._C import ObjectID
Expand Down Expand Up @@ -54,14 +55,39 @@ class ndarray(np.ndarray):

def numpy_ndarray_builder(client, value, **kw):
meta = ObjectMeta()
meta['typename'] = 'vineyard::Tensor<%s>' % normalize_cpptype(value.dtype)
meta['value_type_'] = value.dtype.name
meta['value_type_meta_'] = value.dtype.str
meta['shape_'] = to_json(value.shape)
meta['partition_index_'] = to_json(kw.get('partition_index', []))
meta['nbytes'] = value.nbytes
meta['order_'] = to_json(('C' if value.flags['C_CONTIGUOUS'] else 'F'))
meta.add_member('buffer_', build_numpy_buffer(client, value))

if value.dtype.name == 'object' or value.dtype.name.startswith('str'):
# check if it can be used as a string array
try:
from vineyard.core.builder import get_current_builders
from vineyard.data.arrow import string_array_builder

# string tensors in numpy like np.array(['a', 'b']) cannot be
# converted to pa.large_string_array directly.
try:
array = pa.array(value, type=pa.large_string())
except: # noqa: E722, pylint: disable=bare-except
array = pa.array(value, type=pa.string())
meta['typename'] = 'vineyard::Tensor<std::string>'
meta['value_type_'] = 'string'
meta['value_type_meta_'] = 'str'
meta.add_member(
'buffer_', string_array_builder(client, array, get_current_builders())
)
except: # noqa: E722, pylint: disable=bare-except
meta['typename'] = 'vineyard::Tensor<%s>' % normalize_cpptype(value.dtype)
meta['value_type_'] = value.dtype.name
meta['value_type_meta_'] = value.dtype.str
meta.add_member('buffer_', build_numpy_buffer(client, value))
else:
meta['typename'] = 'vineyard::Tensor<%s>' % normalize_cpptype(value.dtype)
meta['value_type_'] = value.dtype.name
meta['value_type_meta_'] = value.dtype.str
meta.add_member('buffer_', build_numpy_buffer(client, value))
return client.create_metadata(meta)


Expand Down
7 changes: 7 additions & 0 deletions python/vineyard/data/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ def test_pandas_dataframe(vineyard_client):
pd.testing.assert_frame_equal(df, vineyard_client.get(object_id))


def test_pandas_dataframe_string(vineyard_client):
# see gh#533
df = pd.DataFrame({'a': ['1', '2', '3', '4'], 'b': ['5', '6', '7', '8']})
object_id = vineyard_client.put(df)
pd.testing.assert_frame_equal(df, vineyard_client.get(object_id))


def test_pandas_dataframe_complex_columns(vineyard_client):
# see gh#533
df = pd.DataFrame([1, 2, 3, 4], columns=[['x']])
Expand Down
1 change: 0 additions & 1 deletion rust/vineyard-integration-testing/src/ds/numpy_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ mod tests {
return Ok(());
}

#[ignore = "ndarray with string type in python side needs to be fixed"]
#[test]
fn test_numpy_string() -> Result<()> {
use arrow_array::array::Array;
Expand Down
1 change: 0 additions & 1 deletion rust/vineyard-integration-testing/src/ds/pandas_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ mod tests {
return Ok(());
}

#[ignore = "ndarray with string type in python side needs to be fixed"]
#[test]
fn test_pandas_string() -> Result<()> {
let ctx = Context::new();
Expand Down

0 comments on commit 18f0f5e

Please sign in to comment.