Skip to content

Commit

Permalink
Add location read operation implementation (#8)
Browse files Browse the repository at this point in the history
* Add location read operation implementation

* unit test refactoring
  • Loading branch information
Zhou Fang authored Dec 24, 2023
1 parent a558715 commit b1f2201
Show file tree
Hide file tree
Showing 18 changed files with 388 additions and 74 deletions.
9 changes: 8 additions & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,14 @@ pythonpath = ["src"]
[tool.pylint.format]
max-line-length = 80
indent-string = " "
disable = ["fixme", "no-else-return"]
disable = [
"duplicate-code",
"fixme",
"no-else-return",
"too-few-public-methods",
"too-many-instance-attributes",
"too-many-locals"
]

[tool.pylint.MAIN]
ignore = "space/core/proto"
Expand Down
33 changes: 33 additions & 0 deletions python/src/space/core/fs/array_record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""ArrayRecord file utilities."""

from typing import List

from space.core.utils.lazy_imports_utils import array_record_module as ar


def read_record_file(file_path: str, positions: List[int]) -> List[bytes]:
"""Read records of an ArrayRecord file.
Args:
file_path: full file path.
positions: the position inside the file of the records to read.
"""
record_reader = ar.ArrayRecordReader(file_path)
records = record_reader.read(positions)
record_reader.close()
return records
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Parquet utilities."""
"""Parquet file utilities."""

from typing import List

Expand Down
10 changes: 3 additions & 7 deletions python/src/space/core/manifests/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import pyarrow.compute as pc
import pyarrow.parquet as pq

from space.core.utils.parquet import write_parquet_file
from space.core.fs.parquet import write_parquet_file
import space.core.proto.metadata_pb2 as meta
import space.core.proto.runtime_pb2 as runtime
from space.core.schema import constants
Expand Down Expand Up @@ -106,7 +106,6 @@ def to_arrow(self) -> pa.Array:
fields=self._fields)


# pylint: disable=too-many-instance-attributes
class IndexManifestWriter:
"""Writer of index manifest files."""

Expand Down Expand Up @@ -228,11 +227,8 @@ def read_index_manifests(
Returns:
A file set of data files in the manifest file.
"""
if filter_ is None:
table = pq.read_table(manifest_path)
else:
table = pq.read_table(manifest_path,
filters=filter_) # type: ignore[arg-type]
table = pq.read_table(manifest_path,
filters=filter_) # type: ignore[arg-type]

manifests = _index_manifests(table)

Expand Down
2 changes: 1 addition & 1 deletion python/src/space/core/manifests/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import pyarrow as pa

from space.core.utils.parquet import write_parquet_file
from space.core.fs.parquet import write_parquet_file
import space.core.proto.metadata_pb2 as meta
from space.core.utils import paths
from space.core.schema import constants
Expand Down
1 change: 1 addition & 0 deletions python/src/space/core/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
"""Space local data operations."""

from space.core.ops.append import LocalAppendOp
from space.core.ops.read import FileSetReadOp
3 changes: 1 addition & 2 deletions python/src/space/core/ops/append.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@


class BaseAppendOp(BaseOp):
"""Abstract base Append operation class."""
"""Abstract base append operation class."""

@abstractmethod
def write(self, data: InputData) -> None:
Expand Down Expand Up @@ -71,7 +71,6 @@ class _RecordWriterInfo:
default_factory=meta.StorageStatistics)


# pylint: disable=too-many-instance-attributes
class LocalAppendOp(BaseAppendOp, StoragePaths):
"""Append operation running locally.
Expand Down
1 change: 0 additions & 1 deletion python/src/space/core/ops/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,5 @@
InputData: TypeAlias = Union[Dict[str, Any], pa.Table]


# pylint: disable=too-few-public-methods
class BaseOp(ABC):
"""Abstract base operation class."""
147 changes: 147 additions & 0 deletions python/src/space/core/ops/read.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Local read operation implementation."""

from __future__ import annotations
from abc import abstractmethod
from typing import Iterator, Dict, List, Tuple, Optional

import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc

from space.core.fs.array_record import read_record_file
from space.core.ops.base import BaseOp
from space.core.proto import metadata_pb2 as meta
from space.core.proto import runtime_pb2 as runtime
from space.core.schema import arrow
from space.core.schema.constants import FILE_PATH_FIELD, ROW_ID_FIELD
from space.core.schema import utils as schema_utils
from space.core.utils.paths import StoragePaths

_RECORD_KEY_FIELD = "__RECORD_KEY"


class BaseReadOp(BaseOp):
"""Abstract base read operation class."""

@abstractmethod
def __iter__(self) -> Iterator[pa.Table]:
"""Iterator of read data."""


class FileSetReadOp(BaseReadOp, StoragePaths):
"""Read data from a dataset."""

def __init__(self,
location: str,
metadata: meta.StorageMetadata,
file_set: runtime.FileSet,
filter_: Optional[pc.Expression] = None):
StoragePaths.__init__(self, location)

# TODO: to validate that filter_ does not contain record files.

self._metadata = metadata
self._file_set = file_set

record_fields = set(self._metadata.schema.record_fields)
self._physical_schema = arrow.arrow_schema(self._metadata.schema.fields,
record_fields,
physical=True)
self._index_fields, self._record_fields = arrow.classify_fields(
self._physical_schema, record_fields, selected_fields=None)

self._index_field_ids = set(schema_utils.field_ids(self._index_fields))

self._record_fields_dict: Dict[int, schema_utils.Field] = {}
for f in self._record_fields:
self._record_fields_dict[f.field_id] = f

self._filter = filter_

def __iter__(self) -> Iterator[pa.Table]:
for file in self._file_set.index_files:
yield self._read_index_and_record(file.path)

def _read_index_and_record(self, index_path: str) -> pa.Table:
index_table = pq.read_table(self.full_path(index_path),
filters=self._filter) # type: ignore[arg-type]

index_column_ids: List[int] = []
record_columns: List[Tuple[int, pa.Field]] = []
for column_id, field in enumerate(index_table.schema):
field_id = arrow.field_id(field)
if field_id in self._index_field_ids:
index_column_ids.append(column_id)
else:
record_columns.append(
(column_id,
arrow.binary_field(self._record_fields_dict[field_id])))

result_table = index_table.select(
index_column_ids) # type: ignore[arg-type]

# Record record fields from addresses.
for column_id, field in record_columns:
result_table = result_table.append_column(
field,
self._read_record_column(
index_table.select([column_id]), # type: ignore[list-item]
field.name))

return result_table

def _read_record_column(self, record_address: pa.Table,
field: str) -> pa.BinaryArray:
"""Read selective rows in multiple ArrayRecord files."""
num_rows = record_address.num_rows
# _RECORD_KEY_FIELD is the row index of record_address_table used for
# retrieving rows after group by. It is not in the read result.
record_address = record_address.flatten().append_column(
_RECORD_KEY_FIELD, [np.arange(num_rows)]) # type: ignore[arg-type]

# TODO: should detect whether data file use file path or ID.
file_path_field = f"{field}.{FILE_PATH_FIELD}"
row_id_field = f"{field}.{ROW_ID_FIELD}"

# Record row IDs and records key co-grouped by file path, for processing
# one file at a time to minimize file reads.
grouped_records = record_address.group_by(
file_path_field).aggregate( # type: ignore[arg-type]
[(row_id_field, "list"), (_RECORD_KEY_FIELD, "list")])

file_path_column = grouped_records.column(file_path_field).combine_chunks()
row_ids_column = grouped_records.column(
f"{row_id_field}_list").combine_chunks()
record_keys_column = grouped_records.column(
f"{_RECORD_KEY_FIELD}_list").combine_chunks()

# TODO: to parallelize ArrayRecord file reads.
record_values: List[List[bytes]] = []
for file_path, row_ids in zip(
file_path_column, row_ids_column): # type: ignore[call-overload]
record_values.append(
read_record_file(self.full_path(file_path.as_py()), row_ids.as_py()))

# Sort records by record_keys so the records can match indexes.
sorted_values: List[bytes] = [None] * num_rows # type: ignore[list-item]
for values, keys in zip(record_values,
record_keys_column): # type: ignore[call-overload]
for value, key in zip(values, keys):
sorted_values[key.as_py()] = value

return pa.array(sorted_values, pa.binary()) # type: ignore[return-value]
9 changes: 9 additions & 0 deletions python/src/space/core/schema/arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,12 @@ def record_address_types() -> List[Tuple[str, pa.DataType]]:
"""Returns Arrow fields of record addresses."""
return [(constants.FILE_PATH_FIELD, pa.string()),
(constants.ROW_ID_FIELD, pa.int32())]


def binary_field(field: utils.Field) -> pa.Field:
"""Return a binary Arrow field for the given field."""
return _set_field_type(field, pa.binary())


def _set_field_type(field: utils.Field, type_: pa.DataType) -> pa.Field:
return pa.field(field.name, type_, metadata=field_metadata(field.field_id))
1 change: 0 additions & 1 deletion python/src/space/core/schema/field_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
_START_FIELD_ID = 0


# pylint: disable=too-few-public-methods
class FieldIdManager:
"""Assign field IDs to schema fields using Depth First Search.
Expand Down
5 changes: 5 additions & 0 deletions python/src/space/core/schema/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ def field_names(fields: List[Field]) -> List[str]:
return list(map(lambda f: f.name, fields))


def field_ids(fields: List[Field]) -> List[int]:
"""Extract field IDs from a list of fields."""
return list(map(lambda f: f.field_id, fields))


def stats_field_name(field_id_: int) -> str:
"""Column stats struct field name.
Expand Down
4 changes: 2 additions & 2 deletions python/tests/core/manifests/test_falsifiable_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
((pc.field("_STATS_f1", "_MIN") > 1) |
(pc.field("_STATS_f1", "_MAX") < 1)))])
def test_build_manifest_filter(filter_, falsifiable_filter):
arrow_schema = pa.schema([("a", pa.int64()), ("b", pa.float64())]) # pylint: disable=too-few-public-methods
arrow_schema = pa.schema([("a", pa.int64()), ("b", pa.float64())])
field_name_ids = {"a": 0, "b": 1}

falsifiable_filter = ff.build_manifest_filter(arrow_schema, field_name_ids,
Expand All @@ -39,7 +39,7 @@ def test_build_manifest_filter(filter_, falsifiable_filter):
@pytest.mark.parametrize("filter_", [(pc.field("a") != 10),
(~(pc.field("a") > 10))])
def test_build_manifest_filter_not_supported_return_none(filter_):
arrow_schema = pa.schema([("a", pa.int64()), ("b", pa.float64())]) # pylint: disable=too-few-public-methods
arrow_schema = pa.schema([("a", pa.int64()), ("b", pa.float64())])
field_name_ids = {"a": 0, "b": 1}

assert ff.build_manifest_filter(arrow_schema, field_name_ids,
Expand Down
2 changes: 1 addition & 1 deletion python/tests/core/manifests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@

from space.core.manifests import IndexManifestWriter
from space.core.manifests.index import read_index_manifests
from space.core.fs.parquet import write_parquet_file
import space.core.proto.metadata_pb2 as meta
import space.core.proto.runtime_pb2 as runtime
from space.core.schema.arrow import field_metadata
from space.core.utils.parquet import write_parquet_file

_SCHEMA = pa.schema([
pa.field("int64", pa.int64(), metadata=field_metadata(0)),
Expand Down
Loading

0 comments on commit b1f2201

Please sign in to comment.