Add location read operation implementation (#8)

* Add location read operation implementation * unit test refactoring
google · Dec 24, 2023 · b1f2201 · b1f2201
1 parent a558715
commit b1f2201
Show file tree

Hide file tree

Showing 18 changed files with 388 additions and 74 deletions.
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -41,7 +41,14 @@ pythonpath = ["src"]
 [tool.pylint.format]
 max-line-length = 80
 indent-string = "  "
-disable = ["fixme", "no-else-return"]
+disable = [
+  "duplicate-code",
+  "fixme",
+  "no-else-return",
+  "too-few-public-methods",
+  "too-many-instance-attributes",
+  "too-many-locals"
+]
 
 [tool.pylint.MAIN]
 ignore = "space/core/proto"

diff --git a/python/src/space/core/fs/array_record.py b/python/src/space/core/fs/array_record.py
@@ -0,0 +1,33 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""ArrayRecord file utilities."""
+
+from typing import List
+
+from space.core.utils.lazy_imports_utils import array_record_module as ar
+
+
+def read_record_file(file_path: str, positions: List[int]) -> List[bytes]:
+  """Read records of an ArrayRecord file.
+  
+  Args:
+   file_path: full file path.
+   positions: the position inside the file of the records to read.
+
+  """
+  record_reader = ar.ArrayRecordReader(file_path)
+  records = record_reader.read(positions)
+  record_reader.close()
+  return records
diff --git a/python/src/space/core/utils/parquet.py → python/src/space/core/fs/parquet.py b/python/src/space/core/utils/parquet.py → python/src/space/core/fs/parquet.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-"""Parquet utilities."""
+"""Parquet file utilities."""
 
 from typing import List
 

diff --git a/python/src/space/core/manifests/index.py b/python/src/space/core/manifests/index.py
@@ -22,7 +22,7 @@
 import pyarrow.compute as pc
 import pyarrow.parquet as pq
 
-from space.core.utils.parquet import write_parquet_file
+from space.core.fs.parquet import write_parquet_file
 import space.core.proto.metadata_pb2 as meta
 import space.core.proto.runtime_pb2 as runtime
 from space.core.schema import constants
@@ -106,7 +106,6 @@ def to_arrow(self) -> pa.Array:
         fields=self._fields)
 
 
-# pylint: disable=too-many-instance-attributes
 class IndexManifestWriter:
   """Writer of index manifest files."""
 
@@ -228,11 +227,8 @@ def read_index_manifests(
   Returns:
     A file set of data files in the manifest file.
   """
-  if filter_ is None:
-    table = pq.read_table(manifest_path)
-  else:
-    table = pq.read_table(manifest_path,
-                          filters=filter_)  # type: ignore[arg-type]
+  table = pq.read_table(manifest_path,
+                        filters=filter_)  # type: ignore[arg-type]
 
   manifests = _index_manifests(table)
 

diff --git a/python/src/space/core/manifests/record.py b/python/src/space/core/manifests/record.py
@@ -18,7 +18,7 @@
 
 import pyarrow as pa
 
-from space.core.utils.parquet import write_parquet_file
+from space.core.fs.parquet import write_parquet_file
 import space.core.proto.metadata_pb2 as meta
 from space.core.utils import paths
 from space.core.schema import constants

diff --git a/python/src/space/core/ops/__init__.py b/python/src/space/core/ops/__init__.py
@@ -15,3 +15,4 @@
 """Space local data operations."""
 
 from space.core.ops.append import LocalAppendOp
+from space.core.ops.read import FileSetReadOp
diff --git a/python/src/space/core/ops/append.py b/python/src/space/core/ops/append.py
@@ -42,7 +42,7 @@
 
 
 class BaseAppendOp(BaseOp):
-  """Abstract base Append operation class."""
+  """Abstract base append operation class."""
 
   @abstractmethod
   def write(self, data: InputData) -> None:
@@ -71,7 +71,6 @@ class _RecordWriterInfo:
       default_factory=meta.StorageStatistics)
 
 
-# pylint: disable=too-many-instance-attributes
 class LocalAppendOp(BaseAppendOp, StoragePaths):
   """Append operation running locally.
   

diff --git a/python/src/space/core/ops/base.py b/python/src/space/core/ops/base.py
@@ -25,6 +25,5 @@
 InputData: TypeAlias = Union[Dict[str, Any], pa.Table]
 
 
-# pylint: disable=too-few-public-methods
 class BaseOp(ABC):
   """Abstract base operation class."""
diff --git a/python/src/space/core/ops/read.py b/python/src/space/core/ops/read.py
@@ -0,0 +1,147 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Local read operation implementation."""
+
+from __future__ import annotations
+from abc import abstractmethod
+from typing import Iterator, Dict, List, Tuple, Optional
+
+import numpy as np
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pyarrow.compute as pc
+
+from space.core.fs.array_record import read_record_file
+from space.core.ops.base import BaseOp
+from space.core.proto import metadata_pb2 as meta
+from space.core.proto import runtime_pb2 as runtime
+from space.core.schema import arrow
+from space.core.schema.constants import FILE_PATH_FIELD, ROW_ID_FIELD
+from space.core.schema import utils as schema_utils
+from space.core.utils.paths import StoragePaths
+
+_RECORD_KEY_FIELD = "__RECORD_KEY"
+
+
+class BaseReadOp(BaseOp):
+  """Abstract base read operation class."""
+
+  @abstractmethod
+  def __iter__(self) -> Iterator[pa.Table]:
+    """Iterator of read data."""
+
+
+class FileSetReadOp(BaseReadOp, StoragePaths):
+  """Read data from a dataset."""
+
+  def __init__(self,
+               location: str,
+               metadata: meta.StorageMetadata,
+               file_set: runtime.FileSet,
+               filter_: Optional[pc.Expression] = None):
+    StoragePaths.__init__(self, location)
+
+    # TODO: to validate that filter_ does not contain record files.
+
+    self._metadata = metadata
+    self._file_set = file_set
+
+    record_fields = set(self._metadata.schema.record_fields)
+    self._physical_schema = arrow.arrow_schema(self._metadata.schema.fields,
+                                               record_fields,
+                                               physical=True)
+    self._index_fields, self._record_fields = arrow.classify_fields(
+        self._physical_schema, record_fields, selected_fields=None)
+
+    self._index_field_ids = set(schema_utils.field_ids(self._index_fields))
+
+    self._record_fields_dict: Dict[int, schema_utils.Field] = {}
+    for f in self._record_fields:
+      self._record_fields_dict[f.field_id] = f
+
+    self._filter = filter_
+
+  def __iter__(self) -> Iterator[pa.Table]:
+    for file in self._file_set.index_files:
+      yield self._read_index_and_record(file.path)
+
+  def _read_index_and_record(self, index_path: str) -> pa.Table:
+    index_table = pq.read_table(self.full_path(index_path),
+                                filters=self._filter)  # type: ignore[arg-type]
+
+    index_column_ids: List[int] = []
+    record_columns: List[Tuple[int, pa.Field]] = []
+    for column_id, field in enumerate(index_table.schema):
+      field_id = arrow.field_id(field)
+      if field_id in self._index_field_ids:
+        index_column_ids.append(column_id)
+      else:
+        record_columns.append(
+            (column_id,
+             arrow.binary_field(self._record_fields_dict[field_id])))
+
+    result_table = index_table.select(
+        index_column_ids)  # type: ignore[arg-type]
+
+    # Record record fields from addresses.
+    for column_id, field in record_columns:
+      result_table = result_table.append_column(
+          field,
+          self._read_record_column(
+              index_table.select([column_id]),  # type: ignore[list-item]
+              field.name))
+
+    return result_table
+
+  def _read_record_column(self, record_address: pa.Table,
+                          field: str) -> pa.BinaryArray:
+    """Read selective rows in multiple ArrayRecord files."""
+    num_rows = record_address.num_rows
+    # _RECORD_KEY_FIELD is the row index of record_address_table used for
+    # retrieving rows after group by. It is not in the read result.
+    record_address = record_address.flatten().append_column(
+        _RECORD_KEY_FIELD, [np.arange(num_rows)])  # type: ignore[arg-type]
+
+    # TODO: should detect whether data file use file path or ID.
+    file_path_field = f"{field}.{FILE_PATH_FIELD}"
+    row_id_field = f"{field}.{ROW_ID_FIELD}"
+
+    # Record row IDs and records key co-grouped by file path, for processing
+    # one file at a time to minimize file reads.
+    grouped_records = record_address.group_by(
+        file_path_field).aggregate(  # type: ignore[arg-type]
+            [(row_id_field, "list"), (_RECORD_KEY_FIELD, "list")])
+
+    file_path_column = grouped_records.column(file_path_field).combine_chunks()
+    row_ids_column = grouped_records.column(
+        f"{row_id_field}_list").combine_chunks()
+    record_keys_column = grouped_records.column(
+        f"{_RECORD_KEY_FIELD}_list").combine_chunks()
+
+    # TODO: to parallelize ArrayRecord file reads.
+    record_values: List[List[bytes]] = []
+    for file_path, row_ids in zip(
+        file_path_column, row_ids_column):  # type: ignore[call-overload]
+      record_values.append(
+          read_record_file(self.full_path(file_path.as_py()), row_ids.as_py()))
+
+    # Sort records by record_keys so the records can match indexes.
+    sorted_values: List[bytes] = [None] * num_rows  # type: ignore[list-item]
+    for values, keys in zip(record_values,
+                            record_keys_column):  # type: ignore[call-overload]
+      for value, key in zip(values, keys):
+        sorted_values[key.as_py()] = value
+
+    return pa.array(sorted_values, pa.binary())  # type: ignore[return-value]
diff --git a/python/src/space/core/schema/arrow.py b/python/src/space/core/schema/arrow.py
@@ -189,3 +189,12 @@ def record_address_types() -> List[Tuple[str, pa.DataType]]:
   """Returns Arrow fields of record addresses."""
   return [(constants.FILE_PATH_FIELD, pa.string()),
           (constants.ROW_ID_FIELD, pa.int32())]
+
+
+def binary_field(field: utils.Field) -> pa.Field:
+  """Return a binary Arrow field for the given field."""
+  return _set_field_type(field, pa.binary())
+
+
+def _set_field_type(field: utils.Field, type_: pa.DataType) -> pa.Field:
+  return pa.field(field.name, type_, metadata=field_metadata(field.field_id))
diff --git a/python/src/space/core/schema/field_ids.py b/python/src/space/core/schema/field_ids.py
@@ -23,7 +23,6 @@
 _START_FIELD_ID = 0
 
 
-# pylint: disable=too-few-public-methods
 class FieldIdManager:
   """Assign field IDs to schema fields using Depth First Search.
 

diff --git a/python/src/space/core/schema/utils.py b/python/src/space/core/schema/utils.py
@@ -32,6 +32,11 @@ def field_names(fields: List[Field]) -> List[str]:
   return list(map(lambda f: f.name, fields))
 
 
+def field_ids(fields: List[Field]) -> List[int]:
+  """Extract field IDs from a list of fields."""
+  return list(map(lambda f: f.field_id, fields))
+
+
 def stats_field_name(field_id_: int) -> str:
   """Column stats struct field name.
   

diff --git a/python/tests/core/manifests/test_falsifiable_filters.py b/python/tests/core/manifests/test_falsifiable_filters.py
@@ -28,7 +28,7 @@
                            ((pc.field("_STATS_f1", "_MIN") > 1) |
                             (pc.field("_STATS_f1", "_MAX") < 1)))])
 def test_build_manifest_filter(filter_, falsifiable_filter):
-  arrow_schema = pa.schema([("a", pa.int64()), ("b", pa.float64())])  # pylint: disable=too-few-public-methods
+  arrow_schema = pa.schema([("a", pa.int64()), ("b", pa.float64())])
   field_name_ids = {"a": 0, "b": 1}
 
   falsifiable_filter = ff.build_manifest_filter(arrow_schema, field_name_ids,
@@ -39,7 +39,7 @@ def test_build_manifest_filter(filter_, falsifiable_filter):
 @pytest.mark.parametrize("filter_", [(pc.field("a") != 10),
                                      (~(pc.field("a") > 10))])
 def test_build_manifest_filter_not_supported_return_none(filter_):
-  arrow_schema = pa.schema([("a", pa.int64()), ("b", pa.float64())])  # pylint: disable=too-few-public-methods
+  arrow_schema = pa.schema([("a", pa.int64()), ("b", pa.float64())])
   field_name_ids = {"a": 0, "b": 1}
 
   assert ff.build_manifest_filter(arrow_schema, field_name_ids,

diff --git a/python/tests/core/manifests/test_index.py b/python/tests/core/manifests/test_index.py
@@ -18,10 +18,10 @@
 
 from space.core.manifests import IndexManifestWriter
 from space.core.manifests.index import read_index_manifests
+from space.core.fs.parquet import write_parquet_file
 import space.core.proto.metadata_pb2 as meta
 import space.core.proto.runtime_pb2 as runtime
 from space.core.schema.arrow import field_metadata
-from space.core.utils.parquet import write_parquet_file
 
 _SCHEMA = pa.schema([
     pa.field("int64", pa.int64(), metadata=field_metadata(0)),
Original file line number	Diff line number	Diff line change
Expand Up		@@ -15,3 +15,4 @@
		"""Space local data operations."""

		from space.core.ops.append import LocalAppendOp
		from space.core.ops.read import FileSetReadOp