-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add location read operation implementation (#8)
* Add location read operation implementation * unit test refactoring
- Loading branch information
Zhou Fang
authored
Dec 24, 2023
1 parent
a558715
commit b1f2201
Showing
18 changed files
with
388 additions
and
74 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# Copyright 2023 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
"""ArrayRecord file utilities.""" | ||
|
||
from typing import List | ||
|
||
from space.core.utils.lazy_imports_utils import array_record_module as ar | ||
|
||
|
||
def read_record_file(file_path: str, positions: List[int]) -> List[bytes]: | ||
"""Read records of an ArrayRecord file. | ||
Args: | ||
file_path: full file path. | ||
positions: the position inside the file of the records to read. | ||
""" | ||
record_reader = ar.ArrayRecordReader(file_path) | ||
records = record_reader.read(positions) | ||
record_reader.close() | ||
return records |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
# Copyright 2023 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
"""Local read operation implementation.""" | ||
|
||
from __future__ import annotations | ||
from abc import abstractmethod | ||
from typing import Iterator, Dict, List, Tuple, Optional | ||
|
||
import numpy as np | ||
import pyarrow as pa | ||
import pyarrow.parquet as pq | ||
import pyarrow.compute as pc | ||
|
||
from space.core.fs.array_record import read_record_file | ||
from space.core.ops.base import BaseOp | ||
from space.core.proto import metadata_pb2 as meta | ||
from space.core.proto import runtime_pb2 as runtime | ||
from space.core.schema import arrow | ||
from space.core.schema.constants import FILE_PATH_FIELD, ROW_ID_FIELD | ||
from space.core.schema import utils as schema_utils | ||
from space.core.utils.paths import StoragePaths | ||
|
||
_RECORD_KEY_FIELD = "__RECORD_KEY" | ||
|
||
|
||
class BaseReadOp(BaseOp): | ||
"""Abstract base read operation class.""" | ||
|
||
@abstractmethod | ||
def __iter__(self) -> Iterator[pa.Table]: | ||
"""Iterator of read data.""" | ||
|
||
|
||
class FileSetReadOp(BaseReadOp, StoragePaths): | ||
"""Read data from a dataset.""" | ||
|
||
def __init__(self, | ||
location: str, | ||
metadata: meta.StorageMetadata, | ||
file_set: runtime.FileSet, | ||
filter_: Optional[pc.Expression] = None): | ||
StoragePaths.__init__(self, location) | ||
|
||
# TODO: to validate that filter_ does not contain record files. | ||
|
||
self._metadata = metadata | ||
self._file_set = file_set | ||
|
||
record_fields = set(self._metadata.schema.record_fields) | ||
self._physical_schema = arrow.arrow_schema(self._metadata.schema.fields, | ||
record_fields, | ||
physical=True) | ||
self._index_fields, self._record_fields = arrow.classify_fields( | ||
self._physical_schema, record_fields, selected_fields=None) | ||
|
||
self._index_field_ids = set(schema_utils.field_ids(self._index_fields)) | ||
|
||
self._record_fields_dict: Dict[int, schema_utils.Field] = {} | ||
for f in self._record_fields: | ||
self._record_fields_dict[f.field_id] = f | ||
|
||
self._filter = filter_ | ||
|
||
def __iter__(self) -> Iterator[pa.Table]: | ||
for file in self._file_set.index_files: | ||
yield self._read_index_and_record(file.path) | ||
|
||
def _read_index_and_record(self, index_path: str) -> pa.Table: | ||
index_table = pq.read_table(self.full_path(index_path), | ||
filters=self._filter) # type: ignore[arg-type] | ||
|
||
index_column_ids: List[int] = [] | ||
record_columns: List[Tuple[int, pa.Field]] = [] | ||
for column_id, field in enumerate(index_table.schema): | ||
field_id = arrow.field_id(field) | ||
if field_id in self._index_field_ids: | ||
index_column_ids.append(column_id) | ||
else: | ||
record_columns.append( | ||
(column_id, | ||
arrow.binary_field(self._record_fields_dict[field_id]))) | ||
|
||
result_table = index_table.select( | ||
index_column_ids) # type: ignore[arg-type] | ||
|
||
# Record record fields from addresses. | ||
for column_id, field in record_columns: | ||
result_table = result_table.append_column( | ||
field, | ||
self._read_record_column( | ||
index_table.select([column_id]), # type: ignore[list-item] | ||
field.name)) | ||
|
||
return result_table | ||
|
||
def _read_record_column(self, record_address: pa.Table, | ||
field: str) -> pa.BinaryArray: | ||
"""Read selective rows in multiple ArrayRecord files.""" | ||
num_rows = record_address.num_rows | ||
# _RECORD_KEY_FIELD is the row index of record_address_table used for | ||
# retrieving rows after group by. It is not in the read result. | ||
record_address = record_address.flatten().append_column( | ||
_RECORD_KEY_FIELD, [np.arange(num_rows)]) # type: ignore[arg-type] | ||
|
||
# TODO: should detect whether data file use file path or ID. | ||
file_path_field = f"{field}.{FILE_PATH_FIELD}" | ||
row_id_field = f"{field}.{ROW_ID_FIELD}" | ||
|
||
# Record row IDs and records key co-grouped by file path, for processing | ||
# one file at a time to minimize file reads. | ||
grouped_records = record_address.group_by( | ||
file_path_field).aggregate( # type: ignore[arg-type] | ||
[(row_id_field, "list"), (_RECORD_KEY_FIELD, "list")]) | ||
|
||
file_path_column = grouped_records.column(file_path_field).combine_chunks() | ||
row_ids_column = grouped_records.column( | ||
f"{row_id_field}_list").combine_chunks() | ||
record_keys_column = grouped_records.column( | ||
f"{_RECORD_KEY_FIELD}_list").combine_chunks() | ||
|
||
# TODO: to parallelize ArrayRecord file reads. | ||
record_values: List[List[bytes]] = [] | ||
for file_path, row_ids in zip( | ||
file_path_column, row_ids_column): # type: ignore[call-overload] | ||
record_values.append( | ||
read_record_file(self.full_path(file_path.as_py()), row_ids.as_py())) | ||
|
||
# Sort records by record_keys so the records can match indexes. | ||
sorted_values: List[bytes] = [None] * num_rows # type: ignore[list-item] | ||
for values, keys in zip(record_values, | ||
record_keys_column): # type: ignore[call-overload] | ||
for value, key in zip(values, keys): | ||
sorted_values[key.as_py()] = value | ||
|
||
return pa.array(sorted_values, pa.binary()) # type: ignore[return-value] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.