Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FSTORE-1580] OnlineFS Observability #435

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
32 changes: 25 additions & 7 deletions python/hsfs/core/kafka_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
avro_not_installed_message,
)
from hopsworks_common.decorators import uses_confluent_kafka
from hsfs.core import storage_connector_api
from hsfs.core import online_ingestion, online_ingestion_api, storage_connector_api
from tqdm import tqdm


Expand Down Expand Up @@ -71,7 +71,7 @@ def init_kafka_consumer(
def init_kafka_resources(
feature_group: Union[FeatureGroup, ExternalFeatureGroup],
offline_write_options: Dict[str, Any],
project_id: int,
num_entries: int,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it make sense to provide a default value here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, i don't think so, since the number of entries directly depends on the dataframe size

) -> Tuple[
Producer, Dict[str, bytes], Dict[str, Callable[..., bytes]], Callable[..., bytes] :
]:
Expand All @@ -84,7 +84,7 @@ def init_kafka_resources(
feature_group._writer,
)
producer, headers, feature_writers, writer = _init_kafka_resources(
feature_group, offline_write_options, project_id
feature_group, offline_write_options, num_entries
)
if feature_group._multi_part_insert:
feature_group._kafka_producer = producer
Expand All @@ -97,7 +97,7 @@ def init_kafka_resources(
def _init_kafka_resources(
feature_group: Union[FeatureGroup, ExternalFeatureGroup],
offline_write_options: Dict[str, Any],
project_id: int,
num_entries: int,
) -> Tuple[
Producer, Dict[str, bytes], Dict[str, Callable[..., bytes]], Callable[..., bytes] :
]:
Expand All @@ -113,13 +113,31 @@ def _init_kafka_resources(
# setup row writer function
writer = get_encoder_func(feature_group._get_encoded_avro_schema())

return producer, get_headers(feature_group, num_entries), feature_writers, writer


def get_headers(
feature_group: Union[FeatureGroup, ExternalFeatureGroup],
num_entries: int,
) -> Dict[str, bytes]:
# setup online ingestion
online_ingestion_instance = online_ingestion.OnlineIngestion(
num_entries=num_entries
)

online_ingestion_instance = (
online_ingestion_api.OnlineIngestionApi().create_online_ingestion(
feature_group, online_ingestion_instance
)
)

# custom headers for hopsworks onlineFS
headers = {
"projectId": str(project_id).encode("utf8"),
return {
"projectId": str(feature_group.feature_store.project_id).encode("utf8"),
"featureGroupId": str(feature_group._id).encode("utf8"),
"subjectId": str(feature_group.subject["id"]).encode("utf8"),
"onlineIngestionId": str(online_ingestion_instance.id).encode("utf8"),
}
return producer, headers, feature_writers, writer


@uses_confluent_kafka
Expand Down
181 changes: 181 additions & 0 deletions python/hsfs/core/online_ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#
# Copyright 2024 Hopsworks AB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import annotations

import json
import time
import warnings
from datetime import datetime, timedelta
from typing import (
Any,
Dict,
List,
Optional,
Union,
)

import humps
from hopsworks_common import util
from hsfs import feature_group as fg_mod
from hsfs.core import online_ingestion_batch_result
from tqdm.auto import tqdm


class OnlineIngestion:
"""
Metadata object used to provide Online Ingestion information for a feature group.
"""

def __init__(
self,
id: Optional[int] = None,
num_entries: int = None,
current_offsets: Optional[str] = None,
processed_entries: Optional[int] = None,
inserted_entries: Optional[int] = None,
aborted_entries: Optional[int] = None,
batch_results: Union[
List[online_ingestion_batch_result.OnlineIngestionBatchResult],
List[Dict[str, Any]],
] = None,
feature_group: fg_mod.FeatureGroup = None,
**kwargs,
):
self._id = id
self._num_entries = num_entries # specified when inserting
self._current_offsets = current_offsets
self._processed_entries = processed_entries
self._inserted_entries = inserted_entries
self._aborted_entries = aborted_entries
self._batch_results = [
(
online_ingestion_batch_result.OnlineIngestionBatchResult.from_response_json(
batch_result
)
if isinstance(batch_result, dict)
else batch_result
)
for batch_result in batch_results
] if batch_results else [] # batch inserts performed by onlinefs
self._feature_group = feature_group

@classmethod
def from_response_json(
cls, json_dict: Dict[str, Any], feature_group: fg_mod.FeatureGroup = None
) -> OnlineIngestion:
if json_dict is None:
return None

json_decamelized: dict = humps.decamelize(json_dict)

if "count" not in json_decamelized:
return cls(**json_decamelized, feature_group=feature_group)
elif json_decamelized["count"] == 1:
return cls(**json_decamelized["items"][0], feature_group=feature_group)
elif json_decamelized["count"] > 1:
return [
cls(**item, feature_group=feature_group)
for item in json_decamelized["items"]
]
else:
return None

def refresh(self):
from hsfs.core.online_ingestion_api import OnlineIngestionApi

online_ingestion = OnlineIngestionApi().get_online_ingestion(
self.feature_group, query_params={"filter_by": f"ID:{self.id}"}
)
self.__dict__.update(online_ingestion.__dict__)

def to_dict(self):
return {"id": self._id, "numEntries": self._num_entries}

def json(self):
return json.dumps(self, cls=util.Encoder)

@property
def id(self) -> Optional[int]:
return self._id

@property
def num_entries(self) -> int:
return self._num_entries

@num_entries.setter
def num_entries(self, num_entries: int) -> None:
self._num_entries = num_entries

@property
def current_offsets(self) -> Optional[str]:
return self._current_offsets

@property
def processed_entries(self) -> int:
return 0 if self._processed_entries is None else self._processed_entries

@property
def inserted_entries(self) -> int:
return 0 if self._inserted_entries is None else self._inserted_entries

@property
def aborted_entries(self) -> int:
return 0 if self._aborted_entries is None else self._aborted_entries

@property
def batch_results(
self,
) -> List[online_ingestion_batch_result.OnlineIngestionBatchResult]:
return self._batch_results

@property
def feature_group(self) -> fg_mod.FeatureGroup:
return self._feature_group

def wait_for_completion(self, options: Dict[str, Any] = None):
if options is None:
options = {}

# Set timeout time
timeout_delta = timedelta(seconds=options.get("timeout", 60))
timeout_time = datetime.now() + timeout_delta

with tqdm(
total=self.num_entries,
bar_format="{desc}: {percentage:.2f}% |{bar}| Rows {n_fmt}/{total_fmt}",
desc="Online data ingestion progress",
mininterval=1,
) as progress_bar:
while True:
if self.aborted_entries:
progress_bar.colour = "RED"

progress_bar.n = self.processed_entries
progress_bar.refresh()

if self.processed_entries >= self.num_entries:
break

if datetime.now() >= timeout_time:
warnings.warn(
f"Timeout of {timeout_delta} was exceeded while waiting for online ingestion completion.",
stacklevel=1,
)
break

time.sleep(options.get("period", 1))

self.refresh()
72 changes: 72 additions & 0 deletions python/hsfs/core/online_ingestion_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#
# Copyright 2024 Hopsworks AB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import annotations

from typing import Dict, Optional

from hopsworks_common import client
from hsfs import feature_group as fg_mod
from hsfs.core import online_ingestion


class OnlineIngestionApi:
def create_online_ingestion(
self,
feature_group_instance: fg_mod.FeatureGroup,
online_ingestion_instance: online_ingestion.OnlineIngestion,
) -> online_ingestion.OnlineIngestion:
_client = client.get_instance()
path_params = [
"project",
_client._project_id,
"featurestores",
feature_group_instance.feature_store_id,
"featuregroups",
feature_group_instance.id,
"online_ingestion",
]

headers = {"content-type": "application/json"}
return online_ingestion.OnlineIngestion.from_response_json(
_client._send_request(
"POST",
path_params,
headers=headers,
data=online_ingestion_instance.json(),
),
feature_group=feature_group_instance,
)

def get_online_ingestion(
self,
feature_group_instance: fg_mod.FeatureGroup,
query_params: Optional[Dict[str, str]] = None,
) -> online_ingestion.OnlineIngestion:
_client = client.get_instance()
path_params = [
"project",
_client._project_id,
"featurestores",
feature_group_instance.feature_store_id,
"featuregroups",
feature_group_instance.id,
"online_ingestion",
]

return online_ingestion.OnlineIngestion.from_response_json(
_client._send_request("GET", path_params, query_params),
feature_group=feature_group_instance,
)
Loading
Loading