logicalclocks · bubriks · Dec 3, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/python/hsfs/core/kafka_engine.py b/python/hsfs/core/kafka_engine.py
@@ -30,7 +30,7 @@
     avro_not_installed_message,
 )
 from hopsworks_common.decorators import uses_confluent_kafka
-from hsfs.core import storage_connector_api
+from hsfs.core import online_ingestion, online_ingestion_api, storage_connector_api
 from tqdm import tqdm
 
 
@@ -71,7 +71,7 @@ def init_kafka_consumer(
 def init_kafka_resources(
     feature_group: Union[FeatureGroup, ExternalFeatureGroup],
     offline_write_options: Dict[str, Any],
-    project_id: int,
+    num_entries: int,
 ) -> Tuple[
     Producer, Dict[str, bytes], Dict[str, Callable[..., bytes]], Callable[..., bytes] :
 ]:
@@ -84,7 +84,7 @@ def init_kafka_resources(
             feature_group._writer,
         )
     producer, headers, feature_writers, writer = _init_kafka_resources(
-        feature_group, offline_write_options, project_id
+        feature_group, offline_write_options, num_entries
     )
     if feature_group._multi_part_insert:
         feature_group._kafka_producer = producer
@@ -97,7 +97,7 @@ def init_kafka_resources(
 def _init_kafka_resources(
     feature_group: Union[FeatureGroup, ExternalFeatureGroup],
     offline_write_options: Dict[str, Any],
-    project_id: int,
+    num_entries: int,
 ) -> Tuple[
     Producer, Dict[str, bytes], Dict[str, Callable[..., bytes]], Callable[..., bytes] :
 ]:
@@ -113,13 +113,31 @@ def _init_kafka_resources(
     # setup row writer function
     writer = get_encoder_func(feature_group._get_encoded_avro_schema())
 
+    return producer, get_headers(feature_group, num_entries), feature_writers, writer
+
+
+def get_headers(
+    feature_group: Union[FeatureGroup, ExternalFeatureGroup],
+    num_entries: int,
+) -> Dict[str, bytes]:
+    # setup online ingestion
+    online_ingestion_instance = online_ingestion.OnlineIngestion(
+        num_entries=num_entries
+    )
+
+    online_ingestion_instance = (
+        online_ingestion_api.OnlineIngestionApi().create_online_ingestion(
+            feature_group, online_ingestion_instance
+        )
+    )
+
     # custom headers for hopsworks onlineFS
-    headers = {
-        "projectId": str(project_id).encode("utf8"),
+    return {
+        "projectId": str(feature_group.feature_store.project_id).encode("utf8"),
         "featureGroupId": str(feature_group._id).encode("utf8"),
         "subjectId": str(feature_group.subject["id"]).encode("utf8"),
+        "onlineIngestionId": str(online_ingestion_instance.id).encode("utf8"),
     }
-    return producer, headers, feature_writers, writer
 
 
 @uses_confluent_kafka

diff --git a/python/hsfs/core/online_ingestion.py b/python/hsfs/core/online_ingestion.py
@@ -0,0 +1,181 @@
+#
+#   Copyright 2024 Hopsworks AB
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+from __future__ import annotations
+
+import json
+import time
+import warnings
+from datetime import datetime, timedelta
+from typing import (
+    Any,
+    Dict,
+    List,
+    Optional,
+    Union,
+)
+
+import humps
+from hopsworks_common import util
+from hsfs import feature_group as fg_mod
+from hsfs.core import online_ingestion_batch_result
+from tqdm.auto import tqdm
+
+
+class OnlineIngestion:
+    """
+    Metadata object used to provide Online Ingestion information for a feature group.
+    """
+
+    def __init__(
+        self,
+        id: Optional[int] = None,
+        num_entries: int = None,
+        current_offsets: Optional[str] = None,
+        processed_entries: Optional[int] = None,
+        inserted_entries: Optional[int] = None,
+        aborted_entries: Optional[int] = None,
+        batch_results: Union[
+            List[online_ingestion_batch_result.OnlineIngestionBatchResult],
+            List[Dict[str, Any]],
+        ] = None,
+        feature_group: fg_mod.FeatureGroup = None,
+        **kwargs,
+    ):
+        self._id = id
+        self._num_entries = num_entries  # specified when inserting
+        self._current_offsets = current_offsets
+        self._processed_entries = processed_entries
+        self._inserted_entries = inserted_entries
+        self._aborted_entries = aborted_entries
+        self._batch_results = [
+            (
+                online_ingestion_batch_result.OnlineIngestionBatchResult.from_response_json(
+                    batch_result
+                )
+                if isinstance(batch_result, dict)
+                else batch_result
+            )
+            for batch_result in batch_results
+        ] if batch_results else []  # batch inserts performed by onlinefs
+        self._feature_group = feature_group
+
+    @classmethod
+    def from_response_json(
+        cls, json_dict: Dict[str, Any], feature_group: fg_mod.FeatureGroup = None
+    ) -> OnlineIngestion:
+        if json_dict is None:
+            return None
+
+        json_decamelized: dict = humps.decamelize(json_dict)
+
+        if "count" not in json_decamelized:
+            return cls(**json_decamelized, feature_group=feature_group)
+        elif json_decamelized["count"] == 1:
+            return cls(**json_decamelized["items"][0], feature_group=feature_group)
+        elif json_decamelized["count"] > 1:
+            return [
+                cls(**item, feature_group=feature_group)
+                for item in json_decamelized["items"]
+            ]
+        else:
+            return None
+
+    def refresh(self):
+        from hsfs.core.online_ingestion_api import OnlineIngestionApi
+
+        online_ingestion = OnlineIngestionApi().get_online_ingestion(
+            self.feature_group, query_params={"filter_by": f"ID:{self.id}"}
+        )
+        self.__dict__.update(online_ingestion.__dict__)
+
+    def to_dict(self):
+        return {"id": self._id, "numEntries": self._num_entries}
+
+    def json(self):
+        return json.dumps(self, cls=util.Encoder)
+
+    @property
+    def id(self) -> Optional[int]:
+        return self._id
+
+    @property
+    def num_entries(self) -> int:
+        return self._num_entries
+
+    @num_entries.setter
+    def num_entries(self, num_entries: int) -> None:
+        self._num_entries = num_entries
+
+    @property
+    def current_offsets(self) -> Optional[str]:
+        return self._current_offsets
+
+    @property
+    def processed_entries(self) -> int:
+        return 0 if self._processed_entries is None else self._processed_entries
+
+    @property
+    def inserted_entries(self) -> int:
+        return 0 if self._inserted_entries is None else self._inserted_entries
+
+    @property
+    def aborted_entries(self) -> int:
+        return 0 if self._aborted_entries is None else self._aborted_entries
+
+    @property
+    def batch_results(
+        self,
+    ) -> List[online_ingestion_batch_result.OnlineIngestionBatchResult]:
+        return self._batch_results
+
+    @property
+    def feature_group(self) -> fg_mod.FeatureGroup:
+        return self._feature_group
+
+    def wait_for_completion(self, options: Dict[str, Any] = None):
+        if options is None:
+            options = {}
+
+        # Set timeout time
+        timeout_delta = timedelta(seconds=options.get("timeout", 60))
+        timeout_time = datetime.now() + timeout_delta
+
+        with tqdm(
+            total=self.num_entries,
+            bar_format="{desc}: {percentage:.2f}% |{bar}| Rows {n_fmt}/{total_fmt}",
+            desc="Online data ingestion progress",
+            mininterval=1,
+        ) as progress_bar:
+            while True:
+                if self.aborted_entries:
+                    progress_bar.colour = "RED"
+
+                progress_bar.n = self.processed_entries
+                progress_bar.refresh()
+
+                if self.processed_entries >= self.num_entries:
+                    break
+
+                if datetime.now() >= timeout_time:
+                    warnings.warn(
+                        f"Timeout of {timeout_delta} was exceeded while waiting for online ingestion completion.",
+                        stacklevel=1,
+                    )
+                    break
+
+                time.sleep(options.get("period", 1))
+
+                self.refresh()
diff --git a/python/hsfs/core/online_ingestion_api.py b/python/hsfs/core/online_ingestion_api.py
@@ -0,0 +1,72 @@
+#
+#   Copyright 2024 Hopsworks AB
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+from __future__ import annotations
+
+from typing import Dict, Optional
+
+from hopsworks_common import client
+from hsfs import feature_group as fg_mod
+from hsfs.core import online_ingestion
+
+
+class OnlineIngestionApi:
+    def create_online_ingestion(
+        self,
+        feature_group_instance: fg_mod.FeatureGroup,
+        online_ingestion_instance: online_ingestion.OnlineIngestion,
+    ) -> online_ingestion.OnlineIngestion:
+        _client = client.get_instance()
+        path_params = [
+            "project",
+            _client._project_id,
+            "featurestores",
+            feature_group_instance.feature_store_id,
+            "featuregroups",
+            feature_group_instance.id,
+            "online_ingestion",
+        ]
+
+        headers = {"content-type": "application/json"}
+        return online_ingestion.OnlineIngestion.from_response_json(
+            _client._send_request(
+                "POST",
+                path_params,
+                headers=headers,
+                data=online_ingestion_instance.json(),
+            ),
+            feature_group=feature_group_instance,
+        )
+
+    def get_online_ingestion(
+        self,
+        feature_group_instance: fg_mod.FeatureGroup,
+        query_params: Optional[Dict[str, str]] = None,
+    ) -> online_ingestion.OnlineIngestion:
+        _client = client.get_instance()
+        path_params = [
+            "project",
+            _client._project_id,
+            "featurestores",
+            feature_group_instance.feature_store_id,
+            "featuregroups",
+            feature_group_instance.id,
+            "online_ingestion",
+        ]
+
+        return online_ingestion.OnlineIngestion.from_response_json(
+            _client._send_request("GET", path_params, query_params),
+            feature_group=feature_group_instance,
+        )