Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FSTORE-1633] Fix engine choice in case of connection to serverless #424

Merged
merged 1 commit into from
Dec 5, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 18 additions & 18 deletions python/hopsworks_common/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import weakref
from typing import Any, Optional

from hopsworks_common import client, usage, util, version
from hopsworks_common import client, constants, usage, util, version
from hopsworks_common.core import (
hosts_api,
project_api,
Expand Down Expand Up @@ -99,8 +99,8 @@ class Connection:
defaults to the project from where the client is run from.
Defaults to `None`.
engine: Specifies the engine to use. Possible options are "spark", "python", "training", "spark-no-metastore", or "spark-delta". The default value is None, which automatically selects the engine based on the environment:
"spark": Used if Spark is available, such as in Hopsworks or Databricks environments.
"python": Used in local Python environments or AWS SageMaker when Spark is not available.
"spark": Used if Spark is available and the connection is not to serverless Hopsworks, such as in Hopsworks or Databricks environments.
"python": Used in local Python environments or AWS SageMaker when Spark is not available or the connection is done to serverless Hopsworks.
"training": Used when only feature store metadata is needed, such as for obtaining training dataset locations and label information during Hopsworks training experiments.
"spark-no-metastore": Functions like "spark" but does not rely on the Hive metastore.
"spark-delta": Minimizes dependencies further by avoiding both Hive metastore and HopsFS.
Expand Down Expand Up @@ -337,26 +337,26 @@ def connect(self) -> None:
self._connected = True
finalizer = weakref.finalize(self, self.close)
try:
external = client.base.Client.REST_ENDPOINT not in os.environ
serverless = self._host == constants.HOSTS.APP_HOST
# determine engine, needed to init client
if (self._engine is not None and self._engine.lower() == "spark") or (
self._engine is None and importlib.util.find_spec("pyspark")
if (
self._engine is None
and importlib.util.find_spec("pyspark")
and (not external or not serverless)
):
self._engine = "spark"
elif (self._engine is not None and self._engine.lower() == "python") or (
self._engine is None and not importlib.util.find_spec("pyspark")
):
elif self._engine is None:
self._engine = "python"
elif self._engine.lower() == "spark":
self._engine = "spark"
elif self._engine.lower() == "python":
self._engine = "python"
elif self._engine is not None and self._engine.lower() == "training":
elif self._engine.lower() == "training":
self._engine = "training"
elif (
self._engine is not None
and self._engine.lower() == "spark-no-metastore"
):
elif self._engine.lower() == "spark-no-metastore":
self._engine = "spark-no-metastore"
elif (
self._engine is not None
and self._engine.lower() == "spark-delta"
):
elif self._engine.lower() == "spark-delta":
self._engine = "spark-delta"
else:
raise ConnectionError(
Expand All @@ -365,7 +365,7 @@ def connect(self) -> None:
)

# init client
if client.base.Client.REST_ENDPOINT not in os.environ:
if external:
client.init(
"external",
self._host,
Expand Down
Loading