From 4e827bc1262dee95d49615c23065e5458e640a34 Mon Sep 17 00:00:00 2001 From: Aleksey Veresov Date: Wed, 4 Dec 2024 21:33:03 +0100 Subject: [PATCH] Fix engine choice in case of connection to serverless --- python/hopsworks_common/connection.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/python/hopsworks_common/connection.py b/python/hopsworks_common/connection.py index ea954b79a..86111eabb 100644 --- a/python/hopsworks_common/connection.py +++ b/python/hopsworks_common/connection.py @@ -24,7 +24,7 @@ import weakref from typing import Any, Optional -from hopsworks_common import client, usage, util, version +from hopsworks_common import client, constants, usage, util, version from hopsworks_common.core import ( hosts_api, project_api, @@ -99,8 +99,8 @@ class Connection: defaults to the project from where the client is run from. Defaults to `None`. engine: Specifies the engine to use. Possible options are "spark", "python", "training", "spark-no-metastore", or "spark-delta". The default value is None, which automatically selects the engine based on the environment: - "spark": Used if Spark is available, such as in Hopsworks or Databricks environments. - "python": Used in local Python environments or AWS SageMaker when Spark is not available. + "spark": Used if Spark is available and the connection is not to serverless Hopsworks, such as in Hopsworks or Databricks environments. + "python": Used in local Python environments or AWS SageMaker when Spark is not available or the connection is done to serverless Hopsworks. "training": Used when only feature store metadata is needed, such as for obtaining training dataset locations and label information during Hopsworks training experiments. "spark-no-metastore": Functions like "spark" but does not rely on the Hive metastore. "spark-delta": Minimizes dependencies further by avoiding both Hive metastore and HopsFS. @@ -339,12 +339,15 @@ def connect(self) -> None: try: # determine engine, needed to init client if (self._engine is not None and self._engine.lower() == "spark") or ( - self._engine is None and importlib.util.find_spec("pyspark") + self._engine is None + and importlib.util.find_spec("pyspark") + and ( + client.base.Client.REST_ENDPOINT in os.environ + or self._host != constants.HOSTS.APP_HOST + ) ): self._engine = "spark" - elif (self._engine is not None and self._engine.lower() == "python") or ( - self._engine is None and not importlib.util.find_spec("pyspark") - ): + elif self._engine is None or self._engine.lower() == "python": self._engine = "python" elif self._engine is not None and self._engine.lower() == "training": self._engine = "training" @@ -353,10 +356,7 @@ def connect(self) -> None: and self._engine.lower() == "spark-no-metastore" ): self._engine = "spark-no-metastore" - elif ( - self._engine is not None - and self._engine.lower() == "spark-delta" - ): + elif self._engine is not None and self._engine.lower() == "spark-delta": self._engine = "spark-delta" else: raise ConnectionError(