replace - from output column names with _

manu-sj · Jun 17, 2024 · 9891900 · 9891900
1 parent 64f34cd
commit 9891900
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 27 deletions.
diff --git a/python/hsfs/hopsworks_udf.py b/python/hsfs/hopsworks_udf.py
@@ -366,7 +366,7 @@ def _get_output_column_names(self) -> str:
             `List[str]`: List of feature names for the transformed columns
         """
         _BASE_COLUMN_NAME = (
-            f'{self.function_name}_{"-".join(self.transformation_features)}_'
+            f'{self.function_name}_{"_".join(self.transformation_features)}_'
         )
         if len(self.return_types) > 1:
             return [f"{_BASE_COLUMN_NAME}{i}" for i in range(len(self.return_types))]

diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py
@@ -3354,12 +3354,12 @@ def plus_two(col1, col2):
         )
 
         # Assert
-        assert all(result.columns == ["plus_two_col1-col2_0", "plus_two_col1-col2_1"])
+        assert all(result.columns == ["plus_two_col1_col2_0", "plus_two_col1_col2_1"])
         assert len(result) == 2
-        assert result["plus_two_col1-col2_0"][0] == 2
-        assert result["plus_two_col1-col2_0"][1] == 3
-        assert result["plus_two_col1-col2_1"][0] == 12
-        assert result["plus_two_col1-col2_1"][1] == 13
+        assert result["plus_two_col1_col2_0"][0] == 2
+        assert result["plus_two_col1_col2_0"][1] == 3
+        assert result["plus_two_col1_col2_1"][0] == 12
+        assert result["plus_two_col1_col2_1"][1] == 13
 
     def test_apply_transformation_function_polars(self, mocker):
         # Arrange
@@ -3854,7 +3854,7 @@ def test_materialization_kafka_first_job_execution(self, mocker):
             args="defaults tests_offsets",
             await_termination=False,
         )
-    
+
     def test_materialization_kafka_skip_offsets(self, mocker):
         # Arrange
         mocker.patch("hsfs.engine.python.Engine._get_kafka_config", return_value={})

diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py
@@ -5,7 +5,7 @@
 #   you may not use this file except in compliance with the License.
 #   You may obtain a copy of the License at
 #
-#       http://www.apache.org/licenses/LICENSE-2.0
+#       http://www.apache.org/licenses/LICENSE_2.0
 #
 #   Unless required by applicable law or agreed to in writing, software
 #   distributed under the License is distributed on an "AS IS" BASIS,
@@ -937,7 +937,7 @@ def test_save_stream_dataframe(self, mocker, backend_fixtures):
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[
                 0
             ][1]
-            == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint"
+            == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}_checkpoint"
         )
         assert (
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[
@@ -1053,7 +1053,7 @@ def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures):
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[
                 0
             ][1]
-            == "/Projects/test_project_name/Resources/test_query_name-checkpoint"
+            == "/Projects/test_project_name/Resources/test_query_name_checkpoint"
         )
         assert (
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[
@@ -1293,7 +1293,7 @@ def test_save_stream_dataframe_await_termination(self, mocker, backend_fixtures)
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[
                 0
             ][1]
-            == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint"
+            == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}_checkpoint"
         )
         assert (
             mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[
@@ -2456,7 +2456,7 @@ def test_time_series_split_date(self, mocker):
         d = {
             "col_0": [1, 2],
             "col_1": ["test_1", "test_2"],
-            "event_time": ["2017-03-04", "2017-03-05"],
+            "event_time": ["2017_03_04", "2017_03_05"],
         }
         df = pd.DataFrame(data=d)
 
@@ -2516,7 +2516,7 @@ def test_time_series_split_timestamp(self, mocker):
         d = {
             "col_0": [1, 2],
             "col_1": ["test_1", "test_2"],
-            "event_time": ["2017-03-04", "2017-03-05"],
+            "event_time": ["2017_03_04", "2017_03_05"],
         }
         df = pd.DataFrame(data=d)
 
@@ -3809,7 +3809,7 @@ def __init__(self, label, index):
             "double": ["1"],
             "timestamp": [1641340800000],
             "boolean": ["False"],
-            "date": ["2022-01-27"],
+            "date": ["2022_01_27"],
             "binary": ["1"],
             "array<string>": [["123"]],
             "struc": [LabelIndex("0", "1")],
@@ -4212,11 +4212,11 @@ def test_setup_s3_hadoop_conf(self, mocker):
             "fs.s3a.secret.key", s3_connector.secret_key
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
-            "fs.s3a.server-side-encryption-algorithm",
+            "fs.s3a.server_side_encryption_algorithm",
             s3_connector.server_encryption_algorithm,
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
-            "fs.s3a.server-side-encryption-key", s3_connector.server_encryption_key
+            "fs.s3a.server_side_encryption_key", s3_connector.server_encryption_key
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
             "fs.s3a.aws.credentials.provider",
@@ -4487,8 +4487,8 @@ def test(col1, col2):
         expected_df = pd.DataFrame(
             data={
                 "col_1": ["test_1", "test_2"],
-                "test_col_0-col_2_0": [2, 3],
-                "test_col_0-col_2_1": [12, 13],
+                "test_col_0_col_2_0": [2, 3],
+                "test_col_0_col_2_1": [12, 13],
             }
         )  # todo why it doesnt return int?
 
@@ -4514,7 +4514,7 @@ def test_setup_gcp_hadoop_conf(self, mocker):
 
         content = (
             '{"type": "service_account", "project_id": "test", "private_key_id": "123456", '
-            '"private_key": "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----", '
+            '"private_key": "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____", '
             '"client_email": "test@project.iam.gserviceaccount.com"}'
         )
         credentialsFile = "keyFile.json"
@@ -4563,7 +4563,7 @@ def test_setup_gcp_hadoop_conf(self, mocker):
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
             "fs.gs.auth.service.account.private.key",
-            "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----",
+            "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____",
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.unset.assert_any_call(
             "fs.gs.encryption.algorithm"
@@ -4586,7 +4586,7 @@ def test_setup_gcp_hadoop_conf_algorithm(self, mocker):
 
         content = (
             '{"type": "service_account", "project_id": "test", "private_key_id": "123456", '
-            '"private_key": "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----", '
+            '"private_key": "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____", '
             '"client_email": "test@project.iam.gserviceaccount.com"}'
         )
         credentialsFile = "keyFile.json"
@@ -4650,7 +4650,7 @@ def test_setup_gcp_hadoop_conf_algorithm(self, mocker):
         )
         mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call(
             "fs.gs.auth.service.account.private.key",
-            "-----BEGIN PRIVATE KEY-----test-----END PRIVATE KEY-----",
+            "_____BEGIN PRIVATE KEY_____test_____END PRIVATE KEY_____",
         )
 
     def test_get_unique_values(self):

diff --git a/python/tests/test_hopswork_udf.py b/python/tests/test_hopswork_udf.py
@@ -337,7 +337,7 @@ def test_generate_output_column_names_multiple_argument_one_output_type(self):
         def test_func(col1, col2, col3):
             return col1 + 1
 
-        assert test_func._get_output_column_names() == ["test_func_col1-col2-col3_"]
+        assert test_func._get_output_column_names() == ["test_func_col1_col2_col3_"]
 
     def test_generate_output_column_names_single_argument_multiple_output_type(self):
         @udf([int, float, int])
@@ -360,9 +360,9 @@ def test_func(col1, col2, col3):
             )
 
         assert test_func._get_output_column_names() == [
-            "test_func_col1-col2-col3_0",
-            "test_func_col1-col2-col3_1",
-            "test_func_col1-col2-col3_2",
+            "test_func_col1_col2_col3_0",
+            "test_func_col1_col2_col3_1",
+            "test_func_col1_col2_col3_2",
         ]
 
     def test_create_pandas_udf_return_schema_from_list_one_output_type(self):
@@ -422,7 +422,7 @@ def test_func(col1, col2):
             test_dataframe["column1"], test_dataframe["column2"]
         )
 
-        assert all(result.columns == ["test_func_col1-col2_0", "test_func_col1-col2_1"])
+        assert all(result.columns == ["test_func_col1_col2_0", "test_func_col1_col2_1"])
         assert result.values.tolist() == [[2, 12], [3, 22], [4, 32], [5, 42]]
 
     def test_HopsworkUDf_call_one_argument(self):