Merge branch 'main' into feature/prompt-engineering-resource

DataChefHQ · Oct 31, 2024 · 72ac6b0 · 72ac6b0
2 parents 659b707 + 2259cfa
commit 72ac6b0
Show file tree

Hide file tree

Showing 12 changed files with 477 additions and 74 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,23 @@
+## v0.7.0 (2024-10-14)
+
+### Feat
+
+- add Java for running spark related applications
+- add Job schedule and checkpoints bucket
+- work on Glue component
+
+### Fix
+
+- Pulumi args
+- application example
+
+### Refactor
+
+- force creating required buckets
+- apply PR reviews
+- Fix comments and types
+- fix format
+
 ## v0.6.0 (2024-10-10)
 
 ### Feat

diff --git a/devenv.nix b/devenv.nix
@@ -55,6 +55,10 @@
     };
   };
 
+  # Java is required for PySpark
+  languages.java.enable = true;
+  languages.java.jdk.package = pkgs.jdk8; # Java version running on AWS Glue
+
   enterShell = ''
     hello
     pdm install

diff --git a/examples/sparkle/Pulumi.yaml b/examples/sparkle/Pulumi.yaml
@@ -1,11 +1,5 @@
-name: object_storage
+name: simple-spark-application
 runtime:
   name: python
-  options:
-    toolchain: pip
-    virtualenv: venv
-description: A minimal Azure Native Python Pulumi program
-config:
-  pulumi:tags:
-    value:
-      pulumi:template: azure-python
+description: A minimal spark application that uses Sparkle
+region: eu-west-1
diff --git a/examples/sparkle/__main__.py b/examples/sparkle/__main__.py
@@ -1,4 +1,3 @@
-import os
 from damavand.cloud.provider import AwsProvider
 from damavand.factories import SparkControllerFactory
 
@@ -10,7 +9,7 @@ def main() -> None:
     spark_factory = SparkControllerFactory(
         provider=AwsProvider(
             app_name="my-app",
-            region="us-west-2",
+            region="eu-west-1",
         ),
         tags={"env": "dev"},
     )
@@ -22,10 +21,9 @@ def main() -> None:
             CustomerOrders(),
         ],
     )
+    # app_name = os.getenv("APP_NAME", "products-app")  # Get app name on runtime
 
-    app_name = os.getenv("APP_NAME", "default_app")  # Get app name on runtime
-
-    spark_controller.run_application(app_name)
+    # spark_controller.run_application(app_name)
     spark_controller.provision()
 
 

diff --git a/examples/sparkle/applications/orders.py b/examples/sparkle/applications/orders.py
@@ -1,4 +1,5 @@
-from sparkle.config import Config
+from sparkle.config import Config, IcebergConfig, KafkaReaderConfig
+from sparkle.config.kafka_config import KafkaConfig, Credentials
 from sparkle.writer.iceberg_writer import IcebergWriter
 from sparkle.application import Sparkle
 from sparkle.reader.kafka_reader import KafkaReader
@@ -15,6 +16,18 @@ def __init__(self):
                 version="0.0.1",
                 database_bucket="s3://test-bucket",
                 checkpoints_bucket="s3://test-checkpoints",
+                iceberg_output=IcebergConfig(
+                    database_name="all_products",
+                    database_path="",
+                    table_name="orders_v1",
+                ),
+                kafka_input=KafkaReaderConfig(
+                    KafkaConfig(
+                        bootstrap_servers="localhost:9119",
+                        credentials=Credentials("test", "test"),
+                    ),
+                    kafka_topic="src_orders_v1",
+                ),
             ),
             readers={"orders": KafkaReader},
             writers=[IcebergWriter],

diff --git a/examples/sparkle/applications/products.py b/examples/sparkle/applications/products.py
@@ -1,7 +1,6 @@
 from sparkle.application import Sparkle
-from sparkle.config import Config
+from sparkle.config import Config, IcebergConfig, TableConfig
 from sparkle.writer.iceberg_writer import IcebergWriter
-from sparkle.writer.kafka_writer import KafkaStreamPublisher
 from sparkle.reader.table_reader import TableReader
 
 from pyspark.sql import DataFrame
@@ -16,11 +15,20 @@ def __init__(self):
                 version="0.0.1",
                 database_bucket="s3://test-bucket",
                 checkpoints_bucket="s3://test-checkpoints",
+                iceberg_output=IcebergConfig(
+                    database_name="all_products",
+                    database_path="",
+                    table_name="products_v1",
+                ),
+                hive_table_input=TableConfig(
+                    database="source_database",
+                    table="products_v1",
+                    bucket="",
+                ),
             ),
             readers={"products": TableReader},
             writers=[
                 IcebergWriter,
-                KafkaStreamPublisher,
             ],
         )
 

diff --git a/pdm.lock b/pdm.lock
diff --git a/src/damavand/__init__.py b/src/damavand/__init__.py
@@ -1 +1 @@
-__version__ = "0.6.0"
+__version__ = "0.7.0"
diff --git a/src/damavand/base/controllers/spark.py b/src/damavand/base/controllers/spark.py
@@ -54,7 +54,6 @@ def application_with_id(self, app_id: str) -> Sparkle:
         Returns:
             Sparkle: The Spark application.
         """
-
         for app in self.applications:
             if app.config.app_id == app_id:
                 return app