changed to data-set, added test cases

treeverse · Oct 26, 2023 · 76a4187 · 76a4187
1 parent 5d68471
commit 76a4187
Showing 1 changed file with 18 additions and 4 deletions.
diff --git a/test/test_iceberg.py b/test/test_iceberg.py
@@ -11,14 +11,28 @@
 from pyspark.sql import SparkSession
 from pyspark.conf import SparkConf
 
+
+def get_data(spark):
+    data_set = [(1, "James", "Smith", 32, "M"),
+                (2, "Michael","Rose", 35 ,"M"),
+                (3, "Robert", "Williams", 41, "M"),
+                (4, "Maria", "Jones", 36, "F"),
+                (5, "Jen","Brown", 44, "F"),
+                (6, "Monika","Geller", 31, "F")]
+    df = spark.createDataFrame(data_set).coalesce(1)
+    return df
+
 def test_diff_two_same_branches(spark, lfs_client, lakefs_repo):
     print("repo name ", lakefs_repo)
-    df = spark.read.option("inferSchema","true").option("multiline","true").csv("./test/data-sets/film_permits.csv")
-    df.write.saveAsTable("lakefs.main.nyc.permits")
+    df = get_data(spark)
+    df.write.saveAsTable("lakefs.main.company.workers")
 
     #Commit, create a new branch, check that the tables are the same
     lfs_client.commits.commit(lakefs_repo, "main", CommitCreation(message="Initial data load"))
     lfs_client.branches.create_branch(lakefs_repo, BranchCreation(name="dev", source="main"))
-    df_main = spark.read.table("lakefs.main.nyc.permits")
-    df_dev = spark.read.table("lakefs.dev.nyc.permits")
+    df_main = spark.read.table("lakefs.main.company.workers")
+    df_dev = spark.read.table("lakefs.dev.company.workers")
     assert (df_main.schema == df_dev.schema) and (df_main.collect() == df_dev.collect()), "main and dev tables should be equal"
+
+# def delete_on_dev_and_merge(spark, lfs_client, lakefs_repo):
+