From 2c9b7a067ae49d937676c8a081529ec79d4a4887 Mon Sep 17 00:00:00 2001 From: "Ariel Shaqed (Scolnicov)" Date: Mon, 21 Aug 2023 10:10:06 +0300 Subject: [PATCH 1/7] Add "sbt publish" --- build.sbt | 20 ++++++++++++++++++++ project/build.properties | 1 + project/pgp-sign.sbt | 6 ++++++ project/plugins.sbt | 2 ++ 4 files changed, 29 insertions(+) create mode 100644 project/build.properties create mode 100644 project/pgp-sign.sbt create mode 100644 project/plugins.sbt diff --git a/build.sbt b/build.sbt index 0432905..076a8f0 100644 --- a/build.sbt +++ b/build.sbt @@ -1,4 +1,9 @@ +lazy val projectVersion = "0.0.1" + lazy val project = Project("lakefs-spark-extension", file("./src/")) + .settings( + version := projectVersion, + ) semanticdbEnabled := true semanticdbVersion := scalafixSemanticdb.revision @@ -14,6 +19,21 @@ libraryDependencies ++= Seq( "org.scalatestplus" %% "scalacheck-1-17" % "3.2.16.0" % "test", ) +val nexus = "https://s01.oss.sonatype.org/" +lazy val publishSettings = Seq( + publishTo := { + if (isSnapshot.value) Some("snapshots" at nexus + "content/repositories/snapshots") + else Some("releases" at nexus + "service/local/staging/deploy/maven2") + }, + // Remove all additional repository other than Maven Central from POM + pomIncludeRepository := { _ => false }, + credentials ++= Seq( + Credentials(Path.userHome / ".sbt" / "credentials"), + Credentials(Path.userHome / ".sbt" / "sonatype_credentials") + ) +) + +ThisBuild / isSnapshot := false ThisBuild / versionScheme := Some("early-semver") ThisBuild / organization := "io.lakefs" ThisBuild / organizationName := "Treeverse Labs" diff --git a/project/build.properties b/project/build.properties new file mode 100644 index 0000000..52413ab --- /dev/null +++ b/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.9.3 diff --git a/project/pgp-sign.sbt b/project/pgp-sign.sbt new file mode 100644 index 0000000..678cac5 --- /dev/null +++ b/project/pgp-sign.sbt @@ -0,0 +1,6 @@ +credentials += Credentials( + "GnuPG Key ID", + "gpg", + "F638279A7281EE8EAE58C4B98D3CFE7E7D0262DA", // key identifier + "ignored" // this field is ignored; passwords are supplied by pinentry +) diff --git a/project/plugins.sbt b/project/plugins.sbt new file mode 100644 index 0000000..82afe94 --- /dev/null +++ b/project/plugins.sbt @@ -0,0 +1,2 @@ +addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.11.0") +addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") From 6888772edb3903d25310d45662260073d8a7a2ad Mon Sep 17 00:00:00 2001 From: "Ariel Shaqed (Scolnicov)" Date: Mon, 21 Aug 2023 09:53:47 +0300 Subject: [PATCH 2/7] Add unit test --- build.sbt | 2 ++ .../lakefs/iceberg/extension/Extension.scala | 1 + .../iceberg/extension/ExtensionSpec.scala | 36 +++++++++++++++++++ .../iceberg/extension/SparkSessionSetup.scala | 34 ++++++++++++++++++ 4 files changed, 73 insertions(+) create mode 100644 src/test/scala/io/lakefs/iceberg/extension/ExtensionSpec.scala create mode 100644 src/test/scala/io/lakefs/iceberg/extension/SparkSessionSetup.scala diff --git a/build.sbt b/build.sbt index 076a8f0..3425d61 100644 --- a/build.sbt +++ b/build.sbt @@ -12,6 +12,8 @@ scalacOptions += "-Ywarn-unused-import" libraryDependencies ++= Seq( "io.lakefs" % "api-client" % "0.91.0", "org.apache.spark" %% "spark-sql" % "3.2.4" % "provided", + "org.apache.spark" %% "spark-hive" % "3.2.4" % "test", + "joda-time" % "joda-time" % "2.12.5" % "test", // TODO(ariels): Wrap api-client calls in: "dev.failsafe" % "failsafe" % "3.2.4", diff --git a/src/main/scala/io/lakefs/iceberg/extension/Extension.scala b/src/main/scala/io/lakefs/iceberg/extension/Extension.scala index 993b52a..f04ae97 100644 --- a/src/main/scala/io/lakefs/iceberg/extension/Extension.scala +++ b/src/main/scala/io/lakefs/iceberg/extension/Extension.scala @@ -50,6 +50,7 @@ object SchemaDiff { } val Seq(tablePrefix, fromSchema, toSchema, tableSuffix) = e.map(computeString) val sqlString = sql(tablePrefix, fromSchema, toSchema, tableSuffix) + print(s"[DEBUG] $sqlString") spark.sql(sqlString).queryExecution.logical } diff --git a/src/test/scala/io/lakefs/iceberg/extension/ExtensionSpec.scala b/src/test/scala/io/lakefs/iceberg/extension/ExtensionSpec.scala new file mode 100644 index 0000000..7864f7e --- /dev/null +++ b/src/test/scala/io/lakefs/iceberg/extension/ExtensionSpec.scala @@ -0,0 +1,36 @@ +package io.lakefs.iceberg.extension + +import org.scalatest.funspec.AnyFunSpec +import org.scalatest.matchers.should + +import org.apache.spark.sql.Row + +class ExtensionSpec extends AnyFunSpec + with SparkSessionWithExtensionSetup[LakeFSSparkSessionExtensions] + with should.Matchers { + + val _ = new LakeFSSparkSessionExtensions() + + describe("Extension") { + it("should SELECT") { + withSparkSession(spark => { + import spark.implicits._ + + spark.sql("CREATE DATABASE first") + val df = Seq(("a", 1), ("b", 2), ("c", 3)).toDF + df.writeTo("spark_catalog.first.table").create() + + spark.sql("CREATE DATABASE second") + val df2 = Seq(("a", 1), ("xyzzy", 2), ("c", 3), ("d", 4)).toDF + df2.writeTo("spark_catalog.second.table").create() + + val diff = spark.sql("SELECT * FROM schema_diff('spark_catalog', 'first', 'second', 'table')") + .collect() + .toSet + diff should equal(Set(Row("-", "b", 2), Row("+", "xyzzy", 2), Row("+", "d", 4))) + }) + } + + // TODO(ariels): Test SQL identifier quotation. + } +} diff --git a/src/test/scala/io/lakefs/iceberg/extension/SparkSessionSetup.scala b/src/test/scala/io/lakefs/iceberg/extension/SparkSessionSetup.scala new file mode 100644 index 0000000..8f9dce1 --- /dev/null +++ b/src/test/scala/io/lakefs/iceberg/extension/SparkSessionSetup.scala @@ -0,0 +1,34 @@ +package io.lakefs.iceberg.extension + +import java.nio.file.Files +import org.apache.commons.io.FileUtils +import scala.reflect.ClassTag + +import org.apache.spark.sql.SparkSession +import org.apache.spark.SparkConf + +trait SparkSessionWithExtensionSetup[E] { + def withSparkSession(testMethod: (SparkSession) => Any)(implicit tag: ClassTag[E]) { + val tmpDir = Files.createTempDirectory("sparktest").toString + val conf = new SparkConf() + .setMaster("local") + .setAppName("Spark test") + .set("spark.sql.extensions", tag.runtimeClass.getCanonicalName) + .set("spark.sql.warehouse.dir", tmpDir) + val spark = new SparkSession.Builder() + .appName("extension-test") + .config(conf) + .enableHiveSupport() + .getOrCreate + try { + testMethod(spark) + } finally { + // Clean up catalog dir + FileUtils.deleteDirectory(new java.io.File(tmpDir)) + // local metastore_db always created in current working directory, and + // current working directory cannot be changed in Java. + FileUtils.deleteDirectory(new java.io.File("./metastore_db")) + spark.close() + } + } +} From 3f4d3701031032751339bce2d5621896533a120f Mon Sep 17 00:00:00 2001 From: "Ariel Shaqed (Scolnicov)" Date: Mon, 21 Aug 2023 10:14:07 +0300 Subject: [PATCH 3/7] Add Scala CI workflow Based on GitHub's default. --- .github/workflows/ci-scala.yaml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 .github/workflows/ci-scala.yaml diff --git a/.github/workflows/ci-scala.yaml b/.github/workflows/ci-scala.yaml new file mode 100644 index 0000000..50b6973 --- /dev/null +++ b/.github/workflows/ci-scala.yaml @@ -0,0 +1,30 @@ +name: Scala CI + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + workflow_dispatch: {} + +permissions: + contents: read + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK 18 + uses: actions/setup-java@v3 + with: + java-version: '18' + distribution: 'temurin' + cache: 'sbt' + - name: Run tests + run: sbt test + # Optional: This step uploads information to the GitHub dependency graph and unblocking Dependabot alerts for the repository + - name: Upload dependency graph + uses: scalacenter/sbt-dependency-submission@ab086b50c947c9774b70f39fc7f6e20ca2706c91 From a5d463c6ad9fb055887dd5d146edd650da093112 Mon Sep 17 00:00:00 2001 From: "Ariel Shaqed (Scolnicov)" Date: Mon, 21 Aug 2023 10:29:12 +0300 Subject: [PATCH 4/7] Use correct Java version Spark doesn't like a newer JRE. --- .github/workflows/ci-scala.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-scala.yaml b/.github/workflows/ci-scala.yaml index 50b6973..d7f16c1 100644 --- a/.github/workflows/ci-scala.yaml +++ b/.github/workflows/ci-scala.yaml @@ -17,10 +17,10 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Set up JDK 18 + - name: Set up JDK 11 uses: actions/setup-java@v3 with: - java-version: '18' + java-version: '11' distribution: 'temurin' cache: 'sbt' - name: Run tests From 63a9c2fc20a54500ae91cb80553bde3141632758 Mon Sep 17 00:00:00 2001 From: "Ariel Shaqed (Scolnicov)" Date: Mon, 21 Aug 2023 10:43:10 +0300 Subject: [PATCH 5/7] Run CI on any PR not just to main CI is currently so cheap, it's better to run it always. --- .github/workflows/ci-scala.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci-scala.yaml b/.github/workflows/ci-scala.yaml index d7f16c1..1ed4c8e 100644 --- a/.github/workflows/ci-scala.yaml +++ b/.github/workflows/ci-scala.yaml @@ -4,7 +4,6 @@ on: push: branches: [ "main" ] pull_request: - branches: [ "main" ] workflow_dispatch: {} permissions: From 675617deec60e4d26ebac8680c45aeb023c52175 Mon Sep 17 00:00:00 2001 From: "Ariel Shaqed (Scolnicov)" Date: Mon, 21 Aug 2023 10:48:49 +0300 Subject: [PATCH 6/7] [bug] Remove dependency graph plugin It fails reporting, and actions suggested in https://github.com/marketplace/actions/sbt-dependency-submission#unexpected-status-403 don't work and are also not particularly safe. --- .github/workflows/ci-scala.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/ci-scala.yaml b/.github/workflows/ci-scala.yaml index 1ed4c8e..ec021d3 100644 --- a/.github/workflows/ci-scala.yaml +++ b/.github/workflows/ci-scala.yaml @@ -3,7 +3,7 @@ name: Scala CI on: push: branches: [ "main" ] - pull_request: + pull_request: {} workflow_dispatch: {} permissions: @@ -24,6 +24,3 @@ jobs: cache: 'sbt' - name: Run tests run: sbt test - # Optional: This step uploads information to the GitHub dependency graph and unblocking Dependabot alerts for the repository - - name: Upload dependency graph - uses: scalacenter/sbt-dependency-submission@ab086b50c947c9774b70f39fc7f6e20ca2706c91 From a2ccb7d23e66c9e8214ad544d0f1f1a59efcea69 Mon Sep 17 00:00:00 2001 From: "Ariel Shaqed (Scolnicov)" Date: Mon, 21 Aug 2023 11:59:37 +0300 Subject: [PATCH 7/7] [CR] Bug: Remove debug print --- src/main/scala/io/lakefs/iceberg/extension/Extension.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/scala/io/lakefs/iceberg/extension/Extension.scala b/src/main/scala/io/lakefs/iceberg/extension/Extension.scala index f04ae97..993b52a 100644 --- a/src/main/scala/io/lakefs/iceberg/extension/Extension.scala +++ b/src/main/scala/io/lakefs/iceberg/extension/Extension.scala @@ -50,7 +50,6 @@ object SchemaDiff { } val Seq(tablePrefix, fromSchema, toSchema, tableSuffix) = e.map(computeString) val sqlString = sql(tablePrefix, fromSchema, toSchema, tableSuffix) - print(s"[DEBUG] $sqlString") spark.sql(sqlString).queryExecution.logical }