0 1 13 release (#44)

* change version to 0.1.13-SNAPSHOT * Update project dependencies Updated multiple dependencies across the project, including sbt version, plugin versions, and library versions. These changes help keep the project up to date and potentially benefit from updates and fixes in the newer versions of these dependencies. * Update spark versions This change adds Spark 3.5.1 as a supported version * update apache commons compress library to remove CVE warning * add new option to disable evaluation of formulas when reading the Excel source * Update dependencies and refactor actions workflow (#41) * Update apache poi to 5.2.5 and spark to 3.5.1 * Update scalaTestVersion * Refactor actions workflow * Rollback scala version update due to incompatibilities with coverage dependency * Revert package upload path in actions * add latest 3.4.x release * Dazfuller/no data fix (#43) * implement fix to handle files which contain no data (only headers) * add new resource files for tests * add tests to cover new cases * change version to 0.1.13 --------- Co-authored-by: Jose Soto <josecsmorales@gmail.com>
elastacloud · Jun 29, 2024 · 5dc22b4 · 5dc22b4
1 parent 01ee129
commit 5dc22b4
Show file tree

Hide file tree

Showing 15 changed files with 273 additions and 90 deletions.
diff --git a/.github/workflows/spark.yml b/.github/workflows/spark.yml
@@ -7,40 +7,48 @@ on:
     types: [ opened, reopened ]
   workflow_dispatch:
 
-
 jobs:
   build:
     strategy:
       matrix:
-        sparkVersion: [ 3.0.1, 3.0.2, 3.0.3, 3.1.2, 3.2.1, 3.2.4, 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.4.0, 3.4.1, 3.5.0 ]
+        sparkVersion: [ 3.0.1, 3.0.2, 3.0.3, 3.1.2, 3.2.1, 3.2.4, 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.4.0, 3.4.1, 3.4.3, 3.5.0, 3.5.1 ]
 
     runs-on: ubuntu-latest
 
     steps:
-
       - name: Checkout with LFS
-        uses: actions/checkout@v3.5.2
+        uses: actions/checkout@v4.1.7
         with:
           lfs: true
 
       - name: Set up JDK 8
-        uses: actions/setup-java@v3.11.0
+        uses: actions/setup-java@v4.2.1
         with:
           java-version: '8'
           distribution: 'adopt'
 
+      - name: Cache SBT dependencies
+        uses: actions/cache@v4.0.2
+        with:
+          path: |
+            ~/.ivy2/cache
+            ~/.sbt
+            ~/.coursier
+          key: ${{ runner.os }}-sbt-${{ hashFiles('**/*.sbt') }}-${{ matrix.sparkVersion }}
+          restore-keys: |
+            ${{ runner.os }}-sbt-${{ matrix.sparkVersion }}
+
       - name: Run tests and produce coverage
         run: sbt -DsparkVersion="${{matrix.sparkVersion}}" clean coverageOn test coverageReport
 
       - name: Upload coverage to CodeCov
-        uses: codecov/codecov-action@v3.1.4
+        uses: codecov/codecov-action@v4.5.0
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
           files: ./target/spark-${{ matrix.sparkVersion }}/scala-2.12/coverage-report/cobertura.xml
           env_vars: ${{ matrix.sparkVersion }}
           fail_ci_if_error: true
           name: spark-excel
-          # path_to_write_report: ./target/spark-${{ matrix.sparkVersion }}/scala-2.12/coverage-report/codecov_report.txt
           verbose: true
 
       - name: Create assembly

diff --git a/build.ps1 b/build.ps1
@@ -14,7 +14,7 @@
     https://www.elastacloud.com
 #>
 
-$versions = @("3.0.1", "3.0.2", "3.1.2", "3.2.1", "3.2.4", "3.3.0", "3.3.1", "3.3.2", "3.4.0", "3.4.1")
+$versions = @("3.0.1", "3.0.2", "3.1.2", "3.2.1", "3.2.4", "3.3.0", "3.3.1", "3.3.2", "3.4.0", "3.4.1", "3.5.0", "3.5.1")
 $jarPath = "./target/jars"
 $covPath = "./target/coverage"
 

diff --git a/build.sbt b/build.sbt
@@ -60,9 +60,9 @@ libraryDependencies ++= Seq(
   "org.apache.poi" % "poi" % poiVersion.value % Compile,
   "org.apache.poi" % "poi-ooxml" % poiVersion.value % Compile,
   "org.apache.poi" % "poi-ooxml-lite" % poiVersion.value % Compile,
-  "org.apache.commons" % "commons-compress" % "1.21" % Compile,
+  "org.apache.commons" % "commons-compress" % "1.26.1" % Compile,
   "org.apache.commons" % "commons-collections4" % "4.4" % Compile,
-  "commons-io" % "commons-io" % "2.11.0" % Compile,
+  "commons-io" % "commons-io" % "2.16.1" % Compile,
   "org.apache.logging.log4j" % "log4j-core" % log4JVersion.value % Compile,
   "org.apache.logging.log4j" % "log4j-api" % log4JVersion.value % Compile
 )
@@ -120,8 +120,8 @@ addArtifact(Compile / assembly / artifact, assembly)
 
 // Define common settings for the library
 val commonSettings = Seq(
-  sparkVersion := System.getProperty("sparkVersion", "3.5.0"),
-  sparkExcelVersion := "0.1.12",
+  sparkVersion := System.getProperty("sparkVersion", "3.5.1"),
+  sparkExcelVersion := "0.1.13",
   version := s"${sparkVersion.value}_${sparkExcelVersion.value}",
   scalaVersion := {
     if (sparkVersion.value < "3.2.0") {
@@ -132,8 +132,8 @@ val commonSettings = Seq(
       "2.12.15"
     }
   },
-  scalaTestVersion := "3.2.16",
-  poiVersion := "5.2.3",
-  log4JVersion := "2.20.0",
+  scalaTestVersion := "3.2.18",
+  poiVersion := "5.2.5",
+  log4JVersion := "2.23.1",
   crossVersion := CrossVersion.disabled
 )
diff --git a/project/assembly.sbt b/project/assembly.sbt
@@ -1 +1 @@
-addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.2.0")
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.5")
diff --git a/project/build.properties b/project/build.properties
@@ -1 +1 @@
-sbt.version = 1.5.3
+sbt.version = 1.9.8
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -1 +1 @@
-addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.3")
+addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.9")
diff --git a/src/main/scala/com/elastacloud/spark/excel/ExcelParserOptions.scala b/src/main/scala/com/elastacloud/spark/excel/ExcelParserOptions.scala
@@ -62,6 +62,7 @@ private[excel] class ExcelParserOptions(
   val includeSheetName: Boolean = parameters.getOrElse("includeSheetName", "false").toBoolean
   val nulLValue: Option[String] = parameters.get("nullValue")
   val thresholdBytesForTempFiles: Int = parameters.getOrElse("thresholdBytesForTempFiles", parameters.getOrElse("maxBytesForTempFiles", "100000000")).toInt
+  val evaluateFormulae: Boolean = parameters.getOrElse("evaluateFormulae", "true").toBoolean
 
   val schemaMatchColumnName: String = parameters.getOrElse("schemaMatchColumnName", null)
   if (schemaMatchColumnName != null && schemaMatchColumnName.trim.isEmpty) {
@@ -87,7 +88,8 @@ private[excel] object ExcelParserOptions {
     encoder.encode("nullValue") -> "nullValue",
     encoder.encode("maxBytesForTempFiles") -> "maxBytesForTempFiles",
     encoder.encode("thresholdBytesForTempFiles") -> "thresholdBytesForTempFiles",
-    encoder.encode("schemaMatchColumnName") -> "schemaMatchColumnName"
+    encoder.encode("schemaMatchColumnName") -> "schemaMatchColumnName",
+    encoder.encode("evaluateFormulae") -> "evaluateFormulae"
   )
 
   /**

diff --git a/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala b/src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala
@@ -66,7 +66,11 @@ private[excel] class ExcelParser(inputStream: InputStream, options: ExcelParserO
   /**
    * An instance of the formula evaluate for the current workbook
    */
-  private val formulaEvaluator = workBook.getCreationHelper.createFormulaEvaluator()
+  private val formulaEvaluator = if (options.evaluateFormulae) {
+    Some(workBook.getCreationHelper.createFormulaEvaluator())
+  } else {
+    None
+  }
 
   /**
    * The indexes of the worksheets which match the sheet name regular expression pattern
@@ -265,42 +269,89 @@ private[excel] class ExcelParser(inputStream: InputStream, options: ExcelParserO
         return (null, targetNullable)
       }
 
-      val currentCellValue = formulaEvaluator.evaluate(currentCell)
-      currentCellValue.getCellType match {
+      val evaluatedFormulaCell = formulaEvaluator match {
+        case Some(evaluator) => Some(evaluator.evaluate(currentCell))
+        case None => None
+      }
+
+      val cellType = evaluatedFormulaCell match {
+        case Some(evaluatedCell) => evaluatedCell.getCellType
+        case None => currentCell.getCellType
+      }
+
+      cellType match {
         case CellType._NONE | CellType.BLANK | CellType.ERROR => (null, targetNullable)
         case CellType.BOOLEAN => targetType match {
-          case _: StringType => (UTF8String.fromString(currentCellValue.getBooleanValue.toString), true)
-          case _: BooleanType => (currentCellValue.getBooleanValue, true)
+          case _: StringType =>
+            evaluatedFormulaCell match {
+              case Some(evaluatedCell) => (UTF8String.fromString(evaluatedCell.getBooleanValue.toString), true)
+              case None => (UTF8String.fromString(currentCell.getBooleanCellValue.toString), true)
+            }
+          case _: BooleanType =>
+            evaluatedFormulaCell match {
+              case Some(evaluatedCell) => (evaluatedCell.getBooleanValue, true)
+              case None => (currentCell.getBooleanCellValue, true)
+            }
           case _ => (null, false)
         }
         case CellType.NUMERIC => targetType match {
           case _: StringType => if (DateUtil.isCellDateFormatted(currentCell)) {
-            (UTF8String.fromString(DateUtil.getLocalDateTime(currentCellValue.getNumberValue).format(DateTimeFormatter.ISO_DATE_TIME)), true)
+            evaluatedFormulaCell match {
+              case Some(evaluatedCell) => (UTF8String.fromString(DateUtil.getLocalDateTime(evaluatedCell.getNumberValue).format(DateTimeFormatter.ISO_DATE_TIME)), true)
+              case None => (UTF8String.fromString(DateUtil.getLocalDateTime(currentCell.getNumericCellValue).format(DateTimeFormatter.ISO_DATE_TIME)), true)
+            }
           } else {
-            (UTF8String.fromString(currentCellValue.getNumberValue.toString), true)
+            evaluatedFormulaCell match {
+              case Some(evaluatedCell) => (UTF8String.fromString(evaluatedCell.getNumberValue.toString), true)
+              case None => (UTF8String.fromString(currentCell.getNumericCellValue.toString), true)
+            }
           }
           case _: TimestampType if DateUtil.isCellDateFormatted(currentCell) =>
-            val ts = Timestamp.valueOf(DateUtil.getLocalDateTime(currentCellValue.getNumberValue))
+            val ts = evaluatedFormulaCell match {
+              case Some(evaluatedCell) => Timestamp.valueOf(DateUtil.getLocalDateTime(evaluatedCell.getNumberValue))
+              case None => Timestamp.valueOf(DateUtil.getLocalDateTime(currentCell.getNumericCellValue))
+            }
             (DateTimeUtils.fromJavaTimestamp(ts), true)
           case _: DateType if DateUtil.isCellDateFormatted(currentCell) =>
-            val ts = Timestamp.valueOf(DateUtil.getLocalDateTime(currentCellValue.getNumberValue))
+            val ts = evaluatedFormulaCell match {
+              case Some(evaluatedCell) => Timestamp.valueOf(DateUtil.getLocalDateTime(evaluatedCell.getNumberValue))
+              case None => Timestamp.valueOf(DateUtil.getLocalDateTime(currentCell.getNumericCellValue))
+            }
             (DateTimeUtils.fromJavaDate(Date.valueOf(ts.toLocalDateTime.toLocalDate)), true)
-          case _: IntegerType => (currentCellValue.getNumberValue.toInt, true)
-          case _: LongType => (currentCellValue.getNumberValue.toLong, true)
-          case _: FloatType => (currentCellValue.getNumberValue.toFloat, true)
-          case _: DoubleType => (currentCellValue.getNumberValue, true)
+          case _: IntegerType => evaluatedFormulaCell match {
+            case Some(evaluatedCell) => (evaluatedCell.getNumberValue.toInt, true)
+            case None => (currentCell.getNumericCellValue.toInt, true)
+          }
+          case _: LongType => evaluatedFormulaCell match {
+            case Some(evaluatedCell) => (evaluatedCell.getNumberValue.toLong, true)
+            case None => (currentCell.getNumericCellValue.toLong, true)
+          }
+          case _: FloatType => evaluatedFormulaCell match {
+            case Some(evaluatedCell) => (evaluatedCell.getNumberValue.toFloat, true)
+            case None => (currentCell.getNumericCellValue.toFloat, true)
+          }
+          case _: DoubleType => evaluatedFormulaCell match {
+            case Some(evaluatedCell) => (evaluatedCell.getNumberValue, true)
+            case None => (currentCell.getNumericCellValue, true)
+          }
           case _ => (null, false)
         }
         case CellType.STRING => targetType match {
           case _: StringType =>
-            val cellStringValue = UTF8String.fromString(currentCellValue.getStringValue)
+            val cellStringValue = evaluatedFormulaCell match {
+              case Some(evaluatedCell) => UTF8String.fromString(evaluatedCell.getStringValue)
+              case None => UTF8String.fromString(currentCell.getStringCellValue)
+            }
             options.nulLValue match {
               case Some(nullValue) if cellStringValue.toString.equalsIgnoreCase(nullValue) => (null, true)
               case _ => (cellStringValue, true)
             }
           case _ => (null, false)
         }
-        case _ => (UTF8String.fromString(currentCellValue.toString), true)
+        case _ => evaluatedFormulaCell match {
+          case Some(evaluatedCell) => (UTF8String.fromString(evaluatedCell.toString), true)
+          case None => (UTF8String.fromString(currentCell.toString), true)
+        }
       }
     }
   }
@@ -375,34 +426,48 @@ private[excel] class ExcelParser(inputStream: InputStream, options: ExcelParserO
       firstColumnIndex.to(lastColumnIndex).zipWithIndex.map { case (_, i) => s"col_$i" }
     }
 
-    // Determine the last data row, this is either the last row of data, or the maximum number of rows defined by the user
-    val lastRowNum = options.maxRowCount match {
-      case rowNum if rowNum != 0 && rowNum + firstDataRow.getRowNum <= sheet.getLastRowNum => rowNum + firstDataRow.getRowNum
-      case _ => sheet.getLastRowNum
-    }
+    var fields = if (firstRow.getRowNum == sheet.getLastRowNum) {
+      // If there is no data in the file (other than the header) then return a default schema
+      firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (_, i) =>
+        StructField(fieldNames(i), StringType, nullable = true)
+      }
+    } else {
+      // Determine the last data row, this is either the last row of data, or the maximum number of rows defined by the user
+      val lastRowNum = options.maxRowCount match {
+        case rowNum if rowNum != 0 && rowNum + firstDataRow.getRowNum <= sheet.getLastRowNum => rowNum + firstDataRow.getRowNum
+        case _ => sheet.getLastRowNum
+      }
 
-    // Get the field structure for data in the workbook
-    var fields = firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (colIndex, i) =>
-      // Get the collection of types for the current column across the rows used for inferring the schema
-      val colTypes = firstDataRow.getRowNum.until(lastRowNum).flatMap(rowIndex => {
-        // Get the current cell (or cell containing data for part of a merged region), the determine the Spark DataType
-        // for the cell
-        val currentCell = sheet.getRow(rowIndex).getCell(colIndex, Row.MissingCellPolicy.RETURN_NULL_AND_BLANK)
-        val fieldType: Option[DataType] = if (currentCell == null || currentCell.getCellType == CellType.BLANK) None else formulaEvaluator.evaluate(currentCell).getCellType match {
-          case CellType._NONE | CellType.BLANK | CellType.ERROR => None
-          case CellType.BOOLEAN => Some(BooleanType)
-          case CellType.NUMERIC => if (DateUtil.isCellDateFormatted(currentCell)) Some(TimestampType) else Some(DoubleType)
-          case _ => Some(StringType)
-        }
-        fieldType
-      })
+      // Get the field structure for data in the workbook
+      firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (colIndex, i) =>
+        // Get the collection of types for the current column across the rows used for inferring the schema
+        val colTypes = firstDataRow.getRowNum.until(lastRowNum).flatMap(rowIndex => {
+          // Get the current cell (or cell containing data for part of a merged region), the determine the Spark DataType
+          // for the cell
+          val currentCell = sheet.getRow(rowIndex).getCell(colIndex, Row.MissingCellPolicy.RETURN_NULL_AND_BLANK)
+          val fieldType: Option[DataType] = if (currentCell == null || currentCell.getCellType == CellType.BLANK) None else {
+            val cellType = formulaEvaluator match {
+              case Some(evaluator) => evaluator.evaluate(currentCell).getCellType
+              case None => currentCell.getCellType
+            }
 
-      // If all of the cells in the inference set are of the same type, then use this as the schema type, otherwise
-      // default to data as a string
-      if (colTypes.distinct.length == 1) {
-        StructField(fieldNames(i), colTypes.head, nullable = true)
-      } else {
-        StructField(fieldNames(i), StringType, nullable = true)
+            cellType match {
+              case CellType._NONE | CellType.BLANK | CellType.ERROR => None
+              case CellType.BOOLEAN => Some(BooleanType)
+              case CellType.NUMERIC => if (DateUtil.isCellDateFormatted(currentCell)) Some(TimestampType) else Some(DoubleType)
+              case _ => Some(StringType)
+            }
+          }
+          fieldType
+        })
+
+        // If all of the cells in the inference set are of the same type, then use this as the schema type, otherwise
+        // default to data as a string
+        if (colTypes.distinct.length == 1) {
+          StructField(fieldNames(i), colTypes.head, nullable = true)
+        } else {
+          StructField(fieldNames(i), StringType, nullable = true)
+        }
       }
     }
 

diff --git a/src/test/resources/Parser/Empty.xlsx b/src/test/resources/Parser/Empty.xlsx
diff --git a/src/test/resources/Parser/NoData.xlsx b/src/test/resources/Parser/NoData.xlsx
diff --git a/src/test/resources/Parser/VaryingTypes.xlsx b/src/test/resources/Parser/VaryingTypes.xlsx
diff --git a/src/test/scala/com/elastacloud/spark/excel/DefaultSourceTests.scala b/src/test/scala/com/elastacloud/spark/excel/DefaultSourceTests.scala
@@ -16,6 +16,7 @@
 
 package com.elastacloud.spark.excel
 
+import com.elastacloud.spark.excel.parser.ExcelParserException
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.types._
 import org.scalatest.BeforeAndAfterAll
@@ -180,6 +181,38 @@ class DefaultSourceTests extends AnyFlatSpec with Matchers with BeforeAndAfterAl
     df.count() should be(3)
   }
 
+  "Reading an empty workbook" should "throw an exception" in {
+    val inputPath = testFilePath("/Parser/Empty.xlsx")
+
+    val error = intercept[ExcelParserException] {
+      spark.read
+        .format("excel")
+        .load(inputPath.replace("%20", " "))
+        .count()
+    }
+
+    error.getMessage should be("No data found on first row")
+  }
+
+  it should "return a single empty record if only headers exist" in {
+    val inputPath = testFilePath("/Parser/NoData.xlsx")
+
+    val dataSchema = StructType(Array(
+      StructField("Col1", StringType, nullable = true),
+      StructField("Col2", StringType, nullable = true),
+      StructField("Col3", StringType, nullable = true),
+      StructField("Col4", StringType, nullable = true)
+    ))
+
+    val df = spark.read
+      .format("com.elastacloud.spark.excel")
+      .schema(dataSchema)
+      .load(inputPath)
+
+    df.count() should be(1)
+    df.schema should equal(dataSchema)
+  }
+
   "Attempting to write to Excel" should "raise an error" in {
     import spark.implicits._