Skip to content

Commit

Permalink
0 1 13 release (#44)
Browse files Browse the repository at this point in the history
* change version to 0.1.13-SNAPSHOT

* Update project dependencies

Updated multiple dependencies across the project, including sbt version, plugin versions, and library versions. These changes help keep the project up to date and potentially benefit from updates and fixes in the newer versions of these dependencies.

* Update spark versions

This change adds Spark 3.5.1 as a supported version

* update apache commons compress library to remove CVE warning

* add new option to disable evaluation of formulas when reading the Excel source

* Update dependencies and refactor actions workflow (#41)

* Update apache poi to 5.2.5 and spark to 3.5.1

* Update scalaTestVersion

* Refactor actions workflow

* Rollback scala version update due to incompatibilities with coverage dependency

* Revert package upload path in actions

* add latest 3.4.x release

* Dazfuller/no data fix (#43)

* implement fix to handle files which contain no data (only headers)

* add new resource files for tests

* add tests to cover new cases

* change version to 0.1.13

---------

Co-authored-by: Jose Soto <josecsmorales@gmail.com>
  • Loading branch information
dazfuller and josecsotomorales authored Jun 29, 2024
1 parent 01ee129 commit 5dc22b4
Show file tree
Hide file tree
Showing 15 changed files with 273 additions and 90 deletions.
22 changes: 15 additions & 7 deletions .github/workflows/spark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,40 +7,48 @@ on:
types: [ opened, reopened ]
workflow_dispatch:


jobs:
build:
strategy:
matrix:
sparkVersion: [ 3.0.1, 3.0.2, 3.0.3, 3.1.2, 3.2.1, 3.2.4, 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.4.0, 3.4.1, 3.5.0 ]
sparkVersion: [ 3.0.1, 3.0.2, 3.0.3, 3.1.2, 3.2.1, 3.2.4, 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.4.0, 3.4.1, 3.4.3, 3.5.0, 3.5.1 ]

runs-on: ubuntu-latest

steps:

- name: Checkout with LFS
uses: actions/checkout@v3.5.2
uses: actions/checkout@v4.1.7
with:
lfs: true

- name: Set up JDK 8
uses: actions/setup-java@v3.11.0
uses: actions/setup-java@v4.2.1
with:
java-version: '8'
distribution: 'adopt'

- name: Cache SBT dependencies
uses: actions/cache@v4.0.2
with:
path: |
~/.ivy2/cache
~/.sbt
~/.coursier
key: ${{ runner.os }}-sbt-${{ hashFiles('**/*.sbt') }}-${{ matrix.sparkVersion }}
restore-keys: |
${{ runner.os }}-sbt-${{ matrix.sparkVersion }}
- name: Run tests and produce coverage
run: sbt -DsparkVersion="${{matrix.sparkVersion}}" clean coverageOn test coverageReport

- name: Upload coverage to CodeCov
uses: codecov/codecov-action@v3.1.4
uses: codecov/codecov-action@v4.5.0
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./target/spark-${{ matrix.sparkVersion }}/scala-2.12/coverage-report/cobertura.xml
env_vars: ${{ matrix.sparkVersion }}
fail_ci_if_error: true
name: spark-excel
# path_to_write_report: ./target/spark-${{ matrix.sparkVersion }}/scala-2.12/coverage-report/codecov_report.txt
verbose: true

- name: Create assembly
Expand Down
2 changes: 1 addition & 1 deletion build.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
https://www.elastacloud.com
#>

$versions = @("3.0.1", "3.0.2", "3.1.2", "3.2.1", "3.2.4", "3.3.0", "3.3.1", "3.3.2", "3.4.0", "3.4.1")
$versions = @("3.0.1", "3.0.2", "3.1.2", "3.2.1", "3.2.4", "3.3.0", "3.3.1", "3.3.2", "3.4.0", "3.4.1", "3.5.0", "3.5.1")
$jarPath = "./target/jars"
$covPath = "./target/coverage"

Expand Down
14 changes: 7 additions & 7 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ libraryDependencies ++= Seq(
"org.apache.poi" % "poi" % poiVersion.value % Compile,
"org.apache.poi" % "poi-ooxml" % poiVersion.value % Compile,
"org.apache.poi" % "poi-ooxml-lite" % poiVersion.value % Compile,
"org.apache.commons" % "commons-compress" % "1.21" % Compile,
"org.apache.commons" % "commons-compress" % "1.26.1" % Compile,
"org.apache.commons" % "commons-collections4" % "4.4" % Compile,
"commons-io" % "commons-io" % "2.11.0" % Compile,
"commons-io" % "commons-io" % "2.16.1" % Compile,
"org.apache.logging.log4j" % "log4j-core" % log4JVersion.value % Compile,
"org.apache.logging.log4j" % "log4j-api" % log4JVersion.value % Compile
)
Expand Down Expand Up @@ -120,8 +120,8 @@ addArtifact(Compile / assembly / artifact, assembly)

// Define common settings for the library
val commonSettings = Seq(
sparkVersion := System.getProperty("sparkVersion", "3.5.0"),
sparkExcelVersion := "0.1.12",
sparkVersion := System.getProperty("sparkVersion", "3.5.1"),
sparkExcelVersion := "0.1.13",
version := s"${sparkVersion.value}_${sparkExcelVersion.value}",
scalaVersion := {
if (sparkVersion.value < "3.2.0") {
Expand All @@ -132,8 +132,8 @@ val commonSettings = Seq(
"2.12.15"
}
},
scalaTestVersion := "3.2.16",
poiVersion := "5.2.3",
log4JVersion := "2.20.0",
scalaTestVersion := "3.2.18",
poiVersion := "5.2.5",
log4JVersion := "2.23.1",
crossVersion := CrossVersion.disabled
)
2 changes: 1 addition & 1 deletion project/assembly.sbt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.2.0")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.5")
2 changes: 1 addition & 1 deletion project/build.properties
Original file line number Diff line number Diff line change
@@ -1 +1 @@
sbt.version = 1.5.3
sbt.version = 1.9.8
2 changes: 1 addition & 1 deletion project/plugins.sbt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.3")
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.0.9")
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ private[excel] class ExcelParserOptions(
val includeSheetName: Boolean = parameters.getOrElse("includeSheetName", "false").toBoolean
val nulLValue: Option[String] = parameters.get("nullValue")
val thresholdBytesForTempFiles: Int = parameters.getOrElse("thresholdBytesForTempFiles", parameters.getOrElse("maxBytesForTempFiles", "100000000")).toInt
val evaluateFormulae: Boolean = parameters.getOrElse("evaluateFormulae", "true").toBoolean

val schemaMatchColumnName: String = parameters.getOrElse("schemaMatchColumnName", null)
if (schemaMatchColumnName != null && schemaMatchColumnName.trim.isEmpty) {
Expand All @@ -87,7 +88,8 @@ private[excel] object ExcelParserOptions {
encoder.encode("nullValue") -> "nullValue",
encoder.encode("maxBytesForTempFiles") -> "maxBytesForTempFiles",
encoder.encode("thresholdBytesForTempFiles") -> "thresholdBytesForTempFiles",
encoder.encode("schemaMatchColumnName") -> "schemaMatchColumnName"
encoder.encode("schemaMatchColumnName") -> "schemaMatchColumnName",
encoder.encode("evaluateFormulae") -> "evaluateFormulae"
)

/**
Expand Down
147 changes: 106 additions & 41 deletions src/main/scala/com/elastacloud/spark/excel/parser/ExcelParser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,11 @@ private[excel] class ExcelParser(inputStream: InputStream, options: ExcelParserO
/**
* An instance of the formula evaluate for the current workbook
*/
private val formulaEvaluator = workBook.getCreationHelper.createFormulaEvaluator()
private val formulaEvaluator = if (options.evaluateFormulae) {
Some(workBook.getCreationHelper.createFormulaEvaluator())
} else {
None
}

/**
* The indexes of the worksheets which match the sheet name regular expression pattern
Expand Down Expand Up @@ -265,42 +269,89 @@ private[excel] class ExcelParser(inputStream: InputStream, options: ExcelParserO
return (null, targetNullable)
}

val currentCellValue = formulaEvaluator.evaluate(currentCell)
currentCellValue.getCellType match {
val evaluatedFormulaCell = formulaEvaluator match {
case Some(evaluator) => Some(evaluator.evaluate(currentCell))
case None => None
}

val cellType = evaluatedFormulaCell match {
case Some(evaluatedCell) => evaluatedCell.getCellType
case None => currentCell.getCellType
}

cellType match {
case CellType._NONE | CellType.BLANK | CellType.ERROR => (null, targetNullable)
case CellType.BOOLEAN => targetType match {
case _: StringType => (UTF8String.fromString(currentCellValue.getBooleanValue.toString), true)
case _: BooleanType => (currentCellValue.getBooleanValue, true)
case _: StringType =>
evaluatedFormulaCell match {
case Some(evaluatedCell) => (UTF8String.fromString(evaluatedCell.getBooleanValue.toString), true)
case None => (UTF8String.fromString(currentCell.getBooleanCellValue.toString), true)
}
case _: BooleanType =>
evaluatedFormulaCell match {
case Some(evaluatedCell) => (evaluatedCell.getBooleanValue, true)
case None => (currentCell.getBooleanCellValue, true)
}
case _ => (null, false)
}
case CellType.NUMERIC => targetType match {
case _: StringType => if (DateUtil.isCellDateFormatted(currentCell)) {
(UTF8String.fromString(DateUtil.getLocalDateTime(currentCellValue.getNumberValue).format(DateTimeFormatter.ISO_DATE_TIME)), true)
evaluatedFormulaCell match {
case Some(evaluatedCell) => (UTF8String.fromString(DateUtil.getLocalDateTime(evaluatedCell.getNumberValue).format(DateTimeFormatter.ISO_DATE_TIME)), true)
case None => (UTF8String.fromString(DateUtil.getLocalDateTime(currentCell.getNumericCellValue).format(DateTimeFormatter.ISO_DATE_TIME)), true)
}
} else {
(UTF8String.fromString(currentCellValue.getNumberValue.toString), true)
evaluatedFormulaCell match {
case Some(evaluatedCell) => (UTF8String.fromString(evaluatedCell.getNumberValue.toString), true)
case None => (UTF8String.fromString(currentCell.getNumericCellValue.toString), true)
}
}
case _: TimestampType if DateUtil.isCellDateFormatted(currentCell) =>
val ts = Timestamp.valueOf(DateUtil.getLocalDateTime(currentCellValue.getNumberValue))
val ts = evaluatedFormulaCell match {
case Some(evaluatedCell) => Timestamp.valueOf(DateUtil.getLocalDateTime(evaluatedCell.getNumberValue))
case None => Timestamp.valueOf(DateUtil.getLocalDateTime(currentCell.getNumericCellValue))
}
(DateTimeUtils.fromJavaTimestamp(ts), true)
case _: DateType if DateUtil.isCellDateFormatted(currentCell) =>
val ts = Timestamp.valueOf(DateUtil.getLocalDateTime(currentCellValue.getNumberValue))
val ts = evaluatedFormulaCell match {
case Some(evaluatedCell) => Timestamp.valueOf(DateUtil.getLocalDateTime(evaluatedCell.getNumberValue))
case None => Timestamp.valueOf(DateUtil.getLocalDateTime(currentCell.getNumericCellValue))
}
(DateTimeUtils.fromJavaDate(Date.valueOf(ts.toLocalDateTime.toLocalDate)), true)
case _: IntegerType => (currentCellValue.getNumberValue.toInt, true)
case _: LongType => (currentCellValue.getNumberValue.toLong, true)
case _: FloatType => (currentCellValue.getNumberValue.toFloat, true)
case _: DoubleType => (currentCellValue.getNumberValue, true)
case _: IntegerType => evaluatedFormulaCell match {
case Some(evaluatedCell) => (evaluatedCell.getNumberValue.toInt, true)
case None => (currentCell.getNumericCellValue.toInt, true)
}
case _: LongType => evaluatedFormulaCell match {
case Some(evaluatedCell) => (evaluatedCell.getNumberValue.toLong, true)
case None => (currentCell.getNumericCellValue.toLong, true)
}
case _: FloatType => evaluatedFormulaCell match {
case Some(evaluatedCell) => (evaluatedCell.getNumberValue.toFloat, true)
case None => (currentCell.getNumericCellValue.toFloat, true)
}
case _: DoubleType => evaluatedFormulaCell match {
case Some(evaluatedCell) => (evaluatedCell.getNumberValue, true)
case None => (currentCell.getNumericCellValue, true)
}
case _ => (null, false)
}
case CellType.STRING => targetType match {
case _: StringType =>
val cellStringValue = UTF8String.fromString(currentCellValue.getStringValue)
val cellStringValue = evaluatedFormulaCell match {
case Some(evaluatedCell) => UTF8String.fromString(evaluatedCell.getStringValue)
case None => UTF8String.fromString(currentCell.getStringCellValue)
}
options.nulLValue match {
case Some(nullValue) if cellStringValue.toString.equalsIgnoreCase(nullValue) => (null, true)
case _ => (cellStringValue, true)
}
case _ => (null, false)
}
case _ => (UTF8String.fromString(currentCellValue.toString), true)
case _ => evaluatedFormulaCell match {
case Some(evaluatedCell) => (UTF8String.fromString(evaluatedCell.toString), true)
case None => (UTF8String.fromString(currentCell.toString), true)
}
}
}
}
Expand Down Expand Up @@ -375,34 +426,48 @@ private[excel] class ExcelParser(inputStream: InputStream, options: ExcelParserO
firstColumnIndex.to(lastColumnIndex).zipWithIndex.map { case (_, i) => s"col_$i" }
}

// Determine the last data row, this is either the last row of data, or the maximum number of rows defined by the user
val lastRowNum = options.maxRowCount match {
case rowNum if rowNum != 0 && rowNum + firstDataRow.getRowNum <= sheet.getLastRowNum => rowNum + firstDataRow.getRowNum
case _ => sheet.getLastRowNum
}
var fields = if (firstRow.getRowNum == sheet.getLastRowNum) {
// If there is no data in the file (other than the header) then return a default schema
firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (_, i) =>
StructField(fieldNames(i), StringType, nullable = true)
}
} else {
// Determine the last data row, this is either the last row of data, or the maximum number of rows defined by the user
val lastRowNum = options.maxRowCount match {
case rowNum if rowNum != 0 && rowNum + firstDataRow.getRowNum <= sheet.getLastRowNum => rowNum + firstDataRow.getRowNum
case _ => sheet.getLastRowNum
}

// Get the field structure for data in the workbook
var fields = firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (colIndex, i) =>
// Get the collection of types for the current column across the rows used for inferring the schema
val colTypes = firstDataRow.getRowNum.until(lastRowNum).flatMap(rowIndex => {
// Get the current cell (or cell containing data for part of a merged region), the determine the Spark DataType
// for the cell
val currentCell = sheet.getRow(rowIndex).getCell(colIndex, Row.MissingCellPolicy.RETURN_NULL_AND_BLANK)
val fieldType: Option[DataType] = if (currentCell == null || currentCell.getCellType == CellType.BLANK) None else formulaEvaluator.evaluate(currentCell).getCellType match {
case CellType._NONE | CellType.BLANK | CellType.ERROR => None
case CellType.BOOLEAN => Some(BooleanType)
case CellType.NUMERIC => if (DateUtil.isCellDateFormatted(currentCell)) Some(TimestampType) else Some(DoubleType)
case _ => Some(StringType)
}
fieldType
})
// Get the field structure for data in the workbook
firstColumnIndex.until(lastColumnIndex).zipWithIndex.map { case (colIndex, i) =>
// Get the collection of types for the current column across the rows used for inferring the schema
val colTypes = firstDataRow.getRowNum.until(lastRowNum).flatMap(rowIndex => {
// Get the current cell (or cell containing data for part of a merged region), the determine the Spark DataType
// for the cell
val currentCell = sheet.getRow(rowIndex).getCell(colIndex, Row.MissingCellPolicy.RETURN_NULL_AND_BLANK)
val fieldType: Option[DataType] = if (currentCell == null || currentCell.getCellType == CellType.BLANK) None else {
val cellType = formulaEvaluator match {
case Some(evaluator) => evaluator.evaluate(currentCell).getCellType
case None => currentCell.getCellType
}

// If all of the cells in the inference set are of the same type, then use this as the schema type, otherwise
// default to data as a string
if (colTypes.distinct.length == 1) {
StructField(fieldNames(i), colTypes.head, nullable = true)
} else {
StructField(fieldNames(i), StringType, nullable = true)
cellType match {
case CellType._NONE | CellType.BLANK | CellType.ERROR => None
case CellType.BOOLEAN => Some(BooleanType)
case CellType.NUMERIC => if (DateUtil.isCellDateFormatted(currentCell)) Some(TimestampType) else Some(DoubleType)
case _ => Some(StringType)
}
}
fieldType
})

// If all of the cells in the inference set are of the same type, then use this as the schema type, otherwise
// default to data as a string
if (colTypes.distinct.length == 1) {
StructField(fieldNames(i), colTypes.head, nullable = true)
} else {
StructField(fieldNames(i), StringType, nullable = true)
}
}
}

Expand Down
3 changes: 3 additions & 0 deletions src/test/resources/Parser/Empty.xlsx
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/test/resources/Parser/NoData.xlsx
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/test/resources/Parser/VaryingTypes.xlsx
Git LFS file not shown
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package com.elastacloud.spark.excel

import com.elastacloud.spark.excel.parser.ExcelParserException
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
import org.scalatest.BeforeAndAfterAll
Expand Down Expand Up @@ -180,6 +181,38 @@ class DefaultSourceTests extends AnyFlatSpec with Matchers with BeforeAndAfterAl
df.count() should be(3)
}

"Reading an empty workbook" should "throw an exception" in {
val inputPath = testFilePath("/Parser/Empty.xlsx")

val error = intercept[ExcelParserException] {
spark.read
.format("excel")
.load(inputPath.replace("%20", " "))
.count()
}

error.getMessage should be("No data found on first row")
}

it should "return a single empty record if only headers exist" in {
val inputPath = testFilePath("/Parser/NoData.xlsx")

val dataSchema = StructType(Array(
StructField("Col1", StringType, nullable = true),
StructField("Col2", StringType, nullable = true),
StructField("Col3", StringType, nullable = true),
StructField("Col4", StringType, nullable = true)
))

val df = spark.read
.format("com.elastacloud.spark.excel")
.schema(dataSchema)
.load(inputPath)

df.count() should be(1)
df.schema should equal(dataSchema)
}

"Attempting to write to Excel" should "raise an error" in {
import spark.implicits._

Expand Down
Loading

0 comments on commit 5dc22b4

Please sign in to comment.