Skip to content

Commit

Permalink
Non-recursive implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
george-zubrienko committed Dec 10, 2024
1 parent fe04ea2 commit 923ee48
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import scala.language.implicitConversions
import scala.util.matching.Regex

object CSVParser:
def parseCsvLine(line: String, delimiter: Char = ','): Seq[Option[String]] = {
def parseCsvLine(line: String, delimiter: Char = ',', headerCount: Int): Seq[Option[String]] = {
def isQuote(position: Int): Boolean = line(position) == '"'
def isDelimiter(position: Int): Boolean = line(position) == delimiter
def isEol(position: Int): Boolean = position == line.length - 1
Expand All @@ -27,25 +27,27 @@ object CSVParser:
case (true, false) if isQuote(currentPosition - 1) => Some(line.slice(from = fromIndex, until = currentPosition - 1))
case _ => Some(line.slice(from = fromIndex, until = currentPosition))

line.zipWithIndex.foldLeft((IndexedSeq[Option[String]](), 0, 0)) { (agg, element) =>
val parsed = line.zipWithIndex.foldLeft((IndexedSeq[Option[String]](), 0, 0)) { (agg, element) =>
val (character, charIndex) = element
val (result, quoteSum, prevCharIndex) = agg

def handleEol: (IndexedSeq[Option[String]], Int, Int) =
if isQuote(prevCharIndex) then
(result :+ extractValue(prevCharIndex + 1, charIndex), quoteSum, prevCharIndex)
else
(result :+ extractValue(prevCharIndex, charIndex), quoteSum, prevCharIndex)

character match
// recursive case in a quoted line - opening quote - move on
case '"' if charIndex < line.length && quoteSum == 0 =>
case '"' if !isEol(charIndex) && quoteSum == 0 =>
(result, quoteSum + 1, prevCharIndex)

// recursive case in a quoted line - closing quote - move on
case '"' if charIndex < line.length - 1 =>
(result, quoteSum - 1, prevCharIndex)

// EOL on quote
case '"' if isEol(charIndex) =>
if isQuote(prevCharIndex) then
(result :+ extractValue(prevCharIndex + 1, charIndex), quoteSum, prevCharIndex)
else
(result :+ extractValue(prevCharIndex, charIndex), quoteSum, prevCharIndex)
case '"' if isEol(charIndex) => handleEol

// hit a delimiter, not end of string - emit value and continue
case _ if (quoteSum == 0) && isDelimiter(charIndex) && !isEol(charIndex) =>
Expand All @@ -61,18 +63,19 @@ object CSVParser:
(result :+ extractValue(prevCharIndex, charIndex) :+ None, quoteSum, prevCharIndex)

// regular case - end of line - return last segment and exit
case _ if (quoteSum == 0) && isEol(charIndex) =>
if isQuote(prevCharIndex) then
(result :+ extractValue(prevCharIndex + 1, charIndex), quoteSum, prevCharIndex)
else
(result :+ extractValue(prevCharIndex, charIndex), quoteSum, prevCharIndex)
case _ if (quoteSum == 0) && isEol(charIndex) => handleEol

// mismatched quotes
case _ if (quoteSum != 0) && isEol(charIndex) && !isQuote(charIndex) =>
throw new IllegalStateException(s"CSV line $line with delimiter $delimiter has mismatching field quotes")
case _ =>
(result, quoteSum, prevCharIndex)
}._1

if parsed.size != headerCount then
throw new IllegalStateException(s"CSV line $line with delimiter $delimiter cannot be parsed into desired $headerCount")

parsed
}

def isComplete(csvLine: String): Boolean = {
Expand All @@ -88,10 +91,10 @@ object CSVParser:
given Conversion[(String, ArcaneSchema), DataRow] with
override def apply(schemaBoundCsvLine: (String, ArcaneSchema)): DataRow = schemaBoundCsvLine match
case (csvLine, schema) =>
val parsed = CSVParser.parseCsvLine(csvLine)

require(parsed.size == schema.size, s"Mismatched field count: ${parsed.size} in the CSV, ${schema.size} in the schema")

val parsed = CSVParser.parseCsvLine(
line = csvLine,
headerCount = schema.size)
val mergeKeyValue = parsed(schema.zipWithIndex.find(v => v._1.name == "Id").get._2)

parsed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,19 @@ class CdmParserTests extends AnyFlatSpec with Matchers {

private val invalidCsvlines = Table(
("line", "result"),
("\"q\",\",\"1321\"", Seq())
("\"q\",\",\"1321\"", Seq(Some("q"), None, Some("1321")))
)

it should "handle valid quoted CSV lines correctly" in {
forAll (validCsvLines) { (line, result) =>
CSVParser.parseCsvLine(line) should equal(result)
CSVParser.parseCsvLine(line = line, headerCount = result.size) should equal(result)
}
}

it should "handle invalid quoted CSV lines correctly" in {
forAll (invalidCsvlines) { (line, _) =>
forAll (invalidCsvlines) { (line, result) =>
intercept[IllegalStateException] {
CSVParser.parseCsvLine(line)
CSVParser.parseCsvLine(line, headerCount = result.size)
}
}
}
Expand Down

0 comments on commit 923ee48

Please sign in to comment.