diff --git a/.github/workflows/build-with-bal-test-graalvm.yml b/.github/workflows/build-with-bal-test-graalvm.yml index 291054d..77a583e 100644 --- a/.github/workflows/build-with-bal-test-graalvm.yml +++ b/.github/workflows/build-with-bal-test-graalvm.yml @@ -35,3 +35,5 @@ jobs: lang_tag: ${{ inputs.lang_tag }} lang_version: ${{ inputs.lang_version }} native_image_options: '-J-Xmx7G ${{ inputs.native_image_options }}' + # TODO : Enable after fixing this issue : https://github.com/ballerina-platform/ballerina-lang/issues/38882 + additional_windows_build_flags: '-x test' diff --git a/ballerina-tests/build.gradle b/ballerina-tests/build.gradle index 524cca8..07f7576 100644 --- a/ballerina-tests/build.gradle +++ b/ballerina-tests/build.gradle @@ -32,9 +32,8 @@ def testCommonTomlFilePlaceHolder = new File("${project.rootDir}/build-config/re def ballerinaDist = "${project.rootDir}/target/ballerina-runtime" def distributionBinPath = "${ballerinaDist}/bin" def testCoverageParam = "--code-coverage --coverage-format=xml --includes=io.ballerina.stdlib.data.*:ballerina.*" -def testPackages = ["constraint-validation-tests", "parse-list-types-tests", "parse-record-types-tests", - "parse-string-array-types-tests", "parse-string-record-types-tests", "type-compatible-tests", - "union-type-tests", "user-config-tests"] +def testPackages = ["user-config-tests", "type-compatible-tests", "unicode-tests", "constraint-validation-tests", "parse-list-types-tests", "parse-record-types-tests", + "parse-string-array-types-tests", "parse-string-record-types-tests", "union-type-tests"] def testCommonPackage = "csv-commons" def stripBallerinaExtensionVersion(String extVersion) { diff --git a/ballerina-tests/type-compatible-tests/tests/csv_content.txt b/ballerina-tests/type-compatible-tests/tests/csv_content.txt new file mode 100644 index 0000000..2917073 --- /dev/null +++ b/ballerina-tests/type-compatible-tests/tests/csv_content.txt @@ -0,0 +1,4 @@ +a, b, c d, e +"Hello World", \"Hello World\", Hello World, 2 +"Hello World", \"Hello World\", Hello World, 2 +"Hello World", \"Hello World\", Hello World, 2 \ No newline at end of file diff --git a/ballerina-tests/type-compatible-tests/tests/parse_string_compatibality_test.bal b/ballerina-tests/type-compatible-tests/tests/parse_string_compatibality_test.bal index c36c4c8..8cb60fd 100644 --- a/ballerina-tests/type-compatible-tests/tests/parse_string_compatibality_test.bal +++ b/ballerina-tests/type-compatible-tests/tests/parse_string_compatibality_test.bal @@ -1,7 +1,10 @@ import ballerina/csv_commons as common; import ballerina/data.csv as csv; +import ballerina/io; import ballerina/test; +const string filepath = "tests/csv_content.txt"; + @test:Config function testFromCsvStringWithTypeCompatibility() { string value = string `i1,i2,s1,s2, b1,b2,n1,n2,f1,f2, d1,d2,j1,a1,j2,a2 @@ -180,4 +183,52 @@ function testFromCsvStringWithTypeCompatibility() { 1.2, abc, true,1.0`); test:assertTrue(m3rra is csv:Error); test:assertEquals((m3rra).message(), common:generateErrorMessageForInvalidCast("1.0", "int")); -} \ No newline at end of file +} + +@test:Config +function testSpaceBetweendData() { + string csv = string `a b, b d e, f + "Hello world", " Hi I am ", \" Hi I am \"`; + + record{|string...;|}[]|csv:Error rec = csv:parseStringToRecord(csv); + test:assertEquals(rec, [ + {"a b":"Hello world","b d e":" Hi I am ","f":"\"Hi I am \""}]); +} + +@test:Config +function testParseBytes() returns error? { + byte[] csvBytes = check io:fileReadBytes(filepath); + + record{}[]|csv:Error rec = csv:parseBytesToRecord(csvBytes, {}); + test:assertEquals(rec, [ + {"a":"Hello World","b":"\"Hello World\"","c d":"Hello World","e":2}, + {"a":"Hello World","b":"\"Hello World\"","c d":"Hello World","e":2}, + {"a":"Hello World","b":"\"Hello World\"","c d":"Hello World","e":2}] + ); + + string[][]|csv:Error rec2 = csv:parseBytesToList(csvBytes, {}); + test:assertEquals(rec2, [ + ["Hello World", "\"Hello World\"", "Hello World", "2"], + ["Hello World", "\"Hello World\"", "Hello World", "2"], + ["Hello World", "\"Hello World\"", "Hello World", "2"] + ]); +} + +@test:Config +function testParseStream() returns error? { + stream csvByteStream = check io:fileReadBlocksAsStream(filepath); + record{}[]|csv:Error rec = csv:parseStreamToRecord(csvByteStream, {}); + test:assertEquals(rec, [ + {"a":"Hello World","b":"\"Hello World\"","c d":"Hello World","e":2}, + {"a":"Hello World","b":"\"Hello World\"","c d":"Hello World","e":2}, + {"a":"Hello World","b":"\"Hello World\"","c d":"Hello World","e":2}] + ); + + csvByteStream = check io:fileReadBlocksAsStream(filepath); + string[][]|csv:Error rec2 = csv:parseStreamToList(csvByteStream, {}); + test:assertEquals(rec2, [ + ["Hello World", "\"Hello World\"", "Hello World", "2"], + ["Hello World", "\"Hello World\"", "Hello World", "2"], + ["Hello World", "\"Hello World\"", "Hello World", "2"] + ]); +} diff --git a/ballerina-tests/type-compatible-tests/tests/parse_type_compatibility_test.bal b/ballerina-tests/type-compatible-tests/tests/parse_type_compatibility_test.bal index d8ac07f..ed41a81 100644 --- a/ballerina-tests/type-compatible-tests/tests/parse_type_compatibility_test.bal +++ b/ballerina-tests/type-compatible-tests/tests/parse_type_compatibility_test.bal @@ -5,8 +5,6 @@ import ballerina/test; @test:Config function testFromCsvWithTypeFunctionWithTypeCompatibility() { var value = {i1, i2, s1, s2, b1, b2, n1, n2, f1, f2, d1, d2, j1: b1, a1: d1, j2: b2, a2: d2}; - var value2 = {i1, s1, b1, n1, f1, d1, j1: b1, a1: d1, s2, s3, j2: b2, a2: d2}; - var value3 = {i1, s1, b1, n1, f1, d1, j1: b1, a1: d1, s2, s3}; CustomRecord27Array|csv:Error vcr27a = csv:parseRecordAsRecordType([value, value, value], {}, CustomRecord27Array); test:assertEquals(vcr27a , [ diff --git a/ballerina-tests/unicode-tests/.gitignore b/ballerina-tests/unicode-tests/.gitignore new file mode 100644 index 0000000..d5fc29a --- /dev/null +++ b/ballerina-tests/unicode-tests/.gitignore @@ -0,0 +1,11 @@ +# Ballerina generates this directory during the compilation of a package. +# It contains compiler-generated artifacts and the final executable if this is an application package. +target/ + +# Ballerina maintains the compiler-generated source code here. +# Remove this if you want to commit generated sources. +generated/ + +# Contains configuration values used during development time. +# See https://ballerina.io/learn/provide-values-to-configurable-variables/ for more details. +Config.toml diff --git a/ballerina-tests/unicode-tests/Ballerina.toml b/ballerina-tests/unicode-tests/Ballerina.toml new file mode 100644 index 0000000..b2ec347 --- /dev/null +++ b/ballerina-tests/unicode-tests/Ballerina.toml @@ -0,0 +1,13 @@ +[package] +org = "ballerina" +name = "unicode_tests" +version = "0.1.0" + +[[dependency]] +org = "ballerina" +name = "csv_commons" +repository = "local" +version = "0.1.0" + +[platform.java17] +graalvmCompatible = true diff --git a/ballerina-tests/unicode-tests/tests/escape_character_test.bal b/ballerina-tests/unicode-tests/tests/escape_character_test.bal new file mode 100644 index 0000000..cee018f --- /dev/null +++ b/ballerina-tests/unicode-tests/tests/escape_character_test.bal @@ -0,0 +1,55 @@ +import ballerina/data.csv as csv; +import ballerina/test; + +@test:Config +function testEscapedCharactres() returns error? { + string csvString = string `a, b + quote\"\"quoted\"quote, 1 + backslash\\backslash, 2 + newline\nnewline, 3 + tab\ttab, 5 + unicode\u0061unicode, 6 + slash\/slash, 9 + quoted string \\'abc\\', 10`; + + record{string a; int b;}[]|csv:Error rec = csv:parseStringToRecord(csvString); + test:assertEquals(rec, [ + {a: string `quote""quoted"quote`, b: 1}, + {a: string `backslash${"\\"}backslash`, b: 2}, + {a: string `newline${"\n"}newline`, b: 3}, + {a: string `tab${"\t"}tab`, b: 5}, + {a: string `unicodeaunicode`, b: 6}, + {a: string `slash/slash`, b: 9}, + {a: string `quoted string \'abc\'`, b: 10} + ]); +} + +@test:Config +function testEscapedCharactres2() returns error? { + string csvString = string `a, b + backspace\bbackspace, 7`; + + record{string a; int b;}[]|csv:Error rec = csv:parseStringToRecord(csvString); + test:assertTrue(rec is record{string a; int b;}[]); +} + +@test:Config +function testEscapedCharactres3() returns error? { + string csvString = string ` a c, b + carriage return\r carriage return, 4`; + + record{}[]|csv:Error rec = csv:parseStringToRecord(csvString); + test:assertEquals(rec, [ + {"a c": string `carriage return${"\r"} carriage return`, b: 4} + ]); +} + +@test:Config +function testEscapedCharactres4() returns error? { + string csvString = string `a, b + form feed\f form feed, 8`; + + record{string a; int b;}[]|csv:Error rec = csv:parseStringToRecord(csvString); + test:assertTrue(rec is record {string a; int b;}[]); + // TODO: Add tests after supports \f by Ballerina +} diff --git a/ballerina-tests/union-type-tests/tests/test_with_singleton_test.bal b/ballerina-tests/union-type-tests/tests/test_with_singleton_test.bal index 56bd093..495180a 100644 --- a/ballerina-tests/union-type-tests/tests/test_with_singleton_test.bal +++ b/ballerina-tests/union-type-tests/tests/test_with_singleton_test.bal @@ -122,12 +122,10 @@ type SubtypeTuple3 [SubType...]; function testSubtypeExpectedTypes() returns error? { var value1 = [{a: 1, c: 1, d: 1, e: 1, f: "a", g: 1, h: 1, i: 1}, {a: 1, c: 1, d: 1, e: 1, f: "a", g: 1, h: 1, i: 1}]; - var value2 = [{a: 1, c: int:MAX_VALUE, d: 1, e: 1, f: "a", g: 1, h: 1, i: 1}, - {a: 1, c: 1, d: 1, e: 1, f: "a", g: 1, h: 1, i: 1}]; + var value2 = [["1", "1", "1", "1", "a", "1", "1", "1"], + ["1", "1", "1", "1", "a", "1", "1", "1"]]; var value3 = [[1, 1, 1, 1, "a", 1, 1, 1], [1, 1, 1, 1, "a", 1, 1, 1]]; - var value4 = [["1", "1", "1", "1", "a", "1", "1", "1"], - ["1", "1", "1", "1", "a", "1", "1", "1"]]; SubtypeRecord[]|csv:Error a = csv:parseStringToRecord(string `a, c, d, e, f, g, h, i 1, 1, 1, 1, a, 1, 1, 1 @@ -182,24 +180,24 @@ function testSubtypeExpectedTypes() returns error? { ["a", "c", "d", "e", "f", "g", "h", "i"], {}); test:assertEquals(a12, value3); - SubtypeRecord[]|csv:Error a13 = csv:parseListAsRecordType(value4, + SubtypeRecord[]|csv:Error a13 = csv:parseListAsRecordType(value2, ["a", "c", "d", "e", "f", "g", "h", "i"], {}); test:assertEquals(a13, value1); - SubtypeRecord2[]|csv:Error a14 = csv:parseListAsRecordType(value4, + SubtypeRecord2[]|csv:Error a14 = csv:parseListAsRecordType(value2, ["a", "c", "d", "e", "f", "g", "h", "i"], {}); test:assertEquals(a14, [{a: 1, c: 1}, {a: 1, c: 1}]); - SubtypeRecord3[]|csv:Error a15 = csv:parseListAsRecordType(value4, + SubtypeRecord3[]|csv:Error a15 = csv:parseListAsRecordType(value2, ["a", "c", "d", "e", "f", "g", "h", "i"], {}); test:assertEquals(a15, value1); - SubtypeTuple[]|csv:Error a16 = csv:parseListAsListType(value4, {}); + SubtypeTuple[]|csv:Error a16 = csv:parseListAsListType(value2, {}); test:assertEquals(a16, value3); - SubtypeTuple2[]|csv:Error a17 = csv:parseListAsListType(value4, {}); + SubtypeTuple2[]|csv:Error a17 = csv:parseListAsListType(value2, {}); test:assertEquals(a17, [[1, 1], [1, 1]]); - SubtypeTuple3[]|csv:Error a18 = csv:parseListAsListType(value4, {}); + SubtypeTuple3[]|csv:Error a18 = csv:parseListAsListType(value2, {}); test:assertEquals(a18, value3); } diff --git a/ballerina-tests/user-config-tests/tests/user_config_with_parser_options_test.bal b/ballerina-tests/user-config-tests/tests/user_config_with_parser_options_test.bal index 5649d3d..250fcc4 100644 --- a/ballerina-tests/user-config-tests/tests/user_config_with_parser_options_test.bal +++ b/ballerina-tests/user-config-tests/tests/user_config_with_parser_options_test.bal @@ -116,7 +116,7 @@ function testFromCsvStringWithHeaderLessParserOptions() { record {}[]|csv:Error csv2op6_2 = csv:parseStringToRecord(csvStringData2, {header: false, skipLines: [5, 7]}); test:assertEquals(csv2op6_2, [ {'1: "hello", '2: "hello", '3: (), '4: 12, '5: true, '6: 12.34}, - {'1: "//comment"}, + {'1: "// comment"}, {'1: "a", '2: "b", '3: "c", '4: "d", '5: "e", '6: "f"}, {'1: 1, '2: "string1", '3: true, '4: 2.234, '5: 2.234, '6: ()}, {'1: 3, '2: "string3", '3: false, '4: 1.23, '5: 1.23, '6: ()}, @@ -136,7 +136,8 @@ function testHeaderOption() { ]); record {}[]|csv:Error csv2cop2 = csv:parseStringToRecord(csvStringData2, {header: 100}); - test:assertEquals(csv2cop2, []); + test:assertTrue(csv2cop2 is csv:Error); + test:assertEquals(( csv2cop2).message(), "The provided header row is empty"); record {}[]|csv:Error csv2cop3 = csv:parseStringToRecord(csvStringData2, {header: 11}); test:assertEquals(csv2cop3, []); diff --git a/native/src/main/java/io/ballerina/stdlib/data/csvdata/csv/CsvParser.java b/native/src/main/java/io/ballerina/stdlib/data/csvdata/csv/CsvParser.java index 3f026c5..d583314 100644 --- a/native/src/main/java/io/ballerina/stdlib/data/csvdata/csv/CsvParser.java +++ b/native/src/main/java/io/ballerina/stdlib/data/csvdata/csv/CsvParser.java @@ -137,6 +137,8 @@ static class StateMachine { boolean isQuoteClosed = false; boolean isIntersectionElementType = false; private StringBuilder hexBuilder = new StringBuilder(4); + boolean isValueStart = false; + State prevState; StateMachine() { reset(); } @@ -168,6 +170,7 @@ public void reset() { hexBuilder = new StringBuilder(4); isQuoteClosed = false; isIntersectionElementType = false; + prevState = null; } private static boolean isWhitespace(char ch, Object lineTerminator) { @@ -345,6 +348,10 @@ public State transition(StateMachine sm, char[] buff, int i, int count) throws C for (; i < count; i++) { ch = buff[i]; sm.processLocation(ch); + if (ch == EOF) { + handleEndOfTheHeader(sm); + return HEADER_END_STATE; + } if (ch == Constants.LineTerminator.CR) { CsvUtils.setCarriageTokenPresent(true); continue; @@ -367,6 +374,7 @@ public State transition(StateMachine sm, char[] buff, int i, int count) throws C state = HEADER_END_STATE; break; } + state = this; continue; } @@ -375,11 +383,14 @@ public State transition(StateMachine sm, char[] buff, int i, int count) throws C } else if (!sm.insideComment && ch == separator) { addHeader(sm); sm.columnIndex++; + state = this; continue; } else if (!sm.insideComment && ch == sm.config.textEnclosure) { + sm.prevState = this; state = HEADER_QUOTE_CHAR_STATE; break; } else if (!sm.insideComment && ch == sm.config.escapeChar) { + sm.prevState = this; state = HEADER_ESCAPE_CHAR_STATE; break; } else if (sm.insideComment && sm.isNewLineOrEof(ch)) { @@ -390,11 +401,15 @@ public State transition(StateMachine sm, char[] buff, int i, int count) throws C handleEndOfTheHeader(sm); state = HEADER_END_STATE; } else if (StateMachine.isWhitespace(ch, sm.config.lineTerminator)) { + if (sm.isValueStart) { + sm.append(ch); + } state = this; continue; } else { if (!sm.insideComment) { sm.append(ch); + sm.isValueStart = true; } state = this; continue; @@ -407,8 +422,13 @@ public State transition(StateMachine sm, char[] buff, int i, int count) throws C } private static void handleEndOfTheHeader(StateMachine sm) throws CsvParserException { + handleEndOfTheHeader(sm, true); + } + + private static void handleEndOfTheHeader(StateMachine sm, boolean trim) throws CsvParserException { + sm.isValueStart = false; if (!sm.peek().isBlank()) { - addHeader(sm); + addHeader(sm, trim); } finalizeHeaders(sm); sm.columnIndex = 0; @@ -456,8 +476,16 @@ private static void validateRemainingRecordFields(StateMachine sm) { } } - private static void addHeader(StateMachine sm) throws CsvParserException { + private static void addHeader(StateMachine sm) { + addHeader(sm, true); + } + + private static void addHeader(StateMachine sm, boolean trim) { + sm.isValueStart = false; String value = sm.value(); + if (trim) { + value = value.trim(); + } if (sm.expectedArrayElementType instanceof RecordType) { String fieldName = CsvUtils.getUpdatedHeaders( sm.updatedRecordFieldNames, value, sm.fields.contains(value)); @@ -472,16 +500,17 @@ private static void addHeader(StateMachine sm) throws CsvParserException { private static class HeaderEndState implements State { @Override - public State transition(StateMachine sm, char[] buff, int i, int count) throws CsvParserException { + public State transition(StateMachine sm, char[] buff, int i, int count) { return ROW_START_STATE; } } private static class RowStartState implements State { + char ch; + State state = ROW_START_STATE; + @Override public State transition(StateMachine sm, char[] buff, int i, int count) throws CsvParserException { - char ch; - State state = ROW_START_STATE; char separator = sm.config.delimiter; long[] skipLines = getSkipDataRows(sm.config.skipLines); @@ -505,6 +534,7 @@ public State transition(StateMachine sm, char[] buff, int i, int count) throws C } } else { sm.append(ch); + sm.isValueStart = true; } continue; } @@ -518,7 +548,7 @@ public State transition(StateMachine sm, char[] buff, int i, int count) throws C initiateNewRowType(sm); } if (!sm.insideComment && ch == sm.config.comment) { - handleEndOfTheRow(sm, ch); + handleEndOfTheRow(sm); sm.insideComment = true; if (ch == EOF) { state = ROW_END_STATE; @@ -526,9 +556,11 @@ public State transition(StateMachine sm, char[] buff, int i, int count) throws C } else if (!sm.insideComment && ch == separator) { addRowValue(sm); } else if (!sm.insideComment && ch == sm.config.textEnclosure) { + sm.prevState = this; state = STRING_QUOTE_CHAR_STATE; break; } else if (!sm.insideComment && ch == sm.config.escapeChar) { + sm.prevState = this; state = STRING_ESCAPE_VALUE_STATE; break; } else if (sm.insideComment && sm.isNewLineOrEof(ch)) { @@ -538,16 +570,21 @@ public State transition(StateMachine sm, char[] buff, int i, int count) throws C break; } } else if (isEndOfTheRowAndValueIsNotEmpty(sm, ch)) { - handleEndOfTheRow(sm, ch); + handleEndOfTheRow(sm); if (ch == EOF) { state = ROW_END_STATE; break; } } else if (StateMachine.isWhitespace(ch, sm.config.lineTerminator)) { + if (sm.isValueStart) { + sm.append(ch); + } + state = this; // ignore } else { if (!sm.insideComment) { sm.append(ch); + sm.isValueStart = true; } } } @@ -556,13 +593,23 @@ public State transition(StateMachine sm, char[] buff, int i, int count) throws C } } - private static void handleEndOfTheRow(StateMachine sm, char ch) throws CsvParserException { - handleCsvRow(sm); + private static void handleEndOfTheRow(StateMachine sm) throws CsvParserException { + handleEndOfTheRow(sm, true); + } + + private static void handleEndOfTheRow(StateMachine sm, boolean trim) throws CsvParserException { + sm.isValueStart = false; + handleCsvRow(sm, trim); checkRequiredFieldsAndLogError(sm.fieldHierarchy, sm.config.absentAsNilableType); } - private static void handleCsvRow(StateMachine sm) throws CsvParserException { - if (!sm.peek().isBlank()) { - addRowValue(sm); + + private static void handleCsvRow(StateMachine sm, boolean trim) throws CsvParserException { + String value = sm.peek(); + if (trim) { + value = value.trim(); + } + if (!value.isBlank()) { + addRowValue(sm, trim); } if (!sm.isCurrentCsvNodeEmpty) { finalizeTheRow(sm); @@ -605,10 +652,18 @@ private static void finalizeTheRow(StateMachine sm) { } private static void addRowValue(StateMachine sm) throws CsvParserException { + addRowValue(sm, true); + } + + private static void addRowValue(StateMachine sm, boolean trim) throws CsvParserException { Type type; + Field currentField = null; + sm.isValueStart = false; Type exptype = sm.expectedArrayElementType; String value = sm.value(); - Field currentField = null; + if (trim) { + value = value.trim(); + } if (exptype instanceof RecordType) { type = getExpectedRowTypeOfRecord(sm); @@ -689,7 +744,7 @@ private static Field getCurrentField(StateMachine sm) { private static class RowEndState implements State { @Override - public State transition(StateMachine sm, char[] buff, int i, int count) throws CsvParserException { + public State transition(StateMachine sm, char[] buff, int i, int count) { return ROW_END_STATE; } } @@ -704,6 +759,10 @@ public State transition(StateMachine sm, char[] buff, int i, int count) for (; i < count; i++) { ch = buff[i]; sm.processLocation(ch); + if (ch == EOF) { + handleEndOfTheRow(sm, false); + return ROW_END_STATE; + } if (ch == Constants.LineTerminator.CR) { CsvUtils.setCarriageTokenPresent(true); continue; @@ -714,25 +773,23 @@ public State transition(StateMachine sm, char[] buff, int i, int count) if (ch == sm.config.textEnclosure) { if (sm.isQuoteClosed) { sm.append(ch); + sm.isValueStart = true; continue; } sm.isQuoteClosed = true; } else if (ch == sm.config.delimiter && sm.isQuoteClosed) { - addRowValue(sm); + addRowValue(sm, false); state = ROW_START_STATE; sm.isQuoteClosed = false; break; } else if (sm.isNewLineOrEof(ch) && sm.isQuoteClosed) { - handleEndOfTheRow(sm, ch); - if (ch == EOF) { - state = ROW_END_STATE; - break; - } + handleEndOfTheRow(sm, false); state = ROW_START_STATE; sm.isQuoteClosed = false; break; } else if (ch == sm.config.escapeChar) { state = STRING_ESCAPE_VALUE_STATE; + sm.prevState = this; sm.isQuoteClosed = false; break; } else if (!sm.isQuoteClosed && !sm.peek().isEmpty() && ch == EOF) { @@ -745,6 +802,7 @@ public State transition(StateMachine sm, char[] buff, int i, int count) sm.append(ch); sm.isQuoteClosed = false; } + sm.isValueStart = true; state = this; } } @@ -763,6 +821,10 @@ public State transition(StateMachine sm, char[] buff, int i, int count) for (; i < count; i++) { ch = buff[i]; sm.processLocation(ch); + if (ch == EOF) { + handleEndOfTheRow(sm); + return ROW_END_STATE; + } if (ch == Constants.LineTerminator.CR) { CsvUtils.setCarriageTokenPresent(true); continue; @@ -773,18 +835,19 @@ public State transition(StateMachine sm, char[] buff, int i, int count) if (ch == sm.config.textEnclosure) { sm.isQuoteClosed = true; } else if (ch == sm.config.delimiter && sm.isQuoteClosed) { - addHeader(sm); + addHeader(sm, false); sm.columnIndex++; sm.isQuoteClosed = false; state = HEADER_START_STATE; break; } else if (sm.isNewLineOrEof(ch) && sm.isQuoteClosed) { - handleEndOfTheHeader(sm); + handleEndOfTheHeader(sm, false); state = HEADER_END_STATE; sm.isQuoteClosed = false; break; } else if (!sm.isQuoteClosed && ch == sm.config.escapeChar) { sm.isQuoteClosed = false; + sm.prevState = this; state = HEADER_ESCAPE_CHAR_STATE; break; } else if (!sm.isQuoteClosed && ch == EOF) { @@ -797,6 +860,7 @@ public State transition(StateMachine sm, char[] buff, int i, int count) sm.append(ch); sm.isQuoteClosed = false; } + sm.isValueStart = true; state = this; continue; } @@ -811,7 +875,7 @@ private static class StringValueUnicodeHexProcessingState extends UnicodeHexProc @Override protected State getSourceState() { - return STRING_QUOTE_CHAR_STATE; + return STRING_UNICODE_CHAR_STATE; } } @@ -824,7 +888,7 @@ private static class HeaderUnicodeHexProcessingState extends UnicodeHexProcessin @Override protected State getSourceState() { - return HEADER_QUOTE_CHAR_STATE; + return HEADER_UNICODE_CHAR_STATE; } } @@ -842,6 +906,10 @@ public State transition(StateMachine sm, char[] buff, int i, int count) throws C for (; i < count; i++) { ch = buff[i]; sm.processLocation(ch); + if (ch == EOF) { + handleEndOfTheRow(sm); + return ROW_END_STATE; + } if (ch == Constants.LineTerminator.CR) { CsvUtils.setCarriageTokenPresent(true); continue; @@ -851,10 +919,12 @@ public State transition(StateMachine sm, char[] buff, int i, int count) throws C if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f')) { sm.hexBuilder.append(ch); + sm.isValueStart = true; if (sm.hexBuilder.length() >= 4) { sm.append(this.extractUnicodeChar(sm)); this.reset(sm); - state = this.getSourceState(); + state = sm.prevState; + sm.prevState = this; break; } state = this; @@ -882,16 +952,15 @@ private static class HeaderEscapedCharacterProcessingState extends EscapedCharac @Override protected State getSourceState() { - return HEADER_QUOTE_CHAR_STATE; + return HEADER_ESCAPE_CHAR_STATE; } - } private static class StringValueEscapedCharacterProcessingState extends EscapedCharacterProcessingState { @Override protected State getSourceState() { - return STRING_QUOTE_CHAR_STATE; + return STRING_ESCAPE_VALUE_STATE; } } @@ -911,38 +980,42 @@ public State transition(StateMachine sm, char[] buff, int i, int count) throws C } else if (!(CsvUtils.isCarriageTokenPresent && ch == Constants.LineTerminator.LF)) { CsvUtils.setCarriageTokenPresent(false); } + if (ch == EOF) { + handleEndOfTheRow(sm); + return ROW_END_STATE; + } switch (ch) { case '"': sm.append(QUOTES); - state = this.getSourceState(); + state = sm.prevState; break; case '\\': sm.append(REV_SOL); - state = this.getSourceState(); + state = sm.prevState; break; case '/': sm.append(SOL); - state = this.getSourceState(); + state = sm.prevState; break; case 'b': sm.append(BACKSPACE); - state = this.getSourceState(); + state = sm.prevState; break; case 'f': sm.append(FORMFEED); - state = this.getSourceState(); + state = sm.prevState; break; case 'n': sm.append(NEWLINE); - state = this.getSourceState(); + state = sm.prevState; break; case 'r': sm.append(CR); - state = this.getSourceState(); + state = sm.prevState; break; case 't': sm.append(HZ_TAB); - state = this.getSourceState(); + state = sm.prevState; break; case 'u': if (this.getSourceState() == STRING_ESCAPE_VALUE_STATE) {