diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/stats/StatisticsCollection.scala b/spark/src/main/scala/org/apache/spark/sql/delta/stats/StatisticsCollection.scala index 506da043ca3..8efafde9682 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/stats/StatisticsCollection.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/stats/StatisticsCollection.scala @@ -773,6 +773,7 @@ object StatisticsCollection extends DeltaCommand { * Helper method to truncate the input string `x` to the given `prefixLen` length, while also * appending the unicode max character to the end of the truncated string. This ensures that any * value in this column is less than or equal to the max. + * Note: Input string `x` must be properly encoded in UTF-8. */ def truncateMaxStringAgg(prefixLen: Int)(x: String): String = { if (x == null || x.length <= prefixLen) { @@ -783,7 +784,12 @@ object StatisticsCollection extends DeltaCommand { // condition holds, or we run off the end of the string. // scalastyle:off nonascii val tieBreaker = '\ufffd' - x.take(prefixLen) + x.substring(prefixLen).takeWhile(_ >= tieBreaker) + tieBreaker + var ans = x.take(prefixLen) + x.substring(prefixLen).takeWhile(_ >= tieBreaker) + // Append a tie-breaker only if we truncated any characters from input string `x`. + if (ans.length < x.length) { + ans = ans + tieBreaker + } + ans // scalastyle:off nonascii } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/stats/StatsCollectionSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/stats/StatsCollectionSuite.scala index dfda5fcba6a..ace5f184faf 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/stats/StatsCollectionSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/stats/StatsCollectionSuite.scala @@ -369,11 +369,14 @@ class StatsCollectionSuite (s"abcde�", s"abcde�"), (s"abcd�abcd", s"abcd�a�"), (s"�abcd", s"�abcd"), - (s"abcdef�", s"abcdef��"), + (s"abcdef�", s"abcdef�"), (s"abcdef-abcdef�", s"abcdef�"), (s"abcdef�abcdef", s"abcdef��"), (s"abcdef��abcdef", s"abcdef���"), - (s"abcdef�abcdef�abcdef�abcdef", s"abcdef��") + (s"abcdef�abcdef�abcdef�abcdef", s"abcdef��"), + (s"漢字仮名한글தமி", s"漢字仮名한글�"), + (s"漢字仮名한글��", s"漢字仮名한글��"), + (s"漢字仮名한글", s"漢字仮名한글") ) inputToExpected.foreach { case (input, expected) =>