Skip to content

Commit

Permalink
[Spark] Append the tieBreaker unicode max character only if we actual…
Browse files Browse the repository at this point in the history
…ly truncated the string (delta-io#3222)

## Description
This is to not append the tieBreaker character when no part of the
string was truncated

## How was this patch tested?
UTs
  • Loading branch information
sumeet-db authored Jun 12, 2024
1 parent 3aa8be0 commit 5106d27
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,7 @@ object StatisticsCollection extends DeltaCommand {
* Helper method to truncate the input string `x` to the given `prefixLen` length, while also
* appending the unicode max character to the end of the truncated string. This ensures that any
* value in this column is less than or equal to the max.
* Note: Input string `x` must be properly encoded in UTF-8.
*/
def truncateMaxStringAgg(prefixLen: Int)(x: String): String = {
if (x == null || x.length <= prefixLen) {
Expand All @@ -783,7 +784,12 @@ object StatisticsCollection extends DeltaCommand {
// condition holds, or we run off the end of the string.
// scalastyle:off nonascii
val tieBreaker = '\ufffd'
x.take(prefixLen) + x.substring(prefixLen).takeWhile(_ >= tieBreaker) + tieBreaker
var ans = x.take(prefixLen) + x.substring(prefixLen).takeWhile(_ >= tieBreaker)
// Append a tie-breaker only if we truncated any characters from input string `x`.
if (ans.length < x.length) {
ans = ans + tieBreaker
}
ans
// scalastyle:off nonascii
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,11 +369,14 @@ class StatsCollectionSuite
(s"abcde�", s"abcde�"),
(s"abcd�abcd", s"abcd�a�"),
(s"�abcd", s"�abcd"),
(s"abcdef�", s"abcdef�"),
(s"abcdef�", s"abcdef�"),
(s"abcdef-abcdef�", s"abcdef�"),
(s"abcdef�abcdef", s"abcdef��"),
(s"abcdef��abcdef", s"abcdef���"),
(s"abcdef�abcdef�abcdef�abcdef", s"abcdef��")
(s"abcdef�abcdef�abcdef�abcdef", s"abcdef��"),
(s"漢字仮名한글தமி", s"漢字仮名한글�"),
(s"漢字仮名한글��", s"漢字仮名한글��"),
(s"漢字仮名한글", s"漢字仮名한글")
)
inputToExpected.foreach {
case (input, expected) =>
Expand Down

0 comments on commit 5106d27

Please sign in to comment.