Skip to content

Commit

Permalink
Merge pull request #163 from apache/XALANJ-2419
Browse files Browse the repository at this point in the history
XALANJ-2419: Erroneous serialization of astral characters
  • Loading branch information
jkesselm authored Jan 23, 2024
2 parents 3e49f1d + 9e67d12 commit 289dd5f
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 124 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1281,6 +1281,13 @@ else if (escapingNotNeeded(ch))
{
writer.write(ch);
}
else if (Encodings.isHighUTF16Surrogate(ch))
{
writeUTF16Surrogate(ch, chars, i, end);
i++; // two input characters processed
// this increments by one and the for()
// loop itself increments by another one.
}
else
{
writer.write("&#");
Expand Down Expand Up @@ -1427,17 +1434,15 @@ else if (
{
i = pos - 1;
}
else
else if (Encodings.isHighUTF16Surrogate(ch))
{
if (Encodings.isHighUTF16Surrogate(ch))
{

writeUTF16Surrogate(ch, chars, i, end);
i++; // two input characters processed
// this increments by one and the for()
// loop itself increments by another one.
}


writeUTF16Surrogate(ch, chars, i, end);
i++; // two input characters processed
// this increments by one and the for()
// loop itself increments by another one.
} else {

// The next is kind of a hack to keep from escaping in the case
// of Shift_JIS and the like.

Expand Down Expand Up @@ -2333,4 +2338,6 @@ public int getLongestKeyLength()
return m_charBuffer.length;
}
}


}
28 changes: 22 additions & 6 deletions serializer/src/main/java/org/apache/xml/serializer/ToStream.java
Original file line number Diff line number Diff line change
Expand Up @@ -1594,13 +1594,24 @@ else if (m_encodingInfo.isInEncoding(ch)) {
// If the character is in the encoding, and
// not in the normal ASCII range, we also
// just leave it get added on to the clean characters

}
else {
// This is a fallback plan, we should never get here
// but if the character wasn't previously handled
// (i.e. isn't in the encoding, etc.) then what
// should we do? We choose to write out an entity
else if (Encodings.isHighUTF16Surrogate(ch) && i < end-1 && Encodings.isLowUTF16Surrogate(chars[i+1])) {
// So, this is a (valid) surrogate pair
if (! m_encodingInfo.isInEncoding(ch, chars[i+1])) {
int codepoint = Encodings.toCodePoint(ch, chars[i+1]);
writeOutCleanChars(chars, i, lastDirtyCharProcessed);
writer.write("&#");
writer.write(Integer.toString(codepoint));
writer.write(';');
lastDirtyCharProcessed = i+1;
}
i++; // skip the low surrogate, too
}
else {
// This is a fallback plan, we get here if the
// encoding doesn't contain ch and it's not part
// of a surrogate pair
// The right thing is to write out an entity
writeOutCleanChars(chars, i, lastDirtyCharProcessed);
writer.write("&#");
writer.write(Integer.toString(ch));
Expand Down Expand Up @@ -2166,6 +2177,11 @@ else if (m_encodingInfo.isInEncoding(ch)) {
// just write it out
writer.write(ch);
}
else if (Encodings.isHighUTF16Surrogate(ch))
{
writeUTF16Surrogate(ch, stringChars, i, len);
i++ ; // process two input characters
}
else {
// This is a fallback plan, we should never get here
// but if the character wasn't previously handled
Expand Down

This file was deleted.

0 comments on commit 289dd5f

Please sign in to comment.