XALANJ-2419

apache · Jan 22, 2024 · 9e67d12 · 9e67d12
1 parent 3e49f1d
commit 9e67d12
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 124 deletions.
diff --git a/serializer/src/main/java/org/apache/xml/serializer/ToHTMLStream.java b/serializer/src/main/java/org/apache/xml/serializer/ToHTMLStream.java
@@ -1281,6 +1281,13 @@ else if (escapingNotNeeded(ch))
                 {
                     writer.write(ch);
                 }
+                else if (Encodings.isHighUTF16Surrogate(ch))
+                {
+                    writeUTF16Surrogate(ch, chars, i, end);
+                    i++; // two input characters processed
+                         // this increments by one and the for()
+                         // loop itself increments by another one.
+                }
                 else
                 {
                     writer.write("&#");
@@ -1427,17 +1434,15 @@ else if (
                 {
                     i = pos - 1;
                 }
-                else
+                else if (Encodings.isHighUTF16Surrogate(ch))
                 {
-                    if (Encodings.isHighUTF16Surrogate(ch))
-                    {
-
-                            writeUTF16Surrogate(ch, chars, i, end);
-                            i++; // two input characters processed
-                                 // this increments by one and the for()
-                                 // loop itself increments by another one.
-                    }
-
+
+                    writeUTF16Surrogate(ch, chars, i, end);
+                    i++; // two input characters processed
+                         // this increments by one and the for()
+                         // loop itself increments by another one.
+                } else {
+
                     // The next is kind of a hack to keep from escaping in the case 
                     // of Shift_JIS and the like.
 
@@ -2333,4 +2338,6 @@ public int getLongestKeyLength()
             return m_charBuffer.length;
         }
     }
+
+
 }
diff --git a/serializer/src/main/java/org/apache/xml/serializer/ToStream.java b/serializer/src/main/java/org/apache/xml/serializer/ToStream.java
@@ -1594,13 +1594,24 @@ else if (m_encodingInfo.isInEncoding(ch)) {
                         // If the character is in the encoding, and
                         // not in the normal ASCII range, we also
                         // just leave it get added on to the clean characters
-
                     }
-                    else {
-                        // This is a fallback plan, we should never get here
-                        // but if the character wasn't previously handled
-                        // (i.e. isn't in the encoding, etc.) then what
-                        // should we do?  We choose to write out an entity
+                    else if (Encodings.isHighUTF16Surrogate(ch) && i < end-1 && Encodings.isLowUTF16Surrogate(chars[i+1])) {
+                    	// So, this is a (valid) surrogate pair
+                    	if (! m_encodingInfo.isInEncoding(ch, chars[i+1])) {
+                    		int codepoint = Encodings.toCodePoint(ch, chars[i+1]);
+                    		writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                    		writer.write("&#");
+                    		writer.write(Integer.toString(codepoint));
+                    		writer.write(';');
+                    		lastDirtyCharProcessed = i+1;
+                    	}
+                    	i++; // skip the low surrogate, too
+                    }
+                	else {
+                        // This is a fallback plan, we get here if the
+                    	// encoding doesn't contain ch and it's not part
+                    	// of a surrogate pair
+                        // The right thing is to write out an entity
                         writeOutCleanChars(chars, i, lastDirtyCharProcessed);
                         writer.write("&#");
                         writer.write(Integer.toString(ch));
@@ -2166,6 +2177,11 @@ else if (m_encodingInfo.isInEncoding(ch)) {
                     // just write it out
                     writer.write(ch);
                 }
+                else if (Encodings.isHighUTF16Surrogate(ch))
+                {
+                    writeUTF16Surrogate(ch, stringChars, i, len);
+                    i++ ; // process two input characters
+                }
                 else {
                     // This is a fallback plan, we should never get here
                     // but if the character wasn't previously handled

diff --git a/xalansamples/src/main/java/samples/extensions/sql/extConnection/#ExternalConnection.java# b/xalansamples/src/main/java/samples/extensions/sql/extConnection/#ExternalConnection.java#