From a7760d0ca4de99cd27f6a501c5bb31fa4ce4a0c5 Mon Sep 17 00:00:00 2001 From: Mike Samuel Date: Mon, 19 Feb 2018 10:16:57 -0500 Subject: [PATCH] Defang MacOS & iOS crashing text sequences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Manish Goregaokar](https://manishearth.github.io/blog/2018/02/15/picking-apart-the-crashing-ios-string/) says > So, ultimately, the full set of cases that cause the crash are: > Any sequence > in Devanagari, Bengali, and Telugu, where: ... We eliminate the ZWNJ which seems the minimally damaging thing to do to Telugu rendering per the article above: > a ZWNJ before a vowel doesn’t really do anything for most Indic scripts. This is needed as of February 2018, but hopefully not long after that. --- src/main/java/org/owasp/html/Encoding.java | 37 ++++++++++++++ .../org/owasp/html/HtmlSanitizerTest.java | 48 +++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/src/main/java/org/owasp/html/Encoding.java b/src/main/java/org/owasp/html/Encoding.java index 4a41237f..6781970c 100644 --- a/src/main/java/org/owasp/html/Encoding.java +++ b/src/main/java/org/owasp/html/Encoding.java @@ -203,6 +203,43 @@ private static void encodeHtmlOnto( output.append(plainText, pos, i).append(repl); pos = i + 1; } + } else if ((0x93A <= ch && ch <= 0xC4C) + && ( + // Devanagari vowel + ch <= 0x94F + || 0x93A <= ch && ch <= 0x94F + // Benagli vowels + || 0x985 <= ch && ch <= 0x994 + || 0x9BE <= ch && ch < 0x9CC // 0x9CC (Bengali AU) is ok + || 0x9E0 <= ch && ch <= 0x9E3 + // Telugu vowels + || 0xC05 <= ch && ch <= 0xC14 + || 0xC3E <= ch && ch != 0xC48 /* 0xC48 (Telugu AI) is ok */)) { + // https://manishearth.github.io/blog/2018/02/15/picking-apart-the-crashing-ios-string/ + // > So, ultimately, the full set of cases that cause the crash are: + // > Any sequence + // > in Devanagari, Bengali, and Telugu, where: ... + + // TODO: This is needed as of February 2018, but hopefully not long after that. + // We eliminate the ZWNJ which seems the minimally damaging thing to do to + // Telugu rendering per the article above: + // > a ZWNJ before a vowel doesn’t really do anything for most Indic scripts. + + if (pos < i) { + if (plainText.charAt(i - 1) == 0x200C /* ZWNJ */) { + output.append(plainText, pos, i - 1); + // Drop the ZWNJ on the floor. + pos = i; + } + } else if (output instanceof StringBuilder) { + StringBuilder sb = (StringBuilder) output; + int len = sb.length(); + if (len != 0) { + if (sb.charAt(len - 1) == 0x200C /* ZWNJ */) { + sb.setLength(len - 1); + } + } + } } else if (((char) 0xd800) <= ch) { if (ch <= ((char) 0xdfff)) { char next; diff --git a/src/test/java/org/owasp/html/HtmlSanitizerTest.java b/src/test/java/org/owasp/html/HtmlSanitizerTest.java index b842073c..d9165a63 100644 --- a/src/test/java/org/owasp/html/HtmlSanitizerTest.java +++ b/src/test/java/org/owasp/html/HtmlSanitizerTest.java @@ -366,6 +366,54 @@ public static final void testDuplicateAttributes() { } + @Test + public static final void testMacOSAndIOSQueryOfDeath() { + // https://manishearth.github.io/blog/2018/02/15/picking-apart-the-crashing-ios-string/ + String[][] tests = { + { + "\u0C1C\u0C4D\u0C1E\u200C\u0C3E", + "\u0C1C\u0C4D\u0C1E\u0C3E", + }, + { + "\u09B8\u09CD\u09B0\u200C\u09C1", + "\u09B8\u09CD\u09B0\u09C1", + }, + { + "\u0C1C\u0C4D\u0C1E\u200C\u0C3E", + "\u0C1C\u0C4D\u0C1E\u0C3E", + }, + { + "\u09B8\u09CD\u09B0\u200C\u09C1", + "\u09B8\u09CD\u09B0\u09C1", + }, + { + "జ్ఞ‌ా", + "\u0C1C\u0C4D\u0C1E\u0C3E", + }, + { + "జ్ఞ‌ా", + "\u0C1C\u0C4D\u0C1E\u0C3E", + }, + { + "স্র‌ু", + "\u09B8\u09CD\u09B0\u09C1", + }, + { + "স্র‌ু", + "\u09B8\u09CD\u09B0\u09C1", + }, + { + "\u0915\u094D\u0930\u200C\u093E", + "\u0915\u094D\u0930\u093E", + }, + }; + + for (int i = 0, n = tests.length; i < n; ++i) { + String[] test = tests[i]; + assertEquals(i + " : " + test[0], test[1], sanitize(test[0])); + } + } + private static String sanitize(@Nullable String html) { StringBuilder sb = new StringBuilder(); HtmlStreamRenderer renderer = HtmlStreamRenderer.create(