Merge pull request #2794 from matt335672/utf_changes_new

Improve Unicode support
neutrinolabs · Nov 2, 2023 · 50cff2e · 50cff2e
2 parents 76d12c5 + f5f67e2
commit 50cff2e
Show file tree

Hide file tree

Showing 33 changed files with 2,664 additions and 944 deletions.
diff --git a/common/Makefile.am b/common/Makefile.am
@@ -67,6 +67,7 @@ libcommon_la_SOURCES = \
   thread_calls.h \
   trans.c \
   trans.h \
+  unicode_defines.h \
   $(PIXMAN_SOURCES)
 
 libcommon_la_LIBADD = \

diff --git a/common/arch.h b/common/arch.h
@@ -46,6 +46,17 @@ typedef unsigned long uintptr_t;
 
 typedef int bool_t;
 
+// Define Unicode character types
+#if defined(HAVE_UCHAR_H)
+#include <uchar.h>
+#elif defined(HAVE_STDINT_H)
+typedef uint_least16_t char16_t;
+typedef uint_least32_t char32_t;
+#else
+typedef uint16_t char16_t;
+typedef uint32_t char32_t;
+#endif
+
 /* you can define L_ENDIAN or B_ENDIAN and NEED_ALIGN or NO_NEED_ALIGN
    in the makefile to override */
 
@@ -134,12 +145,10 @@ typedef bool_t tbool;
 typedef intptr_t tbus;
 typedef intptr_t tintptr;
 
-/* wide char, socket */
+/* socket */
 #if defined(_WIN32)
-typedef unsigned short twchar;
 typedef unsigned int tsock;
 #else
-typedef int twchar;
 typedef int tsock;
 #endif
 #endif /* DEFINED_Ts */

diff --git a/common/os_calls.c b/common/os_calls.c
@@ -159,20 +159,6 @@ g_init(const char *app_name)
 
     WSAStartup(2, &wsadata);
 #endif
-
-    /* In order to get g_mbstowcs and g_wcstombs to work properly with
-       UTF-8 non-ASCII characters, LC_CTYPE cannot be "C" or blank.
-       To select UTF-8 encoding without specifying any countries/languages,
-       "C.UTF-8" is used but provided in few systems.
-
-       See also: https://sourceware.org/glibc/wiki/Proposals/C.UTF-8 */
-    char *lc_ctype;
-    lc_ctype = setlocale(LC_CTYPE, "C.UTF-8");
-    if (lc_ctype == NULL)
-    {
-        /* use en_US.UTF-8 instead if not available */
-        setlocale(LC_CTYPE, "en_US.UTF-8");
-    }
 }
 
 /*****************************************************************************/

diff --git a/common/parse.c b/common/parse.c
@@ -27,7 +27,47 @@
 #include "arch.h"
 #include "parse.h"
 #include "log.h"
+#include "string_calls.h"
+#include "unicode_defines.h"
 
+/******************************************************************************/
+
+#if defined(B_ENDIAN) || defined(NEED_ALIGN)
+#define out_uint16_le_unchecked(s, v) do \
+    { \
+        *((s)->p) = (unsigned char)((v) >> 0); \
+        (s)->p++; \
+        *((s)->p) = (unsigned char)((v) >> 8); \
+        (s)->p++; \
+    } while (0)
+#else
+#define out_uint16_le_unchecked(s, v) do \
+    { \
+        *((unsigned short*)((s)->p)) = (unsigned short)(v); \
+        (s)->p += 2; \
+    } while (0)
+#endif
+
+/******************************************************************************/
+#if defined(B_ENDIAN) || defined(NEED_ALIGN)
+#define in_uint16_le_unchecked(s, v) do \
+    { \
+        (v) = (unsigned short) \
+              ( \
+                (*((unsigned char*)((s)->p + 0)) << 0) | \
+                (*((unsigned char*)((s)->p + 1)) << 8) \
+              ); \
+        (s)->p += 2; \
+    } while (0)
+#else
+#define in_uint16_le_unchecked(s, v) do \
+    { \
+        (v) = *((unsigned short*)((s)->p)); \
+        (s)->p += 2; \
+    } while (0)
+#endif
+
+/******************************************************************************/
 void
 parser_stream_overflow_check(const struct stream *s, int n, int is_out,
                              const char *file, int line)
@@ -64,3 +104,221 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
         }
     }
 }
+
+/******************************************************************************/
+void
+out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
+                          unsigned int vn,
+                          const char *file, int line)
+{
+    // Expansion of S_CHECK_REM_OUT(s, <octet_count>) using passed-in
+    // file and line
+#ifdef USE_DEVEL_STREAMCHECK
+    int octet_cnt = utf8_as_utf16_word_count(v, vn) * 2;
+    parser_stream_overflow_check(s, octet_cnt, 1, file, line);
+#endif
+
+    while (vn > 0)
+    {
+        char32_t c32 = utf8_get_next_char(&v, &vn);
+        char16_t low;
+        if (c32 < 0x10000)
+        {
+            low = (char16_t)c32;
+        }
+        else
+        {
+            /* Need a surrogate pair */
+            low = LOW_SURROGATE_FROM_C32(c32);
+            char16_t high = HIGH_SURROGATE_FROM_C32(c32);
+            out_uint16_le_unchecked(s, high);
+        }
+        out_uint16_le_unchecked(s, low);
+    }
+}
+
+/******************************************************************************/
+/**
+ * Gets the next Unicode character from a code stream
+ * @param s Stream
+ * @return Unicode character
+ *
+ * Non-characters and illegally coded characters are mapped to
+ * UCS_REPLACEMENT_CHARACTER
+ *
+ * @pre Two bytes are assumed to be available on the stram on entry
+ */
+static char32_t
+get_c32_from_stream(struct stream *s)
+{
+    char32_t c32 = UCS_REPLACEMENT_CHARACTER; // Assume failure
+    char16_t w;
+
+    in_uint16_le_unchecked(s, w);
+
+    if (IS_HIGH_SURROGATE(w))
+    {
+        if (s_check_rem(s, 2))
+        {
+            char16_t low;
+            in_uint16_le_unchecked(s, low);
+            if (IS_LOW_SURROGATE(low))
+            {
+                /* Valid surrogate pair */
+                char32_t v = C32_FROM_SURROGATE_PAIR(low, w);
+
+                /* Ignore some values which can be successfully encoded
+                 * in this way */
+                if (!IS_PLANE_END_NON_CHARACTER(c32))
+                {
+                    c32 = v;
+                }
+            }
+            else
+            {
+                /* Invalid low surrogate  - pop character back */
+                s->p -= 2;
+            }
+        }
+    }
+    else if (!IS_LOW_SURROGATE(w) &&
+             !IS_PLANE_END_NON_CHARACTER(w) &&
+             !IS_ARABIC_NON_CHARACTER(w))
+    {
+        /* Character from the Basic Multilingual Plane */
+        c32 = (char32_t)w;
+    }
+
+    return c32;
+}
+
+/******************************************************************************/
+unsigned int
+in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
+                               char *v, unsigned int vn,
+                               const char *file, int line)
+{
+    unsigned int rv = 0;
+    char32_t c32;
+    char u8str[MAXLEN_UTF8_CHAR];
+    unsigned int u8len;
+    char *saved_s_end = s->end;
+
+    // Expansion of S_CHECK_REM(s, n*2) using passed-in file and line
+#ifdef USE_DEVEL_STREAMCHECK
+    parser_stream_overflow_check(s, n * 2, 0, file, line);
+#endif
+    // Temporarily set the stream end pointer to allow us to use
+    // s_check_rem() when reading in UTF-16 words
+    if (s->end - s->p > (int)(n * 2))
+    {
+        s->end = s->p + (int)(n * 2);
+    }
+
+    while (s_check_rem(s, 2))
+    {
+        c32 = get_c32_from_stream(s);
+
+        u8len = utf_char32_to_utf8(c32, u8str);
+        if (u8len + 1 <= vn)
+        {
+            /* Room for this character and a terminator. Add the character */
+            unsigned int i;
+            for (i = 0 ; i < u8len ; ++i)
+            {
+                v[i] = u8str[i];
+            }
+            vn -= u8len;
+            v += u8len;
+        }
+        else if (vn > 1)
+        {
+            /* We've skipped a character, but there's more than one byte
+             * remaining in the output buffer. Mark the output buffer as
+             * full so we don't get a smaller character being squeezed into
+             * the remaining space */
+            vn = 1;
+        }
+
+        rv += u8len;
+    }
+
+    // Restore stream to full length
+    s->end = saved_s_end;
+
+    if (vn > 0)
+    {
+        *v = '\0';
+    }
+    ++rv;
+    return rv;
+}
+
+/******************************************************************************/
+unsigned int
+in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n)
+{
+    char *saved_s_p = s->p;
+    unsigned int rv = in_utf16_le_fixed_as_utf8(s, n, NULL, 0);
+    s->p = saved_s_p;
+    return rv;
+}
+
+/******************************************************************************/
+unsigned int
+in_utf16_le_terminated_as_utf8(struct stream *s,
+                               char *v, unsigned int vn)
+{
+    unsigned int rv = 0;
+    char32_t c32;
+    char u8str[MAXLEN_UTF8_CHAR];
+    unsigned int u8len;
+    while (s_check_rem(s, 2))
+    {
+        c32 = get_c32_from_stream(s);
+        if (c32 == 0)
+        {
+            break;  // Terminator encountered
+        }
+
+        u8len = utf_char32_to_utf8(c32, u8str);
+        if (u8len + 1 <= vn)
+        {
+            /* Room for this character and a terminator. Add the character */
+            unsigned int i;
+            for (i = 0 ; i < u8len ; ++i)
+            {
+                v[i] = u8str[i];
+            }
+            vn -= u8len;
+            v += u8len;
+        }
+        else if (vn > 1)
+        {
+            /* We've skipped a character, but there's more than one byte
+             * remaining in the output buffer. Mark the output buffer as
+             * full so we don't get a smaller character being squeezed into
+             * the remaining space */
+            vn = 1;
+        }
+        rv += u8len;
+    }
+
+    if (vn > 0)
+    {
+        *v = '\0';
+    }
+    ++rv;
+
+    return rv;
+}
+
+/******************************************************************************/
+unsigned int
+in_utf16_le_terminated_as_utf8_length(struct stream *s)
+{
+    char *saved_s_p = s->p;
+    unsigned int rv = in_utf16_le_terminated_as_utf8(s, NULL, 0);
+    s->p = saved_s_p;
+    return rv;
+}