Skip to content

Commit

Permalink
Merge pull request #2794 from matt335672/utf_changes_new
Browse files Browse the repository at this point in the history
Improve Unicode support
  • Loading branch information
matt335672 authored Nov 2, 2023
2 parents 76d12c5 + f5f67e2 commit 50cff2e
Show file tree
Hide file tree
Showing 33 changed files with 2,664 additions and 944 deletions.
1 change: 1 addition & 0 deletions common/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ libcommon_la_SOURCES = \
thread_calls.h \
trans.c \
trans.h \
unicode_defines.h \
$(PIXMAN_SOURCES)

libcommon_la_LIBADD = \
Expand Down
15 changes: 12 additions & 3 deletions common/arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,17 @@ typedef unsigned long uintptr_t;

typedef int bool_t;

// Define Unicode character types
#if defined(HAVE_UCHAR_H)
#include <uchar.h>
#elif defined(HAVE_STDINT_H)
typedef uint_least16_t char16_t;
typedef uint_least32_t char32_t;
#else
typedef uint16_t char16_t;
typedef uint32_t char32_t;
#endif

/* you can define L_ENDIAN or B_ENDIAN and NEED_ALIGN or NO_NEED_ALIGN
in the makefile to override */

Expand Down Expand Up @@ -134,12 +145,10 @@ typedef bool_t tbool;
typedef intptr_t tbus;
typedef intptr_t tintptr;

/* wide char, socket */
/* socket */
#if defined(_WIN32)
typedef unsigned short twchar;
typedef unsigned int tsock;
#else
typedef int twchar;
typedef int tsock;
#endif
#endif /* DEFINED_Ts */
Expand Down
14 changes: 0 additions & 14 deletions common/os_calls.c
Original file line number Diff line number Diff line change
Expand Up @@ -159,20 +159,6 @@ g_init(const char *app_name)

WSAStartup(2, &wsadata);
#endif

/* In order to get g_mbstowcs and g_wcstombs to work properly with
UTF-8 non-ASCII characters, LC_CTYPE cannot be "C" or blank.
To select UTF-8 encoding without specifying any countries/languages,
"C.UTF-8" is used but provided in few systems.
See also: https://sourceware.org/glibc/wiki/Proposals/C.UTF-8 */
char *lc_ctype;
lc_ctype = setlocale(LC_CTYPE, "C.UTF-8");
if (lc_ctype == NULL)
{
/* use en_US.UTF-8 instead if not available */
setlocale(LC_CTYPE, "en_US.UTF-8");
}
}

/*****************************************************************************/
Expand Down
258 changes: 258 additions & 0 deletions common/parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,47 @@
#include "arch.h"
#include "parse.h"
#include "log.h"
#include "string_calls.h"
#include "unicode_defines.h"

/******************************************************************************/

#if defined(B_ENDIAN) || defined(NEED_ALIGN)
#define out_uint16_le_unchecked(s, v) do \
{ \
*((s)->p) = (unsigned char)((v) >> 0); \
(s)->p++; \
*((s)->p) = (unsigned char)((v) >> 8); \
(s)->p++; \
} while (0)
#else
#define out_uint16_le_unchecked(s, v) do \
{ \
*((unsigned short*)((s)->p)) = (unsigned short)(v); \
(s)->p += 2; \
} while (0)
#endif

/******************************************************************************/
#if defined(B_ENDIAN) || defined(NEED_ALIGN)
#define in_uint16_le_unchecked(s, v) do \
{ \
(v) = (unsigned short) \
( \
(*((unsigned char*)((s)->p + 0)) << 0) | \
(*((unsigned char*)((s)->p + 1)) << 8) \
); \
(s)->p += 2; \
} while (0)
#else
#define in_uint16_le_unchecked(s, v) do \
{ \
(v) = *((unsigned short*)((s)->p)); \
(s)->p += 2; \
} while (0)
#endif

/******************************************************************************/
void
parser_stream_overflow_check(const struct stream *s, int n, int is_out,
const char *file, int line)
Expand Down Expand Up @@ -64,3 +104,221 @@ parser_stream_overflow_check(const struct stream *s, int n, int is_out,
}
}
}

/******************************************************************************/
void
out_utf8_as_utf16_le_proc(struct stream *s, const char *v,
unsigned int vn,
const char *file, int line)
{
// Expansion of S_CHECK_REM_OUT(s, <octet_count>) using passed-in
// file and line
#ifdef USE_DEVEL_STREAMCHECK
int octet_cnt = utf8_as_utf16_word_count(v, vn) * 2;
parser_stream_overflow_check(s, octet_cnt, 1, file, line);
#endif

while (vn > 0)
{
char32_t c32 = utf8_get_next_char(&v, &vn);
char16_t low;
if (c32 < 0x10000)
{
low = (char16_t)c32;
}
else
{
/* Need a surrogate pair */
low = LOW_SURROGATE_FROM_C32(c32);
char16_t high = HIGH_SURROGATE_FROM_C32(c32);
out_uint16_le_unchecked(s, high);
}
out_uint16_le_unchecked(s, low);
}
}

/******************************************************************************/
/**
* Gets the next Unicode character from a code stream
* @param s Stream
* @return Unicode character
*
* Non-characters and illegally coded characters are mapped to
* UCS_REPLACEMENT_CHARACTER
*
* @pre Two bytes are assumed to be available on the stram on entry
*/
static char32_t
get_c32_from_stream(struct stream *s)
{
char32_t c32 = UCS_REPLACEMENT_CHARACTER; // Assume failure
char16_t w;

in_uint16_le_unchecked(s, w);

if (IS_HIGH_SURROGATE(w))
{
if (s_check_rem(s, 2))
{
char16_t low;
in_uint16_le_unchecked(s, low);
if (IS_LOW_SURROGATE(low))
{
/* Valid surrogate pair */
char32_t v = C32_FROM_SURROGATE_PAIR(low, w);

/* Ignore some values which can be successfully encoded
* in this way */
if (!IS_PLANE_END_NON_CHARACTER(c32))
{
c32 = v;
}
}
else
{
/* Invalid low surrogate - pop character back */
s->p -= 2;
}
}
}
else if (!IS_LOW_SURROGATE(w) &&
!IS_PLANE_END_NON_CHARACTER(w) &&
!IS_ARABIC_NON_CHARACTER(w))
{
/* Character from the Basic Multilingual Plane */
c32 = (char32_t)w;
}

return c32;
}

/******************************************************************************/
unsigned int
in_utf16_le_fixed_as_utf8_proc(struct stream *s, unsigned int n,
char *v, unsigned int vn,
const char *file, int line)
{
unsigned int rv = 0;
char32_t c32;
char u8str[MAXLEN_UTF8_CHAR];
unsigned int u8len;
char *saved_s_end = s->end;

// Expansion of S_CHECK_REM(s, n*2) using passed-in file and line
#ifdef USE_DEVEL_STREAMCHECK
parser_stream_overflow_check(s, n * 2, 0, file, line);
#endif
// Temporarily set the stream end pointer to allow us to use
// s_check_rem() when reading in UTF-16 words
if (s->end - s->p > (int)(n * 2))
{
s->end = s->p + (int)(n * 2);
}

while (s_check_rem(s, 2))
{
c32 = get_c32_from_stream(s);

u8len = utf_char32_to_utf8(c32, u8str);
if (u8len + 1 <= vn)
{
/* Room for this character and a terminator. Add the character */
unsigned int i;
for (i = 0 ; i < u8len ; ++i)
{
v[i] = u8str[i];
}
vn -= u8len;
v += u8len;
}
else if (vn > 1)
{
/* We've skipped a character, but there's more than one byte
* remaining in the output buffer. Mark the output buffer as
* full so we don't get a smaller character being squeezed into
* the remaining space */
vn = 1;
}

rv += u8len;
}

// Restore stream to full length
s->end = saved_s_end;

if (vn > 0)
{
*v = '\0';
}
++rv;
return rv;
}

/******************************************************************************/
unsigned int
in_utf16_le_fixed_as_utf8_length(struct stream *s, unsigned int n)
{
char *saved_s_p = s->p;
unsigned int rv = in_utf16_le_fixed_as_utf8(s, n, NULL, 0);
s->p = saved_s_p;
return rv;
}

/******************************************************************************/
unsigned int
in_utf16_le_terminated_as_utf8(struct stream *s,
char *v, unsigned int vn)
{
unsigned int rv = 0;
char32_t c32;
char u8str[MAXLEN_UTF8_CHAR];
unsigned int u8len;
while (s_check_rem(s, 2))
{
c32 = get_c32_from_stream(s);
if (c32 == 0)
{
break; // Terminator encountered
}

u8len = utf_char32_to_utf8(c32, u8str);
if (u8len + 1 <= vn)
{
/* Room for this character and a terminator. Add the character */
unsigned int i;
for (i = 0 ; i < u8len ; ++i)
{
v[i] = u8str[i];
}
vn -= u8len;
v += u8len;
}
else if (vn > 1)
{
/* We've skipped a character, but there's more than one byte
* remaining in the output buffer. Mark the output buffer as
* full so we don't get a smaller character being squeezed into
* the remaining space */
vn = 1;
}
rv += u8len;
}

if (vn > 0)
{
*v = '\0';
}
++rv;

return rv;
}

/******************************************************************************/
unsigned int
in_utf16_le_terminated_as_utf8_length(struct stream *s)
{
char *saved_s_p = s->p;
unsigned int rv = in_utf16_le_terminated_as_utf8(s, NULL, 0);
s->p = saved_s_p;
return rv;
}
Loading

0 comments on commit 50cff2e

Please sign in to comment.