Unicode: add unit tests and additional comments for valid_utf8()

AlekseyCherepanov · Sep 15, 2024 · f5d53d1 · f5d53d1
1 parent 1c8fd6b
commit f5d53d1
Show file tree

Hide file tree

Showing 5 changed files with 269 additions and 5 deletions.
diff --git a/src/Makefile.in b/src/Makefile.in
@@ -803,7 +803,10 @@ testfiles:
 ###############################################################################
 
 UNIT_TEST_OBJS = \
- tests/unit-tests.o tests/misc.o tests/common.o tests/memory.o tests/sha2.o
+ tests/unit-tests.o tests/misc.o tests/common.o tests/memory.o tests/sha2.o unicode.o
+
+UNIT_TEST_INCLUDED_PIECES = \
+ tests/test_valid_utf8.c
 
 tests/unit-tests.o: tests/unit-tests.c common.h memory.h misc.h
  $(CC) -o tests/unit-tests.o $(CFLAGS) -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG tests/unit-tests.c
@@ -823,7 +826,7 @@ tests/memory.o: memory.c arch.h misc.h jumbo.h autoconfig.h memory.h common.h jo
 # keep the 'easy name' build target of unit-tests The 'real' target is ../run/unit-tests[.exe]
 unit-tests: ../run/unit-tests@EXE_EXT@
 
-../run/unit-tests@EXE_EXT@: $(UNIT_TEST_OBJS)
+../run/unit-tests@EXE_EXT@: $(UNIT_TEST_OBJS) $(UNIT_TEST_INCLUDED_PIECES)
  $(LD) $(UNIT_TEST_OBJS) $(LDFLAGS) @OPENSSL_LIBS@ -o $@
  @ echo "Now Running the Unit Tests"
  @ ${POSSIBLE_WINE_MSG}

diff --git a/src/Makefile.legacy b/src/Makefile.legacy
@@ -337,8 +337,8 @@ default:
  @echo "beos-x86-any BeOS, x86"
  @echo "generic Any other Unix-like system with gcc"
 
-unit-tests:
- $(CC) -o ../run/unit-tests -Wall -O2 -fomit-frame-pointer -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG tests/unit-tests.c misc.c common.c memory.c sha2.c
+unit-tests: unicode.o
+ $(CC) -o ../run/unit-tests -Wall -O2 -fomit-frame-pointer -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG tests/unit-tests.c misc.c common.c memory.c sha2.c unicode.o -lcrypto
  ../run/unit-tests
 
 linux-x86-64-avx512:

diff --git a/src/tests/test_valid_utf8.c b/src/tests/test_valid_utf8.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2024 Aleksey Cherepanov
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ */
+
+/* Test code for valid_utf8() from unicode.c vs single char UTF-8 sequences
+ *
+ * It tests only a sequence of bytes for single character. Plus a few
+ * cases of additional byte after valid 2-bytes long sequence are
+ * tested. Higher-level logic is not checked. Test is sparse: checks
+ * of continuous blocks are applied skipping parts. Dense checks are
+ * applied close to borders to catch off-by-one mistakes. Valid
+ * sequences are limited to single character (1-4 bytes). Invalid
+ * sequences go up to 5 bytes and use even bigger steps for skipping.
+ * ASCII bytes at trailing positions are tested lightly.
+ *
+ * Description in PR#5531 contains a script to test valid_utf8()
+ * against Python3. https://github.com/openwall/john/pull/5531
+ */
+
+/* Related info about UTF-8
+ *
+ * Valid UTF-8 sequences of bytes (1-4 bytes long):
+ *
+ * 00..7F
+ *
+ * C2..DF 80..BF
+ *
+ * E0 A0..BF 80..BF
+ * ED 80..9F 80..BF
+ * Ex 80..BF 80..BF where Ex does not include E0 and ED
+ *
+ * F0 90..BF 80..BF 80..BF
+ * F4 80..8F 80..BF 80..BF notice 8F as upper bound in the second byte
+ * F1..F3 80..BF 80..BF 80..BF
+ *
+ * (X..Y denotes range from X to Y inclusive. X and Y are byte
+ * values written in hex.)
+ *
+ * Incomplete sequences are invalid.
+ *
+ * Range 80..BF is for trailing bytes (also called continuation
+ * bytes). It is not a valid starting byte. Adjacent values C0 and C1
+ * could be considered starting 2-bytes sequences but they are not
+ * valid in UTF-8.
+ *
+ * Each of E0,ED,F0,F4 starting bytes use a sub-range for trailing
+ * byte at the second position. E0/ED use different halves of the
+ * range for the second byte. F0/F4 allow the second byte in other
+ * proportions (48:16), not overlapping too.
+ *
+ * Valid UTF-8 text cannot include C0,C1,F5..FF bytes at any position.
+ * 80..BF are invalid at starting position.
+ * 00..7F,C2..F4 are invalid at any trailing position (actually they
+ * invalidate previous char while new starting byte itself can be a
+ * part of valid char, but the whole string would be invalid for
+ * purposes of valid_utf8()).
+ *
+ * See also https://en.wikipedia.org/wiki/UTF-8#Codepage_layout
+ */
+
+/* This file has up to 6 levels of nesting, so tab-width 4 might be
+ * helpful. Deep nesting is the price for simple regular structure. */
+
+#define is_trailing(c) (0x80 <= (c) && (c) < 0xC0)
+
+#define valid_utf8(a) (inc_test(), valid_utf8((a)))
+#define expect(cond) \
+ do { \
+ if (!(cond)) { \
+ printf("Failed %s(): check '%s' fails for this byte sequence: %s\n", \
+ Results.test_name, #cond, hex(buf, strlen((void *)buf))); \
+ inc_failed_test(); \
+ return; /* early exit for the whole test */ \
+ } \
+ } while (0)
+
+void _test_valid_utf8()
+{
+ UTF8 buf[6] = {};
+
+ /* Empty string is valid. */
+ expect(valid_utf8(buf) == 1);
+
+ /* 1 byte: ASCII is valid, non-ascii alone is invalid. */
+ for (int c = 0; c < 256; c++) {
+ buf[0] = c;
+ buf[1] = '\0';
+ expect(valid_utf8(buf) == (c < 128));
+ }
+
+ /* Setup dense check around borders of 80..BF range for trailing bytes. */
+ unsigned char trailing_sparse_check[256] = {};
+ for (int c = 0x79; c < 256; c += 16)
+ trailing_sparse_check[c] = 1;
+ for (int c = 0x80 - 8; c < 0x80 + 8; c++)
+ trailing_sparse_check[c] = 1;
+ for (int c = 0xBF - 8; c < 0xBF + 8; c++)
+ trailing_sparse_check[c] = 1;
+
+ /* Multi-byte test: either start is valid or we grow sequence (up to 5 bytes). */
+ for (int c1 = 128; c1 < 256; c1++) {
+ buf[0] = c1;
+ buf[1] = '\0';
+
+ int step = 1;
+
+ /* Invalid starting byte would be checked with all endings. So
+ * checks are sparse for invalid starting bytes. */
+ if (buf[0] < 0xC2 || 0xF4 < buf[0])
+ step = 15; /* sparse checks */
+ else
+ step = 1;
+
+ for (int c2 = 0x70, r2; c2 < 256; c2 += step) {
+ /* The second byte is checked sparsely only for invalid starts. */
+ buf[1] = c2;
+ buf[2] = '\0';
+ r2 = valid_utf8(buf);
+
+ if (0xC2 <= buf[0] && buf[0] < 0xE0 &&
+ is_trailing(buf[1])) {
+
+ expect(r2 == 2);
+
+ /* Additional test with 41 and F5 after valid 2-bytes sequence */
+ buf[3] = '\0';
+ buf[2] = 'A';
+ expect(valid_utf8(buf) == 2);
+ buf[2] = 0xF5;
+ expect(valid_utf8(buf) == 0);
+
+ continue;
+ }
+
+ expect(r2 == 0);
+ for (int c3 = 0x79, r3; c3 < 256; c3++) {
+ if (0 == trailing_sparse_check[c3])
+ continue; /* run code below sparsely */
+ buf[2] = c3;
+ buf[3] = '\0';
+ r3 = valid_utf8(buf);
+
+ if ((buf[0] == 0xE0 &&
+ 0xA0 <= buf[1] && buf[1] < 0xC0 &&
+ is_trailing(buf[2])) ||
+
+ (buf[0] == 0xED &&
+ 0x80 <= buf[1] && buf[1] < 0xA0 &&
+ is_trailing(buf[2])) ||
+
+ (0xE1 <= buf[0] && buf[0] < 0xF0 && buf[0] != 0xED &&
+ is_trailing(buf[1]) &&
+ is_trailing(buf[2]))) {
+
+ expect(r3 == 2);
+ continue;
+ }
+
+ expect(r3 == 0);
+ for (int c4 = 0x79, r4; c4 < 256; c4++) {
+ if (0 == trailing_sparse_check[c4])
+ continue; /* run code below sparsely */
+ buf[3] = c4;
+ buf[4] = '\0';
+ r4 = valid_utf8(buf);
+
+ if ((buf[0] == 0xF0 &&
+ 0x90 <= buf[1] && buf[1] < 0xC0 &&
+ is_trailing(buf[2]) &&
+ is_trailing(buf[3])) ||
+
+ (buf[0] == 0xF4 &&
+ 0x80 <= buf[1] && buf[1] < 0x90 &&
+ is_trailing(buf[2]) &&
+ is_trailing(buf[3])) ||
+
+ ((buf[0] == 0xF1 || buf[0] == 0xF2 || buf[0] == 0xF3) &&
+ is_trailing(buf[1]) &&
+ is_trailing(buf[2]) &&
+ is_trailing(buf[3]))) {
+
+ expect(r4 == 2);
+ continue;
+ }
+
+ expect(r4 == 0);
+ for (int c5 = 0x79; c5 < 256; c5 += 32) {
+ /* We test only a few values for the fifth byte. */
+ buf[4] = c5;
+ buf[5] = '\0';
+ expect(valid_utf8(buf) == 0);
+ }
+ }
+ }
+ }
+ }
+}
+
+void test_valid_utf8()
+{
+ start_test(__FUNCTION__);
+ failed = 0;
+ _test_valid_utf8();
+ end_test();
+}
+
+#undef expect
+#undef is_trailing
+#undef valid_utf8
diff --git a/src/tests/unit-tests.c b/src/tests/unit-tests.c
@@ -41,9 +41,12 @@
 #include "../misc.h"
 #include "../memory.h"
 #include "../common.h"
+#include "../unicode.h"
 
 #include "../sha2.h"
 
+struct options_main options; /* fake symbol to compile with unicode.o */
+
 char *_fgetl_pad = NULL;
 #ifdef __sun
 /* Solaris fprintf() seems to get confused at around 16384 */
@@ -2439,6 +2442,9 @@ void test_sha2_c() {
  end_test();
 }
 
+/* Tests for unicode.c */
+#include "test_valid_utf8.c"
+
 int main() {
  start_of_run = clock();
 
@@ -2493,6 +2499,9 @@ int main() {
  set_unit_test_source("sha2.c");
  test_sha2_c();
 
+ set_unit_test_source("unicode.c");
+ test_valid_utf8();
+
  // perform dump listing of all processed functions.
  dump_stats();
 

diff --git a/src/unicode.c b/src/unicode.c
@@ -530,7 +530,48 @@ inline size_t strlen_any(const void *source)
  return len;
 }
 
-/* Check if a string is valid UTF-8 */
+/*
+ * Check if a string is valid UTF-8
+ *
+ * Valid UTF-8 sequences of bytes (1-4 bytes long):
+ *
+ * 00..7F
+ *
+ * C2..DF 80..BF
+ *
+ * E0 A0..BF 80..BF
+ * ED 80..9F 80..BF
+ * Ex 80..BF 80..BF where Ex does not include E0 and ED
+ *
+ * F0 90..BF 80..BF 80..BF
+ * F4 80..8F 80..BF 80..BF notice 8F as upper bound in the second byte
+ * F1..F3 80..BF 80..BF 80..BF
+ *
+ * (X..Y denotes range from X to Y inclusive. X and Y are byte
+ * values written in hex.)
+ *
+ * Incomplete sequences are invalid.
+ *
+ * Range 80..BF is for trailing bytes (also called continuation
+ * bytes). It is not a valid starting byte. Adjacent values C0 and C1
+ * could be considered starting 2-bytes sequences but they are not
+ * valid in UTF-8.
+ *
+ * Each of E0,ED,F0,F4 starting bytes use a sub-range for trailing
+ * byte at the second position. E0/ED use different halves of the
+ * range for the second byte. F0/F4 allow the second byte in other
+ * proportions (48:16), not overlapping too.
+ *
+ * Valid UTF-8 text cannot include C0,C1,F5..FF bytes at any position.
+ * 80..BF are invalid at starting position.
+ * 00..7F,C2..F4 are invalid at any trailing position (actually they
+ * invalidate previous char while new starting byte itself could be a
+ * part of a valid char, but even then the whole string would be
+ * invalid for purposes of valid_utf8()).
+ *
+ * See also unicode.h and tests/test_valid_utf8.c
+ * See also https://en.wikipedia.org/wiki/UTF-8#Codepage_layout
+ */
 int valid_utf8(const UTF8 *source)
 {
  UTF8 a;