diff --git a/src/Makefile.in b/src/Makefile.in index 5989f6581f..4bf7378223 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -803,7 +803,10 @@ testfiles: ############################################################################### UNIT_TEST_OBJS = \ - tests/unit-tests.o tests/misc.o tests/common.o tests/memory.o tests/sha2.o + tests/unit-tests.o tests/misc.o tests/common.o tests/memory.o tests/sha2.o unicode.o + +UNIT_TEST_INCLUDED_PIECES = \ + tests/test_valid_utf8.c tests/unit-tests.o: tests/unit-tests.c common.h memory.h misc.h $(CC) -o tests/unit-tests.o $(CFLAGS) -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG tests/unit-tests.c @@ -823,7 +826,7 @@ tests/memory.o: memory.c arch.h misc.h jumbo.h autoconfig.h memory.h common.h jo # keep the 'easy name' build target of unit-tests The 'real' target is ../run/unit-tests[.exe] unit-tests: ../run/unit-tests@EXE_EXT@ -../run/unit-tests@EXE_EXT@: $(UNIT_TEST_OBJS) +../run/unit-tests@EXE_EXT@: $(UNIT_TEST_OBJS) $(UNIT_TEST_INCLUDED_PIECES) $(LD) $(UNIT_TEST_OBJS) $(LDFLAGS) @OPENSSL_LIBS@ -o $@ @ echo "Now Running the Unit Tests" @ ${POSSIBLE_WINE_MSG} diff --git a/src/Makefile.legacy b/src/Makefile.legacy index 9be07db64f..11e3ffccb5 100644 --- a/src/Makefile.legacy +++ b/src/Makefile.legacy @@ -337,8 +337,8 @@ default: @echo "beos-x86-any BeOS, x86" @echo "generic Any other Unix-like system with gcc" -unit-tests: - $(CC) -o ../run/unit-tests -Wall -O2 -fomit-frame-pointer -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG tests/unit-tests.c misc.c common.c memory.c sha2.c +unit-tests: unicode.o + $(CC) -o ../run/unit-tests -Wall -O2 -fomit-frame-pointer -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG tests/unit-tests.c misc.c common.c memory.c sha2.c unicode.o -lcrypto ../run/unit-tests linux-x86-64-avx512: diff --git a/src/tests/test_valid_utf8.c b/src/tests/test_valid_utf8.c new file mode 100644 index 0000000000..b806a5dc40 --- /dev/null +++ b/src/tests/test_valid_utf8.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2024 Aleksey Cherepanov + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + */ + +/* Test code for valid_utf8() from unicode.c vs single char UTF-8 sequences + * + * It tests only a sequence of bytes for single character. Plus a few + * cases of additional byte after valid 2-bytes long sequence are + * tested. Higher-level logic is not checked. Test is sparse: checks + * of continuous blocks are applied skipping parts. Dense checks are + * applied close to borders to catch off-by-one mistakes. Valid + * sequences are limited to single character (1-4 bytes). Invalid + * sequences go up to 5 bytes and use even bigger steps for skipping. + * ASCII bytes at trailing positions are tested lightly. + * + * Description in PR#5531 contains a script to test valid_utf8() + * against Python3. https://github.com/openwall/john/pull/5531 + */ + +/* Related info about UTF-8 + * + * Valid UTF-8 sequences of bytes (1-4 bytes long): + * + * 00..7F + * + * C2..DF 80..BF + * + * E0 A0..BF 80..BF + * ED 80..9F 80..BF + * Ex 80..BF 80..BF where Ex does not include E0 and ED + * + * F0 90..BF 80..BF 80..BF + * F4 80..8F 80..BF 80..BF notice 8F as upper bound in the second byte + * F1..F3 80..BF 80..BF 80..BF + * + * (X..Y denotes range from X to Y inclusive. X and Y are byte + * values written in hex.) + * + * Incomplete sequences are invalid. + * + * Range 80..BF is for trailing bytes (also called continuation + * bytes). It is not a valid starting byte. Adjacent values C0 and C1 + * could be considered starting 2-bytes sequences but they are not + * valid in UTF-8. + * + * Each of E0,ED,F0,F4 starting bytes use a sub-range for trailing + * byte at the second position. E0/ED use different halves of the + * range for the second byte. F0/F4 allow the second byte in other + * proportions (48:16), not overlapping too. + * + * Valid UTF-8 text cannot include C0,C1,F5..FF bytes at any position. + * 80..BF are invalid at starting position. + * 00..7F,C2..F4 are invalid at any trailing position (actually they + * invalidate previous char while new starting byte itself can be a + * part of valid char, but the whole string would be invalid for + * purposes of valid_utf8()). + * + * See also https://en.wikipedia.org/wiki/UTF-8#Codepage_layout + */ + +/* This file has up to 6 levels of nesting, so tab-width 4 might be + * helpful. Deep nesting is the price for simple regular structure. */ + +#define is_trailing(c) (0x80 <= (c) && (c) < 0xC0) + +#define valid_utf8(a) (inc_test(), valid_utf8((a))) +#define expect(cond) \ + do { \ + if (!(cond)) { \ + printf("Failed %s(): check '%s' fails for this byte sequence: %s\n", \ + Results.test_name, #cond, hex(buf, strlen((void *)buf))); \ + inc_failed_test(); \ + return; /* early exit for the whole test */ \ + } \ + } while (0) + +void _test_valid_utf8() +{ + UTF8 buf[6] = {}; + + /* Empty string is valid. */ + expect(valid_utf8(buf) == 1); + + /* 1 byte: ASCII is valid, non-ascii alone is invalid. */ + for (int c = 0; c < 256; c++) { + buf[0] = c; + buf[1] = '\0'; + expect(valid_utf8(buf) == (c < 128)); + } + + /* Setup dense check around borders of 80..BF range for trailing bytes. */ + unsigned char trailing_sparse_check[256] = {}; + for (int c = 0x79; c < 256; c += 16) + trailing_sparse_check[c] = 1; + for (int c = 0x80 - 8; c < 0x80 + 8; c++) + trailing_sparse_check[c] = 1; + for (int c = 0xBF - 8; c < 0xBF + 8; c++) + trailing_sparse_check[c] = 1; + + /* Multi-byte test: either start is valid or we grow sequence (up to 5 bytes). */ + for (int c1 = 128; c1 < 256; c1++) { + buf[0] = c1; + buf[1] = '\0'; + + int step = 1; + + /* Invalid starting byte would be checked with all endings. So + * checks are sparse for invalid starting bytes. */ + if (buf[0] < 0xC2 || 0xF4 < buf[0]) + step = 15; /* sparse checks */ + else + step = 1; + + for (int c2 = 0x70, r2; c2 < 256; c2 += step) { + /* The second byte is checked sparsely only for invalid starts. */ + buf[1] = c2; + buf[2] = '\0'; + r2 = valid_utf8(buf); + + if (0xC2 <= buf[0] && buf[0] < 0xE0 && + is_trailing(buf[1])) { + + expect(r2 == 2); + + /* Additional test with 41 and F5 after valid 2-bytes sequence */ + buf[3] = '\0'; + buf[2] = 'A'; + expect(valid_utf8(buf) == 2); + buf[2] = 0xF5; + expect(valid_utf8(buf) == 0); + + continue; + } + + expect(r2 == 0); + for (int c3 = 0x79, r3; c3 < 256; c3++) { + if (0 == trailing_sparse_check[c3]) + continue; /* run code below sparsely */ + buf[2] = c3; + buf[3] = '\0'; + r3 = valid_utf8(buf); + + if ((buf[0] == 0xE0 && + 0xA0 <= buf[1] && buf[1] < 0xC0 && + is_trailing(buf[2])) || + + (buf[0] == 0xED && + 0x80 <= buf[1] && buf[1] < 0xA0 && + is_trailing(buf[2])) || + + (0xE1 <= buf[0] && buf[0] < 0xF0 && buf[0] != 0xED && + is_trailing(buf[1]) && + is_trailing(buf[2]))) { + + expect(r3 == 2); + continue; + } + + expect(r3 == 0); + for (int c4 = 0x79, r4; c4 < 256; c4++) { + if (0 == trailing_sparse_check[c4]) + continue; /* run code below sparsely */ + buf[3] = c4; + buf[4] = '\0'; + r4 = valid_utf8(buf); + + if ((buf[0] == 0xF0 && + 0x90 <= buf[1] && buf[1] < 0xC0 && + is_trailing(buf[2]) && + is_trailing(buf[3])) || + + (buf[0] == 0xF4 && + 0x80 <= buf[1] && buf[1] < 0x90 && + is_trailing(buf[2]) && + is_trailing(buf[3])) || + + ((buf[0] == 0xF1 || buf[0] == 0xF2 || buf[0] == 0xF3) && + is_trailing(buf[1]) && + is_trailing(buf[2]) && + is_trailing(buf[3]))) { + + expect(r4 == 2); + continue; + } + + expect(r4 == 0); + for (int c5 = 0x79; c5 < 256; c5 += 32) { + /* We test only a few values for the fifth byte. */ + buf[4] = c5; + buf[5] = '\0'; + expect(valid_utf8(buf) == 0); + } + } + } + } + } +} + +void test_valid_utf8() +{ + start_test(__FUNCTION__); + failed = 0; + _test_valid_utf8(); + end_test(); +} + +#undef expect +#undef is_trailing +#undef valid_utf8 diff --git a/src/tests/unit-tests.c b/src/tests/unit-tests.c index 05a1328b2d..9494b34295 100644 --- a/src/tests/unit-tests.c +++ b/src/tests/unit-tests.c @@ -41,9 +41,12 @@ #include "../misc.h" #include "../memory.h" #include "../common.h" +#include "../unicode.h" #include "../sha2.h" +struct options_main options; /* fake symbol to compile with unicode.o */ + char *_fgetl_pad = NULL; #ifdef __sun /* Solaris fprintf() seems to get confused at around 16384 */ @@ -2439,6 +2442,9 @@ void test_sha2_c() { end_test(); } +/* Tests for unicode.c */ +#include "test_valid_utf8.c" + int main() { start_of_run = clock(); @@ -2493,6 +2499,9 @@ int main() { set_unit_test_source("sha2.c"); test_sha2_c(); + set_unit_test_source("unicode.c"); + test_valid_utf8(); + // perform dump listing of all processed functions. dump_stats(); diff --git a/src/unicode.c b/src/unicode.c index 592d567fe9..2f15f406db 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -530,7 +530,48 @@ inline size_t strlen_any(const void *source) return len; } -/* Check if a string is valid UTF-8 */ +/* + * Check if a string is valid UTF-8 + * + * Valid UTF-8 sequences of bytes (1-4 bytes long): + * + * 00..7F + * + * C2..DF 80..BF + * + * E0 A0..BF 80..BF + * ED 80..9F 80..BF + * Ex 80..BF 80..BF where Ex does not include E0 and ED + * + * F0 90..BF 80..BF 80..BF + * F4 80..8F 80..BF 80..BF notice 8F as upper bound in the second byte + * F1..F3 80..BF 80..BF 80..BF + * + * (X..Y denotes range from X to Y inclusive. X and Y are byte + * values written in hex.) + * + * Incomplete sequences are invalid. + * + * Range 80..BF is for trailing bytes (also called continuation + * bytes). It is not a valid starting byte. Adjacent values C0 and C1 + * could be considered starting 2-bytes sequences but they are not + * valid in UTF-8. + * + * Each of E0,ED,F0,F4 starting bytes use a sub-range for trailing + * byte at the second position. E0/ED use different halves of the + * range for the second byte. F0/F4 allow the second byte in other + * proportions (48:16), not overlapping too. + * + * Valid UTF-8 text cannot include C0,C1,F5..FF bytes at any position. + * 80..BF are invalid at starting position. + * 00..7F,C2..F4 are invalid at any trailing position (actually they + * invalidate previous char while new starting byte itself could be a + * part of a valid char, but even then the whole string would be + * invalid for purposes of valid_utf8()). + * + * See also unicode.h and tests/test_valid_utf8.c + * See also https://en.wikipedia.org/wiki/UTF-8#Codepage_layout + */ int valid_utf8(const UTF8 *source) { UTF8 a;