Skip to content

Commit

Permalink
Unicode: add unit tests and additional comments for valid_utf8()
Browse files Browse the repository at this point in the history
  • Loading branch information
AlekseyCherepanov committed Sep 15, 2024
1 parent 1c8fd6b commit f5d53d1
Show file tree
Hide file tree
Showing 5 changed files with 269 additions and 5 deletions.
7 changes: 5 additions & 2 deletions src/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,10 @@ testfiles:
###############################################################################

UNIT_TEST_OBJS = \
tests/unit-tests.o tests/misc.o tests/common.o tests/memory.o tests/sha2.o
tests/unit-tests.o tests/misc.o tests/common.o tests/memory.o tests/sha2.o unicode.o

UNIT_TEST_INCLUDED_PIECES = \
tests/test_valid_utf8.c

tests/unit-tests.o: tests/unit-tests.c common.h memory.h misc.h
$(CC) -o tests/unit-tests.o $(CFLAGS) -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG tests/unit-tests.c
Expand All @@ -823,7 +826,7 @@ tests/memory.o: memory.c arch.h misc.h jumbo.h autoconfig.h memory.h common.h jo
# keep the 'easy name' build target of unit-tests The 'real' target is ../run/unit-tests[.exe]
unit-tests: ../run/unit-tests@EXE_EXT@

../run/unit-tests@EXE_EXT@: $(UNIT_TEST_OBJS)
../run/unit-tests@EXE_EXT@: $(UNIT_TEST_OBJS) $(UNIT_TEST_INCLUDED_PIECES)
$(LD) $(UNIT_TEST_OBJS) $(LDFLAGS) @OPENSSL_LIBS@ -o $@
@ echo "Now Running the Unit Tests"
@ ${POSSIBLE_WINE_MSG}
Expand Down
4 changes: 2 additions & 2 deletions src/Makefile.legacy
Original file line number Diff line number Diff line change
Expand Up @@ -337,8 +337,8 @@ default:
@echo "beos-x86-any BeOS, x86"
@echo "generic Any other Unix-like system with gcc"

unit-tests:
$(CC) -o ../run/unit-tests -Wall -O2 -fomit-frame-pointer -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG tests/unit-tests.c misc.c common.c memory.c sha2.c
unit-tests: unicode.o
$(CC) -o ../run/unit-tests -Wall -O2 -fomit-frame-pointer -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG tests/unit-tests.c misc.c common.c memory.c sha2.c unicode.o -lcrypto
../run/unit-tests

linux-x86-64-avx512:
Expand Down
211 changes: 211 additions & 0 deletions src/tests/test_valid_utf8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
/*
* Copyright (c) 2024 Aleksey Cherepanov
* Redistribution and use in source and binary forms, with or without
* modification, are permitted.
*/

/* Test code for valid_utf8() from unicode.c vs single char UTF-8 sequences
*
* It tests only a sequence of bytes for single character. Plus a few
* cases of additional byte after valid 2-bytes long sequence are
* tested. Higher-level logic is not checked. Test is sparse: checks
* of continuous blocks are applied skipping parts. Dense checks are
* applied close to borders to catch off-by-one mistakes. Valid
* sequences are limited to single character (1-4 bytes). Invalid
* sequences go up to 5 bytes and use even bigger steps for skipping.
* ASCII bytes at trailing positions are tested lightly.
*
* Description in PR#5531 contains a script to test valid_utf8()
* against Python3. https://github.com/openwall/john/pull/5531
*/

/* Related info about UTF-8
*
* Valid UTF-8 sequences of bytes (1-4 bytes long):
*
* 00..7F
*
* C2..DF 80..BF
*
* E0 A0..BF 80..BF
* ED 80..9F 80..BF
* Ex 80..BF 80..BF where Ex does not include E0 and ED
*
* F0 90..BF 80..BF 80..BF
* F4 80..8F 80..BF 80..BF notice 8F as upper bound in the second byte
* F1..F3 80..BF 80..BF 80..BF
*
* (X..Y denotes range from X to Y inclusive. X and Y are byte
* values written in hex.)
*
* Incomplete sequences are invalid.
*
* Range 80..BF is for trailing bytes (also called continuation
* bytes). It is not a valid starting byte. Adjacent values C0 and C1
* could be considered starting 2-bytes sequences but they are not
* valid in UTF-8.
*
* Each of E0,ED,F0,F4 starting bytes use a sub-range for trailing
* byte at the second position. E0/ED use different halves of the
* range for the second byte. F0/F4 allow the second byte in other
* proportions (48:16), not overlapping too.
*
* Valid UTF-8 text cannot include C0,C1,F5..FF bytes at any position.
* 80..BF are invalid at starting position.
* 00..7F,C2..F4 are invalid at any trailing position (actually they
* invalidate previous char while new starting byte itself can be a
* part of valid char, but the whole string would be invalid for
* purposes of valid_utf8()).
*
* See also https://en.wikipedia.org/wiki/UTF-8#Codepage_layout
*/

/* This file has up to 6 levels of nesting, so tab-width 4 might be
* helpful. Deep nesting is the price for simple regular structure. */

#define is_trailing(c) (0x80 <= (c) && (c) < 0xC0)

#define valid_utf8(a) (inc_test(), valid_utf8((a)))
#define expect(cond) \
do { \
if (!(cond)) { \
printf("Failed %s(): check '%s' fails for this byte sequence: %s\n", \
Results.test_name, #cond, hex(buf, strlen((void *)buf))); \
inc_failed_test(); \
return; /* early exit for the whole test */ \
} \
} while (0)

void _test_valid_utf8()
{
UTF8 buf[6] = {};

/* Empty string is valid. */
expect(valid_utf8(buf) == 1);

/* 1 byte: ASCII is valid, non-ascii alone is invalid. */
for (int c = 0; c < 256; c++) {
buf[0] = c;
buf[1] = '\0';
expect(valid_utf8(buf) == (c < 128));
}

/* Setup dense check around borders of 80..BF range for trailing bytes. */
unsigned char trailing_sparse_check[256] = {};
for (int c = 0x79; c < 256; c += 16)
trailing_sparse_check[c] = 1;
for (int c = 0x80 - 8; c < 0x80 + 8; c++)
trailing_sparse_check[c] = 1;
for (int c = 0xBF - 8; c < 0xBF + 8; c++)
trailing_sparse_check[c] = 1;

/* Multi-byte test: either start is valid or we grow sequence (up to 5 bytes). */
for (int c1 = 128; c1 < 256; c1++) {
buf[0] = c1;
buf[1] = '\0';

int step = 1;

/* Invalid starting byte would be checked with all endings. So
* checks are sparse for invalid starting bytes. */
if (buf[0] < 0xC2 || 0xF4 < buf[0])
step = 15; /* sparse checks */
else
step = 1;

for (int c2 = 0x70, r2; c2 < 256; c2 += step) {
/* The second byte is checked sparsely only for invalid starts. */
buf[1] = c2;
buf[2] = '\0';
r2 = valid_utf8(buf);

if (0xC2 <= buf[0] && buf[0] < 0xE0 &&
is_trailing(buf[1])) {

expect(r2 == 2);

/* Additional test with 41 and F5 after valid 2-bytes sequence */
buf[3] = '\0';
buf[2] = 'A';
expect(valid_utf8(buf) == 2);
buf[2] = 0xF5;
expect(valid_utf8(buf) == 0);

continue;
}

expect(r2 == 0);
for (int c3 = 0x79, r3; c3 < 256; c3++) {
if (0 == trailing_sparse_check[c3])
continue; /* run code below sparsely */
buf[2] = c3;
buf[3] = '\0';
r3 = valid_utf8(buf);

if ((buf[0] == 0xE0 &&
0xA0 <= buf[1] && buf[1] < 0xC0 &&
is_trailing(buf[2])) ||

(buf[0] == 0xED &&
0x80 <= buf[1] && buf[1] < 0xA0 &&
is_trailing(buf[2])) ||

(0xE1 <= buf[0] && buf[0] < 0xF0 && buf[0] != 0xED &&
is_trailing(buf[1]) &&
is_trailing(buf[2]))) {

expect(r3 == 2);
continue;
}

expect(r3 == 0);
for (int c4 = 0x79, r4; c4 < 256; c4++) {
if (0 == trailing_sparse_check[c4])
continue; /* run code below sparsely */
buf[3] = c4;
buf[4] = '\0';
r4 = valid_utf8(buf);

if ((buf[0] == 0xF0 &&
0x90 <= buf[1] && buf[1] < 0xC0 &&
is_trailing(buf[2]) &&
is_trailing(buf[3])) ||

(buf[0] == 0xF4 &&
0x80 <= buf[1] && buf[1] < 0x90 &&
is_trailing(buf[2]) &&
is_trailing(buf[3])) ||

((buf[0] == 0xF1 || buf[0] == 0xF2 || buf[0] == 0xF3) &&
is_trailing(buf[1]) &&
is_trailing(buf[2]) &&
is_trailing(buf[3]))) {

expect(r4 == 2);
continue;
}

expect(r4 == 0);
for (int c5 = 0x79; c5 < 256; c5 += 32) {
/* We test only a few values for the fifth byte. */
buf[4] = c5;
buf[5] = '\0';
expect(valid_utf8(buf) == 0);
}
}
}
}
}
}

void test_valid_utf8()
{
start_test(__FUNCTION__);
failed = 0;
_test_valid_utf8();
end_test();
}

#undef expect
#undef is_trailing
#undef valid_utf8
9 changes: 9 additions & 0 deletions src/tests/unit-tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,12 @@
#include "../misc.h"
#include "../memory.h"
#include "../common.h"
#include "../unicode.h"

#include "../sha2.h"

struct options_main options; /* fake symbol to compile with unicode.o */

char *_fgetl_pad = NULL;
#ifdef __sun
/* Solaris fprintf() seems to get confused at around 16384 */
Expand Down Expand Up @@ -2439,6 +2442,9 @@ void test_sha2_c() {
end_test();
}

/* Tests for unicode.c */
#include "test_valid_utf8.c"

int main() {
start_of_run = clock();

Expand Down Expand Up @@ -2493,6 +2499,9 @@ int main() {
set_unit_test_source("sha2.c");
test_sha2_c();

set_unit_test_source("unicode.c");
test_valid_utf8();

// perform dump listing of all processed functions.
dump_stats();

Expand Down
43 changes: 42 additions & 1 deletion src/unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,48 @@ inline size_t strlen_any(const void *source)
return len;
}

/* Check if a string is valid UTF-8 */
/*
* Check if a string is valid UTF-8
*
* Valid UTF-8 sequences of bytes (1-4 bytes long):
*
* 00..7F
*
* C2..DF 80..BF
*
* E0 A0..BF 80..BF
* ED 80..9F 80..BF
* Ex 80..BF 80..BF where Ex does not include E0 and ED
*
* F0 90..BF 80..BF 80..BF
* F4 80..8F 80..BF 80..BF notice 8F as upper bound in the second byte
* F1..F3 80..BF 80..BF 80..BF
*
* (X..Y denotes range from X to Y inclusive. X and Y are byte
* values written in hex.)
*
* Incomplete sequences are invalid.
*
* Range 80..BF is for trailing bytes (also called continuation
* bytes). It is not a valid starting byte. Adjacent values C0 and C1
* could be considered starting 2-bytes sequences but they are not
* valid in UTF-8.
*
* Each of E0,ED,F0,F4 starting bytes use a sub-range for trailing
* byte at the second position. E0/ED use different halves of the
* range for the second byte. F0/F4 allow the second byte in other
* proportions (48:16), not overlapping too.
*
* Valid UTF-8 text cannot include C0,C1,F5..FF bytes at any position.
* 80..BF are invalid at starting position.
* 00..7F,C2..F4 are invalid at any trailing position (actually they
* invalidate previous char while new starting byte itself could be a
* part of a valid char, but even then the whole string would be
* invalid for purposes of valid_utf8()).
*
* See also unicode.h and tests/test_valid_utf8.c
* See also https://en.wikipedia.org/wiki/UTF-8#Codepage_layout
*/
int valid_utf8(const UTF8 *source)
{
UTF8 a;
Expand Down

0 comments on commit f5d53d1

Please sign in to comment.