Skip to content

Commit

Permalink
utils: Add utf8_is_valid function
Browse files Browse the repository at this point in the history
Add 'utf8_is_valid' function to check if a given string is utf8 encoded.

Signed-off-by: James Roy <rruuaanng@outlook.com>
  • Loading branch information
rruuaanng committed Nov 15, 2024
1 parent 2f23313 commit 0eabadd
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 4 deletions.
10 changes: 10 additions & 0 deletions include/zephyr/sys/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,16 @@ char *utf8_trunc(char *utf8_str);
*/
char *utf8_lcpy(char *dst, const char *src, size_t n);

/**
* @brief Checks if the given string @p str is UTF-8 encoded.
*
* @param str Target string
* @param maxlen The maximum length of string @p str to check
*
* @return true if @p str is UTF-8 encoded, or false otherwise.
*/
bool utf8_is_valid(const unsigned char *str, size_t maxlen);

#define __z_log2d(x) (32 - __builtin_clz(x) - 1)
#define __z_log2q(x) (64 - __builtin_clzll(x) - 1)
#define __z_log2(x) (sizeof(__typeof__(x)) > 4 ? __z_log2q(x) : __z_log2d(x))
Expand Down
51 changes: 47 additions & 4 deletions lib/utils/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,17 @@
#include <stdint.h>
#include <string.h>
#include <zephyr/sys/__assert.h>
#include <zephyr/sys/util.h>

#define ASCII_CHAR 0x7F
#define SEQUENCE_FIRST_MASK 0xC0
#define SEQUENCE_LEN_2_BYTE 0xC0
#define SEQUENCE_LEN_3_BYTE 0xE0
#define SEQUENCE_LEN_4_BYTE 0xF0
#define SEQUENCE_MIN_LEN_2_BYTE (SEQUENCE_LEN_2_BYTE + 2)
#define SEQUENCE_MAX_LEN_2_BYTE 0xDF
#define SEQUENCE_MIN_LEN_3_BYTE 0xE0
#define SEQUENCE_MAX_LEN_3_BYTE 0xEF
#define SEQUENCE_MIN_LEN_4_BYTE 0xF0
#define SEQUENCE_MAX_LEN_4_BYTE 0xF4

char *utf8_trunc(char *utf8_str)
{
Expand Down Expand Up @@ -46,11 +51,11 @@ char *utf8_trunc(char *utf8_str)
* matches the number of bytes we searched for the starting byte
*/
seq_start_byte = *last_byte_p;
if ((seq_start_byte & SEQUENCE_LEN_4_BYTE) == SEQUENCE_LEN_4_BYTE) {
if ((seq_start_byte & SEQUENCE_MIN_LEN_4_BYTE) == SEQUENCE_MIN_LEN_4_BYTE) {
if (bytes_truncated == 4) {
return utf8_str;
}
} else if ((seq_start_byte & SEQUENCE_LEN_3_BYTE) == SEQUENCE_LEN_3_BYTE) {
} else if ((seq_start_byte & SEQUENCE_MIN_LEN_3_BYTE) == SEQUENCE_MIN_LEN_3_BYTE) {
if (bytes_truncated == 3) {
return utf8_str;
}
Expand Down Expand Up @@ -79,3 +84,41 @@ char *utf8_lcpy(char *dst, const char *src, size_t n)

return dst;
}

bool utf8_is_valid(const unsigned char *str, size_t len)
{
size_t i = 0, nbyte = 0;

/* It will also return false */
if (str == NULL) {
return false;
}

while (i < len) {
if (str[i] == '\0') {
break;
}
if (str[i] <= ASCII_CHAR && str[i] > '\0') {
i++;
continue;
} else {
if (str[i] <= SEQUENCE_MAX_LEN_2_BYTE
&& str[i] >= SEQUENCE_MIN_LEN_2_BYTE) {
nbyte = 2;
} else if (str[i] <= SEQUENCE_MAX_LEN_3_BYTE
&& str[i] >= SEQUENCE_MIN_LEN_3_BYTE) {
nbyte = 3;
} else if (str[i] <= SEQUENCE_MAX_LEN_4_BYTE
&& str[i] >= SEQUENCE_MIN_LEN_4_BYTE) {
nbyte = 4;

Check notice on line 113 in lib/utils/utf8.c

View workflow job for this annotation

GitHub Actions / Run compliance checks on patch series (PR)

You may want to run clang-format on this change

lib/utils/utf8.c:113 - if (str[i] <= SEQUENCE_MAX_LEN_2_BYTE - && str[i] >= SEQUENCE_MIN_LEN_2_BYTE) { + if (str[i] <= SEQUENCE_MAX_LEN_2_BYTE && + str[i] >= SEQUENCE_MIN_LEN_2_BYTE) { nbyte = 2; - } else if (str[i] <= SEQUENCE_MAX_LEN_3_BYTE - && str[i] >= SEQUENCE_MIN_LEN_3_BYTE) { + } else if (str[i] <= SEQUENCE_MAX_LEN_3_BYTE && + str[i] >= SEQUENCE_MIN_LEN_3_BYTE) { nbyte = 3; - } else if (str[i] <= SEQUENCE_MAX_LEN_4_BYTE - && str[i] >= SEQUENCE_MIN_LEN_4_BYTE) { + } else if (str[i] <= SEQUENCE_MAX_LEN_4_BYTE && + str[i] >= SEQUENCE_MIN_LEN_4_BYTE) {
} else {
return false;
}
}
if (i + nbyte > len) {
return false;
}
i += nbyte;
}
return true;
}
21 changes: 21 additions & 0 deletions tests/unit/util/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,27 @@ ZTEST(util, test_utf8_lcpy_truncated)
zassert_str_equal(dest_str, expected_result, "Failed to copy");
}

ZTEST(util, test_utf8_is_valid)
{
/* Test whether the verification function meets the requirements */
zassert_true(utf8_is_valid("κόσμε", 11));
zassert_true(utf8_is_valid("\x00", 1), "1 byte (U-00000000)");
zassert_true(utf8_is_valid("\xc2\x80", 2)), "2 bytes (U-00000080)";
zassert_true(utf8_is_valid("\xef\xbf\xbf", 3), "(U-0000FFFF)");
zassert_true(utf8_is_valid("\xed\x9f\xbf", 3), "U-0000D7FF");
zassert_true(utf8_is_valid("\xef\xbf\xbf", 3), "Replacement Character U+FFFF");
zassert_true(utf8_is_valid("\xef\xbf\xbe", 3), "Byte Order Mark (BOM) U+FFFE");
zassert_false(utf8_is_valid("\x80", 1), "First continuation byte 0x80");
zassert_false(utf8_is_valid("\xc0", 1), "2-bytes U+0000, last byte missing");
zassert_false(utf8_is_valid("\xfe", 1), "impossible byte");
zassert_false(utf8_is_valid("\xfe\xfe\xff\xff", 4), "several impossible bytes");
zassert_false(utf8_is_valid("\xc0\x7f", 2), "no continuation byte");
zassert_false(utf8_is_valid("\xc0\xaf", 2), "Overlong U+002F");
zassert_false(utf8_is_valid("\xc1\xbf", 2), "Overlong U-0000007F");
zassert_false(utf8_is_valid("\xc0\x80", 2), "2 bytes overlong U+0000");
zassert_false(utf8_is_valid(NULL, 1), "NULL str argument");
}

ZTEST(util, test_utf8_lcpy_not_truncated)
{
/* dest_str size is based on storing 3 * € plus the null terminator */
Expand Down

0 comments on commit 0eabadd

Please sign in to comment.