Skip to content

Commit

Permalink
utils: Add utf8_is_valid function
Browse files Browse the repository at this point in the history
Add 'utf8_is_valid' function to check if a given string is utf8 encoded.

Signed-off-by: James Roy <rruuaanng@outlook.com>
  • Loading branch information
rruuaanng committed Nov 6, 2024
1 parent 2f23313 commit f44015d
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 4 deletions.
10 changes: 10 additions & 0 deletions include/zephyr/sys/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,16 @@ char *utf8_trunc(char *utf8_str);
*/
char *utf8_lcpy(char *dst, const char *src, size_t n);

/**
* @brief Checks if the given string @p str is UTF-8 encoded.
*
* @param str Target string
* @param maxlen The maximum length of string @p str to check
*
* @return true if @p str is UTF-8 encoded, or false otherwise.
*/
bool utf8_is_valid(const char *str, size_t maxlen);

#define __z_log2d(x) (32 - __builtin_clz(x) - 1)
#define __z_log2q(x) (64 - __builtin_clzll(x) - 1)
#define __z_log2(x) (sizeof(__typeof__(x)) > 4 ? __z_log2q(x) : __z_log2d(x))
Expand Down
44 changes: 40 additions & 4 deletions lib/utils/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,17 @@
#include <stdint.h>
#include <string.h>
#include <zephyr/sys/__assert.h>
#include <zephyr/sys/util.h>

#define ASCII_CHAR 0x7F
#define SEQUENCE_FIRST_MASK 0xC0
#define SEQUENCE_LEN_2_BYTE 0xC0
#define SEQUENCE_LEN_3_BYTE 0xE0
#define SEQUENCE_LEN_4_BYTE 0xF0
#define SEQUENCE_MIN_LEN_2_BYTE (SEQUENCE_LEN_2_BYTE + 2)
#define SEQUENCE_MAX_LEN_2_BYTE 0xDF
#define SEQUENCE_MIN_LEN_3_BYTE 0xE0
#define SEQUENCE_MAX_LEN_3_BYTE 0xEF
#define SEQUENCE_MIN_LEN_4_BYTE 0xF0
#define SEQUENCE_MAX_LEN_4_BYTE 0xF4

char *utf8_trunc(char *utf8_str)
{
Expand Down Expand Up @@ -46,11 +51,11 @@ char *utf8_trunc(char *utf8_str)
* matches the number of bytes we searched for the starting byte
*/
seq_start_byte = *last_byte_p;
if ((seq_start_byte & SEQUENCE_LEN_4_BYTE) == SEQUENCE_LEN_4_BYTE) {
if ((seq_start_byte & SEQUENCE_MIN_LEN_4_BYTE) == SEQUENCE_MIN_LEN_4_BYTE) {
if (bytes_truncated == 4) {
return utf8_str;
}
} else if ((seq_start_byte & SEQUENCE_LEN_3_BYTE) == SEQUENCE_LEN_3_BYTE) {
} else if ((seq_start_byte & SEQUENCE_MIN_LEN_3_BYTE) == SEQUENCE_MIN_LEN_3_BYTE) {
if (bytes_truncated == 3) {
return utf8_str;
}
Expand Down Expand Up @@ -79,3 +84,34 @@ char *utf8_lcpy(char *dst, const char *src, size_t n)

return dst;
}

bool utf8_is_valid(const char *str, size_t maxlen)
{
size_t i = 0, nbyte = 0;
size_t len = strnlen(str, maxlen);

while (i < len) {
if (str[i] <= ASCII_CHAR && str[i] >= '\0') {
i++;
continue;
} else {
if (str[i] <= SEQUENCE_MAX_LEN_2_BYTE &&
str[i] >= SEQUENCE_MIN_LEN_2_BYTE) {
nbyte = 2;
} else if (str[i] <= SEQUENCE_MAX_LEN_3_BYTE &&
str[i] >= SEQUENCE_MIN_LEN_3_BYTE) {
nbyte = 3;
} else if (str[i] <= SEQUENCE_MAX_LEN_4_BYTE &&
str[i] >= SEQUENCE_MIN_LEN_4_BYTE) {
nbyte = 4;

Check notice on line 106 in lib/utils/utf8.c

View workflow job for this annotation

GitHub Actions / Run compliance checks on patch series (PR)

You may want to run clang-format on this change

lib/utils/utf8.c:106 - str[i] >= SEQUENCE_MIN_LEN_2_BYTE) { + str[i] >= SEQUENCE_MIN_LEN_2_BYTE) { nbyte = 2; } else if (str[i] <= SEQUENCE_MAX_LEN_3_BYTE && - str[i] >= SEQUENCE_MIN_LEN_3_BYTE) { + str[i] >= SEQUENCE_MIN_LEN_3_BYTE) { nbyte = 3; } else if (str[i] <= SEQUENCE_MAX_LEN_4_BYTE && - str[i] >= SEQUENCE_MIN_LEN_4_BYTE) { + str[i] >= SEQUENCE_MIN_LEN_4_BYTE) {
} else {
return false;
}
}
if (i + nbyte > len) {
return false;
}
i += nbyte;
}
return true;
}
22 changes: 22 additions & 0 deletions tests/unit/util/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,28 @@ ZTEST(util, test_utf8_lcpy_truncated)
zassert_str_equal(dest_str, expected_result, "Failed to copy");
}

ZTEST(util, test_utf8_is_valid)
{
/* Test whether the verification function meets the requirements */
zassert_true(utf8_is_valid("\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5", 11));
zassert_true(utf8_is_valid("\x00", 1));
zassert_true(utf8_is_valid("\xc2\x80", 2));
zassert_true(utf8_is_valid("\xef\xbf\xbf", 3));
zassert_true(utf8_is_valid("\xed\x9f\xbf", 3));
zassert_true(utf8_is_valid("\xed\xa0\x80", 3));
zassert_true(utf8_is_valid("\xed\xa0\x80\xed\xb0\x80", 6));
zassert_true(utf8_is_valid("\xef\xbf\xbe", 3));
zassert_false(utf8_is_valid("\x80", 1));
zassert_false(utf8_is_valid("\xc0", 1));
zassert_false(utf8_is_valid("\xfe", 1));
zassert_false(utf8_is_valid("\xff", 1));
zassert_false(utf8_is_valid("\xfe\xfe\xff\xff", 4));
zassert_false(utf8_is_valid("\xc0\x7f", 2));
zassert_false(utf8_is_valid("\xc0\xaf", 2));
zassert_false(utf8_is_valid("\xc1\xbf", 2));
zassert_false(utf8_is_valid("\xc0\x80", 2));
}

ZTEST(util, test_utf8_lcpy_not_truncated)
{
/* dest_str size is based on storing 3 * € plus the null terminator */
Expand Down

0 comments on commit f44015d

Please sign in to comment.