Skip to content

Commit

Permalink
utils: Add utf8_is_valid function
Browse files Browse the repository at this point in the history
Add 'utf8_is_valid' function to check if a given string is utf8 encoded.

Signed-off-by: James Roy <rruuaanng@outlook.com>
  • Loading branch information
rruuaanng committed Nov 11, 2024
1 parent 2f23313 commit 5e338ae
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 4 deletions.
10 changes: 10 additions & 0 deletions include/zephyr/sys/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,16 @@ char *utf8_trunc(char *utf8_str);
*/
char *utf8_lcpy(char *dst, const char *src, size_t n);

/**
* @brief Checks if the given string @p str is UTF-8 encoded.
*
* @param str Target string
* @param maxlen The maximum length of string @p str to check
*
* @return true if @p str is UTF-8 encoded, or false otherwise.
*/
bool utf8_is_valid(const unsigned char *str, size_t maxlen);

#define __z_log2d(x) (32 - __builtin_clz(x) - 1)
#define __z_log2q(x) (64 - __builtin_clzll(x) - 1)
#define __z_log2(x) (sizeof(__typeof__(x)) > 4 ? __z_log2q(x) : __z_log2d(x))
Expand Down
53 changes: 49 additions & 4 deletions lib/utils/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,23 @@
* SPDX-License-Identifier: Apache-2.0
*/

#undef _POSIX_C_SOURCE
#define _POSIX_C_SOURCE 200809L

#include <stdint.h>
#include <string.h>
#include <zephyr/sys/__assert.h>
#include <zephyr/sys/util.h>

#define ASCII_CHAR 0x7F
#define SEQUENCE_FIRST_MASK 0xC0
#define SEQUENCE_LEN_2_BYTE 0xC0
#define SEQUENCE_LEN_3_BYTE 0xE0
#define SEQUENCE_LEN_4_BYTE 0xF0
#define SEQUENCE_MIN_LEN_2_BYTE (SEQUENCE_LEN_2_BYTE + 2)
#define SEQUENCE_MAX_LEN_2_BYTE 0xDF
#define SEQUENCE_MIN_LEN_3_BYTE 0xE0
#define SEQUENCE_MAX_LEN_3_BYTE 0xEF
#define SEQUENCE_MIN_LEN_4_BYTE 0xF0
#define SEQUENCE_MAX_LEN_4_BYTE 0xF4

char *utf8_trunc(char *utf8_str)
{
Expand Down Expand Up @@ -46,11 +54,11 @@ char *utf8_trunc(char *utf8_str)
* matches the number of bytes we searched for the starting byte
*/
seq_start_byte = *last_byte_p;
if ((seq_start_byte & SEQUENCE_LEN_4_BYTE) == SEQUENCE_LEN_4_BYTE) {
if ((seq_start_byte & SEQUENCE_MIN_LEN_4_BYTE) == SEQUENCE_MIN_LEN_4_BYTE) {
if (bytes_truncated == 4) {
return utf8_str;
}
} else if ((seq_start_byte & SEQUENCE_LEN_3_BYTE) == SEQUENCE_LEN_3_BYTE) {
} else if ((seq_start_byte & SEQUENCE_MIN_LEN_3_BYTE) == SEQUENCE_MIN_LEN_3_BYTE) {
if (bytes_truncated == 3) {
return utf8_str;
}
Expand Down Expand Up @@ -79,3 +87,40 @@ char *utf8_lcpy(char *dst, const char *src, size_t n)

return dst;
}

bool utf8_is_valid(const unsigned char *str, size_t maxlen)
{
size_t i = 0, nbyte = 0;
size_t len;

/* It will also return false */
if (str == NULL) {
return false;
}
len = strnlen(str, maxlen);

while (i < len) {
if (str[i] <= ASCII_CHAR && str[i] >= '\0') {
i++;
continue;
} else {
if (str[i] <= SEQUENCE_MAX_LEN_2_BYTE
&& str[i] >= SEQUENCE_MIN_LEN_2_BYTE) {
nbyte = 2;
} else if (str[i] <= SEQUENCE_MAX_LEN_3_BYTE
&& str[i] >= SEQUENCE_MIN_LEN_3_BYTE) {
nbyte = 3;
} else if (str[i] <= SEQUENCE_MAX_LEN_4_BYTE
&& str[i] >= SEQUENCE_MIN_LEN_4_BYTE) {
nbyte = 4;

Check notice on line 115 in lib/utils/utf8.c

View workflow job for this annotation

GitHub Actions / Run compliance checks on patch series (PR)

You may want to run clang-format on this change

lib/utils/utf8.c:115 - if (str[i] <= SEQUENCE_MAX_LEN_2_BYTE - && str[i] >= SEQUENCE_MIN_LEN_2_BYTE) { + if (str[i] <= SEQUENCE_MAX_LEN_2_BYTE && + str[i] >= SEQUENCE_MIN_LEN_2_BYTE) { nbyte = 2; - } else if (str[i] <= SEQUENCE_MAX_LEN_3_BYTE - && str[i] >= SEQUENCE_MIN_LEN_3_BYTE) { + } else if (str[i] <= SEQUENCE_MAX_LEN_3_BYTE && + str[i] >= SEQUENCE_MIN_LEN_3_BYTE) { nbyte = 3; - } else if (str[i] <= SEQUENCE_MAX_LEN_4_BYTE - && str[i] >= SEQUENCE_MIN_LEN_4_BYTE) { + } else if (str[i] <= SEQUENCE_MAX_LEN_4_BYTE && + str[i] >= SEQUENCE_MIN_LEN_4_BYTE) {
} else {
return false;
}
}
if (i + nbyte > len) {
return false;
}
i += nbyte;
}
return true;
}
23 changes: 23 additions & 0 deletions tests/unit/util/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,29 @@ ZTEST(util, test_utf8_lcpy_truncated)
zassert_str_equal(dest_str, expected_result, "Failed to copy");
}

ZTEST(util, test_utf8_is_valid)
{
/* Test whether the verification function meets the requirements */
zassert_true(utf8_is_valid("κόσμε", 11));
zassert_true(utf8_is_valid("\x00", 1));
zassert_true(utf8_is_valid("\xc2\x80", 2));
zassert_true(utf8_is_valid("\xef\xbf\xbf", 3));
zassert_true(utf8_is_valid("\xed\x9f\xbf", 3));
zassert_true(utf8_is_valid("\xed\xa0\x80", 3));
zassert_true(utf8_is_valid("\xed\xa0\x80\xed\xb0\x80", 6));
zassert_true(utf8_is_valid("\xef\xbf\xbe", 3));
zassert_false(utf8_is_valid("\x80", 1));
zassert_false(utf8_is_valid("\xc0", 1));
zassert_false(utf8_is_valid("\xfe", 1));
zassert_false(utf8_is_valid("\xff", 1));
zassert_false(utf8_is_valid("\xfe\xfe\xff\xff", 4));
zassert_false(utf8_is_valid("\xc0\x7f", 2));
zassert_false(utf8_is_valid("\xc0\xaf", 2));
zassert_false(utf8_is_valid("\xc1\xbf", 2));
zassert_false(utf8_is_valid("\xc0\x80", 2));
zassert_false(utf8_is_valid(NULL, 1));
}

ZTEST(util, test_utf8_lcpy_not_truncated)
{
/* dest_str size is based on storing 3 * € plus the null terminator */
Expand Down

0 comments on commit 5e338ae

Please sign in to comment.