utils: Add utf8_is_valid function

Add 'utf8_is_valid' function to check if a given string is utf8 encoded. Signed-off-by: James Roy <rruuaanng@outlook.com>
zephyrproject-rtos · Nov 6, 2024 · f44015d · f44015d
1 parent 2f23313
commit f44015d
Show file tree

Hide file tree

Showing 3 changed files with 72 additions and 4 deletions.
diff --git a/include/zephyr/sys/util.h b/include/zephyr/sys/util.h
@@ -685,6 +685,16 @@ char *utf8_trunc(char *utf8_str);
  */
 char *utf8_lcpy(char *dst, const char *src, size_t n);
 
+/**
+ * @brief Checks if the given string @p str is UTF-8 encoded.
+ *
+ * @param str Target string
+ * @param maxlen The maximum length of string @p str to check
+ *
+ * @return true if @p str is UTF-8 encoded, or false otherwise.
+ */
+bool utf8_is_valid(const char *str, size_t maxlen);
+
 #define __z_log2d(x) (32 - __builtin_clz(x) - 1)
 #define __z_log2q(x) (64 - __builtin_clzll(x) - 1)
 #define __z_log2(x) (sizeof(__typeof__(x)) > 4 ? __z_log2q(x) : __z_log2d(x))

diff --git a/lib/utils/utf8.c b/lib/utils/utf8.c
@@ -7,12 +7,17 @@
 #include <stdint.h>
 #include <string.h>
 #include <zephyr/sys/__assert.h>
+#include <zephyr/sys/util.h>
 
 #define ASCII_CHAR 0x7F
 #define SEQUENCE_FIRST_MASK 0xC0
 #define SEQUENCE_LEN_2_BYTE 0xC0
-#define SEQUENCE_LEN_3_BYTE 0xE0
-#define SEQUENCE_LEN_4_BYTE 0xF0
+#define SEQUENCE_MIN_LEN_2_BYTE (SEQUENCE_LEN_2_BYTE + 2)
+#define SEQUENCE_MAX_LEN_2_BYTE 0xDF
+#define SEQUENCE_MIN_LEN_3_BYTE 0xE0
+#define SEQUENCE_MAX_LEN_3_BYTE 0xEF
+#define SEQUENCE_MIN_LEN_4_BYTE 0xF0
+#define SEQUENCE_MAX_LEN_4_BYTE 0xF4
 
 char *utf8_trunc(char *utf8_str)
 {
@@ -46,11 +51,11 @@ char *utf8_trunc(char *utf8_str)
 	 * matches the number of bytes we searched for the starting byte
 	 */
 	seq_start_byte = *last_byte_p;
-	if ((seq_start_byte & SEQUENCE_LEN_4_BYTE) == SEQUENCE_LEN_4_BYTE) {
+	if ((seq_start_byte & SEQUENCE_MIN_LEN_4_BYTE) == SEQUENCE_MIN_LEN_4_BYTE) {
 		if (bytes_truncated == 4) {
 			return utf8_str;
 		}
-	} else if ((seq_start_byte & SEQUENCE_LEN_3_BYTE) == SEQUENCE_LEN_3_BYTE) {
+	} else if ((seq_start_byte & SEQUENCE_MIN_LEN_3_BYTE) == SEQUENCE_MIN_LEN_3_BYTE) {
 		if (bytes_truncated == 3) {
 			return utf8_str;
 		}
@@ -79,3 +84,34 @@ char *utf8_lcpy(char *dst, const char *src, size_t n)
 
 	return dst;
 }
+
+bool utf8_is_valid(const char *str, size_t maxlen)
+{
+	size_t i = 0, nbyte = 0;
+	size_t len = strnlen(str, maxlen);
+
+	while (i < len) {
+		if (str[i] <= ASCII_CHAR && str[i] >= '\0') {
+			i++;
+			continue;
+		} else {
+			if (str[i] <= SEQUENCE_MAX_LEN_2_BYTE &&
+				str[i] >= SEQUENCE_MIN_LEN_2_BYTE) {
+				nbyte = 2;
+			} else if (str[i] <= SEQUENCE_MAX_LEN_3_BYTE &&
+					   str[i] >= SEQUENCE_MIN_LEN_3_BYTE) {
+				nbyte = 3;
+			} else if (str[i] <= SEQUENCE_MAX_LEN_4_BYTE &&
+					   str[i] >= SEQUENCE_MIN_LEN_4_BYTE) {
+				nbyte = 4;
+			} else {
+				return false;
+			}
+		}
+		if (i + nbyte > len) {
+			return false;
+		}
+		i += nbyte;
+	}
+	return true;
+}
diff --git a/tests/unit/util/main.c b/tests/unit/util/main.c
@@ -867,6 +867,28 @@ ZTEST(util, test_utf8_lcpy_truncated)
 	zassert_str_equal(dest_str, expected_result, "Failed to copy");
 }
 
+ZTEST(util, test_utf8_is_valid)
+{
+	/* Test whether the verification function meets the requirements */
+	zassert_true(utf8_is_valid("\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5", 11));
+	zassert_true(utf8_is_valid("\x00", 1));
+	zassert_true(utf8_is_valid("\xc2\x80", 2));
+	zassert_true(utf8_is_valid("\xef\xbf\xbf", 3));
+	zassert_true(utf8_is_valid("\xed\x9f\xbf", 3));
+	zassert_true(utf8_is_valid("\xed\xa0\x80", 3));
+	zassert_true(utf8_is_valid("\xed\xa0\x80\xed\xb0\x80", 6));
+	zassert_true(utf8_is_valid("\xef\xbf\xbe", 3));
+	zassert_false(utf8_is_valid("\x80", 1));
+	zassert_false(utf8_is_valid("\xc0", 1));
+	zassert_false(utf8_is_valid("\xfe", 1));
+	zassert_false(utf8_is_valid("\xff", 1));
+	zassert_false(utf8_is_valid("\xfe\xfe\xff\xff", 4));
+	zassert_false(utf8_is_valid("\xc0\x7f", 2));
+	zassert_false(utf8_is_valid("\xc0\xaf", 2));
+	zassert_false(utf8_is_valid("\xc1\xbf", 2));
+	zassert_false(utf8_is_valid("\xc0\x80", 2));
+}
+
 ZTEST(util, test_utf8_lcpy_not_truncated)
 {
 	/* dest_str size is based on storing 3 * € plus the null terminator  */