1 /*
2  * Copyright (c) 2021 Nordic Semiconductor ASA
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  */
6 
7 #include <stdint.h>
8 #include <string.h>
9 #include <zephyr/sys/__assert.h>
10 #include <errno.h>
11 #include <zephyr/sys/util_utf8.h>
12 
13 #define ASCII_CHAR 0x7F
14 #define SEQUENCE_FIRST_MASK 0xC0
15 #define SEQUENCE_LEN_2_BYTE 0xC0
16 #define SEQUENCE_LEN_3_BYTE 0xE0
17 #define SEQUENCE_LEN_4_BYTE 0xF0
18 #define MSB_SET 0x80
19 
utf8_trunc(char * utf8_str)20 char *utf8_trunc(char *utf8_str)
21 {
22 	const size_t len = strlen(utf8_str);
23 
24 	if (len == 0U) {
25 		/* no-op */
26 		return utf8_str;
27 	}
28 
29 	char *last_byte_p = utf8_str + len - 1U;
30 	uint8_t bytes_truncated;
31 	char seq_start_byte;
32 
33 	if ((*last_byte_p & ASCII_CHAR) == *last_byte_p) {
34 		/* Not part of an UTF8 sequence, return */
35 		return utf8_str;
36 	}
37 
38 	/* Find the starting byte and NULL-terminate other bytes */
39 	bytes_truncated = 0;
40 	while ((*last_byte_p & SEQUENCE_FIRST_MASK) != SEQUENCE_FIRST_MASK &&
41 	       last_byte_p > utf8_str) {
42 		last_byte_p--;
43 		bytes_truncated++;
44 	}
45 	bytes_truncated++; /* include the starting byte */
46 
47 	/* Verify if the last character actually need to be truncated
48 	 * Handles the case where the number of bytes in the last UTF8-char
49 	 * matches the number of bytes we searched for the starting byte
50 	 */
51 	seq_start_byte = *last_byte_p;
52 	if ((seq_start_byte & SEQUENCE_LEN_4_BYTE) == SEQUENCE_LEN_4_BYTE) {
53 		if (bytes_truncated == 4) {
54 			return utf8_str;
55 		}
56 	} else if ((seq_start_byte & SEQUENCE_LEN_3_BYTE) == SEQUENCE_LEN_3_BYTE) {
57 		if (bytes_truncated == 3) {
58 			return utf8_str;
59 		}
60 	} else if ((seq_start_byte & SEQUENCE_LEN_2_BYTE) == SEQUENCE_LEN_2_BYTE) {
61 		if (bytes_truncated == 2) {
62 			return utf8_str;
63 		}
64 	}
65 
66 	/* NULL-terminate the unterminated starting byte */
67 	*last_byte_p = '\0';
68 
69 	return utf8_str;
70 }
71 
utf8_lcpy(char * dst,const char * src,size_t n)72 char *utf8_lcpy(char *dst, const char *src, size_t n)
73 {
74 	if (n > 0) {
75 		strncpy(dst, src, n - 1);
76 		dst[n - 1] = '\0';
77 
78 		if (n != 1) {
79 			utf8_trunc(dst);
80 		}
81 	}
82 
83 	return dst;
84 }
85 
utf8_count_chars(const char * s)86 int utf8_count_chars(const char *s)
87 {
88 	int count = 0;
89 	const char *p = s; /* getting a pointer to increment */
90 
91 	while (*p != '\0') {
92 		if ((*p & MSB_SET) == 0) { /* 1-byte character: 0xxxxxxx */
93 			p += 1;
94 		} else if ((*p & SEQUENCE_LEN_3_BYTE) == SEQUENCE_FIRST_MASK) {
95 			/* 2-byte character: 110xxxxx */
96 			if ((p[1] & SEQUENCE_FIRST_MASK) != MSB_SET) {
97 				/* invalid continuation */
98 				return -EINVAL;
99 			}
100 			p += 2;
101 		} else if ((*p & SEQUENCE_LEN_4_BYTE) == SEQUENCE_LEN_3_BYTE) {
102 			/* 3-byte character: 1110xxxx */
103 			if ((p[1] & SEQUENCE_FIRST_MASK) != MSB_SET
104 				|| (p[2] & SEQUENCE_FIRST_MASK) != MSB_SET) {
105 				/* invalid continuation */
106 				return -EINVAL;
107 			}
108 			p += 3;
109 		} else if ((*p & 0xF8) == SEQUENCE_LEN_4_BYTE) {
110 			/* 4-byte character: 11110xxx */
111 			if ((p[1] & SEQUENCE_FIRST_MASK) != MSB_SET
112 				|| (p[2] & SEQUENCE_FIRST_MASK) != MSB_SET
113 				|| (p[3] & SEQUENCE_FIRST_MASK) != MSB_SET) {
114 				/* invalid continuation */
115 				return -EINVAL;
116 			}
117 			p += 4;
118 		} else {
119 			/* Invalid UTF-8 byte (return) */
120 			return -EINVAL;
121 		}
122 		count++;
123 	}
124 
125 	return count;
126 }
127