1 /* 2 * Copyright (c) 2021 Nordic Semiconductor ASA 3 * 4 * SPDX-License-Identifier: Apache-2.0 5 */ 6 7 #include <stdint.h> 8 #include <string.h> 9 #include <zephyr/sys/__assert.h> 10 #include <errno.h> 11 #include <zephyr/sys/util_utf8.h> 12 13 #define ASCII_CHAR 0x7F 14 #define SEQUENCE_FIRST_MASK 0xC0 15 #define SEQUENCE_LEN_2_BYTE 0xC0 16 #define SEQUENCE_LEN_3_BYTE 0xE0 17 #define SEQUENCE_LEN_4_BYTE 0xF0 18 #define MSB_SET 0x80 19 utf8_trunc(char * utf8_str)20char *utf8_trunc(char *utf8_str) 21 { 22 const size_t len = strlen(utf8_str); 23 24 if (len == 0U) { 25 /* no-op */ 26 return utf8_str; 27 } 28 29 char *last_byte_p = utf8_str + len - 1U; 30 uint8_t bytes_truncated; 31 char seq_start_byte; 32 33 if ((*last_byte_p & ASCII_CHAR) == *last_byte_p) { 34 /* Not part of an UTF8 sequence, return */ 35 return utf8_str; 36 } 37 38 /* Find the starting byte and NULL-terminate other bytes */ 39 bytes_truncated = 0; 40 while ((*last_byte_p & SEQUENCE_FIRST_MASK) != SEQUENCE_FIRST_MASK && 41 last_byte_p > utf8_str) { 42 last_byte_p--; 43 bytes_truncated++; 44 } 45 bytes_truncated++; /* include the starting byte */ 46 47 /* Verify if the last character actually need to be truncated 48 * Handles the case where the number of bytes in the last UTF8-char 49 * matches the number of bytes we searched for the starting byte 50 */ 51 seq_start_byte = *last_byte_p; 52 if ((seq_start_byte & SEQUENCE_LEN_4_BYTE) == SEQUENCE_LEN_4_BYTE) { 53 if (bytes_truncated == 4) { 54 return utf8_str; 55 } 56 } else if ((seq_start_byte & SEQUENCE_LEN_3_BYTE) == SEQUENCE_LEN_3_BYTE) { 57 if (bytes_truncated == 3) { 58 return utf8_str; 59 } 60 } else if ((seq_start_byte & SEQUENCE_LEN_2_BYTE) == SEQUENCE_LEN_2_BYTE) { 61 if (bytes_truncated == 2) { 62 return utf8_str; 63 } 64 } 65 66 /* NULL-terminate the unterminated starting byte */ 67 *last_byte_p = '\0'; 68 69 return utf8_str; 70 } 71 utf8_lcpy(char * dst,const char * src,size_t n)72char *utf8_lcpy(char *dst, const char *src, size_t n) 73 { 74 if (n > 0) { 75 strncpy(dst, src, n - 1); 76 dst[n - 1] = '\0'; 77 78 if (n != 1) { 79 utf8_trunc(dst); 80 } 81 } 82 83 return dst; 84 } 85 utf8_count_chars(const char * s)86int utf8_count_chars(const char *s) 87 { 88 int count = 0; 89 const char *p = s; /* getting a pointer to increment */ 90 91 while (*p != '\0') { 92 if ((*p & MSB_SET) == 0) { /* 1-byte character: 0xxxxxxx */ 93 p += 1; 94 } else if ((*p & SEQUENCE_LEN_3_BYTE) == SEQUENCE_FIRST_MASK) { 95 /* 2-byte character: 110xxxxx */ 96 if ((p[1] & SEQUENCE_FIRST_MASK) != MSB_SET) { 97 /* invalid continuation */ 98 return -EINVAL; 99 } 100 p += 2; 101 } else if ((*p & SEQUENCE_LEN_4_BYTE) == SEQUENCE_LEN_3_BYTE) { 102 /* 3-byte character: 1110xxxx */ 103 if ((p[1] & SEQUENCE_FIRST_MASK) != MSB_SET 104 || (p[2] & SEQUENCE_FIRST_MASK) != MSB_SET) { 105 /* invalid continuation */ 106 return -EINVAL; 107 } 108 p += 3; 109 } else if ((*p & 0xF8) == SEQUENCE_LEN_4_BYTE) { 110 /* 4-byte character: 11110xxx */ 111 if ((p[1] & SEQUENCE_FIRST_MASK) != MSB_SET 112 || (p[2] & SEQUENCE_FIRST_MASK) != MSB_SET 113 || (p[3] & SEQUENCE_FIRST_MASK) != MSB_SET) { 114 /* invalid continuation */ 115 return -EINVAL; 116 } 117 p += 4; 118 } else { 119 /* Invalid UTF-8 byte (return) */ 120 return -EINVAL; 121 } 122 count++; 123 } 124 125 return count; 126 } 127