lib/utils/utf8.c

/*
 * Copyright (c) 2021 Nordic Semiconductor ASA
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <string.h>
#include <zephyr/sys/__assert.h>
#include <errno.h>
#include <zephyr/sys/util_utf8.h>

#define ASCII_CHAR 0x7F
#define SEQUENCE_FIRST_MASK 0xC0
#define SEQUENCE_LEN_2_BYTE 0xC0
#define SEQUENCE_LEN_3_BYTE 0xE0
#define SEQUENCE_LEN_4_BYTE 0xF0
#define MSB_SET 0x80

char *utf8_trunc(char *utf8_str)
{
	const size_t len = strlen(utf8_str);

	if (len == 0U) {
		/* no-op */
		return utf8_str;
	}

	char *last_byte_p = utf8_str + len - 1U;
	uint8_t bytes_truncated;
	char seq_start_byte;

	if ((*last_byte_p & ASCII_CHAR) == *last_byte_p) {
		/* Not part of an UTF8 sequence, return */
		return utf8_str;
	}

	/* Find the starting byte and NULL-terminate other bytes */
	bytes_truncated = 0;
	while ((*last_byte_p & SEQUENCE_FIRST_MASK) != SEQUENCE_FIRST_MASK &&
	       last_byte_p > utf8_str) {
		last_byte_p--;
		bytes_truncated++;
	}
	bytes_truncated++; /* include the starting byte */

	/* Verify if the last character actually need to be truncated
	 * Handles the case where the number of bytes in the last UTF8-char
	 * matches the number of bytes we searched for the starting byte
	 */
	seq_start_byte = *last_byte_p;
	if ((seq_start_byte & SEQUENCE_LEN_4_BYTE) == SEQUENCE_LEN_4_BYTE) {
		if (bytes_truncated == 4) {
			return utf8_str;
		}
	} else if ((seq_start_byte & SEQUENCE_LEN_3_BYTE) == SEQUENCE_LEN_3_BYTE) {
		if (bytes_truncated == 3) {
			return utf8_str;
		}
	} else if ((seq_start_byte & SEQUENCE_LEN_2_BYTE) == SEQUENCE_LEN_2_BYTE) {
		if (bytes_truncated == 2) {
			return utf8_str;
		}
	}

	/* NULL-terminate the unterminated starting byte */
	*last_byte_p = '\0';

	return utf8_str;
}

char *utf8_lcpy(char *dst, const char *src, size_t n)
{
	if (n > 0) {
		strncpy(dst, src, n - 1);
		dst[n - 1] = '\0';

		if (n != 1) {
			utf8_trunc(dst);
		}
	}

	return dst;
}

int utf8_count_chars(const char *s)
{
	int count = 0;
	const char *p = s; /* getting a pointer to increment */

	while (*p != '\0') {
		if ((*p & MSB_SET) == 0) { /* 1-byte character: 0xxxxxxx */
			p += 1;
		} else if ((*p & SEQUENCE_LEN_3_BYTE) == SEQUENCE_FIRST_MASK) {
			/* 2-byte character: 110xxxxx */
			if ((p[1] & SEQUENCE_FIRST_MASK) != MSB_SET) {
				/* invalid continuation */
				return -EINVAL;
			}
			p += 2;
		} else if ((*p & SEQUENCE_LEN_4_BYTE) == SEQUENCE_LEN_3_BYTE) {
			/* 3-byte character: 1110xxxx */
			if ((p[1] & SEQUENCE_FIRST_MASK) != MSB_SET
				|| (p[2] & SEQUENCE_FIRST_MASK) != MSB_SET) {
				/* invalid continuation */
				return -EINVAL;
			}
			p += 3;
		} else if ((*p & 0xF8) == SEQUENCE_LEN_4_BYTE) {
			/* 4-byte character: 11110xxx */
			if ((p[1] & SEQUENCE_FIRST_MASK) != MSB_SET
				|| (p[2] & SEQUENCE_FIRST_MASK) != MSB_SET
				|| (p[3] & SEQUENCE_FIRST_MASK) != MSB_SET) {
				/* invalid continuation */
				return -EINVAL;
			}
			p += 4;
		} else {
			/* Invalid UTF-8 byte (return) */
			return -EINVAL;
		}
		count++;
	}

	return count;
}