core: return number of bytes for UTF-8 char in function utf8_int_string

2026-06-30 06:46:38 +02:00 · 2022-12-17 20:25:07 +01:00
parent 6aedddd351
commit eb6cc0bc2a
4 changed files with 27 additions and 12 deletions
@@ -1073,10 +1073,9 @@ string_convert_escaped_chars (const char *string)
                        {
                            value = (value * 16) + HEX2DEC(ptr_string[i + 1]);
                        }
-                        utf8_int_string (value, utf_char);
+                        length = utf8_int_string (value, utf_char);
                        if (utf_char[0])
                        {
-                            length = strlen (utf_char);
                            memcpy (output + pos_output, utf_char, length);
                            pos_output += length;
                        }
@@ -331,21 +331,32 @@ utf8_char_int (const char *string)
 *
 * In case of error (if unicode value is > 0x1FFFFF), the string is set to an
 * empty string (string[0] == '\0').
+ *
+ * Returns the number of bytes in the UTF-8 char (not counting the final '\0').
 */

-void
+int
 utf8_int_string (unsigned int unicode_value, char *string)
 {
+    int num_bytes;
+
+    num_bytes = 0;
+
    if (!string)
-        return;
+        return num_bytes;

    string[0] = '\0';

-    if (unicode_value <= 0x007F)
+    if (unicode_value == 0)
+    {
+        /* NUL char */
+    }
+    else if (unicode_value <= 0x007F)
    {
        /* UTF-8, 1 byte: 0vvvvvvv */
        string[0] = unicode_value;
        string[1] = '\0';
+        num_bytes = 1;
    }
    else if (unicode_value <= 0x07FF)
    {
@@ -353,6 +364,7 @@ utf8_int_string (unsigned int unicode_value, char *string)
        string[0] = 0xC0 | ((unicode_value >> 6) & 0x1F);
        string[1] = 0x80 | (unicode_value & 0x3F);
        string[2] = '\0';
+        num_bytes = 2;
    }
    else if (unicode_value <= 0xFFFF)
    {
@@ -361,6 +373,7 @@ utf8_int_string (unsigned int unicode_value, char *string)
        string[1] = 0x80 | ((unicode_value >> 6) & 0x3F);
        string[2] = 0x80 | (unicode_value & 0x3F);
        string[3] = '\0';
+        num_bytes = 3;
    }
    else if (unicode_value <= 0x1FFFFF)
    {
@@ -370,7 +383,10 @@ utf8_int_string (unsigned int unicode_value, char *string)
        string[2] = 0x80 | ((unicode_value >> 6) & 0x3F);
        string[3] = 0x80 | (unicode_value & 0x3F);
        string[4] = '\0';
+        num_bytes = 4;
    }
+
+    return num_bytes;
 }

 /*
@@ -36,7 +36,7 @@ extern const char *utf8_prev_char (const char *string_start,
                                   const char *string);
 extern const char *utf8_next_char (const char *string);
 extern int utf8_char_int (const char *string);
-extern void utf8_int_string (unsigned int unicode_value, char *string);
+extern int utf8_int_string (unsigned int unicode_value, char *string);
 extern wint_t utf8_wide_char (const char *string);
 extern int utf8_char_size (const char *string);
 extern int utf8_strlen (const char *string);
@@ -458,16 +458,16 @@ TEST(CoreUtf8, Convert)
    LONGS_EQUAL(0x92d, utf8_char_int (utf8_4bytes_truncated_3));

    /* convert unicode char to a string */
-    utf8_int_string (0, NULL);
-    utf8_int_string (0, result);
+    LONGS_EQUAL(0, utf8_int_string (0, NULL));
+    LONGS_EQUAL(0, utf8_int_string (0, result));
    STRCMP_EQUAL("", result);
-    utf8_int_string (235, result);
+    LONGS_EQUAL(2, utf8_int_string (L'ë', result));
    STRCMP_EQUAL("ë", result);
-    utf8_int_string (0x20ac, result);
+    LONGS_EQUAL(3, utf8_int_string (L'€', result));
    STRCMP_EQUAL("€", result);
-    utf8_int_string (0x2ee9, result);
+    LONGS_EQUAL(3, utf8_int_string (0x2ee9, result));
    STRCMP_EQUAL(UNICODE_CJK_YELLOW, result);
-    utf8_int_string (0x24b62, result);
+    LONGS_EQUAL(4, utf8_int_string (0x24b62, result));
    STRCMP_EQUAL(UNICODE_HAN_CHAR, result);

    /* get wide char */