From 1d3d7336ed6681747e7b5e1ddcff580147c686c8 Mon Sep 17 00:00:00 2001 From: Christian Hergert Date: Mon, 30 Sep 2024 11:28:21 -0700 Subject: [PATCH 1/5] glib/utf8: Use SIMD for UTF-8 validation This is based on the https://github.com/c-util/c-utf8 project and has been adapted for portability and integration into GLib. c-utf8 is dual licensed Apache-2.0 and LGPLv2.1+, the latter matching GLib. Notably, `case 0x01 ... 0x7F:` style switch/case labels have been converted to if/else which is more portable to non-GCC/Clang platforms while generating the same assembly, at least on x86_64 with GCC. Additionally, `__attribute__((aligned(n)))` is used in favor of `__builtin_assume_aligned(n)` because it is more portable to MSVC's `__declspec(align(n))` and also generates the same assembly as GCC's `__builtin_assume_aligned(n)`. For GCC x86_64 Linux on a Xeon 4214 this improved the throughput of g_utf8_validate() for ASCII from 750MB/s to around 10,000MB/s (13x). On GCC aarch64 Linux with an Apple Silicon M2 Pro we go from about 2,200 MB/s to 26,700 MB/s (12x). Closes: #3481 --- glib/gutf8.c | 388 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 234 insertions(+), 154 deletions(-) diff --git a/glib/gutf8.c b/glib/gutf8.c index dd9966df52..e52e1dc61b 100644 --- a/glib/gutf8.c +++ b/glib/gutf8.c @@ -1,7 +1,8 @@ /* gutf8.c - Operations on UTF-8 strings. * * Copyright (C) 1999 Tom Tromey - * Copyright (C) 2000 Red Hat, Inc. + * Copyright (C) 2000, 2015-2022 Red Hat, Inc. + * Copyright (C) 2022-2023 David Rheinsberg * * SPDX-License-Identifier: LGPL-2.1-or-later * @@ -1565,166 +1566,255 @@ g_ucs4_to_utf16 (const gunichar *str, return result; } -#define VALIDATE_BYTE(mask, expect) \ - G_STMT_START { \ - if (G_UNLIKELY((*(guchar *)p & (mask)) != (expect))) \ - goto error; \ - } G_STMT_END +/* SIMD-based UTF-8 validation originates in the c-utf8 project from + * https://github.com/c-util/c-utf8/ from the following authors: + * + * David Rheinsberg + * Evgeny Vereshchagin + * Jan Engelhardt + * Tom Gundersen + * + * It has been adapted for portability and integration. + * The original code is dual-licensed Apache-2.0 or LGPLv2.1+ + */ -/* see IETF RFC 3629 Section 4 */ +#define align_to(_val, _to) (((_val) + (_to) - 1) & ~((_to) - 1)) + +static inline guint8 +load_u8 (gconstpointer memory, + gsize offset) +{ + return ((const guint8 *)memory)[offset]; +} -static const gchar * -fast_validate (const char *str) +#if G_GNUC_CHECK_VERSION(4,8) || defined(__clang__) +# define _attribute_aligned(n) __attribute__((aligned(n))) +#elif defined(_MSC_VER) +# define _attribute_aligned(n) __declspec(align(n)) +#else +# define _attribute_aligned(n) +#endif +static inline gsize +load_word (gconstpointer memory, + gsize offset) { - const gchar *p; +#if GLIB_SIZEOF_VOID_P == 8 + _attribute_aligned(8) const guint8 *m = ((const guint8 *)memory) + offset; + + return ((guint64)m[0] << 0) | ((guint64)m[1] << 8) | + ((guint64)m[2] << 16) | ((guint64)m[3] << 24) | + ((guint64)m[4] << 32) | ((guint64)m[5] << 40) | + ((guint64)m[6] << 48) | ((guint64)m[7] << 56); +#else + _attribute_aligned(4) const guint8 *m = ((const guint8 *)memory) + offset; + + return ((guint)m[0] << 0) | ((guint)m[1] << 8) | + ((guint)m[2] << 16) | ((guint)m[3] << 24); +#endif +} - for (p = str; *p; p++) +/* The following constants are truncated on 32-bit machines */ +#define UTF8_ASCII_MASK ((gsize)0x8080808080808080L) +#define UTF8_ASCII_SUB ((gsize)0x0101010101010101L) + +static inline int +utf8_word_is_ascii (gsize word) +{ + /* True unless any byte is NULL or has the MSB set. */ + return ((((word - UTF8_ASCII_SUB) | word) & UTF8_ASCII_MASK) == 0); +} + +static void +utf8_verify_ascii (const char **strp, + gsize *lenp) +{ + const char *str = *strp; + gsize len = lenp ? *lenp : (gsize)-1; + + while (len > 0 && load_u8 (str, 0) < 128) { - if (*(guchar *)p < 128) - /* done */; - else - { - const gchar *last; + if ((gpointer) align_to ((guintptr) str, sizeof (gsize)) == str) + { + while (len >= 2 * sizeof (gsize)) + { + if (!utf8_word_is_ascii (load_word (str, 0)) || + !utf8_word_is_ascii (load_word (str, sizeof (gsize)))) + break; - last = p; - if (*(guchar *)p < 0xe0) /* 110xxxxx */ - { - if (G_UNLIKELY (*(guchar *)p < 0xc2)) - goto error; - } - else - { - if (*(guchar *)p < 0xf0) /* 1110xxxx */ - { - switch (*(guchar *)p++ & 0x0f) - { - case 0: - VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */ - break; - case 0x0d: - VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */ - break; - default: - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - } - } - else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */ - { - switch (*(guchar *)p++ & 0x07) - { - case 0: - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - if (G_UNLIKELY((*(guchar *)p & 0x30) == 0)) - goto error; - break; - case 4: - VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */ - break; - default: - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - } - p++; - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - } - else - goto error; - } + str += 2 * sizeof(gsize); + len -= 2 * sizeof(gsize); + } - p++; - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ + while (len > 0 && load_u8 (str, 0) < 128) + { + if G_UNLIKELY (load_u8 (str, 0) == 0x00) + goto out; - continue; + ++str; + --len; + } + } + else + { + if G_UNLIKELY (load_u8 (str, 0) == 0x00) + goto out; - error: - return last; - } + ++str; + --len; + } } - return p; +out: + *strp = str; + + if (lenp) + *lenp = len; } -static const gchar * -fast_validate_len (const char *str, - gssize max_len) +#define UTF8_CHAR_IS_TAIL(_x) (((_x) & 0xC0) == 0x80) +static void +utf8_verify (const char **strp, + gsize *lenp) { - const gchar *p; + const char *str = *strp; + gsize len = lenp ? *lenp : (gsize)-1; - g_assert (max_len >= 0); + /* See Unicode 10.0.0, Chapter 3, Section D92 */ - for (p = str; ((p - str) < max_len) && *p; p++) + while (len > 0) { - if (*(guchar *)p < 128) - /* done */; - else - { - const gchar *last; + guint8 b = load_u8 (str, 0); - last = p; - if (*(guchar *)p < 0xe0) /* 110xxxxx */ - { - if (G_UNLIKELY (max_len - (p - str) < 2)) - goto error; - - if (G_UNLIKELY (*(guchar *)p < 0xc2)) - goto error; - } - else - { - if (*(guchar *)p < 0xf0) /* 1110xxxx */ - { - if (G_UNLIKELY (max_len - (p - str) < 3)) - goto error; - - switch (*(guchar *)p++ & 0x0f) - { - case 0: - VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */ - break; - case 0x0d: - VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */ - break; - default: - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - } - } - else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */ - { - if (G_UNLIKELY (max_len - (p - str) < 4)) - goto error; - - switch (*(guchar *)p++ & 0x07) - { - case 0: - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - if (G_UNLIKELY((*(guchar *)p & 0x30) == 0)) - goto error; - break; - case 4: - VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */ - break; - default: - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - } - p++; - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ - } - else - goto error; - } + if (b == 0x00) + goto out; + + else if (b >= 0x01 && b <= 0x7F) + { + /* + * Special-case and optimize the ASCII case. + */ + utf8_verify_ascii ((const char **)&str, &len); + } - p++; - VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ + else if (b >= 0xC2 && b <= 0xDF) + { + if G_UNLIKELY (len < 2) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1))) + goto out; - continue; + str += 2; + len -= 2; - error: - return last; - } + } + + else if (b == 0xE0) + { + if G_UNLIKELY (len < 3) + goto out; + if G_UNLIKELY (load_u8 (str, 1) < 0xA0 || load_u8 (str, 1) > 0xBF) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2))) + goto out; + + str += 3; + len -= 3; + } + + else if (b >= 0xE1 && b <= 0xEC) + { + if G_UNLIKELY (len < 3) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1))) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2))) + goto out; + + str += 3; + len -= 3; + } + + else if (b == 0xED) + { + if G_UNLIKELY (len < 3) + goto out; + if G_UNLIKELY (load_u8 (str, 1) < 0x80 || load_u8 (str, 1) > 0x9F) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2))) + goto out; + + str += 3; + len -= 3; + } + + else if (b >= 0xEE && b <= 0xEF) + { + if G_UNLIKELY (len < 3) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1))) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2))) + goto out; + + str += 3; + len -= 3; + } + + else if (b == 0xF0) + { + if G_UNLIKELY (len < 4) + goto out; + if G_UNLIKELY (load_u8 (str, 1) < 0x90 || load_u8 (str, 1) > 0xBF) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2))) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3))) + goto out; + + str += 4; + len -= 4; + } + + else if (b >= 0xF1 && b <= 0xF3) + { + if G_UNLIKELY (len < 4) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1))) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2))) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3))) + goto out; + + str += 4; + len -= 4; + } + + else if (b == 0xF4) + { + if G_UNLIKELY (len < 4) + goto out; + if G_UNLIKELY (load_u8 (str, 1) < 0x80 || load_u8 (str, 1) > 0x8F) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2))) + goto out; + if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3))) + goto out; + + str += 4; + len -= 4; + } + + else goto out; } - return p; +out: + *strp = str; + + if (lenp) + *lenp = len; } /** @@ -1757,20 +1847,15 @@ g_utf8_validate (const char *str, const gchar **end) { - const gchar *p; - if (max_len >= 0) return g_utf8_validate_len (str, max_len, end); - p = fast_validate (str); + utf8_verify (&str, NULL); - if (end) - *end = p; + if (end != NULL) + *end = str; - if (*p != '\0') - return FALSE; - else - return TRUE; + return *str == 0; } /** @@ -1793,17 +1878,12 @@ g_utf8_validate_len (const char *str, const gchar **end) { - const gchar *p; - - p = fast_validate_len (str, max_len); + utf8_verify (&str, &max_len); - if (end) - *end = p; + if (end != NULL) + *end = str; - if (p != str + max_len) - return FALSE; - else - return TRUE; + return max_len == 0; } /** -- GitLab From b72650542c9285dcbb00f2a4bf6a226bdf758014 Mon Sep 17 00:00:00 2001 From: Christian Hergert Date: Tue, 1 Oct 2024 15:30:05 -0700 Subject: [PATCH 2/5] glib/utf8: Use SIMD for g_str_is_ascii() This moves g_str_is_ascii() from gstrfuncs.c to gutf8.c so that we can reuse the same SIMD code for ASCII validation. On Apple Silicon: Before: 3297 MB/s After: 26146 MB/s --- glib/gstrfuncs.c | 23 ----------------------- glib/gutf8.c | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/glib/gstrfuncs.c b/glib/gstrfuncs.c index 33faa80c27..f2c80e4e9c 100644 --- a/glib/gstrfuncs.c +++ b/glib/gstrfuncs.c @@ -1604,29 +1604,6 @@ g_ascii_strup (const gchar *str, return result; } -/** - * g_str_is_ascii: - * @str: a string - * - * Determines if a string is pure ASCII. A string is pure ASCII if it - * contains no bytes with the high bit set. - * - * Returns: true if @str is ASCII - * - * Since: 2.40 - */ -gboolean -g_str_is_ascii (const gchar *str) -{ - gsize i; - - for (i = 0; str[i]; i++) - if (str[i] & 0x80) - return FALSE; - - return TRUE; -} - /** * g_strdown: * @string: the string to convert diff --git a/glib/gutf8.c b/glib/gutf8.c index e52e1dc61b..2e43dc0444 100644 --- a/glib/gutf8.c +++ b/glib/gutf8.c @@ -1886,6 +1886,25 @@ g_utf8_validate_len (const char *str, return max_len == 0; } +/** + * g_str_is_ascii: + * @str: a string + * + * Determines if a string is pure ASCII. A string is pure ASCII if it + * contains no bytes with the high bit set. + * + * Returns: true if @str is ASCII + * + * Since: 2.40 + */ +gboolean +g_str_is_ascii (const gchar *str) +{ + utf8_verify_ascii (&str, NULL); + + return *str == 0; +} + /** * g_unichar_validate: * @ch: a Unicode character -- GitLab From e570263483791b4d093084df2c8a3a1af6482daf Mon Sep 17 00:00:00 2001 From: Philip Withnall Date: Thu, 3 Oct 2024 14:31:02 +0100 Subject: [PATCH 3/5] tests: Add basic unit tests for g_str_is_ascii() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It turns out it’s not actually been explicitly tested before, even though it has full code coverage through being called by other code which is tested. Signed-off-by: Philip Withnall --- glib/tests/strfuncs.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/glib/tests/strfuncs.c b/glib/tests/strfuncs.c index 5d4dfa3aa4..05f08c0ac9 100644 --- a/glib/tests/strfuncs.c +++ b/glib/tests/strfuncs.c @@ -2719,6 +2719,27 @@ test_set_str (void) g_free (str); } +static void +test_str_is_ascii (void) +{ + const char *ascii_strings[] = { + "", + "hello", + "is it me you're looking for", + }; + const char *non_ascii_strings[] = { + "is it me you’re looking for", + "áccents", + "☺️", + }; + + for (size_t i = 0; i < G_N_ELEMENTS (ascii_strings); i++) + g_assert_true (g_str_is_ascii (ascii_strings[i])); + + for (size_t i = 0; i < G_N_ELEMENTS (non_ascii_strings); i++) + g_assert_false (g_str_is_ascii (non_ascii_strings[i])); +} + int main (int argc, char *argv[]) @@ -2775,6 +2796,7 @@ main (int argc, g_test_add_func ("/strfuncs/test-is-to-digit", test_is_to_digit); g_test_add_func ("/strfuncs/transliteration", test_transliteration); g_test_add_func ("/strfuncs/str-equal", test_str_equal); + g_test_add_func ("/strfuncs/str-is-ascii", test_str_is_ascii); return g_test_run(); } -- GitLab From 36e4bb98723f9997547aa479dbcbd08ea065b847 Mon Sep 17 00:00:00 2001 From: Philip Withnall Date: Thu, 3 Oct 2024 15:43:35 +0100 Subject: [PATCH 4/5] tests: Add some more UTF-8 validation corner cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The move to c-utf8 for validation has exposed a few new branches where our existing (fairly comprehensive) UTF-8 validation test suite didn’t check things. Add unit tests for those branches, so we keep code coverage. I’ve validated (with an independent UTF-8 decoder) that the test vectors are correctly marked as valid/invalid in the test data (so the tests aren’t just blindly coded to match the behaviour of the new validator code). Signed-off-by: Philip Withnall Helps: #3481 --- glib/tests/utf8-validate.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/glib/tests/utf8-validate.c b/glib/tests/utf8-validate.c index 6c230452c6..b77f0e18da 100644 --- a/glib/tests/utf8-validate.c +++ b/glib/tests/utf8-validate.c @@ -81,8 +81,9 @@ static Test global_test[] = { { "\xed\x9f\xbf", -1, 3, TRUE }, { "\xee\x80\x80", -1, 3, TRUE }, { "\xef\xbf\xbd", -1, 3, TRUE }, + { "\xf1\x80\x80\x80", -1, 4, TRUE }, { "\xf4\x8f\xbf\xbf", -1, 4, TRUE }, - { "\xf4\x90\x80\x80", -1, 0, FALSE }, + { "\xf4\x90\x80\x80", -1, 0, FALSE }, /* bigger than U+10FFFF */ /* malformed sequences */ /* continuation bytes */ { "\x80", -1, 0, FALSE }, @@ -94,6 +95,18 @@ static Test global_test[] = { { "\x80\xbf\x80\xbf\x80", -1, 0, FALSE }, { "\x80\xbf\x80\xbf\x80\xbf", -1, 0, FALSE }, { "\x80\xbf\x80\xbf\x80\xbf\x80", -1, 0, FALSE }, + { "\xe0\xa0\x20", -1, 0, FALSE }, + { "\xe1\x80\x20", -1, 0, FALSE }, + { "\xed\x80\x20", -1, 0, FALSE }, + { "\xf0\xc0\x80\x80", -1, 0, FALSE }, + { "\xf0\x90\x20\x80", -1, 0, FALSE }, + { "\xf0\x90\x80\x20", -1, 0, FALSE }, + { "\xf1\x20\x80\x80", -1, 0, FALSE }, + { "\xf1\x80\x20\x80", -1, 0, FALSE }, + { "\xf1\x80\x80\x20", -1, 0, FALSE }, + { "\xf4\x7f\x80\x80", -1, 0, FALSE }, + { "\xf4\x80\x20\x80", -1, 0, FALSE }, + { "\xf4\x80\x80\x20", -1, 0, FALSE }, /* all possible continuation byte */ { "\x80", -1, 0, FALSE }, @@ -253,6 +266,9 @@ static Test global_test[] = { { "\x20\xf0\x80\x80\x80\x20", -1, 1, FALSE }, { "\x20\xf8\x80\x80\x80\x80\x20", -1, 1, FALSE }, { "\x20\xfc\x80\x80\x80\x80\x80\x20", -1, 1, FALSE }, + { "\xe0\x9f\x80", -1, 0, FALSE }, + { "\xe0\xc0\x80", -1, 0, FALSE }, + { "\xf0\x8f\x80\x80", -1, 0, FALSE }, /* illegal code positions */ { "\x20\xed\xa0\x80\x20", -1, 1, FALSE }, { "\x20\xed\xad\xbf\x20", -1, 1, FALSE }, @@ -270,6 +286,14 @@ static Test global_test[] = { { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE }, { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE }, + /* ASCII boundaries */ + { "\x00", 1, 0, FALSE }, + { "\x01", -1, 1, TRUE }, + { "\x02", -1, 1, TRUE }, + { "\x7d", -1, 1, TRUE }, + { "\x7e", -1, 1, TRUE }, + { "\x7f", -1, 1, TRUE }, + { NULL, 0, 0, 0 } }; -- GitLab From 72384894b890dc2a45a0ced8c09162cac1c6b1ee Mon Sep 17 00:00:00 2001 From: Philip Withnall Date: Thu, 3 Oct 2024 15:46:01 +0100 Subject: [PATCH 5/5] gutf8: Remove dead branch condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This arm of the condition is always true, because 0x00 has been checked in the previous branch. This is not going to improve performance, but does mean we now have full branch coverage of the code via our unit tests, which gives some assurance that it’s all good. Signed-off-by: Philip Withnall Helps: #3481 --- glib/gutf8.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/glib/gutf8.c b/glib/gutf8.c index 2e43dc0444..51e01a020a 100644 --- a/glib/gutf8.c +++ b/glib/gutf8.c @@ -1690,7 +1690,7 @@ utf8_verify (const char **strp, if (b == 0x00) goto out; - else if (b >= 0x01 && b <= 0x7F) + else if (b <= 0x7F) { /* * Special-case and optimize the ASCII case. -- GitLab