From 1d3d7336ed6681747e7b5e1ddcff580147c686c8 Mon Sep 17 00:00:00 2001
From: Christian Hergert <chergert@redhat.com>
Date: Mon, 30 Sep 2024 11:28:21 -0700
Subject: [PATCH 1/5] glib/utf8: Use SIMD for UTF-8 validation

This is based on the https://github.com/c-util/c-utf8 project and has
been adapted for portability and integration into GLib. c-utf8 is dual
licensed Apache-2.0 and LGPLv2.1+, the latter matching GLib.

Notably, `case 0x01 ... 0x7F:` style switch/case labels have been
converted to if/else which is more portable to non-GCC/Clang platforms
while generating the same assembly, at least on x86_64 with GCC.

Additionally, `__attribute__((aligned(n)))` is used in favor of
`__builtin_assume_aligned(n)` because it is more portable to MSVC's
`__declspec(align(n))` and also generates the same assembly as GCC's
`__builtin_assume_aligned(n)`.

For GCC x86_64 Linux on a Xeon 4214 this improved the throughput of
g_utf8_validate() for ASCII from 750MB/s to around 10,000MB/s (13x).

On GCC aarch64 Linux with an Apple Silicon M2 Pro we go from about
2,200 MB/s to 26,700 MB/s (12x).

Closes: #3481
---
 glib/gutf8.c | 388 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 234 insertions(+), 154 deletions(-)

diff --git a/glib/gutf8.c b/glib/gutf8.c
index dd9966df52..e52e1dc61b 100644
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -1,7 +1,8 @@
 /* gutf8.c - Operations on UTF-8 strings.
  *
  * Copyright (C) 1999 Tom Tromey
- * Copyright (C) 2000 Red Hat, Inc.
+ * Copyright (C) 2000, 2015-2022 Red Hat, Inc.
+ * Copyright (C) 2022-2023 David Rheinsberg
  *
  * SPDX-License-Identifier: LGPL-2.1-or-later
  *
@@ -1565,166 +1566,255 @@ g_ucs4_to_utf16 (const gunichar  *str,
   return result;
 }
 
-#define VALIDATE_BYTE(mask, expect)                      \
-  G_STMT_START {                                         \
-    if (G_UNLIKELY((*(guchar *)p & (mask)) != (expect))) \
-      goto error;                                        \
-  } G_STMT_END
+/* SIMD-based UTF-8 validation originates in the c-utf8 project from
+ * https://github.com/c-util/c-utf8/ from the following authors:
+ *
+ *   David Rheinsberg <david@readahead.eu>
+ *   Evgeny Vereshchagin <evvers@ya.ru>
+ *   Jan Engelhardt <jengelh@inai.de>
+ *   Tom Gundersen <teg@jklm.no>
+ *
+ * It has been adapted for portability and integration.
+ * The original code is dual-licensed Apache-2.0 or LGPLv2.1+
+ */
 
-/* see IETF RFC 3629 Section 4 */
+#define align_to(_val, _to) (((_val) + (_to) - 1) & ~((_to) - 1))
+
+static inline guint8
+load_u8 (gconstpointer memory,
+         gsize         offset)
+{
+  return ((const guint8 *)memory)[offset];
+}
 
-static const gchar *
-fast_validate (const char *str)
+#if G_GNUC_CHECK_VERSION(4,8) || defined(__clang__)
+# define _attribute_aligned(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+# define _attribute_aligned(n) __declspec(align(n))
+#else
+# define _attribute_aligned(n)
+#endif
 
+static inline gsize
+load_word (gconstpointer memory,
+           gsize         offset)
 {
-  const gchar *p;
+#if GLIB_SIZEOF_VOID_P == 8
+  _attribute_aligned(8) const guint8 *m = ((const guint8 *)memory) + offset;
+
+  return ((guint64)m[0] <<  0) | ((guint64)m[1] <<  8) |
+         ((guint64)m[2] << 16) | ((guint64)m[3] << 24) |
+         ((guint64)m[4] << 32) | ((guint64)m[5] << 40) |
+         ((guint64)m[6] << 48) | ((guint64)m[7] << 56);
+#else
+  _attribute_aligned(4) const guint8 *m = ((const guint8 *)memory) + offset;
+
+  return ((guint)m[0] <<  0) | ((guint)m[1] <<  8) |
+         ((guint)m[2] << 16) | ((guint)m[3] << 24);
+#endif
+}
 
-  for (p = str; *p; p++)
+/* The following constants are truncated on 32-bit machines */
+#define UTF8_ASCII_MASK ((gsize)0x8080808080808080L)
+#define UTF8_ASCII_SUB  ((gsize)0x0101010101010101L)
+
+static inline int
+utf8_word_is_ascii (gsize word)
+{
+  /* True unless any byte is NULL or has the MSB set. */
+  return ((((word - UTF8_ASCII_SUB) | word) & UTF8_ASCII_MASK) == 0);
+}
+
+static void
+utf8_verify_ascii (const char **strp,
+                   gsize       *lenp)
+{
+  const char *str = *strp;
+  gsize len = lenp ? *lenp : (gsize)-1;
+
+  while (len > 0 && load_u8 (str, 0) < 128)
     {
-      if (*(guchar *)p < 128)
-	/* done */;
-      else 
-	{
-	  const gchar *last;
+      if ((gpointer) align_to ((guintptr) str, sizeof (gsize)) == str)
+        {
+          while (len >= 2 * sizeof (gsize))
+            {
+              if (!utf8_word_is_ascii (load_word (str, 0)) ||
+                  !utf8_word_is_ascii (load_word (str, sizeof (gsize))))
+                break;
 
-	  last = p;
-	  if (*(guchar *)p < 0xe0) /* 110xxxxx */
-	    {
-	      if (G_UNLIKELY (*(guchar *)p < 0xc2))
-		goto error;
-	    }
-	  else
-	    {
-	      if (*(guchar *)p < 0xf0) /* 1110xxxx */
-		{
-		  switch (*(guchar *)p++ & 0x0f)
-		    {
-		    case 0:
-		      VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
-		      break;
-		    case 0x0d:
-		      VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
-		      break;
-		    default:
-		      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		    }
-		}
-	      else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
-		{
-		  switch (*(guchar *)p++ & 0x07)
-		    {
-		    case 0:
-		      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		      if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
-			goto error;
-		      break;
-		    case 4:
-		      VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
-		      break;
-		    default:
-		      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		    }
-		  p++;
-		  VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		}
-	      else
-		goto error;
-	    }
+              str += 2 * sizeof(gsize);
+              len -= 2 * sizeof(gsize);
+            }
 
-	  p++;
-	  VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
+          while (len > 0 && load_u8 (str, 0) < 128)
+            {
+              if G_UNLIKELY (load_u8 (str, 0) == 0x00)
+                goto out;
 
-	  continue;
+              ++str;
+              --len;
+            }
+        }
+      else
+        {
+          if G_UNLIKELY (load_u8 (str, 0) == 0x00)
+            goto out;
 
-	error:
-	  return last;
-	}
+          ++str;
+          --len;
+        }
     }
 
-  return p;
+out:
+  *strp = str;
+
+  if (lenp)
+    *lenp = len;
 }
 
-static const gchar *
-fast_validate_len (const char *str,
-		   gssize      max_len)
+#define UTF8_CHAR_IS_TAIL(_x) (((_x) & 0xC0) == 0x80)
 
+static void
+utf8_verify (const char **strp,
+             gsize       *lenp)
 {
-  const gchar *p;
+  const char *str = *strp;
+  gsize len = lenp ? *lenp : (gsize)-1;
 
-  g_assert (max_len >= 0);
+  /* See Unicode 10.0.0, Chapter 3, Section D92 */
 
-  for (p = str; ((p - str) < max_len) && *p; p++)
+  while (len > 0)
     {
-      if (*(guchar *)p < 128)
-	/* done */;
-      else 
-	{
-	  const gchar *last;
+      guint8 b = load_u8 (str, 0);
 
-	  last = p;
-	  if (*(guchar *)p < 0xe0) /* 110xxxxx */
-	    {
-	      if (G_UNLIKELY (max_len - (p - str) < 2))
-		goto error;
-	      
-	      if (G_UNLIKELY (*(guchar *)p < 0xc2))
-		goto error;
-	    }
-	  else
-	    {
-	      if (*(guchar *)p < 0xf0) /* 1110xxxx */
-		{
-		  if (G_UNLIKELY (max_len - (p - str) < 3))
-		    goto error;
-
-		  switch (*(guchar *)p++ & 0x0f)
-		    {
-		    case 0:
-		      VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
-		      break;
-		    case 0x0d:
-		      VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
-		      break;
-		    default:
-		      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		    }
-		}
-	      else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
-		{
-		  if (G_UNLIKELY (max_len - (p - str) < 4))
-		    goto error;
-
-		  switch (*(guchar *)p++ & 0x07)
-		    {
-		    case 0:
-		      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		      if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
-			goto error;
-		      break;
-		    case 4:
-		      VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
-		      break;
-		    default:
-		      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		    }
-		  p++;
-		  VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
-		}
-	      else
-		goto error;
-	    }
+      if (b == 0x00)
+        goto out;
+
+      else if (b >= 0x01 && b <= 0x7F)
+        {
+          /*
+           * Special-case and optimize the ASCII case.
+           */
+          utf8_verify_ascii ((const char **)&str, &len);
+        }
 
-	  p++;
-	  VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
+      else if (b >= 0xC2 && b <= 0xDF)
+        {
+          if G_UNLIKELY (len < 2)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
+            goto out;
 
-	  continue;
+          str += 2;
+          len -= 2;
 
-	error:
-	  return last;
-	}
+        }
+
+      else if (b == 0xE0)
+        {
+          if G_UNLIKELY (len < 3)
+            goto out;
+          if G_UNLIKELY (load_u8 (str, 1) < 0xA0 || load_u8 (str, 1) > 0xBF)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
+            goto out;
+
+          str += 3;
+          len -= 3;
+        }
+
+      else if (b >= 0xE1 && b <= 0xEC)
+        {
+          if G_UNLIKELY (len < 3)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
+            goto out;
+
+          str += 3;
+          len -= 3;
+        }
+
+      else if (b == 0xED)
+        {
+          if G_UNLIKELY (len < 3)
+            goto out;
+          if G_UNLIKELY (load_u8 (str, 1) < 0x80 || load_u8 (str, 1) > 0x9F)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
+            goto out;
+
+          str += 3;
+          len -= 3;
+        }
+
+      else if (b >= 0xEE && b <= 0xEF)
+        {
+          if G_UNLIKELY (len < 3)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
+            goto out;
+
+          str += 3;
+          len -= 3;
+        }
+
+      else if (b == 0xF0)
+        {
+          if G_UNLIKELY (len < 4)
+            goto out;
+          if G_UNLIKELY (load_u8 (str, 1) < 0x90 || load_u8 (str, 1) > 0xBF)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
+            goto out;
+
+          str += 4;
+          len -= 4;
+        }
+
+      else if (b >= 0xF1 && b <= 0xF3)
+        {
+          if G_UNLIKELY (len < 4)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 1)))
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
+            goto out;
+
+          str += 4;
+          len -= 4;
+        }
+
+      else if (b == 0xF4)
+        {
+          if G_UNLIKELY (len < 4)
+            goto out;
+          if G_UNLIKELY (load_u8 (str, 1) < 0x80 || load_u8 (str, 1) > 0x8F)
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 2)))
+            goto out;
+          if G_UNLIKELY (!UTF8_CHAR_IS_TAIL (load_u8 (str, 3)))
+            goto out;
+
+          str += 4;
+          len -= 4;
+        }
+
+      else goto out;
     }
 
-  return p;
+out:
+  *strp = str;
+
+  if (lenp)
+    *lenp = len;
 }
 
 /**
@@ -1757,20 +1847,15 @@ g_utf8_validate (const char   *str,
 		 const gchar **end)
 
 {
-  const gchar *p;
-
   if (max_len >= 0)
     return g_utf8_validate_len (str, max_len, end);
 
-  p = fast_validate (str);
+  utf8_verify (&str, NULL);
 
-  if (end)
-    *end = p;
+  if (end != NULL)
+    *end = str;
 
-  if (*p != '\0')
-    return FALSE;
-  else
-    return TRUE;
+  return *str == 0;
 }
 
 /**
@@ -1793,17 +1878,12 @@ g_utf8_validate_len (const char   *str,
                      const gchar **end)
 
 {
-  const gchar *p;
-
-  p = fast_validate_len (str, max_len);
+  utf8_verify (&str, &max_len);
 
-  if (end)
-    *end = p;
+  if (end != NULL)
+    *end = str;
 
-  if (p != str + max_len)
-    return FALSE;
-  else
-    return TRUE;
+  return max_len == 0;
 }
 
 /**
-- 
GitLab


From b72650542c9285dcbb00f2a4bf6a226bdf758014 Mon Sep 17 00:00:00 2001
From: Christian Hergert <chergert@redhat.com>
Date: Tue, 1 Oct 2024 15:30:05 -0700
Subject: [PATCH 2/5] glib/utf8: Use SIMD for g_str_is_ascii()

This moves g_str_is_ascii() from gstrfuncs.c to gutf8.c so that we can
reuse the same SIMD code for ASCII validation.

On Apple Silicon:

 Before:  3297 MB/s
  After: 26146 MB/s
---
 glib/gstrfuncs.c | 23 -----------------------
 glib/gutf8.c     | 19 +++++++++++++++++++
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/glib/gstrfuncs.c b/glib/gstrfuncs.c
index 33faa80c27..f2c80e4e9c 100644
--- a/glib/gstrfuncs.c
+++ b/glib/gstrfuncs.c
@@ -1604,29 +1604,6 @@ g_ascii_strup (const gchar *str,
   return result;
 }
 
-/**
- * g_str_is_ascii:
- * @str: a string
- *
- * Determines if a string is pure ASCII. A string is pure ASCII if it
- * contains no bytes with the high bit set.
- *
- * Returns: true if @str is ASCII
- *
- * Since: 2.40
- */
-gboolean
-g_str_is_ascii (const gchar *str)
-{
-  gsize i;
-
-  for (i = 0; str[i]; i++)
-    if (str[i] & 0x80)
-      return FALSE;
-
-  return TRUE;
-}
-
 /**
  * g_strdown:
  * @string: the string to convert
diff --git a/glib/gutf8.c b/glib/gutf8.c
index e52e1dc61b..2e43dc0444 100644
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -1886,6 +1886,25 @@ g_utf8_validate_len (const char   *str,
   return max_len == 0;
 }
 
+/**
+ * g_str_is_ascii:
+ * @str: a string
+ *
+ * Determines if a string is pure ASCII. A string is pure ASCII if it
+ * contains no bytes with the high bit set.
+ *
+ * Returns: true if @str is ASCII
+ *
+ * Since: 2.40
+ */
+gboolean
+g_str_is_ascii (const gchar *str)
+{
+  utf8_verify_ascii (&str, NULL);
+
+  return *str == 0;
+}
+
 /**
  * g_unichar_validate:
  * @ch: a Unicode character
-- 
GitLab


From e570263483791b4d093084df2c8a3a1af6482daf Mon Sep 17 00:00:00 2001
From: Philip Withnall <pwithnall@gnome.org>
Date: Thu, 3 Oct 2024 14:31:02 +0100
Subject: [PATCH 3/5] tests: Add basic unit tests for g_str_is_ascii()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It turns out it’s not actually been explicitly tested before, even
though it has full code coverage through being called by other code
which is tested.

Signed-off-by: Philip Withnall <pwithnall@gnome.org>
---
 glib/tests/strfuncs.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/glib/tests/strfuncs.c b/glib/tests/strfuncs.c
index 5d4dfa3aa4..05f08c0ac9 100644
--- a/glib/tests/strfuncs.c
+++ b/glib/tests/strfuncs.c
@@ -2719,6 +2719,27 @@ test_set_str (void)
   g_free (str);
 }
 
+static void
+test_str_is_ascii (void)
+{
+  const char *ascii_strings[] = {
+    "",
+    "hello",
+    "is it me you're looking for",
+  };
+  const char *non_ascii_strings[] = {
+    "is it me you’re looking for",
+    "áccents",
+    "☺️",
+  };
+
+  for (size_t i = 0; i < G_N_ELEMENTS (ascii_strings); i++)
+    g_assert_true (g_str_is_ascii (ascii_strings[i]));
+
+  for (size_t i = 0; i < G_N_ELEMENTS (non_ascii_strings); i++)
+    g_assert_false (g_str_is_ascii (non_ascii_strings[i]));
+}
+
 int
 main (int   argc,
       char *argv[])
@@ -2775,6 +2796,7 @@ main (int   argc,
   g_test_add_func ("/strfuncs/test-is-to-digit", test_is_to_digit);
   g_test_add_func ("/strfuncs/transliteration", test_transliteration);
   g_test_add_func ("/strfuncs/str-equal", test_str_equal);
+  g_test_add_func ("/strfuncs/str-is-ascii", test_str_is_ascii);
 
   return g_test_run();
 }
-- 
GitLab


From 36e4bb98723f9997547aa479dbcbd08ea065b847 Mon Sep 17 00:00:00 2001
From: Philip Withnall <pwithnall@gnome.org>
Date: Thu, 3 Oct 2024 15:43:35 +0100
Subject: [PATCH 4/5] tests: Add some more UTF-8 validation corner cases
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The move to c-utf8 for validation has exposed a few new branches where
our existing (fairly comprehensive) UTF-8 validation test suite didn’t
check things.

Add unit tests for those branches, so we keep code coverage.

I’ve validated (with an independent UTF-8 decoder) that the test vectors
are correctly marked as valid/invalid in the test data (so the tests
aren’t just blindly coded to match the behaviour of the new validator
code).

Signed-off-by: Philip Withnall <pwithnall@gnome.org>

Helps: #3481
---
 glib/tests/utf8-validate.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/glib/tests/utf8-validate.c b/glib/tests/utf8-validate.c
index 6c230452c6..b77f0e18da 100644
--- a/glib/tests/utf8-validate.c
+++ b/glib/tests/utf8-validate.c
@@ -81,8 +81,9 @@ static Test global_test[] = {
   { "\xed\x9f\xbf", -1, 3, TRUE },
   { "\xee\x80\x80", -1, 3, TRUE },
   { "\xef\xbf\xbd", -1, 3, TRUE },
+  { "\xf1\x80\x80\x80", -1, 4, TRUE },
   { "\xf4\x8f\xbf\xbf", -1, 4, TRUE },
-  { "\xf4\x90\x80\x80", -1, 0, FALSE },
+  { "\xf4\x90\x80\x80", -1, 0, FALSE }, /* bigger than U+10FFFF */
   /* malformed sequences */
   /* continuation bytes */
   { "\x80", -1, 0, FALSE },
@@ -94,6 +95,18 @@ static Test global_test[] = {
   { "\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
   { "\x80\xbf\x80\xbf\x80\xbf", -1, 0, FALSE },
   { "\x80\xbf\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
+  { "\xe0\xa0\x20", -1, 0, FALSE },
+  { "\xe1\x80\x20", -1, 0, FALSE },
+  { "\xed\x80\x20", -1, 0, FALSE },
+  { "\xf0\xc0\x80\x80", -1, 0, FALSE },
+  { "\xf0\x90\x20\x80", -1, 0, FALSE },
+  { "\xf0\x90\x80\x20", -1, 0, FALSE },
+  { "\xf1\x20\x80\x80", -1, 0, FALSE },
+  { "\xf1\x80\x20\x80", -1, 0, FALSE },
+  { "\xf1\x80\x80\x20", -1, 0, FALSE },
+  { "\xf4\x7f\x80\x80", -1, 0, FALSE },
+  { "\xf4\x80\x20\x80", -1, 0, FALSE },
+  { "\xf4\x80\x80\x20", -1, 0, FALSE },
 
   /* all possible continuation byte */
   { "\x80", -1, 0, FALSE },
@@ -253,6 +266,9 @@ static Test global_test[] = {
   { "\x20\xf0\x80\x80\x80\x20", -1, 1, FALSE },
   { "\x20\xf8\x80\x80\x80\x80\x20", -1, 1, FALSE },
   { "\x20\xfc\x80\x80\x80\x80\x80\x20", -1, 1, FALSE },
+  { "\xe0\x9f\x80", -1, 0, FALSE },
+  { "\xe0\xc0\x80", -1, 0, FALSE },
+  { "\xf0\x8f\x80\x80", -1, 0, FALSE },
   /* illegal code positions */
   { "\x20\xed\xa0\x80\x20", -1, 1, FALSE },
   { "\x20\xed\xad\xbf\x20", -1, 1, FALSE },
@@ -270,6 +286,14 @@ static Test global_test[] = {
   { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
   { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
 
+  /* ASCII boundaries */
+  { "\x00", 1, 0, FALSE },
+  { "\x01", -1, 1, TRUE },
+  { "\x02", -1, 1, TRUE },
+  { "\x7d", -1, 1, TRUE },
+  { "\x7e", -1, 1, TRUE },
+  { "\x7f", -1, 1, TRUE },
+
   { NULL, 0, 0, 0 }
 };
 
-- 
GitLab


From 72384894b890dc2a45a0ced8c09162cac1c6b1ee Mon Sep 17 00:00:00 2001
From: Philip Withnall <pwithnall@gnome.org>
Date: Thu, 3 Oct 2024 15:46:01 +0100
Subject: [PATCH 5/5] gutf8: Remove dead branch condition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This arm of the condition is always true, because 0x00 has been checked
in the previous branch.

This is not going to improve performance, but does mean we now have full
branch coverage of the code via our unit tests, which gives some
assurance that it’s all good.

Signed-off-by: Philip Withnall <pwithnall@gnome.org>

Helps: #3481
---
 glib/gutf8.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/glib/gutf8.c b/glib/gutf8.c
index 2e43dc0444..51e01a020a 100644
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -1690,7 +1690,7 @@ utf8_verify (const char **strp,
       if (b == 0x00)
         goto out;
 
-      else if (b >= 0x01 && b <= 0x7F)
+      else if (b <= 0x7F)
         {
           /*
            * Special-case and optimize the ASCII case.
-- 
GitLab