From 1f4b017894e89c3ef853b179d2709ee559c30986 Mon Sep 17 00:00:00 2001
From: Guillaume Desmottes <guillaume.desmottes@collabora.com>
Date: Mon, 6 Jul 2020 11:17:00 +0200
Subject: [PATCH 1/9] glib: unicode: fix unicode standard link

---
 glib/gunicode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/glib/gunicode.h b/glib/gunicode.h
index d729803804..3cfb7c7a04 100644
--- a/glib/gunicode.h
+++ b/glib/gunicode.h
@@ -207,7 +207,7 @@ typedef enum
  * Since new unicode versions may add new types here, applications should be ready 
  * to handle unknown values. They may be regarded as %G_UNICODE_BREAK_UNKNOWN.
  *
- * See [Unicode Line Breaking Algorithm](http://www.unicode.org/unicode/reports/tr14/).
+ * See [Unicode Line Breaking Algorithm](https://www.unicode.org/reports/tr14/).
  */
 typedef enum
 {
-- 
GitLab


From 5ac1eb5b2020e509487c920895da1757c122cb66 Mon Sep 17 00:00:00 2001
From: Guillaume Desmottes <guillaume.desmottes@collabora.com>
Date: Tue, 7 Jul 2020 16:50:51 +0200
Subject: [PATCH 2/9] glib: unicode: add tests for g_utf8_normalize()

Test corner cases and some examples from Unicode Standard Annex #15
http://unicode.org/reports/tr15/
---
 glib/tests/unicode.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/glib/tests/unicode.c b/glib/tests/unicode.c
index fa8bd1fa1f..1ee995ffa5 100644
--- a/glib/tests/unicode.c
+++ b/glib/tests/unicode.c
@@ -1610,6 +1610,45 @@ test_iso15924 (void)
 #undef PACK
 }
 
+static void
+test_normalize (void)
+{
+  guint i;
+  typedef struct
+  {
+    const gchar *str;
+    const gchar *nfd;
+    const gchar *nfc;
+    const gchar *nfkd;
+    const gchar *nfkc;
+  } Test;
+  Test tests[] = {
+    { "Äffin", "A\u0308ffin", "Äffin", "A\u0308ffin", "Äffin" },
+    { "Ä\uFB03n", "A\u0308\uFB03n", "Ä\uFB03n", "A\u0308ffin", "Äffin" },
+    { "Henry IV", "Henry IV", "Henry IV", "Henry IV", "Henry IV" },
+    { "Henry \u2163", "Henry \u2163", "Henry \u2163", "Henry IV", "Henry IV" },
+    { "non-utf\x88", NULL, NULL, NULL, NULL },
+    { "", "", "", "", "" },
+  };
+
+#define TEST(str, mode, expected)                         \
+  {                                                       \
+    gchar *normalized = g_utf8_normalize (str, -1, mode); \
+    g_assert_cmpstr (normalized, ==, expected);           \
+    g_free (normalized);                                  \
+  }
+
+  for (i = 0; i < G_N_ELEMENTS (tests); i++)
+    {
+      TEST (tests[i].str, G_NORMALIZE_NFD, tests[i].nfd);
+      TEST (tests[i].str, G_NORMALIZE_NFC, tests[i].nfc);
+      TEST (tests[i].str, G_NORMALIZE_NFKD, tests[i].nfkd);
+      TEST (tests[i].str, G_NORMALIZE_NFKC, tests[i].nfkc);
+    }
+
+#undef TEST
+}
+
 int
 main (int   argc,
       char *argv[])
@@ -1651,6 +1690,7 @@ main (int   argc,
   g_test_add_func ("/unicode/xdigit", test_xdigit);
   g_test_add_func ("/unicode/xdigit-value", test_xdigit_value);
   g_test_add_func ("/unicode/zero-width", test_zerowidth);
+  g_test_add_func ("/unicode/normalize", test_normalize);
 
   return g_test_run();
 }
-- 
GitLab


From a9772523dba6c9ac57105e8f5cad8e1d014f2956 Mon Sep 17 00:00:00 2001
From: Guillaume Desmottes <guillaume.desmottes@collabora.com>
Date: Wed, 8 Jul 2020 16:47:14 +0200
Subject: [PATCH 3/9] glib: unicode: add some empty string tests

---
 glib/tests/unicode.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/glib/tests/unicode.c b/glib/tests/unicode.c
index 1ee995ffa5..612c1097ba 100644
--- a/glib/tests/unicode.c
+++ b/glib/tests/unicode.c
@@ -437,6 +437,10 @@ test_strup (void)
   /* Tricky, comparing two unicode strings with an ASCII function */
   g_assert_cmpstr (str_up, ==, "AAZZ09X;\003E\357\274\241\357\274\241");
   g_free (str_up);
+
+  str_up = g_utf8_strup ("", 0);
+  g_assert_cmpstr (str_up, ==, "");
+  g_free (str_up);
 }
 
 /* Test that g_utf8_strdown() returns the correct value for various
@@ -462,6 +466,10 @@ test_strdown (void)
   /* Tricky, comparing two unicode strings with an ASCII function */
   g_assert_cmpstr (str_down, ==, "aazz09x;\003\007\357\275\201\357\275\201");
   g_free (str_down);
+
+  str_down = g_utf8_strdown ("", 0);
+  g_assert_cmpstr (str_down, ==, "");
+  g_free (str_down);
 }
 
 /* Test that g_utf8_casefold() returns the correct value for various
@@ -487,6 +495,10 @@ test_casefold (void)
   /* Tricky, comparing two unicode strings with an ASCII function */
   g_assert_cmpstr (str_casefold, ==, "aazz09x;\357\275\201\357\275\201");
   g_free (str_casefold);
+
+  str_casefold = g_utf8_casefold ("", 0);
+  g_assert_cmpstr (str_casefold, ==, "");
+  g_free (str_casefold);
 }
 
 /* Test that g_unichar_ismark() returns the correct value for various
-- 
GitLab


From 2975a623b38d5952ca80f640abb246a9e2ac2512 Mon Sep 17 00:00:00 2001
From: Guillaume Desmottes <guillaume.desmottes@collabora.com>
Date: Thu, 9 Jul 2020 10:05:28 +0200
Subject: [PATCH 4/9] glib: unicode: fix hand crafted casemap tests

Those tests are failing when using libicu which is the reference
Unicode implementation so I suppose they should be wrong.
Also, the author mentioned in a comment it wasn't sure
about them so looks like they were actually wrong.
---
 glib/guniprop.c          | 40 +++-------------------------------------
 tests/casemap.txt        | 13 ++++++-------
 tests/gen-casemap-txt.py | 19 +++++++++----------
 3 files changed, 18 insertions(+), 54 deletions(-)

diff --git a/glib/guniprop.c b/glib/guniprop.c
index 619b399089..dde4ea792f 100644
--- a/glib/guniprop.c
+++ b/glib/guniprop.c
@@ -803,7 +803,6 @@ real_toupper (const gchar *str,
   const gchar *p = str;
   const char *last = NULL;
   gsize len = 0;
-  gboolean last_was_i = FALSE;
 
   while ((max_len < 0 || p < str + max_len) && *p)
     {
@@ -814,38 +813,6 @@ real_toupper (const gchar *str,
       last = p;
       p = g_utf8_next_char (p);
 
-      if (locale_type == LOCALE_LITHUANIAN)
-	{
-	  if (c == 'i')
-	    last_was_i = TRUE;
-	  else 
-	    {
-	      if (last_was_i)
-		{
-		  /* Nasty, need to remove any dot above. Though
-		   * I think only E WITH DOT ABOVE occurs in practice
-		   * which could simplify this considerably.
-		   */
-		  gsize decomp_len, i;
-		  gunichar decomp[G_UNICHAR_MAX_DECOMPOSITION_LENGTH];
-
-		  decomp_len = g_unichar_fully_decompose (c, FALSE, decomp, G_N_ELEMENTS (decomp));
-		  for (i=0; i < decomp_len; i++)
-		    {
-		      if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */)
-			len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL);
-		    }
-		  
-		  len += output_marks (&p, out_buffer ? out_buffer + len : NULL, TRUE);
-
-		  continue;
-		}
-
-	      if (!ISMARK (t))
-		last_was_i = FALSE;
-	    }
-	}
-
       if (locale_type == LOCALE_TURKIC && c == 'i')
 	{
 	  /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */
@@ -856,11 +823,10 @@ real_toupper (const gchar *str,
 	  /* Nasty, need to move it after other combining marks .. this would go away if
 	   * we normalized first.
 	   */
-	  len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE);
-
 	  /* And output as GREEK CAPITAL LETTER IOTA */
-	  len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL); 	  
-	}
+          len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL);
+          len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE);
+        }
       else if (IS (t,
 		   OR (G_UNICODE_LOWERCASE_LETTER,
 		   OR (G_UNICODE_TITLECASE_LETTER,
diff --git a/tests/casemap.txt b/tests/casemap.txt
index eccc12b1ea..332cbc1c64 100644
--- a/tests/casemap.txt
+++ b/tests/casemap.txt
@@ -10,16 +10,15 @@ tr_TR.UTF-8	i	i	İ	İ	# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
 tr_TR.UTF-8	I	ı	I	I	# I => LATIN SMALL LETTER DOTLESS I
 tr_TR.UTF-8	İ	i	İ	İ	# I => LATIN SMALL LETTER DOTLESS I
 # Test reordering of YPOGEGRAMMENI across other accents
-	ᾁ	ᾁ	ᾉ	ἉΙ	
+	ᾁ	ᾁ	ᾉ	ΑἹ	
 	ᾁ	ᾁ	ᾉ	ἉΙ	
 # Handling of final and nonfinal sigma
 	ΜΆΙΟΣ 	μάιος 	Μάιος 	ΜΆΙΟΣ 	
 	ΜΆΙΟΣ	μάιος	Μάιος	ΜΆΙΟΣ	
 	ΣΙΓΜΑ	σιγμα	Σιγμα	ΣΙΓΜΑ	
-# Lithuanian rule of i followed by letter with dot. Not at all sure
-# about the titlecase part here
-lt_LT	iė	iė	Ie	IE	
-lt_LT	iė	iė	Ie	IE	
+# Lithuanian rule of i followed by letter with dot.
+lt_LT	iė	iė	Ie	IĖ	
+lt_LT	iė	iė	Ie	IĖ	
 lt_LT	Ì	i̇̀	Ì	Ì	 # LATIN CAPITAL LETTER I WITH GRAVE
 lt_LT	Í	i̇́	Í	Í	 # LATIN CAPITAL LETTER I WITH ACUTE
 lt_LT	Ĩ	i̇̃	Ĩ	Ĩ	 # LATIN CAPITAL LETTER I WITH TILDE
@@ -29,8 +28,8 @@ lt_LT	Ĩ	i̇̃	Ĩ	Ĩ	 # LATIN CAPITAL LETTER I (with tilde above)
 lt_LT	Į́	į̇́	Į́	Į́	 # LATIN CAPITAL LETTER I (with ogonek and acute accent)
 lt_LT	J́	j̇́	J́	J́	 # LATIN CAPITAL LETTER J (with acute accent)
 lt_LT	Į́	į̇́	Į́	Į́	 # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
-lt_LT.UTF-8	iė	iė	Ie	IE	
-lt_LT.UTF-8	iė	iė	Ie	IE	
+lt_LT.UTF-8	iė	iė	Ie	IĖ	
+lt_LT.UTF-8	iė	iė	Ie	IĖ	
 lt_LT.UTF-8	Ì	i̇̀	Ì	Ì	 # LATIN CAPITAL LETTER I WITH GRAVE
 lt_LT.UTF-8	Í	i̇́	Í	Í	 # LATIN CAPITAL LETTER I WITH ACUTE
 lt_LT.UTF-8	Ĩ	i̇̃	Ĩ	Ĩ	 # LATIN CAPITAL LETTER I WITH TILDE
diff --git a/tests/gen-casemap-txt.py b/tests/gen-casemap-txt.py
index 62d59638a7..731fb59c1d 100755
--- a/tests/gen-casemap-txt.py
+++ b/tests/gen-casemap-txt.py
@@ -184,16 +184,15 @@ tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
 tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
 tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
 # Test reordering of YPOGEGRAMMENI across other accents
-\t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t
+\t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0399\u0314\t
 \t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t
 # Handling of final and nonfinal sigma
-\tΜΆΙΟΣ 	μάιος 	Μάιος 	ΜΆΙΟΣ \t
-\tΜΆΙΟΣ	μάιος	Μάιος	ΜΆΙΟΣ\t
-\tΣΙΓΜΑ	σιγμα	Σιγμα	ΣΙΓΜΑ\t
-# Lithuanian rule of i followed by letter with dot. Not at all sure
-# about the titlecase part here
-lt_LT\ti\u0117\ti\u0117\tIe\tIE\t
-lt_LT\tie\u0307\tie\u0307\tIe\tIE\t
+\tΜΆΙΟΣ 	μάιος 	Μάιος 	ΜΆΙΟΣ 	
+\tΜΆΙΟΣ	μάιος	Μάιος	ΜΆΙΟΣ	
+\tΣΙΓΜΑ	σιγμα	Σιγμα	ΣΙΓΜΑ	
+# Lithuanian rule of i followed by letter with dot.
+lt_LT\ti\u0117\ti\u0117\tIe\tIĖ\t
+lt_LT\tie\u0307\tie\u0307\tIe\tIĖ\t
 lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
 lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
 lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
@@ -203,8 +202,8 @@ lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with
 lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
 lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
 lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
-lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t
-lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t
+lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIĖ\t
+lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIĖ\t
 lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
 lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
 lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
-- 
GitLab


From 6cddb8d28394fa1ab85d0ba06eeb1659aea160bb Mon Sep 17 00:00:00 2001
From: Guillaume Desmottes <guillaume.desmottes@collabora.com>
Date: Mon, 6 Jul 2020 12:02:31 +0200
Subject: [PATCH 5/9] glib: add optional libicu dep

Will be used to remove unicode table from glib and so reduce the binary
size.

Depend on libicu 66 as that's the version implementing Unicode 13 that
our tests rely on.
---
 glib/meson.build  |  2 +-
 meson.build       | 10 ++++++++++
 meson_options.txt |  9 ++++++++-
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/glib/meson.build b/glib/meson.build
index 8c18e6de40..e2db4616f0 100644
--- a/glib/meson.build
+++ b/glib/meson.build
@@ -377,7 +377,7 @@ libglib = library('glib-2.0',
   # intl.lib is not compatible with SAFESEH
   link_args : [noseh_link_args, glib_link_flags, win32_ldflags],
   include_directories : configinc,
-  dependencies : pcre_deps + [thread_dep, librt] + libintl_deps + libiconv + platform_deps + [gnulib_libm_dependency, libm] + [libsysprof_capture_dep],
+  dependencies : pcre_deps + [thread_dep, librt, libicu_dep] + libintl_deps + libiconv + platform_deps + [gnulib_libm_dependency, libm] + [libsysprof_capture_dep],
   c_args : glib_c_args,
   objc_args : glib_c_args,
 )
diff --git a/meson.build b/meson.build
index 0d892fb2df..c5521243ea 100644
--- a/meson.build
+++ b/meson.build
@@ -2308,6 +2308,16 @@ if want_systemtap and enable_dtrace
   enable_systemtap = true
 endif
 
+unicode_opt = get_option('unicode')
+if unicode_opt == 'libicu'
+  warning('Building with \'unicode=libicu\' is meant to be used ONLY on custom embedded platforms, this feature is NOT meant to be used by general-purpose Linux distributions.')
+  libicu_dep = dependency('icu-uc', version: '>= 66')
+  glib_conf.set('HAVE_LIBICU', true)
+else
+  libicu_dep = []
+  glib_conf.set('HAVE_LIBICU', false)
+endif
+
 test_timeout = 60
 test_timeout_slow = 180
 
diff --git a/meson_options.txt b/meson_options.txt
index 072765361e..1beba38675 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -126,4 +126,11 @@ option('glib_checks',
 option('libelf',
        type : 'feature',
        value : 'auto',
-       description : 'Enable support for listing and extracting from ELF resource files with gresource tool')
\ No newline at end of file
+       description : 'Enable support for listing and extracting from ELF resource files with gresource tool')
+
+option('unicode',
+       type : 'combo',
+       choices : ['internal', 'libicu'],
+       value : 'internal',
+       yield : true,
+       description : 'Unicode implementation to use (\'internal\' = \'GLib\'s own implementation\'; \'libicu\' = \'External libicu. WARNING: should only be used on custom embedded platforms, this feature is NOT meant to be used by general-purpose Linux distributions\';)')
-- 
GitLab


From 958d7af6634313efb4c15f4c2ea90f1d1d8bbdc8 Mon Sep 17 00:00:00 2001
From: Guillaume Desmottes <guillaume.desmottes@collabora.com>
Date: Mon, 6 Jul 2020 12:43:49 +0200
Subject: [PATCH 6/9] glib: unicode: stop using gunibreak.h with libicu

Re-implement g_unichar_break_type() using libicu so we can remove the
tables from gunibreak.h in the glib binary and so reduce its size.
---
 glib/gunibreak.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/glib/gunibreak.c b/glib/gunibreak.c
index 334acd3d42..22dafe20e3 100644
--- a/glib/gunibreak.c
+++ b/glib/gunibreak.c
@@ -20,6 +20,8 @@
 
 #include <stdlib.h>
 
+#ifndef HAVE_LIBICU
+
 #include "gunibreak.h"
 
 #define TPROP_PART1(Page, Char) \
@@ -39,6 +41,109 @@
       ? TPROP_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
       : G_UNICODE_BREAK_UNKNOWN))
 
+#else /* HAVE_LIBICU */
+
+#include <unicode/uchar.h>
+#include "gunicode.h"
+
+static GUnicodeBreakType
+u_line_break_to_g_unicode_break_type (ULineBreak code)
+{
+  switch (code)
+    {
+    case U_LB_UNKNOWN: /*[XX]*/
+      return G_UNICODE_BREAK_UNKNOWN;
+    case U_LB_AMBIGUOUS: /*[AI]*/
+      return G_UNICODE_BREAK_AMBIGUOUS;
+    case U_LB_ALPHABETIC: /*[AL]*/
+      return G_UNICODE_BREAK_ALPHABETIC;
+    case U_LB_BREAK_BOTH: /*[B2]*/
+      return G_UNICODE_BREAK_BEFORE_AND_AFTER;
+    case U_LB_BREAK_AFTER: /*[BA]*/
+      return G_UNICODE_BREAK_AFTER;
+    case U_LB_BREAK_BEFORE: /*[BB]*/
+      return G_UNICODE_BREAK_BEFORE;
+    case U_LB_MANDATORY_BREAK: /*[BK]*/
+      return G_UNICODE_BREAK_MANDATORY;
+    case U_LB_CONTINGENT_BREAK: /*[CB]*/
+      return G_UNICODE_BREAK_CONTINGENT;
+    case U_LB_CLOSE_PUNCTUATION: /*[CL]*/
+      return G_UNICODE_BREAK_CLOSE_PUNCTUATION;
+    case U_LB_COMBINING_MARK: /*[CM]*/
+      return G_UNICODE_BREAK_COMBINING_MARK;
+    case U_LB_CARRIAGE_RETURN: /*[CR]*/
+      return G_UNICODE_BREAK_CARRIAGE_RETURN;
+    case U_LB_EXCLAMATION: /*[EX]*/
+      return G_UNICODE_BREAK_EXCLAMATION;
+    case U_LB_GLUE: /*[GL]*/
+      return G_UNICODE_BREAK_NON_BREAKING_GLUE;
+    case U_LB_HYPHEN: /*[HY]*/
+      return G_UNICODE_BREAK_HYPHEN;
+    case U_LB_IDEOGRAPHIC: /*[ID]*/
+      return G_UNICODE_BREAK_IDEOGRAPHIC;
+    case U_LB_INSEPARABLE: /*[IN]*/
+      return G_UNICODE_BREAK_INSEPARABLE;
+    case U_LB_INFIX_NUMERIC: /*[IS]*/
+      return G_UNICODE_BREAK_INFIX_SEPARATOR;
+    case U_LB_LINE_FEED: /*[LF]*/
+      return G_UNICODE_BREAK_LINE_FEED;
+    case U_LB_NONSTARTER: /*[NS]*/
+      return G_UNICODE_BREAK_NON_STARTER;
+    case U_LB_NUMERIC: /*[NU]*/
+      return G_UNICODE_BREAK_NUMERIC;
+    case U_LB_OPEN_PUNCTUATION: /*[OP]*/
+      return G_UNICODE_BREAK_OPEN_PUNCTUATION;
+    case U_LB_POSTFIX_NUMERIC: /*[PO]*/
+      return G_UNICODE_BREAK_POSTFIX;
+    case U_LB_PREFIX_NUMERIC: /*[PR]*/
+      return G_UNICODE_BREAK_PREFIX;
+    case U_LB_QUOTATION: /*[QU]*/
+      return G_UNICODE_BREAK_QUOTATION;
+    case U_LB_COMPLEX_CONTEXT: /*[SA]*/
+      return G_UNICODE_BREAK_COMPLEX_CONTEXT;
+    case U_LB_SURROGATE: /*[SG]*/
+      return G_UNICODE_BREAK_SURROGATE;
+    case U_LB_SPACE: /*[SP]*/
+      return G_UNICODE_BREAK_SPACE;
+    case U_LB_BREAK_SYMBOLS: /*[SY]*/
+      return G_UNICODE_BREAK_SYMBOL;
+    case U_LB_ZWSPACE: /*[ZW]*/
+      return G_UNICODE_BREAK_ZERO_WIDTH_SPACE;
+    case U_LB_NEXT_LINE: /*[NL]*/
+      return G_UNICODE_BREAK_NEXT_LINE;
+    case U_LB_WORD_JOINER: /*[WJ]*/
+      return G_UNICODE_BREAK_WORD_JOINER;
+    case U_LB_H2: /*[H2]*/
+      return G_UNICODE_BREAK_HANGUL_LV_SYLLABLE;
+    case U_LB_H3: /*[H3]*/
+      return G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE;
+    case U_LB_JL: /*[JL]*/
+      return G_UNICODE_BREAK_HANGUL_L_JAMO;
+    case U_LB_JT: /*[JT]*/
+      return G_UNICODE_BREAK_HANGUL_T_JAMO;
+    case U_LB_JV: /*[JV]*/
+      return G_UNICODE_BREAK_HANGUL_V_JAMO;
+    case U_LB_CLOSE_PARENTHESIS: /*[CP]*/
+      return G_UNICODE_BREAK_CLOSE_PARANTHESIS;
+    case U_LB_CONDITIONAL_JAPANESE_STARTER: /*[CJ]*/
+      return G_UNICODE_BREAK_CONDITIONAL_JAPANESE_STARTER;
+    case U_LB_HEBREW_LETTER: /*[HL]*/
+      return G_UNICODE_BREAK_HEBREW_LETTER;
+    case U_LB_REGIONAL_INDICATOR: /*[RI]*/
+      return G_UNICODE_BREAK_REGIONAL_INDICATOR;
+    case U_LB_E_BASE: /*[EB]*/
+      return G_UNICODE_BREAK_EMOJI_BASE;
+    case U_LB_E_MODIFIER: /*[EM]*/
+      return G_UNICODE_BREAK_EMOJI_MODIFIER;
+    case U_LB_ZWJ: /*[ZWJ]*/
+      return G_UNICODE_BREAK_ZERO_WIDTH_JOINER;
+    case U_LB_COUNT:
+      break;
+    }
+  return G_UNICODE_BREAK_UNKNOWN;
+}
+#endif /* HAVE_LIBICU */
+
 /**
  * g_unichar_break_type:
  * @c: a Unicode character
@@ -55,5 +160,12 @@
 GUnicodeBreakType
 g_unichar_break_type (gunichar c)
 {
+#ifdef HAVE_LIBICU
+  gint32 line_break;
+
+  line_break = u_getIntPropertyValue (c, UCHAR_LINE_BREAK);
+  return u_line_break_to_g_unicode_break_type (line_break);
+#else /* !HAVE_LIBICU */
   return PROP (c);
+#endif
 }
-- 
GitLab


From b4090a7f48452ac3976c6227d494e9f9cd73028f Mon Sep 17 00:00:00 2001
From: Guillaume Desmottes <guillaume.desmottes@collabora.com>
Date: Mon, 6 Jul 2020 15:16:12 +0200
Subject: [PATCH 7/9] glib: unicode: stop using gunichartables.h with libicu

By using libicu we can remove the unicode tables and so reduce the glib
binary size at the cost of converting to/from UTF-16 for string
operations.
---
 glib/guniprop.c | 285 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 267 insertions(+), 18 deletions(-)

diff --git a/glib/guniprop.c b/glib/guniprop.c
index dde4ea792f..0dd3af8bc0 100644
--- a/glib/guniprop.c
+++ b/glib/guniprop.c
@@ -25,11 +25,11 @@
 #include <locale.h>
 
 #include "gmem.h"
+#include "gstrfuncs.h"
 #include "gstring.h"
 #include "gtestutils.h"
 #include "gtypes.h"
 #include "gunicode.h"
-#include "gunichartables.h"
 #include "gmirroringtable.h"
 #include "gscripttable.h"
 #include "gunicodeprivate.h"
@@ -37,6 +37,13 @@
 #include "gwin32.h"
 #endif
 
+#ifdef HAVE_LIBICU
+#include <unicode/uchar.h>
+#include <unicode/ustring.h>
+#else
+#include "gunichartables.h"
+#endif
+
 #define G_UNICHAR_FULLWIDTH_A 0xff21
 #define G_UNICHAR_FULLWIDTH_I 0xff29
 #define G_UNICHAR_FULLWIDTH_J 0xff2a
@@ -44,6 +51,8 @@
 #define G_UNICHAR_FULLWIDTH_a 0xff41
 #define G_UNICHAR_FULLWIDTH_f 0xff46
 
+#ifndef HAVE_LIBICU
+
 #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
                           ? attr_table_part1[Page] \
                           : attr_table_part2[(Page) - 0xe00])
@@ -68,6 +77,81 @@
       ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
       : G_UNICODE_UNASSIGNED))
 
+#else /* HAVE_LIBICU */
+
+static GUnicodeType
+u_char_category_to_g_unicode_type (UCharCategory c)
+{
+  switch (c)
+    {
+    case U_GENERAL_OTHER_TYPES:
+    case U_CHAR_CATEGORY_COUNT:
+      return G_UNICODE_UNASSIGNED;
+    case U_UPPERCASE_LETTER:
+      return G_UNICODE_UPPERCASE_LETTER;
+    case U_LOWERCASE_LETTER:
+      return G_UNICODE_LOWERCASE_LETTER;
+    case U_TITLECASE_LETTER:
+      return G_UNICODE_TITLECASE_LETTER;
+    case U_MODIFIER_LETTER:
+      return G_UNICODE_MODIFIER_LETTER;
+    case U_OTHER_LETTER:
+      return G_UNICODE_OTHER_LETTER;
+    case U_NON_SPACING_MARK:
+      return G_UNICODE_NON_SPACING_MARK;
+    case U_ENCLOSING_MARK:
+      return G_UNICODE_ENCLOSING_MARK;
+    case U_COMBINING_SPACING_MARK:
+      return G_UNICODE_SPACING_MARK;
+    case U_DECIMAL_DIGIT_NUMBER:
+      return G_UNICODE_DECIMAL_NUMBER;
+    case U_LETTER_NUMBER:
+      return G_UNICODE_LETTER_NUMBER;
+    case U_OTHER_NUMBER:
+      return G_UNICODE_OTHER_NUMBER;
+    case U_SPACE_SEPARATOR:
+      return G_UNICODE_SPACE_SEPARATOR;
+    case U_LINE_SEPARATOR:
+      return G_UNICODE_LINE_SEPARATOR;
+    case U_PARAGRAPH_SEPARATOR:
+      return G_UNICODE_PARAGRAPH_SEPARATOR;
+    case U_CONTROL_CHAR:
+      return G_UNICODE_CONTROL;
+    case U_FORMAT_CHAR:
+      return G_UNICODE_FORMAT;
+    case U_PRIVATE_USE_CHAR:
+      return G_UNICODE_PRIVATE_USE;
+    case U_SURROGATE:
+      return G_UNICODE_SURROGATE;
+    case U_DASH_PUNCTUATION:
+      return G_UNICODE_DASH_PUNCTUATION;
+    case U_START_PUNCTUATION:
+      return G_UNICODE_OPEN_PUNCTUATION;
+    case U_END_PUNCTUATION:
+      return G_UNICODE_CLOSE_PUNCTUATION;
+    case U_CONNECTOR_PUNCTUATION:
+      return G_UNICODE_CONNECT_PUNCTUATION;
+    case U_OTHER_PUNCTUATION:
+      return G_UNICODE_OTHER_PUNCTUATION;
+    case U_MATH_SYMBOL:
+      return G_UNICODE_MATH_SYMBOL;
+    case U_CURRENCY_SYMBOL:
+      return G_UNICODE_CURRENCY_SYMBOL;
+    case U_MODIFIER_SYMBOL:
+      return G_UNICODE_MODIFIER_SYMBOL;
+    case U_OTHER_SYMBOL:
+      return G_UNICODE_OTHER_SYMBOL;
+    case U_INITIAL_PUNCTUATION:
+      return G_UNICODE_INITIAL_PUNCTUATION;
+    case U_FINAL_PUNCTUATION:
+      return G_UNICODE_FINAL_PUNCTUATION;
+    }
+  return G_UNICODE_UNASSIGNED;
+}
+
+#define TYPE(Char) u_char_category_to_g_unicode_type (u_charType (Char))
+
+#endif /* HAVE_LIBICU */
 
 #define IS(Type, Class)	(((guint)1 << (Type)) & (Class))
 #define OR(Type, Rest)	(((guint)1 << (Type)) | (Rest))
@@ -351,11 +435,15 @@ g_unichar_isupper (gunichar c)
 gboolean
 g_unichar_istitle (gunichar c)
 {
+#ifdef HAVE_LIBICU
+  return u_istitle (c);
+#else
   unsigned int i;
   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
     if (title_table[i][0] == c)
       return TRUE;
   return FALSE;
+#endif
 }
 
 /**
@@ -428,6 +516,7 @@ g_unichar_iszerowidth (gunichar c)
   return FALSE;
 }
 
+#ifndef HAVE_LIBICU
 static int
 interval_compare (const void *key, const void *elt)
 {
@@ -467,6 +556,7 @@ g_unichar_iswide_bsearch (gunichar ch)
 
   return FALSE;
 }
+#endif /* !HAVE_LIBICU */
 
 /**
  * g_unichar_iswide:
@@ -480,10 +570,16 @@ g_unichar_iswide_bsearch (gunichar ch)
 gboolean
 g_unichar_iswide (gunichar c)
 {
+#ifdef HAVE_LIBICU
+  UEastAsianWidth ea = (UEastAsianWidth) u_getIntPropertyValue (c, UCHAR_EAST_ASIAN_WIDTH);
+
+  return ea == U_EA_FULLWIDTH || ea == U_EA_WIDE;
+#else
   if (c < g_unicode_width_table_wide[0].start)
     return FALSE;
   else
     return g_unichar_iswide_bsearch (c);
+#endif
 }
 
 
@@ -512,6 +608,13 @@ g_unichar_iswide_cjk (gunichar c)
   if (g_unichar_iswide (c))
     return TRUE;
 
+#ifdef HAVE_LIBICU
+  {
+    UEastAsianWidth ea = (UEastAsianWidth) u_getIntPropertyValue (c, UCHAR_EAST_ASIAN_WIDTH);
+
+    return ea == U_EA_AMBIGUOUS;
+  }
+#else
   /* bsearch() is declared attribute(nonnull(1)) so we can't validly search
    * for a NULL key */
   if (c == 0)
@@ -525,6 +628,7 @@ g_unichar_iswide_cjk (gunichar c)
     return TRUE;
 
   return FALSE;
+#endif
 }
 
 
@@ -541,6 +645,9 @@ g_unichar_iswide_cjk (gunichar c)
 gunichar
 g_unichar_toupper (gunichar c)
 {
+#ifdef HAVE_LIBICU
+  return u_toupper (c);
+#else
   int t = TYPE (c);
   if (t == G_UNICODE_LOWERCASE_LETTER)
     {
@@ -566,6 +673,7 @@ g_unichar_toupper (gunichar c)
 	}
     }
   return c;
+#endif
 }
 
 /**
@@ -581,6 +689,9 @@ g_unichar_toupper (gunichar c)
 gunichar
 g_unichar_tolower (gunichar c)
 {
+#ifdef HAVE_LIBICU
+  return u_tolower (c);
+#else
   int t = TYPE (c);
   if (t == G_UNICODE_UPPERCASE_LETTER)
     {
@@ -607,6 +718,7 @@ g_unichar_tolower (gunichar c)
 	}
     }
   return c;
+#endif
 }
 
 /**
@@ -622,6 +734,9 @@ g_unichar_tolower (gunichar c)
 gunichar
 g_unichar_totitle (gunichar c)
 {
+#ifdef HAVE_LIBICU
+  return u_totitle (c);
+#else
   unsigned int i;
 
   /* We handle U+0000 explicitly because some elements in
@@ -640,6 +755,7 @@ g_unichar_totitle (gunichar c)
     return g_unichar_toupper (c);
 
   return c;
+#endif
 }
 
 /**
@@ -655,9 +771,13 @@ g_unichar_totitle (gunichar c)
 int
 g_unichar_digit_value (gunichar c)
 {
+#ifdef HAVE_LIBICU
+  return u_digit (c, 10);
+#else
   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
     return ATTTABLE (c >> 8, c & 0xff);
   return -1;
+#endif
 }
 
 /**
@@ -673,6 +793,9 @@ g_unichar_digit_value (gunichar c)
 int
 g_unichar_xdigit_value (gunichar c)
 {
+#ifdef HAVE_LIBICU
+  return u_digit (c, 16);
+#else
   if (c >= 'A' && c <= 'F')
     return c - 'A' + 10;
   if (c >= 'a' && c <= 'f')
@@ -684,6 +807,7 @@ g_unichar_xdigit_value (gunichar c)
   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
     return ATTTABLE (c >> 8, c & 0xff);
   return -1;
+#endif
 }
 
 /**
@@ -704,6 +828,7 @@ g_unichar_type (gunichar c)
  * Case mapping functions
  */
 
+#ifndef HAVE_LIBICU
 typedef enum {
   LOCALE_NORMAL,
   LOCALE_TURKIC,
@@ -874,6 +999,17 @@ real_toupper (const gchar *str,
 
   return len;
 }
+#else /* HAVE_LIBICU */
+static gchar *
+get_locale (void)
+{
+#ifdef G_OS_WIN32
+  return g_win32_getlocale ();
+#else
+  return g_strdup (setlocale (LC_CTYPE, NULL));
+#endif
+}
+#endif /* HAVE_LIBICU */
 
 /**
  * g_utf8_strup:
@@ -894,24 +1030,64 @@ g_utf8_strup (const gchar *str,
 	      gssize       len)
 {
   gsize result_len;
-  LocaleType locale_type;
-  gchar *result;
+  gchar *result = NULL;
 
   g_return_val_if_fail (str != NULL, NULL);
 
-  locale_type = get_locale_type ();
-  
   /*
    * We use a two pass approach to keep memory management simple
    */
-  result_len = real_toupper (str, len, NULL, locale_type);
-  result = g_malloc (result_len + 1);
-  real_toupper (str, len, result, locale_type);
-  result[result_len] = '\0';
+#ifdef HAVE_LIBICU
+  {
+    UErrorCode error = U_ZERO_ERROR;
+    gunichar2 *orig_utf16 = NULL, *result_utf16 = NULL;
+    glong utf16_len;
+    gchar *locale;
+
+    locale = get_locale ();
+    orig_utf16 = g_utf8_to_utf16 (str, len, NULL, &utf16_len, NULL);
+
+    result_len = u_strToUpper (NULL, 0, orig_utf16, utf16_len, locale, &error);
+    /* Buffer Overflow is expected from the preflight operation */
+    if (error != U_BUFFER_OVERFLOW_ERROR && result_len > 0)
+      goto out;
+
+    if (result_len == 0)
+      {
+        result = g_strdup ("");
+        goto out;
+      }
+
+    result_utf16 = g_malloc ((result_len + 1) * 2);
+    error = U_ZERO_ERROR;
+    u_strToUpper (result_utf16, result_len + 1, orig_utf16, utf16_len, locale, &error);
+    if (U_FAILURE (error))
+      goto out;
+
+    result = g_utf16_to_utf8 (result_utf16, result_len, NULL, NULL, NULL);
+
+  out:
+    g_free (locale);
+    g_free (result_utf16);
+    g_free (orig_utf16);
+  }
+#else
+  {
+    LocaleType locale_type;
+
+    locale_type = get_locale_type ();
+
+    result_len = real_toupper (str, len, NULL, locale_type);
+    result = g_malloc (result_len + 1);
+    real_toupper (str, len, result, locale_type);
+    result[result_len] = '\0';
+  }
+#endif
 
   return result;
 }
 
+#ifndef HAVE_LIBICU
 /* traverses the string checking for characters with combining class == 230
  * until a base character is found */
 static gboolean
@@ -1065,6 +1241,7 @@ real_tolower (const gchar *str,
 
   return len;
 }
+#endif
 
 /**
  * g_utf8_strdown:
@@ -1084,20 +1261,59 @@ g_utf8_strdown (const gchar *str,
 		gssize       len)
 {
   gsize result_len;
-  LocaleType locale_type;
-  gchar *result;
+  gchar *result = NULL;
 
   g_return_val_if_fail (str != NULL, NULL);
 
-  locale_type = get_locale_type ();
-  
   /*
    * We use a two pass approach to keep memory management simple
    */
-  result_len = real_tolower (str, len, NULL, locale_type);
-  result = g_malloc (result_len + 1);
-  real_tolower (str, len, result, locale_type);
-  result[result_len] = '\0';
+#ifdef HAVE_LIBICU
+  {
+    UErrorCode error = U_ZERO_ERROR;
+    gunichar2 *orig_utf16 = NULL, *result_utf16 = NULL;
+    glong utf16_len;
+    gchar *locale;
+
+    locale = get_locale ();
+    orig_utf16 = g_utf8_to_utf16 (str, len, NULL, &utf16_len, NULL);
+
+    result_len = u_strToLower (NULL, 0, orig_utf16, utf16_len, locale, &error);
+    /* Buffer Overflow is expected from the preflight operation */
+    if (error != U_BUFFER_OVERFLOW_ERROR && result_len > 0)
+      goto out;
+
+    if (result_len == 0)
+      {
+        result = g_strdup ("");
+        goto out;
+      }
+
+    result_utf16 = g_malloc ((result_len + 1) * 2);
+    error = U_ZERO_ERROR;
+    u_strToLower (result_utf16, result_len + 1, orig_utf16, utf16_len, locale, &error);
+    if (U_FAILURE (error))
+      goto out;
+
+    result = g_utf16_to_utf8 (result_utf16, result_len, NULL, NULL, NULL);
+
+  out:
+    g_free (locale);
+    g_free (result_utf16);
+    g_free (orig_utf16);
+  }
+#else
+  {
+    LocaleType locale_type;
+
+    locale_type = get_locale_type ();
+
+    result_len = real_tolower (str, len, NULL, locale_type);
+    result = g_malloc (result_len + 1);
+    real_tolower (str, len, result, locale_type);
+    result[result_len] = '\0';
+  }
+#endif
 
   return result;
 }
@@ -1126,6 +1342,7 @@ gchar *
 g_utf8_casefold (const gchar *str,
 		 gssize       len)
 {
+#ifndef HAVE_LIBICU
   GString *result;
   const char *p;
 
@@ -1166,7 +1383,39 @@ g_utf8_casefold (const gchar *str,
       p = g_utf8_next_char (p);
     }
 
-  return g_string_free (result, FALSE); 
+  return g_string_free (result, FALSE);
+#else /* HAVE_LIBICU */
+  UErrorCode error = U_ZERO_ERROR;
+  gchar *result = NULL;
+  gunichar2 *orig_utf16 = NULL, *result_utf16 = NULL;
+  glong utf16_len;
+  gint32 result_len;
+
+  g_return_val_if_fail (str != NULL, NULL);
+
+  orig_utf16 = g_utf8_to_utf16 (str, len, NULL, &utf16_len, NULL);
+
+  result_len = u_strFoldCase (NULL, 0, orig_utf16, utf16_len, U_FOLD_CASE_DEFAULT, &error);
+  /* Buffer Overflow is expected from the preflight operation */
+  if (error != U_BUFFER_OVERFLOW_ERROR && result_len != 0)
+    goto out;
+
+  if (result_len == 0)
+    return g_strdup ("");
+
+  result_utf16 = g_malloc ((result_len + 1) * 2);
+  error = U_ZERO_ERROR;
+  u_strFoldCase (result_utf16, result_len + 1, orig_utf16, utf16_len, U_FOLD_CASE_DEFAULT, &error);
+  if (U_FAILURE (error))
+    goto out;
+
+  result = g_utf16_to_utf8 (result_utf16, result_len, NULL, NULL, NULL);
+
+out:
+  g_free (result_utf16);
+  g_free (orig_utf16);
+  return result;
+#endif
 }
 
 /**
-- 
GitLab


From 972d73f63ee2c8854b98ee0136797f1e21f5acae Mon Sep 17 00:00:00 2001
From: Guillaume Desmottes <guillaume.desmottes@collabora.com>
Date: Tue, 7 Jul 2020 12:42:27 +0200
Subject: [PATCH 8/9] glib: unicode: stop using gunicomp.h and gunidecomp.h
 with libicu

Same logic as previous patches, stop using internal unicode tables when
build with libicu.

Fix #1333
---
 glib/gunidecomp.c | 182 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 180 insertions(+), 2 deletions(-)

diff --git a/glib/gunidecomp.c b/glib/gunidecomp.c
index feaa25c42d..dcc1a5294a 100644
--- a/glib/gunidecomp.c
+++ b/glib/gunidecomp.c
@@ -59,11 +59,19 @@
 #include <stdlib.h>
 
 #include "gunicode.h"
-#include "gunidecomp.h"
 #include "gmem.h"
-#include "gunicomp.h"
 #include "gunicodeprivate.h"
 
+#ifdef HAVE_LIBICU
+#include <string.h>
+#include <unicode/unorm2.h>
+
+#define COMBINING_CLASS(Char) u_getCombiningClass (Char)
+
+#else /* !HAVE_LIBICU */
+
+#include "gunidecomp.h"
+#include "gunicomp.h"
 
 #define CC_PART1(Page, Char) \
   ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
@@ -82,6 +90,8 @@
       ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
       : 0))
 
+#endif /* !HAVE_LIBICU */
+
 /**
  * g_unichar_combining_class:
  * @uc: a Unicode character
@@ -157,6 +167,7 @@ g_unicode_canonical_ordering (gunichar *string,
     }
 }
 
+#ifndef HAVE_LIBICU
 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
  * r should be null or have sufficient space. Calling with r == NULL will
  * only calculate the result_len; however, a buffer with space for three
@@ -229,6 +240,7 @@ find_decomposition (gunichar ch,
 
   return NULL;
 }
+#endif /* !HAVE_LIBICU */
 
 /**
  * g_unicode_canonical_decomposition:
@@ -247,6 +259,18 @@ gunichar *
 g_unicode_canonical_decomposition (gunichar ch,
 				   gsize   *result_len)
 {
+#ifdef HAVE_LIBICU
+  gunichar buffer[G_UNICHAR_MAX_DECOMPOSITION_LENGTH];
+  gunichar *res;
+
+  *result_len = g_unichar_fully_decompose (ch, FALSE, buffer, G_UNICHAR_MAX_DECOMPOSITION_LENGTH);
+  res = g_malloc ((*result_len + 1) * sizeof (gunichar));
+
+  memcpy (res, buffer, *result_len * sizeof (gunichar));
+  res[*result_len] = '\0';
+
+  return res;
+#else
   const gchar *decomp;
   const gchar *p;
   gunichar *r;
@@ -278,8 +302,10 @@ g_unicode_canonical_decomposition (gunichar ch,
     }
 
   return r;
+#endif
 }
 
+#ifndef HAVE_LIBICU
 /* L,V => LV and LV,T => LVT  */
 static gboolean
 combine_hangul (gunichar a,
@@ -366,12 +392,74 @@ combine (gunichar  a,
 
   return FALSE;
 }
+#endif /* !HAVE_LIBICU */
 
 gunichar *
 _g_utf8_normalize_wc (const gchar    *str,
 		      gssize          max_len,
 		      GNormalizeMode  mode)
 {
+#ifdef HAVE_LIBICU
+  const UNormalizer2 *norm;
+  UErrorCode error = U_ZERO_ERROR;
+  gunichar2 *orig_utf16, *res_utf16;
+  glong utf16_len;
+  gint32 result_len;
+  gunichar *res;
+
+  switch (mode)
+    {
+    case G_NORMALIZE_NFC:
+      norm = unorm2_getNFCInstance (&error);
+      break;
+    case G_NORMALIZE_NFKC:
+      norm = unorm2_getNFKCInstance (&error);
+      break;
+    case G_NORMALIZE_NFKD:
+      norm = unorm2_getNFKDInstance (&error);
+      break;
+    default:
+    case G_NORMALIZE_NFD:
+      norm = unorm2_getNFDInstance (&error);
+      break;
+    }
+
+  if (U_FAILURE (error))
+    return NULL;
+
+  orig_utf16 = g_utf8_to_utf16 (str, max_len, NULL, &utf16_len, NULL);
+  if (!orig_utf16)
+    return NULL;
+
+  result_len = unorm2_normalize (norm, orig_utf16, utf16_len, NULL, 0, &error);
+  /* Buffer Overflow is expected from the preflight operation */
+  if (error != U_BUFFER_OVERFLOW_ERROR && result_len > 0)
+    {
+      g_free (orig_utf16);
+      return NULL;
+    }
+
+  if (result_len == 0)
+    {
+      g_free (orig_utf16);
+      return g_utf8_to_ucs4 ("", 0, NULL, NULL, NULL);
+    }
+
+  res_utf16 = g_malloc (sizeof (gunichar2) * result_len);
+  error = U_ZERO_ERROR;
+  result_len = unorm2_normalize (norm, orig_utf16, utf16_len, res_utf16, result_len, &error);
+  g_free (orig_utf16);
+
+  if (U_FAILURE (error))
+    {
+      g_free (res_utf16);
+      return NULL;
+    }
+
+  res = g_utf16_to_ucs4 (res_utf16, result_len, NULL, NULL, NULL);
+  g_free (res_utf16);
+  return res;
+#else
   gsize n_wc;
   gunichar *wc_buffer;
   const char *p;
@@ -502,6 +590,7 @@ _g_utf8_normalize_wc (const gchar    *str,
   wc_buffer[n_wc] = 0;
 
   return wc_buffer;
+#endif
 }
 
 /**
@@ -548,12 +637,16 @@ g_utf8_normalize (const gchar    *str,
   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
   gchar *result;
 
+  if (!result_wc)
+    return NULL;
+
   result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
   g_free (result_wc);
 
   return result;
 }
 
+#ifndef HAVE_LIBICU
 static gboolean
 decompose_hangul_step (gunichar  ch,
                        gunichar *a,
@@ -582,6 +675,7 @@ decompose_hangul_step (gunichar  ch,
 
   return TRUE;
 }
+#endif /* !HAVE_LIBICU */
 
 /**
  * g_unichar_decompose:
@@ -622,6 +716,33 @@ g_unichar_decompose (gunichar  ch,
                      gunichar *a,
                      gunichar *b)
 {
+#ifdef HAVE_LIBICU
+  const UNormalizer2 *norm;
+  UErrorCode error = U_ZERO_ERROR;
+  gint32 len;
+  gunichar2 out16[2];
+
+  norm = unorm2_getNFDInstance (&error);
+  if (U_FAILURE (error))
+    goto no_decompose;
+
+  len = unorm2_getRawDecomposition (norm, ch, out16, 2, &error);
+  if (U_FAILURE (error) || len < 0)
+    goto no_decompose;
+
+  *a = out16[0];
+  if (len > 1)
+    *b = out16[1];
+  else
+    *b = 0;
+
+  return TRUE;
+
+no_decompose:
+  *a = ch;
+  *b = 0;
+  return FALSE;
+#else
   gint start = 0;
   gint end = G_N_ELEMENTS (decomp_step_table);
 
@@ -655,6 +776,7 @@ g_unichar_decompose (gunichar  ch,
   *b = 0;
 
   return FALSE;
+#endif
 }
 
 /**
@@ -689,11 +811,28 @@ g_unichar_compose (gunichar  a,
                    gunichar  b,
                    gunichar *ch)
 {
+#ifdef HAVE_LIBICU
+  const UNormalizer2 *norm;
+  UErrorCode error = U_ZERO_ERROR;
+  gunichar res;
+
+  norm = unorm2_getNFCInstance (&error);
+  if (U_FAILURE (error))
+    return FALSE;
+  res = unorm2_composePair (norm, a, b);
+
+  if (res < 0)
+    return FALSE;
+
+  *ch = res;
+  return TRUE;
+#else
   if (combine (a, b, ch))
     return TRUE;
 
   *ch = 0;
   return FALSE;
+#endif
 }
 
 /**
@@ -733,6 +872,44 @@ g_unichar_fully_decompose (gunichar  ch,
 			   gunichar *result,
 			   gsize     result_len)
 {
+#ifdef HAVE_LIBICU
+  const UNormalizer2 *norm;
+  UErrorCode error = U_ZERO_ERROR;
+  gint32 len;
+  gunichar2 *out16;
+  guint i;
+
+  if (compat)
+    norm = unorm2_getNFKDInstance (&error);
+  else
+    norm = unorm2_getNFDInstance (&error);
+
+  if (U_FAILURE (error))
+    goto no_decompose;
+
+  /* Output of getDecomposition() is on 16 bits while ours in on 32 bits */
+  out16 = g_malloc (sizeof (gunichar2) * result_len);
+
+  len = unorm2_getDecomposition (norm, ch, out16, result_len, &error);
+  if (U_FAILURE (error) || len < 0)
+    {
+      g_free (out16);
+      goto no_decompose;
+    }
+
+  for (i = 0; i < len; i++)
+    {
+      result[i] = out16[i];
+    }
+
+  g_free (out16);
+  return len;
+
+no_decompose:
+  if (result)
+    *result = ch;
+  return 1;
+#else /* !HAVE_LIBICU */
   const gchar *decomp;
   const gchar *p;
 
@@ -764,4 +941,5 @@ g_unichar_fully_decompose (gunichar  ch,
   if (result && result_len >= 1)
     *result = ch;
   return 1;
+#endif
 }
-- 
GitLab


From 3224062f036f9c4ff3b514a30c0d473ebdf6dd31 Mon Sep 17 00:00:00 2001
From: Guillaume Desmottes <guillaume.desmottes@collabora.com>
Date: Thu, 16 Jul 2020 12:36:45 +0200
Subject: [PATCH 9/9] ci: add build with libicu support

Use libicu official binary release.
---
 .gitlab-ci.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e79a667679..f3840b3049 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -537,3 +537,30 @@ dist-job:
       - "${CI_PROJECT_DIR}/_build/gobject-docs-$CI_COMMIT_TAG.tar.xz"
       - "${CI_PROJECT_DIR}/_build/gio-docs-$CI_COMMIT_TAG.tar.xz"
       - "${CI_PROJECT_DIR}/_build/meson-dist/glib-*.tar.xz"
+
+fedora-x86_64-libicu:
+  extends: .build
+  image: $FEDORA_IMAGE
+  stage: build
+  script:
+    - wget https://github.com/unicode-org/icu/releases/download/release-67-1/icu4c-67_1-Fedora31-x64.tgz
+    - sudo tar xf icu4c-67_1-Fedora31-x64.tgz -C /usr/local/ --strip-components=4
+    - echo /usr/local/lib | sudo tee -a /etc/ld.so.conf.d/local-lib.conf
+    - sudo ldconfig
+    - meson ${MESON_COMMON_OPTIONS_NO_WARNING}
+            --werror
+            -Dunicode=libicu
+            -Dpkg_config_path=/usr/local/lib/pkgconfig/
+            _build
+    - ninja -C _build
+    - .gitlab-ci/run-tests.sh
+  artifacts:
+    reports:
+      junit: "_build/${CI_JOB_NAME}-report.xml"
+    name: "glib-${CI_JOB_NAME}-${CI_COMMIT_REF_NAME}"
+    when: always
+    paths:
+      - "_build/config.h"
+      - "_build/glib/glibconfig.h"
+      - "_build/meson-logs"
+      - "_build/${CI_JOB_NAME}-report.xml"
\ No newline at end of file
-- 
GitLab