diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e79a66767991242afbfd36e1dccc9fe28259c0da..f3840b304962bb3d3c2be4c396160428c07f1759 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -537,3 +537,30 @@ dist-job: - "${CI_PROJECT_DIR}/_build/gobject-docs-$CI_COMMIT_TAG.tar.xz" - "${CI_PROJECT_DIR}/_build/gio-docs-$CI_COMMIT_TAG.tar.xz" - "${CI_PROJECT_DIR}/_build/meson-dist/glib-*.tar.xz" + +fedora-x86_64-libicu: + extends: .build + image: $FEDORA_IMAGE + stage: build + script: + - wget https://github.com/unicode-org/icu/releases/download/release-67-1/icu4c-67_1-Fedora31-x64.tgz + - sudo tar xf icu4c-67_1-Fedora31-x64.tgz -C /usr/local/ --strip-components=4 + - echo /usr/local/lib | sudo tee -a /etc/ld.so.conf.d/local-lib.conf + - sudo ldconfig + - meson ${MESON_COMMON_OPTIONS_NO_WARNING} + --werror + -Dunicode=libicu + -Dpkg_config_path=/usr/local/lib/pkgconfig/ + _build + - ninja -C _build + - .gitlab-ci/run-tests.sh + artifacts: + reports: + junit: "_build/${CI_JOB_NAME}-report.xml" + name: "glib-${CI_JOB_NAME}-${CI_COMMIT_REF_NAME}" + when: always + paths: + - "_build/config.h" + - "_build/glib/glibconfig.h" + - "_build/meson-logs" + - "_build/${CI_JOB_NAME}-report.xml" \ No newline at end of file diff --git a/glib/gunibreak.c b/glib/gunibreak.c index 334acd3d420097651008765175f46b94c821b6a1..22dafe20e325f8b9c2d89fe7bf200967817908b0 100644 --- a/glib/gunibreak.c +++ b/glib/gunibreak.c @@ -20,6 +20,8 @@ #include +#ifndef HAVE_LIBICU + #include "gunibreak.h" #define TPROP_PART1(Page, Char) \ @@ -39,6 +41,109 @@ ? TPROP_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ : G_UNICODE_BREAK_UNKNOWN)) +#else /* HAVE_LIBICU */ + +#include +#include "gunicode.h" + +static GUnicodeBreakType +u_line_break_to_g_unicode_break_type (ULineBreak code) +{ + switch (code) + { + case U_LB_UNKNOWN: /*[XX]*/ + return G_UNICODE_BREAK_UNKNOWN; + case U_LB_AMBIGUOUS: /*[AI]*/ + return G_UNICODE_BREAK_AMBIGUOUS; + case U_LB_ALPHABETIC: /*[AL]*/ + return G_UNICODE_BREAK_ALPHABETIC; + case U_LB_BREAK_BOTH: /*[B2]*/ + return G_UNICODE_BREAK_BEFORE_AND_AFTER; + case U_LB_BREAK_AFTER: /*[BA]*/ + return G_UNICODE_BREAK_AFTER; + case U_LB_BREAK_BEFORE: /*[BB]*/ + return G_UNICODE_BREAK_BEFORE; + case U_LB_MANDATORY_BREAK: /*[BK]*/ + return G_UNICODE_BREAK_MANDATORY; + case U_LB_CONTINGENT_BREAK: /*[CB]*/ + return G_UNICODE_BREAK_CONTINGENT; + case U_LB_CLOSE_PUNCTUATION: /*[CL]*/ + return G_UNICODE_BREAK_CLOSE_PUNCTUATION; + case U_LB_COMBINING_MARK: /*[CM]*/ + return G_UNICODE_BREAK_COMBINING_MARK; + case U_LB_CARRIAGE_RETURN: /*[CR]*/ + return G_UNICODE_BREAK_CARRIAGE_RETURN; + case U_LB_EXCLAMATION: /*[EX]*/ + return G_UNICODE_BREAK_EXCLAMATION; + case U_LB_GLUE: /*[GL]*/ + return G_UNICODE_BREAK_NON_BREAKING_GLUE; + case U_LB_HYPHEN: /*[HY]*/ + return G_UNICODE_BREAK_HYPHEN; + case U_LB_IDEOGRAPHIC: /*[ID]*/ + return G_UNICODE_BREAK_IDEOGRAPHIC; + case U_LB_INSEPARABLE: /*[IN]*/ + return G_UNICODE_BREAK_INSEPARABLE; + case U_LB_INFIX_NUMERIC: /*[IS]*/ + return G_UNICODE_BREAK_INFIX_SEPARATOR; + case U_LB_LINE_FEED: /*[LF]*/ + return G_UNICODE_BREAK_LINE_FEED; + case U_LB_NONSTARTER: /*[NS]*/ + return G_UNICODE_BREAK_NON_STARTER; + case U_LB_NUMERIC: /*[NU]*/ + return G_UNICODE_BREAK_NUMERIC; + case U_LB_OPEN_PUNCTUATION: /*[OP]*/ + return G_UNICODE_BREAK_OPEN_PUNCTUATION; + case U_LB_POSTFIX_NUMERIC: /*[PO]*/ + return G_UNICODE_BREAK_POSTFIX; + case U_LB_PREFIX_NUMERIC: /*[PR]*/ + return G_UNICODE_BREAK_PREFIX; + case U_LB_QUOTATION: /*[QU]*/ + return G_UNICODE_BREAK_QUOTATION; + case U_LB_COMPLEX_CONTEXT: /*[SA]*/ + return G_UNICODE_BREAK_COMPLEX_CONTEXT; + case U_LB_SURROGATE: /*[SG]*/ + return G_UNICODE_BREAK_SURROGATE; + case U_LB_SPACE: /*[SP]*/ + return G_UNICODE_BREAK_SPACE; + case U_LB_BREAK_SYMBOLS: /*[SY]*/ + return G_UNICODE_BREAK_SYMBOL; + case U_LB_ZWSPACE: /*[ZW]*/ + return G_UNICODE_BREAK_ZERO_WIDTH_SPACE; + case U_LB_NEXT_LINE: /*[NL]*/ + return G_UNICODE_BREAK_NEXT_LINE; + case U_LB_WORD_JOINER: /*[WJ]*/ + return G_UNICODE_BREAK_WORD_JOINER; + case U_LB_H2: /*[H2]*/ + return G_UNICODE_BREAK_HANGUL_LV_SYLLABLE; + case U_LB_H3: /*[H3]*/ + return G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE; + case U_LB_JL: /*[JL]*/ + return G_UNICODE_BREAK_HANGUL_L_JAMO; + case U_LB_JT: /*[JT]*/ + return G_UNICODE_BREAK_HANGUL_T_JAMO; + case U_LB_JV: /*[JV]*/ + return G_UNICODE_BREAK_HANGUL_V_JAMO; + case U_LB_CLOSE_PARENTHESIS: /*[CP]*/ + return G_UNICODE_BREAK_CLOSE_PARANTHESIS; + case U_LB_CONDITIONAL_JAPANESE_STARTER: /*[CJ]*/ + return G_UNICODE_BREAK_CONDITIONAL_JAPANESE_STARTER; + case U_LB_HEBREW_LETTER: /*[HL]*/ + return G_UNICODE_BREAK_HEBREW_LETTER; + case U_LB_REGIONAL_INDICATOR: /*[RI]*/ + return G_UNICODE_BREAK_REGIONAL_INDICATOR; + case U_LB_E_BASE: /*[EB]*/ + return G_UNICODE_BREAK_EMOJI_BASE; + case U_LB_E_MODIFIER: /*[EM]*/ + return G_UNICODE_BREAK_EMOJI_MODIFIER; + case U_LB_ZWJ: /*[ZWJ]*/ + return G_UNICODE_BREAK_ZERO_WIDTH_JOINER; + case U_LB_COUNT: + break; + } + return G_UNICODE_BREAK_UNKNOWN; +} +#endif /* HAVE_LIBICU */ + /** * g_unichar_break_type: * @c: a Unicode character @@ -55,5 +160,12 @@ GUnicodeBreakType g_unichar_break_type (gunichar c) { +#ifdef HAVE_LIBICU + gint32 line_break; + + line_break = u_getIntPropertyValue (c, UCHAR_LINE_BREAK); + return u_line_break_to_g_unicode_break_type (line_break); +#else /* !HAVE_LIBICU */ return PROP (c); +#endif } diff --git a/glib/gunicode.h b/glib/gunicode.h index d729803804bb1660c6332c63a59ab46c9b245bfe..3cfb7c7a04efd51d563a52c8a0932b369c6d217c 100644 --- a/glib/gunicode.h +++ b/glib/gunicode.h @@ -207,7 +207,7 @@ typedef enum * Since new unicode versions may add new types here, applications should be ready * to handle unknown values. They may be regarded as %G_UNICODE_BREAK_UNKNOWN. * - * See [Unicode Line Breaking Algorithm](http://www.unicode.org/unicode/reports/tr14/). + * See [Unicode Line Breaking Algorithm](https://www.unicode.org/reports/tr14/). */ typedef enum { diff --git a/glib/gunidecomp.c b/glib/gunidecomp.c index feaa25c42d3997ae56d0eab7ff66f38c9a26c259..dcc1a5294a52baf59ca4dafb29c611aa3e9fb8ec 100644 --- a/glib/gunidecomp.c +++ b/glib/gunidecomp.c @@ -59,11 +59,19 @@ #include #include "gunicode.h" -#include "gunidecomp.h" #include "gmem.h" -#include "gunicomp.h" #include "gunicodeprivate.h" +#ifdef HAVE_LIBICU +#include +#include + +#define COMBINING_CLASS(Char) u_getCombiningClass (Char) + +#else /* !HAVE_LIBICU */ + +#include "gunidecomp.h" +#include "gunicomp.h" #define CC_PART1(Page, Char) \ ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ @@ -82,6 +90,8 @@ ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ : 0)) +#endif /* !HAVE_LIBICU */ + /** * g_unichar_combining_class: * @uc: a Unicode character @@ -157,6 +167,7 @@ g_unicode_canonical_ordering (gunichar *string, } } +#ifndef HAVE_LIBICU /* http://www.unicode.org/unicode/reports/tr15/#Hangul * r should be null or have sufficient space. Calling with r == NULL will * only calculate the result_len; however, a buffer with space for three @@ -229,6 +240,7 @@ find_decomposition (gunichar ch, return NULL; } +#endif /* !HAVE_LIBICU */ /** * g_unicode_canonical_decomposition: @@ -247,6 +259,18 @@ gunichar * g_unicode_canonical_decomposition (gunichar ch, gsize *result_len) { +#ifdef HAVE_LIBICU + gunichar buffer[G_UNICHAR_MAX_DECOMPOSITION_LENGTH]; + gunichar *res; + + *result_len = g_unichar_fully_decompose (ch, FALSE, buffer, G_UNICHAR_MAX_DECOMPOSITION_LENGTH); + res = g_malloc ((*result_len + 1) * sizeof (gunichar)); + + memcpy (res, buffer, *result_len * sizeof (gunichar)); + res[*result_len] = '\0'; + + return res; +#else const gchar *decomp; const gchar *p; gunichar *r; @@ -278,8 +302,10 @@ g_unicode_canonical_decomposition (gunichar ch, } return r; +#endif } +#ifndef HAVE_LIBICU /* L,V => LV and LV,T => LVT */ static gboolean combine_hangul (gunichar a, @@ -366,12 +392,74 @@ combine (gunichar a, return FALSE; } +#endif /* !HAVE_LIBICU */ gunichar * _g_utf8_normalize_wc (const gchar *str, gssize max_len, GNormalizeMode mode) { +#ifdef HAVE_LIBICU + const UNormalizer2 *norm; + UErrorCode error = U_ZERO_ERROR; + gunichar2 *orig_utf16, *res_utf16; + glong utf16_len; + gint32 result_len; + gunichar *res; + + switch (mode) + { + case G_NORMALIZE_NFC: + norm = unorm2_getNFCInstance (&error); + break; + case G_NORMALIZE_NFKC: + norm = unorm2_getNFKCInstance (&error); + break; + case G_NORMALIZE_NFKD: + norm = unorm2_getNFKDInstance (&error); + break; + default: + case G_NORMALIZE_NFD: + norm = unorm2_getNFDInstance (&error); + break; + } + + if (U_FAILURE (error)) + return NULL; + + orig_utf16 = g_utf8_to_utf16 (str, max_len, NULL, &utf16_len, NULL); + if (!orig_utf16) + return NULL; + + result_len = unorm2_normalize (norm, orig_utf16, utf16_len, NULL, 0, &error); + /* Buffer Overflow is expected from the preflight operation */ + if (error != U_BUFFER_OVERFLOW_ERROR && result_len > 0) + { + g_free (orig_utf16); + return NULL; + } + + if (result_len == 0) + { + g_free (orig_utf16); + return g_utf8_to_ucs4 ("", 0, NULL, NULL, NULL); + } + + res_utf16 = g_malloc (sizeof (gunichar2) * result_len); + error = U_ZERO_ERROR; + result_len = unorm2_normalize (norm, orig_utf16, utf16_len, res_utf16, result_len, &error); + g_free (orig_utf16); + + if (U_FAILURE (error)) + { + g_free (res_utf16); + return NULL; + } + + res = g_utf16_to_ucs4 (res_utf16, result_len, NULL, NULL, NULL); + g_free (res_utf16); + return res; +#else gsize n_wc; gunichar *wc_buffer; const char *p; @@ -502,6 +590,7 @@ _g_utf8_normalize_wc (const gchar *str, wc_buffer[n_wc] = 0; return wc_buffer; +#endif } /** @@ -548,12 +637,16 @@ g_utf8_normalize (const gchar *str, gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode); gchar *result; + if (!result_wc) + return NULL; + result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL); g_free (result_wc); return result; } +#ifndef HAVE_LIBICU static gboolean decompose_hangul_step (gunichar ch, gunichar *a, @@ -582,6 +675,7 @@ decompose_hangul_step (gunichar ch, return TRUE; } +#endif /* !HAVE_LIBICU */ /** * g_unichar_decompose: @@ -622,6 +716,33 @@ g_unichar_decompose (gunichar ch, gunichar *a, gunichar *b) { +#ifdef HAVE_LIBICU + const UNormalizer2 *norm; + UErrorCode error = U_ZERO_ERROR; + gint32 len; + gunichar2 out16[2]; + + norm = unorm2_getNFDInstance (&error); + if (U_FAILURE (error)) + goto no_decompose; + + len = unorm2_getRawDecomposition (norm, ch, out16, 2, &error); + if (U_FAILURE (error) || len < 0) + goto no_decompose; + + *a = out16[0]; + if (len > 1) + *b = out16[1]; + else + *b = 0; + + return TRUE; + +no_decompose: + *a = ch; + *b = 0; + return FALSE; +#else gint start = 0; gint end = G_N_ELEMENTS (decomp_step_table); @@ -655,6 +776,7 @@ g_unichar_decompose (gunichar ch, *b = 0; return FALSE; +#endif } /** @@ -689,11 +811,28 @@ g_unichar_compose (gunichar a, gunichar b, gunichar *ch) { +#ifdef HAVE_LIBICU + const UNormalizer2 *norm; + UErrorCode error = U_ZERO_ERROR; + gunichar res; + + norm = unorm2_getNFCInstance (&error); + if (U_FAILURE (error)) + return FALSE; + res = unorm2_composePair (norm, a, b); + + if (res < 0) + return FALSE; + + *ch = res; + return TRUE; +#else if (combine (a, b, ch)) return TRUE; *ch = 0; return FALSE; +#endif } /** @@ -733,6 +872,44 @@ g_unichar_fully_decompose (gunichar ch, gunichar *result, gsize result_len) { +#ifdef HAVE_LIBICU + const UNormalizer2 *norm; + UErrorCode error = U_ZERO_ERROR; + gint32 len; + gunichar2 *out16; + guint i; + + if (compat) + norm = unorm2_getNFKDInstance (&error); + else + norm = unorm2_getNFDInstance (&error); + + if (U_FAILURE (error)) + goto no_decompose; + + /* Output of getDecomposition() is on 16 bits while ours in on 32 bits */ + out16 = g_malloc (sizeof (gunichar2) * result_len); + + len = unorm2_getDecomposition (norm, ch, out16, result_len, &error); + if (U_FAILURE (error) || len < 0) + { + g_free (out16); + goto no_decompose; + } + + for (i = 0; i < len; i++) + { + result[i] = out16[i]; + } + + g_free (out16); + return len; + +no_decompose: + if (result) + *result = ch; + return 1; +#else /* !HAVE_LIBICU */ const gchar *decomp; const gchar *p; @@ -764,4 +941,5 @@ g_unichar_fully_decompose (gunichar ch, if (result && result_len >= 1) *result = ch; return 1; +#endif } diff --git a/glib/guniprop.c b/glib/guniprop.c index 619b3990892a959adeb93078ce281aff5a245168..0dd3af8bc0a407614d45a827a71bbb494b05dca5 100644 --- a/glib/guniprop.c +++ b/glib/guniprop.c @@ -25,11 +25,11 @@ #include #include "gmem.h" +#include "gstrfuncs.h" #include "gstring.h" #include "gtestutils.h" #include "gtypes.h" #include "gunicode.h" -#include "gunichartables.h" #include "gmirroringtable.h" #include "gscripttable.h" #include "gunicodeprivate.h" @@ -37,6 +37,13 @@ #include "gwin32.h" #endif +#ifdef HAVE_LIBICU +#include +#include +#else +#include "gunichartables.h" +#endif + #define G_UNICHAR_FULLWIDTH_A 0xff21 #define G_UNICHAR_FULLWIDTH_I 0xff29 #define G_UNICHAR_FULLWIDTH_J 0xff2a @@ -44,6 +51,8 @@ #define G_UNICHAR_FULLWIDTH_a 0xff41 #define G_UNICHAR_FULLWIDTH_f 0xff46 +#ifndef HAVE_LIBICU + #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \ ? attr_table_part1[Page] \ : attr_table_part2[(Page) - 0xe00]) @@ -68,6 +77,81 @@ ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ : G_UNICODE_UNASSIGNED)) +#else /* HAVE_LIBICU */ + +static GUnicodeType +u_char_category_to_g_unicode_type (UCharCategory c) +{ + switch (c) + { + case U_GENERAL_OTHER_TYPES: + case U_CHAR_CATEGORY_COUNT: + return G_UNICODE_UNASSIGNED; + case U_UPPERCASE_LETTER: + return G_UNICODE_UPPERCASE_LETTER; + case U_LOWERCASE_LETTER: + return G_UNICODE_LOWERCASE_LETTER; + case U_TITLECASE_LETTER: + return G_UNICODE_TITLECASE_LETTER; + case U_MODIFIER_LETTER: + return G_UNICODE_MODIFIER_LETTER; + case U_OTHER_LETTER: + return G_UNICODE_OTHER_LETTER; + case U_NON_SPACING_MARK: + return G_UNICODE_NON_SPACING_MARK; + case U_ENCLOSING_MARK: + return G_UNICODE_ENCLOSING_MARK; + case U_COMBINING_SPACING_MARK: + return G_UNICODE_SPACING_MARK; + case U_DECIMAL_DIGIT_NUMBER: + return G_UNICODE_DECIMAL_NUMBER; + case U_LETTER_NUMBER: + return G_UNICODE_LETTER_NUMBER; + case U_OTHER_NUMBER: + return G_UNICODE_OTHER_NUMBER; + case U_SPACE_SEPARATOR: + return G_UNICODE_SPACE_SEPARATOR; + case U_LINE_SEPARATOR: + return G_UNICODE_LINE_SEPARATOR; + case U_PARAGRAPH_SEPARATOR: + return G_UNICODE_PARAGRAPH_SEPARATOR; + case U_CONTROL_CHAR: + return G_UNICODE_CONTROL; + case U_FORMAT_CHAR: + return G_UNICODE_FORMAT; + case U_PRIVATE_USE_CHAR: + return G_UNICODE_PRIVATE_USE; + case U_SURROGATE: + return G_UNICODE_SURROGATE; + case U_DASH_PUNCTUATION: + return G_UNICODE_DASH_PUNCTUATION; + case U_START_PUNCTUATION: + return G_UNICODE_OPEN_PUNCTUATION; + case U_END_PUNCTUATION: + return G_UNICODE_CLOSE_PUNCTUATION; + case U_CONNECTOR_PUNCTUATION: + return G_UNICODE_CONNECT_PUNCTUATION; + case U_OTHER_PUNCTUATION: + return G_UNICODE_OTHER_PUNCTUATION; + case U_MATH_SYMBOL: + return G_UNICODE_MATH_SYMBOL; + case U_CURRENCY_SYMBOL: + return G_UNICODE_CURRENCY_SYMBOL; + case U_MODIFIER_SYMBOL: + return G_UNICODE_MODIFIER_SYMBOL; + case U_OTHER_SYMBOL: + return G_UNICODE_OTHER_SYMBOL; + case U_INITIAL_PUNCTUATION: + return G_UNICODE_INITIAL_PUNCTUATION; + case U_FINAL_PUNCTUATION: + return G_UNICODE_FINAL_PUNCTUATION; + } + return G_UNICODE_UNASSIGNED; +} + +#define TYPE(Char) u_char_category_to_g_unicode_type (u_charType (Char)) + +#endif /* HAVE_LIBICU */ #define IS(Type, Class) (((guint)1 << (Type)) & (Class)) #define OR(Type, Rest) (((guint)1 << (Type)) | (Rest)) @@ -351,11 +435,15 @@ g_unichar_isupper (gunichar c) gboolean g_unichar_istitle (gunichar c) { +#ifdef HAVE_LIBICU + return u_istitle (c); +#else unsigned int i; for (i = 0; i < G_N_ELEMENTS (title_table); ++i) if (title_table[i][0] == c) return TRUE; return FALSE; +#endif } /** @@ -428,6 +516,7 @@ g_unichar_iszerowidth (gunichar c) return FALSE; } +#ifndef HAVE_LIBICU static int interval_compare (const void *key, const void *elt) { @@ -467,6 +556,7 @@ g_unichar_iswide_bsearch (gunichar ch) return FALSE; } +#endif /* !HAVE_LIBICU */ /** * g_unichar_iswide: @@ -480,10 +570,16 @@ g_unichar_iswide_bsearch (gunichar ch) gboolean g_unichar_iswide (gunichar c) { +#ifdef HAVE_LIBICU + UEastAsianWidth ea = (UEastAsianWidth) u_getIntPropertyValue (c, UCHAR_EAST_ASIAN_WIDTH); + + return ea == U_EA_FULLWIDTH || ea == U_EA_WIDE; +#else if (c < g_unicode_width_table_wide[0].start) return FALSE; else return g_unichar_iswide_bsearch (c); +#endif } @@ -512,6 +608,13 @@ g_unichar_iswide_cjk (gunichar c) if (g_unichar_iswide (c)) return TRUE; +#ifdef HAVE_LIBICU + { + UEastAsianWidth ea = (UEastAsianWidth) u_getIntPropertyValue (c, UCHAR_EAST_ASIAN_WIDTH); + + return ea == U_EA_AMBIGUOUS; + } +#else /* bsearch() is declared attribute(nonnull(1)) so we can't validly search * for a NULL key */ if (c == 0) @@ -525,6 +628,7 @@ g_unichar_iswide_cjk (gunichar c) return TRUE; return FALSE; +#endif } @@ -541,6 +645,9 @@ g_unichar_iswide_cjk (gunichar c) gunichar g_unichar_toupper (gunichar c) { +#ifdef HAVE_LIBICU + return u_toupper (c); +#else int t = TYPE (c); if (t == G_UNICODE_LOWERCASE_LETTER) { @@ -566,6 +673,7 @@ g_unichar_toupper (gunichar c) } } return c; +#endif } /** @@ -581,6 +689,9 @@ g_unichar_toupper (gunichar c) gunichar g_unichar_tolower (gunichar c) { +#ifdef HAVE_LIBICU + return u_tolower (c); +#else int t = TYPE (c); if (t == G_UNICODE_UPPERCASE_LETTER) { @@ -607,6 +718,7 @@ g_unichar_tolower (gunichar c) } } return c; +#endif } /** @@ -622,6 +734,9 @@ g_unichar_tolower (gunichar c) gunichar g_unichar_totitle (gunichar c) { +#ifdef HAVE_LIBICU + return u_totitle (c); +#else unsigned int i; /* We handle U+0000 explicitly because some elements in @@ -640,6 +755,7 @@ g_unichar_totitle (gunichar c) return g_unichar_toupper (c); return c; +#endif } /** @@ -655,9 +771,13 @@ g_unichar_totitle (gunichar c) int g_unichar_digit_value (gunichar c) { +#ifdef HAVE_LIBICU + return u_digit (c, 10); +#else if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) return ATTTABLE (c >> 8, c & 0xff); return -1; +#endif } /** @@ -673,6 +793,9 @@ g_unichar_digit_value (gunichar c) int g_unichar_xdigit_value (gunichar c) { +#ifdef HAVE_LIBICU + return u_digit (c, 16); +#else if (c >= 'A' && c <= 'F') return c - 'A' + 10; if (c >= 'a' && c <= 'f') @@ -684,6 +807,7 @@ g_unichar_xdigit_value (gunichar c) if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) return ATTTABLE (c >> 8, c & 0xff); return -1; +#endif } /** @@ -704,6 +828,7 @@ g_unichar_type (gunichar c) * Case mapping functions */ +#ifndef HAVE_LIBICU typedef enum { LOCALE_NORMAL, LOCALE_TURKIC, @@ -803,7 +928,6 @@ real_toupper (const gchar *str, const gchar *p = str; const char *last = NULL; gsize len = 0; - gboolean last_was_i = FALSE; while ((max_len < 0 || p < str + max_len) && *p) { @@ -814,38 +938,6 @@ real_toupper (const gchar *str, last = p; p = g_utf8_next_char (p); - if (locale_type == LOCALE_LITHUANIAN) - { - if (c == 'i') - last_was_i = TRUE; - else - { - if (last_was_i) - { - /* Nasty, need to remove any dot above. Though - * I think only E WITH DOT ABOVE occurs in practice - * which could simplify this considerably. - */ - gsize decomp_len, i; - gunichar decomp[G_UNICHAR_MAX_DECOMPOSITION_LENGTH]; - - decomp_len = g_unichar_fully_decompose (c, FALSE, decomp, G_N_ELEMENTS (decomp)); - for (i=0; i < decomp_len; i++) - { - if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */) - len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL); - } - - len += output_marks (&p, out_buffer ? out_buffer + len : NULL, TRUE); - - continue; - } - - if (!ISMARK (t)) - last_was_i = FALSE; - } - } - if (locale_type == LOCALE_TURKIC && c == 'i') { /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */ @@ -856,11 +948,10 @@ real_toupper (const gchar *str, /* Nasty, need to move it after other combining marks .. this would go away if * we normalized first. */ - len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE); - /* And output as GREEK CAPITAL LETTER IOTA */ - len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL); - } + len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL); + len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE); + } else if (IS (t, OR (G_UNICODE_LOWERCASE_LETTER, OR (G_UNICODE_TITLECASE_LETTER, @@ -908,6 +999,17 @@ real_toupper (const gchar *str, return len; } +#else /* HAVE_LIBICU */ +static gchar * +get_locale (void) +{ +#ifdef G_OS_WIN32 + return g_win32_getlocale (); +#else + return g_strdup (setlocale (LC_CTYPE, NULL)); +#endif +} +#endif /* HAVE_LIBICU */ /** * g_utf8_strup: @@ -928,24 +1030,64 @@ g_utf8_strup (const gchar *str, gssize len) { gsize result_len; - LocaleType locale_type; - gchar *result; + gchar *result = NULL; g_return_val_if_fail (str != NULL, NULL); - locale_type = get_locale_type (); - /* * We use a two pass approach to keep memory management simple */ - result_len = real_toupper (str, len, NULL, locale_type); - result = g_malloc (result_len + 1); - real_toupper (str, len, result, locale_type); - result[result_len] = '\0'; +#ifdef HAVE_LIBICU + { + UErrorCode error = U_ZERO_ERROR; + gunichar2 *orig_utf16 = NULL, *result_utf16 = NULL; + glong utf16_len; + gchar *locale; + + locale = get_locale (); + orig_utf16 = g_utf8_to_utf16 (str, len, NULL, &utf16_len, NULL); + + result_len = u_strToUpper (NULL, 0, orig_utf16, utf16_len, locale, &error); + /* Buffer Overflow is expected from the preflight operation */ + if (error != U_BUFFER_OVERFLOW_ERROR && result_len > 0) + goto out; + + if (result_len == 0) + { + result = g_strdup (""); + goto out; + } + + result_utf16 = g_malloc ((result_len + 1) * 2); + error = U_ZERO_ERROR; + u_strToUpper (result_utf16, result_len + 1, orig_utf16, utf16_len, locale, &error); + if (U_FAILURE (error)) + goto out; + + result = g_utf16_to_utf8 (result_utf16, result_len, NULL, NULL, NULL); + + out: + g_free (locale); + g_free (result_utf16); + g_free (orig_utf16); + } +#else + { + LocaleType locale_type; + + locale_type = get_locale_type (); + + result_len = real_toupper (str, len, NULL, locale_type); + result = g_malloc (result_len + 1); + real_toupper (str, len, result, locale_type); + result[result_len] = '\0'; + } +#endif return result; } +#ifndef HAVE_LIBICU /* traverses the string checking for characters with combining class == 230 * until a base character is found */ static gboolean @@ -1099,6 +1241,7 @@ real_tolower (const gchar *str, return len; } +#endif /** * g_utf8_strdown: @@ -1118,20 +1261,59 @@ g_utf8_strdown (const gchar *str, gssize len) { gsize result_len; - LocaleType locale_type; - gchar *result; + gchar *result = NULL; g_return_val_if_fail (str != NULL, NULL); - locale_type = get_locale_type (); - /* * We use a two pass approach to keep memory management simple */ - result_len = real_tolower (str, len, NULL, locale_type); - result = g_malloc (result_len + 1); - real_tolower (str, len, result, locale_type); - result[result_len] = '\0'; +#ifdef HAVE_LIBICU + { + UErrorCode error = U_ZERO_ERROR; + gunichar2 *orig_utf16 = NULL, *result_utf16 = NULL; + glong utf16_len; + gchar *locale; + + locale = get_locale (); + orig_utf16 = g_utf8_to_utf16 (str, len, NULL, &utf16_len, NULL); + + result_len = u_strToLower (NULL, 0, orig_utf16, utf16_len, locale, &error); + /* Buffer Overflow is expected from the preflight operation */ + if (error != U_BUFFER_OVERFLOW_ERROR && result_len > 0) + goto out; + + if (result_len == 0) + { + result = g_strdup (""); + goto out; + } + + result_utf16 = g_malloc ((result_len + 1) * 2); + error = U_ZERO_ERROR; + u_strToLower (result_utf16, result_len + 1, orig_utf16, utf16_len, locale, &error); + if (U_FAILURE (error)) + goto out; + + result = g_utf16_to_utf8 (result_utf16, result_len, NULL, NULL, NULL); + + out: + g_free (locale); + g_free (result_utf16); + g_free (orig_utf16); + } +#else + { + LocaleType locale_type; + + locale_type = get_locale_type (); + + result_len = real_tolower (str, len, NULL, locale_type); + result = g_malloc (result_len + 1); + real_tolower (str, len, result, locale_type); + result[result_len] = '\0'; + } +#endif return result; } @@ -1160,6 +1342,7 @@ gchar * g_utf8_casefold (const gchar *str, gssize len) { +#ifndef HAVE_LIBICU GString *result; const char *p; @@ -1200,7 +1383,39 @@ g_utf8_casefold (const gchar *str, p = g_utf8_next_char (p); } - return g_string_free (result, FALSE); + return g_string_free (result, FALSE); +#else /* HAVE_LIBICU */ + UErrorCode error = U_ZERO_ERROR; + gchar *result = NULL; + gunichar2 *orig_utf16 = NULL, *result_utf16 = NULL; + glong utf16_len; + gint32 result_len; + + g_return_val_if_fail (str != NULL, NULL); + + orig_utf16 = g_utf8_to_utf16 (str, len, NULL, &utf16_len, NULL); + + result_len = u_strFoldCase (NULL, 0, orig_utf16, utf16_len, U_FOLD_CASE_DEFAULT, &error); + /* Buffer Overflow is expected from the preflight operation */ + if (error != U_BUFFER_OVERFLOW_ERROR && result_len != 0) + goto out; + + if (result_len == 0) + return g_strdup (""); + + result_utf16 = g_malloc ((result_len + 1) * 2); + error = U_ZERO_ERROR; + u_strFoldCase (result_utf16, result_len + 1, orig_utf16, utf16_len, U_FOLD_CASE_DEFAULT, &error); + if (U_FAILURE (error)) + goto out; + + result = g_utf16_to_utf8 (result_utf16, result_len, NULL, NULL, NULL); + +out: + g_free (result_utf16); + g_free (orig_utf16); + return result; +#endif } /** diff --git a/glib/meson.build b/glib/meson.build index 8c18e6de405a389b2065d1f5f3abe6ea8fa2efc2..e2db4616f030ac1197b922a199bdf6b540edb035 100644 --- a/glib/meson.build +++ b/glib/meson.build @@ -377,7 +377,7 @@ libglib = library('glib-2.0', # intl.lib is not compatible with SAFESEH link_args : [noseh_link_args, glib_link_flags, win32_ldflags], include_directories : configinc, - dependencies : pcre_deps + [thread_dep, librt] + libintl_deps + libiconv + platform_deps + [gnulib_libm_dependency, libm] + [libsysprof_capture_dep], + dependencies : pcre_deps + [thread_dep, librt, libicu_dep] + libintl_deps + libiconv + platform_deps + [gnulib_libm_dependency, libm] + [libsysprof_capture_dep], c_args : glib_c_args, objc_args : glib_c_args, ) diff --git a/glib/tests/unicode.c b/glib/tests/unicode.c index fa8bd1fa1ff54f0deb6054a09866ab51ae47a362..612c1097baaa9d21841b8f9c2476ee5dd2061ccf 100644 --- a/glib/tests/unicode.c +++ b/glib/tests/unicode.c @@ -437,6 +437,10 @@ test_strup (void) /* Tricky, comparing two unicode strings with an ASCII function */ g_assert_cmpstr (str_up, ==, "AAZZ09X;\003E\357\274\241\357\274\241"); g_free (str_up); + + str_up = g_utf8_strup ("", 0); + g_assert_cmpstr (str_up, ==, ""); + g_free (str_up); } /* Test that g_utf8_strdown() returns the correct value for various @@ -462,6 +466,10 @@ test_strdown (void) /* Tricky, comparing two unicode strings with an ASCII function */ g_assert_cmpstr (str_down, ==, "aazz09x;\003\007\357\275\201\357\275\201"); g_free (str_down); + + str_down = g_utf8_strdown ("", 0); + g_assert_cmpstr (str_down, ==, ""); + g_free (str_down); } /* Test that g_utf8_casefold() returns the correct value for various @@ -487,6 +495,10 @@ test_casefold (void) /* Tricky, comparing two unicode strings with an ASCII function */ g_assert_cmpstr (str_casefold, ==, "aazz09x;\357\275\201\357\275\201"); g_free (str_casefold); + + str_casefold = g_utf8_casefold ("", 0); + g_assert_cmpstr (str_casefold, ==, ""); + g_free (str_casefold); } /* Test that g_unichar_ismark() returns the correct value for various @@ -1610,6 +1622,45 @@ test_iso15924 (void) #undef PACK } +static void +test_normalize (void) +{ + guint i; + typedef struct + { + const gchar *str; + const gchar *nfd; + const gchar *nfc; + const gchar *nfkd; + const gchar *nfkc; + } Test; + Test tests[] = { + { "Äffin", "A\u0308ffin", "Äffin", "A\u0308ffin", "Äffin" }, + { "Ä\uFB03n", "A\u0308\uFB03n", "Ä\uFB03n", "A\u0308ffin", "Äffin" }, + { "Henry IV", "Henry IV", "Henry IV", "Henry IV", "Henry IV" }, + { "Henry \u2163", "Henry \u2163", "Henry \u2163", "Henry IV", "Henry IV" }, + { "non-utf\x88", NULL, NULL, NULL, NULL }, + { "", "", "", "", "" }, + }; + +#define TEST(str, mode, expected) \ + { \ + gchar *normalized = g_utf8_normalize (str, -1, mode); \ + g_assert_cmpstr (normalized, ==, expected); \ + g_free (normalized); \ + } + + for (i = 0; i < G_N_ELEMENTS (tests); i++) + { + TEST (tests[i].str, G_NORMALIZE_NFD, tests[i].nfd); + TEST (tests[i].str, G_NORMALIZE_NFC, tests[i].nfc); + TEST (tests[i].str, G_NORMALIZE_NFKD, tests[i].nfkd); + TEST (tests[i].str, G_NORMALIZE_NFKC, tests[i].nfkc); + } + +#undef TEST +} + int main (int argc, char *argv[]) @@ -1651,6 +1702,7 @@ main (int argc, g_test_add_func ("/unicode/xdigit", test_xdigit); g_test_add_func ("/unicode/xdigit-value", test_xdigit_value); g_test_add_func ("/unicode/zero-width", test_zerowidth); + g_test_add_func ("/unicode/normalize", test_normalize); return g_test_run(); } diff --git a/meson.build b/meson.build index 0d892fb2df538b5599d3e45afb0b280d79e9c713..c5521243ea7401dd768c54af401bf711f3c2268b 100644 --- a/meson.build +++ b/meson.build @@ -2308,6 +2308,16 @@ if want_systemtap and enable_dtrace enable_systemtap = true endif +unicode_opt = get_option('unicode') +if unicode_opt == 'libicu' + warning('Building with \'unicode=libicu\' is meant to be used ONLY on custom embedded platforms, this feature is NOT meant to be used by general-purpose Linux distributions.') + libicu_dep = dependency('icu-uc', version: '>= 66') + glib_conf.set('HAVE_LIBICU', true) +else + libicu_dep = [] + glib_conf.set('HAVE_LIBICU', false) +endif + test_timeout = 60 test_timeout_slow = 180 diff --git a/meson_options.txt b/meson_options.txt index 072765361e386bbb99c01f108134139748c0a277..1beba38675da62e36863a77f482e9e41e2d81a77 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -126,4 +126,11 @@ option('glib_checks', option('libelf', type : 'feature', value : 'auto', - description : 'Enable support for listing and extracting from ELF resource files with gresource tool') \ No newline at end of file + description : 'Enable support for listing and extracting from ELF resource files with gresource tool') + +option('unicode', + type : 'combo', + choices : ['internal', 'libicu'], + value : 'internal', + yield : true, + description : 'Unicode implementation to use (\'internal\' = \'GLib\'s own implementation\'; \'libicu\' = \'External libicu. WARNING: should only be used on custom embedded platforms, this feature is NOT meant to be used by general-purpose Linux distributions\';)') diff --git a/tests/casemap.txt b/tests/casemap.txt index eccc12b1ea6d77130fed9ddaaa502431201a4685..332cbc1c64db35388796c9447d04df8919636390 100644 --- a/tests/casemap.txt +++ b/tests/casemap.txt @@ -10,16 +10,15 @@ tr_TR.UTF-8 i i İ İ # i => LATIN CAPITAL LETTER I WITH DOT ABOVE tr_TR.UTF-8 I ı I I # I => LATIN SMALL LETTER DOTLESS I tr_TR.UTF-8 İ i İ İ # I => LATIN SMALL LETTER DOTLESS I # Test reordering of YPOGEGRAMMENI across other accents - ᾁ ᾁ ᾉ ἉΙ + ᾁ ᾁ ᾉ ΑἹ ᾁ ᾁ ᾉ ἉΙ # Handling of final and nonfinal sigma ΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ ΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ ΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ -# Lithuanian rule of i followed by letter with dot. Not at all sure -# about the titlecase part here -lt_LT iė iė Ie IE -lt_LT iė iė Ie IE +# Lithuanian rule of i followed by letter with dot. +lt_LT iė iė Ie IĖ +lt_LT iė iė Ie IĖ lt_LT Ì i̇̀ Ì Ì # LATIN CAPITAL LETTER I WITH GRAVE lt_LT Í i̇́ Í Í # LATIN CAPITAL LETTER I WITH ACUTE lt_LT Ĩ i̇̃ Ĩ Ĩ # LATIN CAPITAL LETTER I WITH TILDE @@ -29,8 +28,8 @@ lt_LT Ĩ i̇̃ Ĩ Ĩ # LATIN CAPITAL LETTER I (with tilde above) lt_LT Į́ į̇́ Į́ Į́ # LATIN CAPITAL LETTER I (with ogonek and acute accent) lt_LT J́ j̇́ J́ J́ # LATIN CAPITAL LETTER J (with acute accent) lt_LT Į́ į̇́ Į́ Į́ # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) -lt_LT.UTF-8 iė iė Ie IE -lt_LT.UTF-8 iė iė Ie IE +lt_LT.UTF-8 iė iė Ie IĖ +lt_LT.UTF-8 iė iė Ie IĖ lt_LT.UTF-8 Ì i̇̀ Ì Ì # LATIN CAPITAL LETTER I WITH GRAVE lt_LT.UTF-8 Í i̇́ Í Í # LATIN CAPITAL LETTER I WITH ACUTE lt_LT.UTF-8 Ĩ i̇̃ Ĩ Ĩ # LATIN CAPITAL LETTER I WITH TILDE diff --git a/tests/gen-casemap-txt.py b/tests/gen-casemap-txt.py index 62d59638a787d35b63039d3642417fd01eb68ad6..731fb59c1d92b9fb762e0141a907a461409d4d5a 100755 --- a/tests/gen-casemap-txt.py +++ b/tests/gen-casemap-txt.py @@ -184,16 +184,15 @@ tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I # Test reordering of YPOGEGRAMMENI across other accents -\t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t +\t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0399\u0314\t \t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t # Handling of final and nonfinal sigma -\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ \t -\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ\t -\tΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ\t -# Lithuanian rule of i followed by letter with dot. Not at all sure -# about the titlecase part here -lt_LT\ti\u0117\ti\u0117\tIe\tIE\t -lt_LT\tie\u0307\tie\u0307\tIe\tIE\t +\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ +\tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ +\tΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ +# Lithuanian rule of i followed by letter with dot. +lt_LT\ti\u0117\ti\u0117\tIe\tIĖ\t +lt_LT\tie\u0307\tie\u0307\tIe\tIĖ\t lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE @@ -203,8 +202,8 @@ lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent) lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent) lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent) -lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t -lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t +lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIĖ\t +lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIĖ\t lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE