gunidecomp.c 18.9 KB
Newer Older
1 2 3 4 5
/* decomp.c - Character decomposition.
 *
 *  Copyright (C) 1999, 2000 Tom Tromey
 *  Copyright 2000 Red Hat, Inc.
 *
6 7 8
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12 13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16 17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, see <http://www.gnu.org/licenses/>.
18 19
 */

20 21 22 23 24 25 26 27
/**
 * SECTION:unicode
 * @Title: Unicode Manipulation
 * @Short_description: functions operating on Unicode characters and
 *     UTF-8 strings
 * @See_also: g_locale_to_utf8(), g_locale_from_utf8()
 *
 * This section describes a number of functions for dealing with
28 29 30 31 32 33
 * Unicode characters and strings. There are analogues of the
 * traditional `ctype.h` character classification and case conversion
 * functions, UTF-8 analogues of some string utility functions,
 * functions to perform normalization, case conversion and collation
 * on UTF-8 strings and finally functions to convert between the UTF-8,
 * UTF-16 and UCS-4 encodings of Unicode.
34 35 36
 *
 * The implementations of the Unicode functions in GLib are based
 * on the Unicode Character Data tables, which are available from
37
 * [www.unicode.org](http://www.unicode.org/).
38 39 40 41 42
 * GLib 2.8 supports Unicode 4.0, GLib 2.10 supports Unicode 4.1,
 * GLib 2.12 supports Unicode 5.0, GLib 2.16.3 supports Unicode 5.1,
 * GLib 2.30 supports Unicode 6.0.
 */

43 44 45
#include "config.h"

#include <stdlib.h>
Owen Taylor's avatar
Owen Taylor committed
46

47
#include "gunicode.h"
48
#include "gunidecomp.h"
49
#include "gmem.h"
50
#include "gunicomp.h"
51
#include "gunicodeprivate.h"
52 53


54 55 56 57 58 59 60 61 62
#define CC_PART1(Page, Char) \
  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
   ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
   : (cclass_data[combining_class_table_part1[Page]][Char]))

#define CC_PART2(Page, Char) \
  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
   ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
   : (cclass_data[combining_class_table_part2[Page]][Char]))
63 64

#define COMBINING_CLASS(Char) \
65 66 67 68 69
  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
   ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
      ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
      : 0))
70

71 72
/**
 * g_unichar_combining_class:
Matthias Clasen's avatar
Matthias Clasen committed
73
 * @uc: a Unicode character
74 75 76
 * 
 * Determines the canonical combining class of a Unicode character.
 * 
77
 * Returns: the combining class of the character
78 79 80
 *
 * Since: 2.14
 **/
81
gint
82
g_unichar_combining_class (gunichar uc)
83 84 85 86
{
  return COMBINING_CLASS (uc);
}

87 88 89 90 91 92 93 94 95 96 97
/* constants for hangul syllable [de]composition */
#define SBase 0xAC00 
#define LBase 0x1100 
#define VBase 0x1161 
#define TBase 0x11A7
#define LCount 19 
#define VCount 21
#define TCount 28
#define NCount (VCount * TCount)
#define SCount (LCount * NCount)

98 99 100 101 102 103 104 105 106 107
/**
 * g_unicode_canonical_ordering:
 * @string: a UCS-4 encoded string.
 * @len: the maximum length of @string to use.
 *
 * Computes the canonical ordering of a string in-place.  
 * This rearranges decomposed characters in the string 
 * according to their combining classes.  See the Unicode 
 * manual for more information. 
 **/
108 109
void
g_unicode_canonical_ordering (gunichar *string,
110
			      gsize     len)
111
{
112
  gsize i;
113 114 115 116 117 118 119 120 121 122 123 124
  int swap = 1;

  while (swap)
    {
      int last;
      swap = 0;
      last = COMBINING_CLASS (string[0]);
      for (i = 0; i < len - 1; ++i)
	{
	  int next = COMBINING_CLASS (string[i + 1]);
	  if (next != 0 && last > next)
	    {
125
	      gsize j;
126
	      /* Percolate item leftward through string.  */
127
	      for (j = i + 1; j > 0; --j)
128 129
		{
		  gunichar t;
130
		  if (COMBINING_CLASS (string[j - 1]) <= next)
131
		    break;
132 133 134
		  t = string[j];
		  string[j] = string[j - 1];
		  string[j - 1] = t;
135 136 137 138 139 140 141 142 143 144 145
		  swap = 1;
		}
	      /* We're re-entering the loop looking at the old
		 character again.  */
	      next = last;
	    }
	  last = next;
	}
    }
}

146 147 148 149 150
/* http://www.unicode.org/unicode/reports/tr15/#Hangul
 * r should be null or have sufficient space. Calling with r == NULL will
 * only calculate the result_len; however, a buffer with space for three
 * characters will always be big enough. */
static void
151
decompose_hangul (gunichar s,
152 153 154 155
                  gunichar *r,
                  gsize *result_len)
{
  gint SIndex = s - SBase;
156
  gint TIndex = SIndex % TCount;
157

158
  if (r)
159
    {
160 161
      r[0] = LBase + SIndex / NCount;
      r[1] = VBase + (SIndex % NCount) / TCount;
162 163
    }

164 165
  if (TIndex)
    {
166
      if (r)
167 168
	r[2] = TBase + TIndex;
      *result_len = 3;
169
    }
170 171
  else
    *result_len = 2;
172 173
}

174 175
/* returns a pointer to a null-terminated UTF-8 string */
static const gchar *
176 177
find_decomposition (gunichar ch,
		    gboolean compat)
178
{
179 180 181 182 183
  int start = 0;
  int end = G_N_ELEMENTS (decomp_table);
  
  if (ch >= decomp_table[start].ch &&
      ch <= decomp_table[end - 1].ch)
184
    {
185
      while (TRUE)
186 187 188 189
	{
	  int half = (start + end) / 2;
	  if (ch == decomp_table[half].ch)
	    {
190 191 192 193 194
	      int offset;

	      if (compat)
		{
		  offset = decomp_table[half].compat_offset;
195
		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
196 197 198
		    offset = decomp_table[half].canon_offset;
		}
	      else
199
		{
200
		  offset = decomp_table[half].canon_offset;
201
		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
202
		    return NULL;
203
		}
204
	      
205
	      return &(decomp_expansion_string[offset]);
206
	    }
207 208
	  else if (half == start)
	    break;
209 210 211 212 213 214 215
	  else if (ch > decomp_table[half].ch)
	    start = half;
	  else
	    end = half;
	}
    }

216 217 218
  return NULL;
}

219 220 221 222 223 224 225
/**
 * g_unicode_canonical_decomposition:
 * @ch: a Unicode character.
 * @result_len: location to store the length of the return value.
 *
 * Computes the canonical decomposition of a Unicode character.  
 * 
226
 * Returns: a newly allocated string of Unicode characters.
227
 *   @result_len is set to the resulting length of the string.
228 229 230
 *
 * Deprecated: 2.30: Use the more flexible g_unichar_fully_decompose()
 *   instead.
231
 **/
232 233
gunichar *
g_unicode_canonical_decomposition (gunichar ch,
234
				   gsize   *result_len)
235
{
236
  const gchar *decomp;
237
  const gchar *p;
238 239
  gunichar *r;

240
  /* Hangul syllable */
241
  if (ch >= SBase && ch < SBase + SCount)
242 243 244 245 246 247
    {
      decompose_hangul (ch, NULL, result_len);
      r = g_malloc (*result_len * sizeof (gunichar));
      decompose_hangul (ch, r, result_len);
    }
  else if ((decomp = find_decomposition (ch, FALSE)) != NULL)
248 249
    {
      /* Found it.  */
250
      int i;
251
      
252 253
      *result_len = g_utf8_strlen (decomp, -1);
      r = g_malloc (*result_len * sizeof (gunichar));
254
      
255 256
      for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++)
        r[i] = g_utf8_get_char (p);
257 258
    }
  else
259 260
    {
      /* Not in our table.  */
261
      r = g_malloc (sizeof (gunichar));
262 263 264 265 266 267
      *r = ch;
      *result_len = 1;
    }

  return r;
}
268

269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
/* L,V => LV and LV,T => LVT  */
static gboolean
combine_hangul (gunichar a,
                gunichar b,
                gunichar *result)
{
  gint LIndex = a - LBase;
  gint SIndex = a - SBase;

  gint VIndex = b - VBase;
  gint TIndex = b - TBase;

  if (0 <= LIndex && LIndex < LCount
      && 0 <= VIndex && VIndex < VCount)
    {
      *result = SBase + (LIndex * VCount + VIndex) * TCount;
      return TRUE;
    }
  else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
288
           && 0 < TIndex && TIndex < TCount)
289 290 291 292 293 294 295 296
    {
      *result = a + TIndex;
      return TRUE;
    }

  return FALSE;
}

297
#define CI(Page, Char) \
298 299 300
  ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
   ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
   : (compose_data[compose_table[Page]][Char]))
301 302

#define COMPOSE_INDEX(Char) \
303
     (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
304

305
static gboolean
306 307 308 309 310 311
combine (gunichar  a,
	 gunichar  b,
	 gunichar *result)
{
  gushort index_a, index_b;

312 313 314
  if (combine_hangul (a, b, result))
    return TRUE;

315
  index_a = COMPOSE_INDEX(a);
316

317 318 319 320 321 322 323 324
  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
    {
      if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
	{
	  *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
	  return TRUE;
	}
      else
325
        return FALSE;
326 327 328
    }
  
  index_b = COMPOSE_INDEX(b);
329

330 331 332 333 334 335 336 337
  if (index_b >= COMPOSE_SECOND_SINGLE_START)
    {
      if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
	{
	  *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
	  return TRUE;
	}
      else
338
        return FALSE;
339 340 341
    }

  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
342
      index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START)
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357
    {
      gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];

      if (res)
	{
	  *result = res;
	  return TRUE;
	}
    }

  return FALSE;
}

gunichar *
_g_utf8_normalize_wc (const gchar    *str,
358
		      gssize          max_len,
359 360 361 362 363 364 365 366 367 368 369 370 371
		      GNormalizeMode  mode)
{
  gsize n_wc;
  gunichar *wc_buffer;
  const char *p;
  gsize last_start;
  gboolean do_compat = (mode == G_NORMALIZE_NFKC ||
			mode == G_NORMALIZE_NFKD);
  gboolean do_compose = (mode == G_NORMALIZE_NFC ||
			 mode == G_NORMALIZE_NFKC);

  n_wc = 0;
  p = str;
372
  while ((max_len < 0 || p < str + max_len) && *p)
373
    {
374
      const gchar *decomp;
375 376
      gunichar wc = g_utf8_get_char (p);

377
      if (wc >= SBase && wc < SBase + SCount)
378
        {
Manish Singh's avatar
Manish Singh committed
379
          gsize result_len;
380 381 382 383 384 385 386 387 388 389 390 391
          decompose_hangul (wc, NULL, &result_len);
          n_wc += result_len;
        }
      else 
        {
          decomp = find_decomposition (wc, do_compat);

          if (decomp)
            n_wc += g_utf8_strlen (decomp, -1);
          else
            n_wc++;
        }
392 393 394 395 396 397 398 399 400

      p = g_utf8_next_char (p);
    }

  wc_buffer = g_new (gunichar, n_wc + 1);

  last_start = 0;
  n_wc = 0;
  p = str;
401
  while ((max_len < 0 || p < str + max_len) && *p)
402 403
    {
      gunichar wc = g_utf8_get_char (p);
404
      const gchar *decomp;
405
      int cc;
406
      gsize old_n_wc = n_wc;
407
	  
408
      if (wc >= SBase && wc < SBase + SCount)
409
        {
Manish Singh's avatar
Manish Singh committed
410
          gsize result_len;
411 412 413
          decompose_hangul (wc, wc_buffer + n_wc, &result_len);
          n_wc += result_len;
        }
414
      else
415 416 417 418 419 420 421 422 423 424 425 426
        {
          decomp = find_decomposition (wc, do_compat);
          
          if (decomp)
            {
              const char *pd;
              for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
                wc_buffer[n_wc++] = g_utf8_get_char (pd);
            }
          else
            wc_buffer[n_wc++] = wc;
        }
427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462

      if (n_wc > 0)
	{
	  cc = COMBINING_CLASS (wc_buffer[old_n_wc]);

	  if (cc == 0)
	    {
	      g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
	      last_start = old_n_wc;
	    }
	}
      
      p = g_utf8_next_char (p);
    }

  if (n_wc > 0)
    {
      g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
      last_start = n_wc;
    }
	  
  wc_buffer[n_wc] = 0;

  /* All decomposed and reordered */ 

  if (do_compose && n_wc > 0)
    {
      gsize i, j;
      int last_cc = 0;
      last_start = 0;
      
      for (i = 0; i < n_wc; i++)
	{
	  int cc = COMBINING_CLASS (wc_buffer[i]);

	  if (i > 0 &&
463
	      (last_cc == 0 || last_cc < cc) &&
464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
	      combine (wc_buffer[last_start], wc_buffer[i],
		       &wc_buffer[last_start]))
	    {
	      for (j = i + 1; j < n_wc; j++)
		wc_buffer[j-1] = wc_buffer[j];
	      n_wc--;
	      i--;
	      
	      if (i == last_start)
		last_cc = 0;
	      else
		last_cc = COMBINING_CLASS (wc_buffer[i-1]);
	      
	      continue;
	    }

	  if (cc == 0)
	    last_start = i;

	  last_cc = cc;
	}
    }

  wc_buffer[n_wc] = 0;

  return wc_buffer;
}

/**
 * g_utf8_normalize:
 * @str: a UTF-8 encoded string.
495
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
496
 * @mode: the type of normalization to perform.
497
 *
498
 * Converts a string into canonical form, standardizing
499 500
 * such issues as whether a character with an accent
 * is represented as a base character and combining
501 502 503 504
 * accent or as a single precomposed character. The
 * string has to be valid UTF-8, otherwise %NULL is
 * returned. You should generally call g_utf8_normalize()
 * before comparing two Unicode strings.
505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
 *
 * The normalization mode %G_NORMALIZE_DEFAULT only
 * standardizes differences that do not affect the
 * text content, such as the above-mentioned accent
 * representation. %G_NORMALIZE_ALL also standardizes
 * the "compatibility" characters in Unicode, such
 * as SUPERSCRIPT THREE to the standard forms
 * (in this case DIGIT THREE). Formatting information
 * may be lost but for most text operations such
 * characters should be considered the same.
 *
 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
 * but returned a result with composed forms rather
 * than a maximally decomposed form. This is often
 * useful if you intend to convert the string to
 * a legacy encoding or pass it to a system with
 * less capable Unicode handling.
523
 *
524
 * Returns: a newly allocated string, that is the
525 526
 *   normalized form of @str, or %NULL if @str is not
 *   valid UTF-8.
527 528 529
 **/
gchar *
g_utf8_normalize (const gchar    *str,
530
		  gssize          len,
531 532
		  GNormalizeMode  mode)
{
533
  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
534
  gchar *result;
535

536 537 538 539 540
  result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
  g_free (result_wc);

  return result;
}
541 542 543 544 545 546

static gboolean
decompose_hangul_step (gunichar  ch,
                       gunichar *a,
                       gunichar *b)
{
547
  gint SIndex, TIndex;
548

549
  if (ch < SBase || ch >= SBase + SCount)
550 551
    return FALSE;  /* not a hangul syllable */

552 553
  SIndex = ch - SBase;
  TIndex = SIndex % TCount;
554

555
  if (TIndex)
556 557
    {
      /* split LVT -> LV,T */
558 559
      *a = ch - TIndex;
      *b = TBase + TIndex;
560 561 562 563
    }
  else
    {
      /* split LV -> L,V */
564 565
      *a = LBase + SIndex / NCount;
      *b = VBase + (SIndex % NCount) / TCount;
566 567 568 569 570 571 572 573 574 575 576 577
    }

  return TRUE;
}

/**
 * g_unichar_decompose:
 * @ch: a Unicode character
 * @a: return location for the first component of @ch
 * @b: return location for the second component of @ch
 *
 * Performs a single decomposition step of the
578
 * Unicode canonical decomposition algorithm.
579 580 581 582 583
 *
 * This function does not include compatibility
 * decompositions. It does, however, include algorithmic
 * Hangul Jamo decomposition, as well as 'singleton'
 * decompositions which replace a character by a single
584 585 586 587 588 589
 * other character. In the case of singletons *@b will
 * be set to zero.
 *
 * If @ch is not decomposable, *@a is set to @ch and *@b
 * is set to zero.
 *
590 591 592 593 594
 * Note that the way Unicode decomposition pairs are
 * defined, it is guaranteed that @b would not decompose
 * further, but @a may itself decompose.  To get the full
 * canonical decomposition for @ch, one would need to
 * recursively call this function on @a.  Or use
595
 * g_unichar_fully_decompose().
596
 *
597 598
 * See
 * [UAX#15](http://unicode.org/reports/tr15/)
599
 * for details.
600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615
 *
 * Returns: %TRUE if the character could be decomposed
 *
 * Since: 2.30
 */
gboolean
g_unichar_decompose (gunichar  ch,
                     gunichar *a,
                     gunichar *b)
{
  gint start = 0;
  gint end = G_N_ELEMENTS (decomp_step_table);

  if (decompose_hangul_step (ch, a, b))
    return TRUE;

616
  /* TODO use bsearch() */
617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638
  if (ch >= decomp_step_table[start].ch &&
      ch <= decomp_step_table[end - 1].ch)
    {
      while (TRUE)
        {
          gint half = (start + end) / 2;
          const decomposition_step *p = &(decomp_step_table[half]);
          if (ch == p->ch)
            {
              *a = p->a;
              *b = p->b;
              return TRUE;
            }
          else if (half == start)
            break;
          else if (ch > p->ch)
            start = half;
          else
            end = half;
        }
    }

639 640 641
  *a = ch;
  *b = 0;

642 643 644 645 646 647 648 649 650 651
  return FALSE;
}

/**
 * g_unichar_compose:
 * @a: a Unicode character
 * @b: a Unicode character
 * @ch: return location for the composed character
 *
 * Performs a single composition step of the
652
 * Unicode canonical composition algorithm.
653
 *
Behdad Esfahbod's avatar
Behdad Esfahbod committed
654 655 656 657 658 659 660
 * This function includes algorithmic Hangul Jamo composition,
 * but it is not exactly the inverse of g_unichar_decompose().
 * No composition can have either of @a or @b equal to zero.
 * To be precise, this function composes if and only if
 * there exists a Primary Composite P which is canonically
 * equivalent to the sequence <@a,@b>.  See the Unicode
 * Standard for the definition of Primary Composite.
661 662 663
 *
 * If @a and @b do not compose a new character, @ch is set to zero.
 *
664 665
 * See
 * [UAX#15](http://unicode.org/reports/tr15/)
666 667
 * for details.
 *
668 669 670 671 672 673 674 675 676
 * Returns: %TRUE if the characters could be composed
 *
 * Since: 2.30
 */
gboolean
g_unichar_compose (gunichar  a,
                   gunichar  b,
                   gunichar *ch)
{
677
  if (combine (a, b, ch))
678 679
    return TRUE;

680
  *ch = 0;
681 682
  return FALSE;
}
683 684 685 686 687

/**
 * g_unichar_fully_decompose:
 * @ch: a Unicode character.
 * @compat: whether perform canonical or compatibility decomposition
688
 * @result: (nullable): location to store decomposed result, or %NULL
689 690 691 692 693 694 695 696 697 698
 * @result_len: length of @result
 *
 * Computes the canonical or compatibility decomposition of a
 * Unicode character.  For compatibility decomposition,
 * pass %TRUE for @compat; for canonical decomposition
 * pass %FALSE for @compat.
 *
 * The decomposed sequence is placed in @result.  Only up to
 * @result_len characters are written into @result.  The length
 * of the full decomposition (irrespective of @result_len) is
699 700 701 702
 * returned by the function.  For canonical decomposition,
 * currently all decompositions are of length at most 4, but
 * this may change in the future (very unlikely though).
 * At any rate, Unicode does guarantee that a buffer of length
703 704 705
 * 18 is always enough for both compatibility and canonical
 * decompositions, so that is the size recommended. This is provided
 * as %G_UNICHAR_MAX_DECOMPOSITION_LENGTH.
706
 *
707 708
 * See
 * [UAX#15](http://unicode.org/reports/tr15/)
709 710
 * for details.
 *
711
 * Returns: the length of the full decomposition.
712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752
 *
 * Since: 2.30
 **/
gsize
g_unichar_fully_decompose (gunichar  ch,
			   gboolean  compat,
			   gunichar *result,
			   gsize     result_len)
{
  const gchar *decomp;
  const gchar *p;

  /* Hangul syllable */
  if (ch >= SBase && ch < SBase + SCount)
    {
      gsize len, i;
      gunichar buffer[3];
      decompose_hangul (ch, result ? buffer : NULL, &len);
      if (result)
        for (i = 0; i < len && i < result_len; i++)
	  result[i] = buffer[i];
      return len;
    }
  else if ((decomp = find_decomposition (ch, compat)) != NULL)
    {
      /* Found it.  */
      gsize len, i;

      len = g_utf8_strlen (decomp, -1);

      for (p = decomp, i = 0; i < len && i < result_len; p = g_utf8_next_char (p), i++)
        result[i] = g_utf8_get_char (p);

      return len;
    }

  /* Does not decompose */
  if (result && result_len >= 1)
    *result = ch;
  return 1;
}