gconvert.c 62.5 KB
Newer Older
1 2 3 4
/* GLIB - Library of useful routines for C programming
 *
 * gconvert.c: Convert between character sets using iconv
 * Copyright Red Hat Inc., 2000
5
 * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com>
6 7 8 9
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11 12 13 14 15 16 17
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 20
 */

21
#include "config.h"
22
#include "glibconfig.h"
Owen Taylor's avatar
1.3.14  
Owen Taylor committed
23

24
#ifndef G_OS_WIN32
25
#include <iconv.h>
26
#endif
27
#include <errno.h>
28
#include <stdio.h>
29
#include <string.h>
30 31
#include <stdlib.h>

32 33 34 35
#ifdef G_OS_WIN32
#include "win_iconv.c"
#endif

36 37
#ifdef G_PLATFORM_WIN32
#define STRICT
38
#include <windows.h>
39
#undef STRICT
40
#endif
41

Matthias Clasen's avatar
Matthias Clasen committed
42 43
#include "gconvert.h"

44
#include "gcharsetprivate.h"
Matthias Clasen's avatar
Matthias Clasen committed
45 46 47 48
#include "gslist.h"
#include "gstrfuncs.h"
#include "gtestutils.h"
#include "gthread.h"
49
#include "gthreadprivate.h"
Matthias Clasen's avatar
Matthias Clasen committed
50
#include "gunicode.h"
Matthias Clasen's avatar
Matthias Clasen committed
51
#include "gfileutils.h"
52
#include "genviron.h"
Matthias Clasen's avatar
Matthias Clasen committed
53

Owen Taylor's avatar
Owen Taylor committed
54
#include "glibintl.h"
55

56

Matthias Clasen's avatar
Matthias Clasen committed
57 58 59
/**
 * SECTION:conversions
 * @title: Character Set Conversion
60
 * @short_description: convert strings between different character sets
Matthias Clasen's avatar
Matthias Clasen committed
61
 *
62 63 64
 * The g_convert() family of function wraps the functionality of iconv().
 * In addition to pure character set conversions, GLib has functions to
 * deal with the extra complications of encodings for file names.
Matthias Clasen's avatar
Matthias Clasen committed
65
 *
66 67 68 69 70 71 72
 * ## File Name Encodings
 *
 * Historically, UNIX has not had a defined encoding for file names:
 * a file name is valid as long as it does not have path separators
 * in it ("/"). However, displaying file names may require conversion:
 * from the character set in which they were created, to the character
 * set in which the application operates. Consider the Spanish file name
73
 * "Presentación.sxi". If the application which created it uses
74
 * ISO-8859-1 for its encoding,
75
 * |[
Phillip Wood's avatar
Phillip Wood committed
76
 * Character:  P  r  e  s  e  n  t  a  c  i  ó  n  .  s  x  i
Matthias Clasen's avatar
Matthias Clasen committed
77
 * Hex code:   50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69
78
 * ]|
Matthias Clasen's avatar
Matthias Clasen committed
79 80
 * However, if the application use UTF-8, the actual file name on
 * disk would look like this:
81
 * |[
Phillip Wood's avatar
Phillip Wood committed
82
 * Character:  P  r  e  s  e  n  t  a  c  i  ó     n  .  s  x  i
Matthias Clasen's avatar
Matthias Clasen committed
83
 * Hex code:   50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69
84
 * ]|
85
 * Glib uses UTF-8 for its strings, and GUI toolkits like GTK+ that use
86
 * GLib do the same thing. If you get a file name from the file system,
87 88 89
 * for example, from readdir() or from g_dir_read_name(), and you wish
 * to display the file name to the user, you  will need to convert it
 * into UTF-8. The opposite case is when the user types the name of a
90
 * file they wish to save: the toolkit will give you that string in
91 92 93 94
 * UTF-8 encoding, and you will need to convert it to the character
 * set used for file names before you can create the file with open()
 * or fopen().
 *
95
 * By default, GLib assumes that file names on disk are in UTF-8
96 97
 * encoding. This is a valid assumption for file systems which
 * were created relatively recently: most applications use UTF-8
Matthias Clasen's avatar
Matthias Clasen committed
98
 * encoding for their strings, and that is also what they use for
99
 * the file names they create. However, older file systems may
Matthias Clasen's avatar
Matthias Clasen committed
100
 * still contain file names created in "older" encodings, such as
Matthias Clasen's avatar
Matthias Clasen committed
101
 * ISO-8859-1. In this case, for compatibility reasons, you may want
102
 * to instruct GLib to use that particular encoding for file names
Matthias Clasen's avatar
Matthias Clasen committed
103 104
 * rather than UTF-8. You can do this by specifying the encoding for
 * file names in the [`G_FILENAME_ENCODING`][G_FILENAME_ENCODING]
105
 * environment variable. For example, if your installation uses
106
 * ISO-8859-1 for file names, you can put this in your `~/.profile`:
Matthias Clasen's avatar
Matthias Clasen committed
107
 * |[
Matthias Clasen's avatar
Matthias Clasen committed
108
 * export G_FILENAME_ENCODING=ISO-8859-1
Matthias Clasen's avatar
Matthias Clasen committed
109
 * ]|
110
 * GLib provides the functions g_filename_to_utf8() and
111 112
 * g_filename_from_utf8() to perform the necessary conversions.
 * These functions convert file names from the encoding specified
Matthias Clasen's avatar
Matthias Clasen committed
113 114
 * in `G_FILENAME_ENCODING` to UTF-8 and vice-versa. This
 * [diagram][file-name-encodings-diagram] illustrates how
Matthias Clasen's avatar
Matthias Clasen committed
115 116
 * these functions are used to convert between UTF-8 and the
 * encoding for file names in the file system.
117
 *
Matthias Clasen's avatar
Matthias Clasen committed
118 119 120
 * ## Conversion between file name encodings # {#file-name-encodings-diagram)
 *
 * ![](file-name-encodings.png)
121 122 123
 *
 * ## Checklist for Application Writers
 *
Matthias Clasen's avatar
Matthias Clasen committed
124 125 126
 * This section is a practical summary of the detailed
 * things to do to make sure your applications process file
 * name encodings correctly.
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
 * 
 * 1. If you get a file name from the file system from a function
 *    such as readdir() or gtk_file_chooser_get_filename(), you do
 *    not need to do any conversion to pass that file name to
 *    functions like open(), rename(), or fopen() -- those are "raw"
 *    file names which the file system understands.
 *
 * 2. If you need to display a file name, convert it to UTF-8 first
 *    by using g_filename_to_utf8(). If conversion fails, display a
 *    string like "Unknown file name". Do not convert this string back
 *    into the encoding used for file names if you wish to pass it to
 *    the file system; use the original file name instead.
 *
 *    For example, the document window of a word processor could display
 *    "Unknown file name" in its title bar but still let the user save
Matthias Clasen's avatar
Matthias Clasen committed
142 143 144 145
 *    the file, as it would keep the raw file name internally. This
 *    can happen if the user has not set the `G_FILENAME_ENCODING`
 *    environment variable even though he has files whose names are
 *    not encoded in UTF-8.
146 147 148 149 150 151
 *
 * 3. If your user interface lets the user type a file name for saving
 *    or renaming, convert it to the encoding used for file names in
 *    the file system by using g_filename_from_utf8(). Pass the converted
 *    file name to functions like fopen(). If conversion fails, ask the
 *    user to enter a different file name. This can happen if the user
Matthias Clasen's avatar
Matthias Clasen committed
152 153
 *    types Japanese characters when `G_FILENAME_ENCODING` is set to
 *    `ISO-8859-1`, for example.
Matthias Clasen's avatar
Matthias Clasen committed
154 155
 */

156 157 158 159 160 161
/* We try to terminate strings in unknown charsets with this many zero bytes
 * to ensure that multibyte strings really are nul-terminated when we return
 * them from g_convert() and friends.
 */
#define NUL_TERMINATOR_LENGTH 4

162
G_DEFINE_QUARK (g_convert_error, g_convert_error)
163

164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
static gboolean
try_conversion (const char *to_codeset,
		const char *from_codeset,
		iconv_t    *cd)
{
  *cd = iconv_open (to_codeset, from_codeset);

  if (*cd == (iconv_t)-1 && errno == EINVAL)
    return FALSE;
  else
    return TRUE;
}

static gboolean
try_to_aliases (const char **to_aliases,
		const char  *from_codeset,
		iconv_t     *cd)
{
  if (to_aliases)
    {
      const char **p = to_aliases;
      while (*p)
	{
	  if (try_conversion (*p, from_codeset, cd))
	    return TRUE;

	  p++;
	}
    }

  return FALSE;
}

Havoc Pennington's avatar
docs  
Havoc Pennington committed
197
/**
198
 * g_iconv_open: (skip)
Havoc Pennington's avatar
docs  
Havoc Pennington committed
199 200 201
 * @to_codeset: destination codeset
 * @from_codeset: source codeset
 * 
202
 * Same as the standard UNIX routine iconv_open(), but
203
 * may be implemented via libiconv on UNIX flavors that lack
Havoc Pennington's avatar
docs  
Havoc Pennington committed
204 205
 * a native implementation.
 * 
Owen Taylor's avatar
Owen Taylor committed
206
 * GLib provides g_convert() and g_locale_to_utf8() which are likely
Havoc Pennington's avatar
docs  
Havoc Pennington committed
207 208
 * more convenient than the raw iconv wrappers.
 * 
209
 * Returns: a "conversion descriptor", or (GIConv)-1 if
210
 *  opening the converter failed.
Havoc Pennington's avatar
docs  
Havoc Pennington committed
211
 **/
212 213 214 215
GIConv
g_iconv_open (const gchar  *to_codeset,
	      const gchar  *from_codeset)
{
216
  iconv_t cd;
217
  
218 219 220
  if (!try_conversion (to_codeset, from_codeset, &cd))
    {
      const char **to_aliases = _g_charset_get_aliases (to_codeset);
221
      const char **from_aliases = _g_charset_get_aliases (from_codeset);
222 223 224 225 226 227 228

      if (from_aliases)
	{
	  const char **p = from_aliases;
	  while (*p)
	    {
	      if (try_conversion (to_codeset, *p, &cd))
229
		goto out;
230 231

	      if (try_to_aliases (to_aliases, *p, &cd))
232
		goto out;
233 234 235 236 237 238

	      p++;
	    }
	}

      if (try_to_aliases (to_aliases, from_codeset, &cd))
239
	goto out;
240 241
    }

242
 out:
243
  return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd;
244 245
}

Havoc Pennington's avatar
docs  
Havoc Pennington committed
246
/**
247
 * g_iconv: (skip)
Havoc Pennington's avatar
docs  
Havoc Pennington committed
248 249 250 251 252 253
 * @converter: conversion descriptor from g_iconv_open()
 * @inbuf: bytes to convert
 * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
 * @outbuf: converted output bytes
 * @outbytes_left: inout parameter, bytes available to fill in @outbuf
 * 
254
 * Same as the standard UNIX routine iconv(), but
255
 * may be implemented via libiconv on UNIX flavors that lack
Havoc Pennington's avatar
docs  
Havoc Pennington committed
256 257
 * a native implementation.
 *
Owen Taylor's avatar
Owen Taylor committed
258
 * GLib provides g_convert() and g_locale_to_utf8() which are likely
Havoc Pennington's avatar
docs  
Havoc Pennington committed
259 260
 * more convenient than the raw iconv wrappers.
 * 
261 262 263 264 265 266 267
 * Note that the behaviour of iconv() for characters which are valid in the
 * input character set, but which have no representation in the output character
 * set, is implementation defined. This function may return success (with a
 * positive number of non-reversible conversions as replacement characters were
 * used), or it may return -1 and set an error such as %EILSEQ, in such a
 * situation.
 *
268
 * Returns: count of non-reversible conversions, or -1 on error
Havoc Pennington's avatar
docs  
Havoc Pennington committed
269
 **/
270
gsize 
271 272
g_iconv (GIConv   converter,
	 gchar  **inbuf,
273
	 gsize   *inbytes_left,
274
	 gchar  **outbuf,
275
	 gsize   *outbytes_left)
276 277 278 279 280 281
{
  iconv_t cd = (iconv_t)converter;

  return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
}

Havoc Pennington's avatar
docs  
Havoc Pennington committed
282
/**
283
 * g_iconv_close: (skip)
Havoc Pennington's avatar
docs  
Havoc Pennington committed
284 285
 * @converter: a conversion descriptor from g_iconv_open()
 *
286
 * Same as the standard UNIX routine iconv_close(), but
287
 * may be implemented via libiconv on UNIX flavors that lack
Havoc Pennington's avatar
docs  
Havoc Pennington committed
288
 * a native implementation. Should be called to clean up
Matthias Clasen's avatar
Matthias Clasen committed
289
 * the conversion descriptor from g_iconv_open() when
Havoc Pennington's avatar
docs  
Havoc Pennington committed
290 291
 * you are done converting things.
 *
Owen Taylor's avatar
Owen Taylor committed
292
 * GLib provides g_convert() and g_locale_to_utf8() which are likely
Havoc Pennington's avatar
docs  
Havoc Pennington committed
293 294
 * more convenient than the raw iconv wrappers.
 * 
295
 * Returns: -1 on error, 0 on success
Havoc Pennington's avatar
docs  
Havoc Pennington committed
296
 **/
297 298 299 300 301 302 303 304
gint
g_iconv_close (GIConv converter)
{
  iconv_t cd = (iconv_t)converter;

  return iconv_close (cd);
}

Matthias Clasen's avatar
Matthias Clasen committed
305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
static GIConv
open_converter (const gchar *to_codeset,
		const gchar *from_codeset,
		GError     **error)
{
  GIConv cd;

  cd = g_iconv_open (to_codeset, from_codeset);

  if (cd == (GIConv) -1)
    {
      /* Something went wrong.  */
      if (error)
	{
	  if (errno == EINVAL)
	    g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
321
			 _("Conversion from character set “%s” to “%s” is not supported"),
Matthias Clasen's avatar
Matthias Clasen committed
322 323 324
			 from_codeset, to_codeset);
	  else
	    g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
325
			 _("Could not open converter from “%s” to “%s”"),
Matthias Clasen's avatar
Matthias Clasen committed
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
			 from_codeset, to_codeset);
	}
    }
  
  return cd;
}

static int
close_converter (GIConv cd)
{
  if (cd == (GIConv) -1)
    return 0;
  
  return g_iconv_close (cd);  
}

342
/**
343
 * g_convert_with_iconv: (skip)
344 345
 * @str:           (array length=len) (element-type guint8):
 *                 the string to convert.
346
 * @len:           the length of the string in bytes, or -1 if the string is
347 348 349
 *                 nul-terminated (Note that some encodings may allow nul
 *                 bytes to occur inside strings. In that case, using -1
 *                 for the @len parameter is unsafe)
350
 * @converter:     conversion descriptor from g_iconv_open()
351 352
 * @bytes_read:    (out) (optional): location to store the number of bytes in
 *                 the input string that were successfully converted, or %NULL.
Matthias Clasen's avatar
Matthias Clasen committed
353
 *                 Even if the conversion was successful, this may be 
354
 *                 less than @len if there were partial characters
355
 *                 at the end of the input. If the error
356
 *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
357
 *                 stored will be the byte offset after the last valid
358
 *                 input sequence.
359 360
 * @bytes_written: (out) (optional): the number of bytes stored in
 *                 the output buffer (not including the terminating nul).
Matthias Clasen's avatar
Matthias Clasen committed
361
 * @error:         location to store the error occurring, or %NULL to ignore
362 363
 *                 errors. Any of the errors in #GConvertError may occur.
 *
364 365
 * Converts a string from one character set to another. 
 * 
366
 * Note that you should use g_iconv() for streaming conversions. 
367
 * Despite the fact that @bytes_read can return information about partial
368 369 370 371 372 373 374
 * characters, the g_convert_... functions are not generally suitable
 * for streaming. If the underlying converter maintains internal state,
 * then this won't be preserved across successive calls to g_convert(),
 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
 * this is the GNU C converter for CP1255 which does not emit a base
 * character until it knows that the next character is not a mark that
 * could combine with the base character.)
375
 *
376 377 378 379 380 381 382 383
 * Characters which are valid in the input character set, but which have no
 * representation in the output character set will result in a
 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error. This is in contrast to the iconv()
 * specification, which leaves this behaviour implementation defined. Note that
 * this is the same error code as is returned for an invalid byte sequence in
 * the input character set. To get defined behaviour for conversion of
 * unrepresentable characters, use g_convert_with_fallback().
 *
384 385 386
 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
 *               If the conversion was successful, a newly allocated buffer
 *               containing the converted string, which must be freed with
387
 *               g_free(). Otherwise %NULL and @error will be set.
388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
 **/
gchar*
g_convert_with_iconv (const gchar *str,
		      gssize       len,
		      GIConv       converter,
		      gsize       *bytes_read, 
		      gsize       *bytes_written, 
		      GError     **error)
{
  gchar *dest;
  gchar *outp;
  const gchar *p;
  gsize inbytes_remaining;
  gsize outbytes_remaining;
  gsize err;
  gsize outbuf_size;
  gboolean have_error = FALSE;
405
  gboolean done = FALSE;
406
  gboolean reset = FALSE;
407 408 409
  
  g_return_val_if_fail (converter != (GIConv) -1, NULL);
     
410 411 412 413 414
  if (len < 0)
    len = strlen (str);

  p = str;
  inbytes_remaining = len;
415
  outbuf_size = len + NUL_TERMINATOR_LENGTH;
416
  
417
  outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
418 419
  outp = dest = g_malloc (outbuf_size);

420
  while (!done && !have_error)
421
    {
422 423 424 425
      if (reset)
        err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining);
      else
        err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
426

427
      if (err == (gsize) -1)
428
	{
429 430 431 432
	  switch (errno)
	    {
	    case EINVAL:
	      /* Incomplete text, do not report an error */
433
	      done = TRUE;
434 435 436
	      break;
	    case E2BIG:
	      {
437
		gsize used = outp - dest;
438
		
439 440 441 442
		outbuf_size *= 2;
		dest = g_realloc (dest, outbuf_size);
		
		outp = dest + used;
443
		outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
444 445 446
	      }
	      break;
	    case EILSEQ:
Matthias Clasen's avatar
Matthias Clasen committed
447 448
              g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
                                   _("Invalid byte sequence in conversion input"));
449 450 451
	      have_error = TRUE;
	      break;
	    default:
Matthias Clasen's avatar
Matthias Clasen committed
452 453 454 455 456 457 458
              {
                int errsv = errno;

                g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
                             _("Error during conversion: %s"),
                             g_strerror (errsv));
              }
459 460 461 462
	      have_error = TRUE;
	      break;
	    }
	}
463 464 465 466 467 468 469
      else if (err > 0)
        {
          /* @err gives the number of replacement characters used. */
          g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
                               _("Unrepresentable character in conversion input"));
          have_error = TRUE;
        }
470 471
      else 
	{
472
	  if (!reset)
473 474
	    {
	      /* call g_iconv with NULL inbuf to cleanup shift state */
475
	      reset = TRUE;
476 477 478 479
	      inbytes_remaining = 0;
	    }
	  else
	    done = TRUE;
480 481 482
	}
    }

483
  memset (outp, 0, NUL_TERMINATOR_LENGTH);
484 485 486
  
  if (bytes_read)
    *bytes_read = p - str;
487 488 489 490
  else
    {
      if ((p - str) != len) 
	{
Havoc Pennington's avatar
Havoc Pennington committed
491 492
          if (!have_error)
            {
Matthias Clasen's avatar
Matthias Clasen committed
493 494
              g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
                                   _("Partial character sequence at end of input"));
Havoc Pennington's avatar
Havoc Pennington committed
495 496
              have_error = TRUE;
            }
497 498
	}
    }
499 500 501 502 503 504 505 506 507 508 509 510 511

  if (bytes_written)
    *bytes_written = outp - dest;	/* Doesn't include '\0' */

  if (have_error)
    {
      g_free (dest);
      return NULL;
    }
  else
    return dest;
}

512 513
/**
 * g_convert:
514 515
 * @str:           (array length=len) (element-type guint8):
 *                 the string to convert.
516
 * @len:           the length of the string in bytes, or -1 if the string is
517 518 519
 *                 nul-terminated (Note that some encodings may allow nul
 *                 bytes to occur inside strings. In that case, using -1
 *                 for the @len parameter is unsafe)
520 521
 * @to_codeset:    name of character set into which to convert @str
 * @from_codeset:  character set of @str.
522 523
 * @bytes_read:    (out) (optional): location to store the number of bytes in
 *                 the input string that were successfully converted, or %NULL.
524 525 526 527
 *                 Even if the conversion was successful, this may be 
 *                 less than @len if there were partial characters
 *                 at the end of the input. If the error
 *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
528
 *                 stored will be the byte offset after the last valid
529
 *                 input sequence.
530 531
 * @bytes_written: (out) (optional): the number of bytes stored in
 *                 the output buffer (not including the terminating nul).
Matthias Clasen's avatar
Matthias Clasen committed
532
 * @error:         location to store the error occurring, or %NULL to ignore
533 534 535 536
 *                 errors. Any of the errors in #GConvertError may occur.
 *
 * Converts a string from one character set to another.
 *
537
 * Note that you should use g_iconv() for streaming conversions. 
538
 * Despite the fact that @bytes_read can return information about partial
539 540 541 542 543 544 545
 * characters, the g_convert_... functions are not generally suitable
 * for streaming. If the underlying converter maintains internal state,
 * then this won't be preserved across successive calls to g_convert(),
 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
 * this is the GNU C converter for CP1255 which does not emit a base
 * character until it knows that the next character is not a mark that
 * could combine with the base character.)
546
 *
547 548 549
 * Using extensions such as "//TRANSLIT" may not work (or may not work
 * well) on many platforms.  Consider using g_str_to_ascii() instead.
 *
550 551 552 553
 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
 *          If the conversion was successful, a newly allocated buffer
 *          containing the converted string, which must be freed with g_free().
 *          Otherwise %NULL and @error will be set.
554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
 **/
gchar*
g_convert (const gchar *str,
           gssize       len,  
           const gchar *to_codeset,
           const gchar *from_codeset,
           gsize       *bytes_read, 
	   gsize       *bytes_written, 
	   GError     **error)
{
  gchar *res;
  GIConv cd;

  g_return_val_if_fail (str != NULL, NULL);
  g_return_val_if_fail (to_codeset != NULL, NULL);
  g_return_val_if_fail (from_codeset != NULL, NULL);
  
  cd = open_converter (to_codeset, from_codeset, error);

  if (cd == (GIConv) -1)
    {
      if (bytes_read)
        *bytes_read = 0;
      
      if (bytes_written)
        *bytes_written = 0;
      
      return NULL;
    }

  res = g_convert_with_iconv (str, len, cd,
			      bytes_read, bytes_written,
			      error);

  close_converter (cd);

  return res;
}

593 594
/**
 * g_convert_with_fallback:
595 596
 * @str:          (array length=len) (element-type guint8):
 *                the string to convert.
597
 * @len:          the length of the string in bytes, or -1 if the string is
598 599 600
 *                 nul-terminated (Note that some encodings may allow nul
 *                 bytes to occur inside strings. In that case, using -1
 *                 for the @len parameter is unsafe)
601 602
 * @to_codeset:   name of character set into which to convert @str
 * @from_codeset: character set of @str.
603
 * @fallback:     UTF-8 string to use in place of characters not
604 605
 *                present in the target encoding. (The string must be
 *                representable in the target encoding). 
606 607
 *                If %NULL, characters not in the target encoding will 
 *                be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
608 609
 * @bytes_read:   (out) (optional): location to store the number of bytes in
 *                the input string that were successfully converted, or %NULL.
Matthias Clasen's avatar
Matthias Clasen committed
610
 *                Even if the conversion was successful, this may be 
611
 *                less than @len if there were partial characters
612
 *                at the end of the input.
613 614
 * @bytes_written: (out) (optional): the number of bytes stored in
 *                 the output buffer (not including the terminating nul).
Matthias Clasen's avatar
Matthias Clasen committed
615
 * @error:        location to store the error occurring, or %NULL to ignore
616 617
 *                errors. Any of the errors in #GConvertError may occur.
 *
Matthias Clasen's avatar
Matthias Clasen committed
618
 * Converts a string from one character set to another, possibly
619 620 621
 * including fallback sequences for characters not representable
 * in the output. Note that it is not guaranteed that the specification
 * for the fallback sequences in @fallback will be honored. Some
622
 * systems may do an approximate conversion from @from_codeset
623
 * to @to_codeset in their iconv() functions, 
Owen Taylor's avatar
Owen Taylor committed
624
 * in which case GLib will simply return that approximate conversion.
625
 *
626
 * Note that you should use g_iconv() for streaming conversions. 
627
 * Despite the fact that @bytes_read can return information about partial
628 629 630 631 632 633 634
 * characters, the g_convert_... functions are not generally suitable
 * for streaming. If the underlying converter maintains internal state,
 * then this won't be preserved across successive calls to g_convert(),
 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
 * this is the GNU C converter for CP1255 which does not emit a base
 * character until it knows that the next character is not a mark that
 * could combine with the base character.)
635
 *
636 637 638 639
 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
 *          If the conversion was successful, a newly allocated buffer
 *          containing the converted string, which must be freed with g_free().
 *          Otherwise %NULL and @error will be set.
640 641 642
 **/
gchar*
g_convert_with_fallback (const gchar *str,
643
			 gssize       len,    
644 645
			 const gchar *to_codeset,
			 const gchar *from_codeset,
646
			 const gchar *fallback,
647 648
			 gsize       *bytes_read,
			 gsize       *bytes_written,
649 650 651 652 653 654 655
			 GError     **error)
{
  gchar *utf8;
  gchar *dest;
  gchar *outp;
  const gchar *insert_str = NULL;
  const gchar *p;
656
  gsize inbytes_remaining;   
657
  const gchar *save_p = NULL;
658 659 660
  gsize save_inbytes = 0;
  gsize outbytes_remaining; 
  gsize err;
661
  GIConv cd;
662
  gsize outbuf_size;
663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690
  gboolean have_error = FALSE;
  gboolean done = FALSE;

  GError *local_error = NULL;
  
  g_return_val_if_fail (str != NULL, NULL);
  g_return_val_if_fail (to_codeset != NULL, NULL);
  g_return_val_if_fail (from_codeset != NULL, NULL);
     
  if (len < 0)
    len = strlen (str);
  
  /* Try an exact conversion; we only proceed if this fails
   * due to an illegal sequence in the input string.
   */
  dest = g_convert (str, len, to_codeset, from_codeset, 
		    bytes_read, bytes_written, &local_error);
  if (!local_error)
    return dest;

  if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
    {
      g_propagate_error (error, local_error);
      return NULL;
    }
  else
    g_error_free (local_error);

Havoc Pennington's avatar
Havoc Pennington committed
691 692
  local_error = NULL;
  
693 694 695 696
  /* No go; to proceed, we need a converter from "UTF-8" to
   * to_codeset, and the string as UTF-8.
   */
  cd = open_converter (to_codeset, "UTF-8", error);
697
  if (cd == (GIConv) -1)
698 699 700 701 702 703 704 705 706 707 708 709 710
    {
      if (bytes_read)
        *bytes_read = 0;
      
      if (bytes_written)
        *bytes_written = 0;
      
      return NULL;
    }

  utf8 = g_convert (str, len, "UTF-8", from_codeset, 
		    bytes_read, &inbytes_remaining, error);
  if (!utf8)
711
    {
712
      close_converter (cd);
713 714 715 716
      if (bytes_written)
        *bytes_written = 0;
      return NULL;
    }
717 718 719 720 721 722 723 724 725 726

  /* Now the heart of the code. We loop through the UTF-8 string, and
   * whenever we hit an offending character, we form fallback, convert
   * the fallback to the target codeset, and then go back to
   * converting the original string after finishing with the fallback.
   *
   * The variables save_p and save_inbytes store the input state
   * for the original string while we are converting the fallback
   */
  p = utf8;
727

728 729
  outbuf_size = len + NUL_TERMINATOR_LENGTH;
  outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
730 731 732 733
  outp = dest = g_malloc (outbuf_size);

  while (!done && !have_error)
    {
734
      gsize inbytes_tmp = inbytes_remaining;
735
      err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
736
      inbytes_remaining = inbytes_tmp;
737

738
      if (err == (gsize) -1)
739 740 741 742 743 744 745 746
	{
	  switch (errno)
	    {
	    case EINVAL:
	      g_assert_not_reached();
	      break;
	    case E2BIG:
	      {
747
		gsize used = outp - dest;
748

749 750 751 752
		outbuf_size *= 2;
		dest = g_realloc (dest, outbuf_size);
		
		outp = dest + used;
753
		outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
754 755 756 757 758 759 760 761 762
		
		break;
	      }
	    case EILSEQ:
	      if (save_p)
		{
		  /* Error converting fallback string - fatal
		   */
		  g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
763
			       _("Cannot convert fallback “%s” to codeset “%s”"),
764 765 766 767
			       insert_str, to_codeset);
		  have_error = TRUE;
		  break;
		}
768
	      else if (p)
769 770 771 772
		{
		  if (!fallback)
		    { 
		      gunichar ch = g_utf8_get_char (p);
773
		      insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x",
774 775 776 777 778 779 780 781 782
						    ch);
		    }
		  else
		    insert_str = fallback;
		  
		  save_p = g_utf8_next_char (p);
		  save_inbytes = inbytes_remaining - (save_p - p);
		  p = insert_str;
		  inbytes_remaining = strlen (p);
783
		  break;
784
		}
785 786
              /* if p is null */
              G_GNUC_FALLTHROUGH;
787
	    default:
Christian Persch's avatar
Christian Persch committed
788 789 790 791 792 793 794 795
              {
                int errsv = errno;

                g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
                             _("Error during conversion: %s"),
                             g_strerror (errsv));
              }

796 797 798 799 800 801 802 803 804 805 806 807 808 809
	      have_error = TRUE;
	      break;
	    }
	}
      else
	{
	  if (save_p)
	    {
	      if (!fallback)
		g_free ((gchar *)insert_str);
	      p = save_p;
	      inbytes_remaining = save_inbytes;
	      save_p = NULL;
	    }
810 811 812 813 814 815
	  else if (p)
	    {
	      /* call g_iconv with NULL inbuf to cleanup shift state */
	      p = NULL;
	      inbytes_remaining = 0;
	    }
816 817 818 819 820 821 822
	  else
	    done = TRUE;
	}
    }

  /* Cleanup
   */
823
  memset (outp, 0, NUL_TERMINATOR_LENGTH);
824
  
825
  close_converter (cd);
826 827

  if (bytes_written)
828
    *bytes_written = outp - dest;	/* Doesn't include '\0' */
829 830 831 832 833 834 835 836 837 838 839 840 841

  g_free (utf8);

  if (have_error)
    {
      if (save_p && !fallback)
	g_free ((gchar *)insert_str);
      g_free (dest);
      return NULL;
    }
  else
    return dest;
}
842 843 844 845

/*
 * g_locale_to_utf8
 *
846 847 848
 * 
 */

849 850 851 852 853 854 855 856 857 858 859
/*
 * Validate @string as UTF-8. @len can be negative if @string is
 * nul-terminated, or a non-negative value in bytes. If @string ends in an
 * incomplete sequence, or contains any illegal sequences or nul codepoints,
 * %NULL will be returned and the error set to
 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
 * On success, @bytes_read and @bytes_written, if provided, will be set to
 * the number of bytes in @string up to @len or the terminating nul byte.
 * On error, @bytes_read will be set to the byte offset after the last valid
 * and non-nul UTF-8 sequence in @string, and @bytes_written will be set to 0.
 */
860 861 862
static gchar *
strdup_len (const gchar *string,
	    gssize       len,
863
	    gsize       *bytes_read,
864 865
	    gsize       *bytes_written,
	    GError     **error)
866 867
{
  gsize real_len;
868
  const gchar *end_valid;
869

870
  if (!g_utf8_validate (string, len, &end_valid))
871 872
    {
      if (bytes_read)
873
	*bytes_read = end_valid - string;
874 875 876
      if (bytes_written)
	*bytes_written = 0;

877 878
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
                           _("Invalid byte sequence in conversion input"));
879 880
      return NULL;
    }
881 882 883

  real_len = end_valid - string;

884 885 886 887 888 889 890 891
  if (bytes_read)
    *bytes_read = real_len;
  if (bytes_written)
    *bytes_written = real_len;

  return g_strndup (string, real_len);
}

892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910
typedef enum
{
  CONVERT_CHECK_NO_NULS_IN_INPUT  = 1 << 0,
  CONVERT_CHECK_NO_NULS_IN_OUTPUT = 1 << 1
} ConvertCheckFlags;

/*
 * Convert from @string in the encoding identified by @from_codeset,
 * returning a string in the encoding identifed by @to_codeset.
 * @len can be negative if @string is nul-terminated, or a non-negative
 * value in bytes. Flags defined in #ConvertCheckFlags can be set in @flags
 * to check the input, the output, or both, for embedded nul bytes.
 * On success, @bytes_read, if provided, will be set to the number of bytes
 * in @string up to @len or the terminating nul byte, and @bytes_written, if
 * provided, will be set to the number of output bytes written into the
 * returned buffer, excluding the terminating nul sequence.
 * On error, @bytes_read will be set to the byte offset after the last valid
 * sequence in @string, and @bytes_written will be set to 0.
 */
911
static gchar *
912 913 914 915 916 917 918 919
convert_checked (const gchar      *string,
                 gssize            len,
                 const gchar      *to_codeset,
                 const gchar      *from_codeset,
                 ConvertCheckFlags flags,
                 gsize            *bytes_read,
                 gsize            *bytes_written,
                 GError          **error)
920
{
921
  gchar *out;
922 923
  gsize outbytes;

924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942
  if ((flags & CONVERT_CHECK_NO_NULS_IN_INPUT) && len > 0)
    {
      const gchar *early_nul = memchr (string, '\0', len);
      if (early_nul != NULL)
        {
          if (bytes_read)
            *bytes_read = early_nul - string;
          if (bytes_written)
            *bytes_written = 0;

          g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
                               _("Embedded NUL byte in conversion input"));
          return NULL;
        }
    }

  out = g_convert (string, len, to_codeset, from_codeset,
                   bytes_read, &outbytes, error);
  if (out == NULL)
943 944 945 946 947
    {
      if (bytes_written)
        *bytes_written = 0;
      return NULL;
    }
948 949 950

  if ((flags & CONVERT_CHECK_NO_NULS_IN_OUTPUT)
      && memchr (out, '\0', outbytes) != NULL)
951
    {
952
      g_free (out);
953 954 955 956 957 958 959 960 961
      if (bytes_written)
        *bytes_written = 0;
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL,
                           _("Embedded NUL byte in conversion output"));
      return NULL;
    }

  if (bytes_written)
    *bytes_written = outbytes;
962
  return out;
963 964
}

965 966
/**
 * g_locale_to_utf8:
967 968
 * @opsysstring:   (array length=len) (element-type guint8): a string in the
 *                 encoding of the current locale. On Windows
Tor Lillqvist's avatar
Tor Lillqvist committed
969
 *                 this means the system codepage.
970
 * @len:           the length of the string, or -1 if the string is
971 972 973
 *                 nul-terminated (Note that some encodings may allow nul
 *                 bytes to occur inside strings. In that case, using -1
 *                 for the @len parameter is unsafe)
974
 * @bytes_read: (out) (optional): location to store the number of bytes in the
975
 *                 input string that were successfully converted, or %NULL.
Matthias Clasen's avatar
Matthias Clasen committed
976
 *                 Even if the conversion was successful, this may be 
977
 *                 less than @len if there were partial characters
978
 *                 at the end of the input. If the error
979
 *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
980
 *                 stored will be the byte offset after the last valid
981
 *                 input sequence.
982 983
 * @bytes_written: (out) (optional): the number of bytes stored in the output
 *                 buffer (not including the terminating nul).
Matthias Clasen's avatar
Matthias Clasen committed
984
 * @error:         location to store the error occurring, or %NULL to ignore
985 986
 *                 errors. Any of the errors in #GConvertError may occur.
 * 
987 988
 * Converts a string which is in the encoding used for strings by
 * the C runtime (usually the same as that used by the operating
Matthias Clasen's avatar
Matthias Clasen committed
989
 * system) in the [current locale][setlocale] into a UTF-8 string.
990 991 992 993 994 995 996 997
 *
 * If the source encoding is not UTF-8 and the conversion output contains a
 * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
 * function returns %NULL.
 * If the source encoding is UTF-8, an embedded nul character is treated with
 * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
 * earlier versions of this library. Use g_convert() to produce output that
 * may contain embedded nul characters.
998
 * 
999
 * Returns: (type utf8): The converted string, or %NULL on an error.
1000
 **/
1001
gchar *
1002
g_locale_to_utf8 (const gchar  *opsysstring,
1003 1004 1005
		  gssize        len,            
		  gsize        *bytes_read,    
		  gsize        *bytes_written,
1006
		  GError      **error)
1007
{
1008
  const char *charset;
1009 1010

  if (g_get_charset (&charset))
1011
    return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1012
  else
1013 1014
    return convert_checked (opsysstring, len, "UTF-8", charset,
                            CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1015
                            bytes_read, bytes_written, error);
1016 1017
}

1018 1019 1020 1021
/**
 * g_locale_from_utf8:
 * @utf8string:    a UTF-8 encoded string 
 * @len:           the length of the string, or -1 if the string is
1022
 *                 nul-terminated.
1023
 * @bytes_read: (out) (optional): location to store the number of bytes in the
1024
 *                 input string that were successfully converted, or %NULL.
Matthias Clasen's avatar
Matthias Clasen committed
1025
 *                 Even if the conversion was successful, this may be 
1026
 *                 less than @len if there were partial characters
1027
 *                 at the end of the input. If the error
1028
 *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1029
 *                 stored will be the byte offset after the last valid
1030
 *                 input sequence.
1031 1032
 * @bytes_written: (out) (optional): the number of bytes stored in the output
 *                 buffer (not including the terminating nul).
Matthias Clasen's avatar
Matthias Clasen committed
1033
 * @error:         location to store the error occurring, or %NULL to ignore
1034 1035 1036 1037
 *                 errors. Any of the errors in #GConvertError may occur.
 * 
 * Converts a string from UTF-8 to the encoding used for strings by
 * the C runtime (usually the same as that used by the operating
Matthias Clasen's avatar
Matthias Clasen committed
1038 1039
 * system) in the [current locale][setlocale]. On Windows this means
 * the system codepage.
1040
 *
1041 1042
 * The input string shall not contain nul characters even if the @len
 * argument is positive. A nul character found inside the string will result
1043 1044 1045
 * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert
 * input that may contain embedded nul characters.
 *
1046 1047