gmarkup.c 85 KB
Newer Older
1 2
/* gmarkup.c - Simple XML-like parser
 *
3
 *  Copyright 2000, 2003 Red Hat, Inc.
4
 *  Copyright 2007, 2008 Ryan Lortie <desrt@desrt.ca>
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
 *
 * GLib is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * GLib is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with GLib; see the file COPYING.LIB.  If not,
 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 *   Boston, MA 02111-1307, USA.
 */

22
#include "config.h"
23

24
#include <stdarg.h>
25 26 27 28 29
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>

30 31
#include "gmarkup.h"

32
#include "gslice.h"
33 34 35 36
#include "galloca.h"
#include "gstrfuncs.h"
#include "gstring.h"
#include "gtestutils.h"
Owen Taylor's avatar
Owen Taylor committed
37
#include "glibintl.h"
38

Matthias Clasen's avatar
Matthias Clasen committed
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
/**
 * SECTION:markup
 * @Title: Simple XML Subset Parser
 * @Short_description: parses a subset of XML
 * @See_also: <ulink url="http://www.w3.org/TR/REC-xml/">XML
 *     Specification</ulink>
 *
 * The "GMarkup" parser is intended to parse a simple markup format
 * that's a subset of XML. This is a small, efficient, easy-to-use
 * parser. It should not be used if you expect to interoperate with
 * other applications generating full-scale XML. However, it's very
 * useful for application data files, config files, etc. where you
 * know your application will be the only one writing the file.
 * Full-scale XML parsers should be able to parse the subset used by
 * GMarkup, so you can easily migrate to full-scale XML at a later
 * time if the need arises.
 *
 * GMarkup is not guaranteed to signal an error on all invalid XML;
 * the parser may accept documents that an XML parser would not.
 * However, XML documents which are not well-formed<footnote
 * id="wellformed">Being wellformed is a weaker condition than being
 * valid. See the <ulink url="http://www.w3.org/TR/REC-xml/">XML
 * specification</ulink> for definitions of these terms.</footnote>
 * are not considered valid GMarkup documents.
 *
 * Simplifications to XML include:
 * <itemizedlist>
 * <listitem>Only UTF-8 encoding is allowed</listitem>
 * <listitem>No user-defined entities</listitem>
 * <listitem>Processing instructions, comments and the doctype declaration
 * are "passed through" but are not interpreted in any way</listitem>
 * <listitem>No DTD or validation.</listitem>
 * </itemizedlist>
 *
 * The markup format does support:
 * <itemizedlist>
 * <listitem>Elements</listitem>
 * <listitem>Attributes</listitem>
 * <listitem>5 standard entities:
 *   <literal>&amp;amp; &amp;lt; &amp;gt; &amp;quot; &amp;apos;</literal>
 * </listitem>
 * <listitem>Character references</listitem>
 * <listitem>Sections marked as CDATA</listitem>
 * </itemizedlist>
 */

85
GQuark
86
g_markup_error_quark (void)
87
{
88
  return g_quark_from_static_string ("g-markup-error-quark");
89 90 91 92 93 94 95 96 97 98
}

typedef enum
{
  STATE_START,
  STATE_AFTER_OPEN_ANGLE,
  STATE_AFTER_CLOSE_ANGLE,
  STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */
  STATE_INSIDE_OPEN_TAG_NAME,
  STATE_INSIDE_ATTRIBUTE_NAME,
99
  STATE_AFTER_ATTRIBUTE_NAME,
100 101
  STATE_BETWEEN_ATTRIBUTES,
  STATE_AFTER_ATTRIBUTE_EQUALS_SIGN,
102 103
  STATE_INSIDE_ATTRIBUTE_VALUE_SQ,
  STATE_INSIDE_ATTRIBUTE_VALUE_DQ,
104 105 106
  STATE_INSIDE_TEXT,
  STATE_AFTER_CLOSE_TAG_SLASH,
  STATE_INSIDE_CLOSE_TAG_NAME,
107
  STATE_AFTER_CLOSE_TAG_NAME,
108 109 110 111
  STATE_INSIDE_PASSTHROUGH,
  STATE_ERROR
} GMarkupParseState;

112 113 114 115 116 117 118
typedef struct
{
  const char *prev_element;
  const GMarkupParser *prev_parser;
  gpointer prev_user_data;
} GMarkupRecursionTracker;

119 120 121 122 123 124 125 126 127
struct _GMarkupParseContext
{
  const GMarkupParser *parser;

  GMarkupParseFlags flags;

  gint line_number;
  gint char_number;

128 129
  GMarkupParseState state;

130 131 132 133 134 135 136 137
  gpointer user_data;
  GDestroyNotify dnotify;

  /* A piece of character data or an element that
   * hasn't "ended" yet so we haven't yet called
   * the callback for it.
   */
  GString *partial_chunk;
138
  GSList *spare_chunks;
139 140

  GSList *tag_stack;
141 142 143 144 145
  GSList *tag_stack_gstr;
  GSList *spare_list_nodes;

  GString **attr_names;
  GString **attr_values;
146 147
  gint cur_attr;
  gint alloc_attrs;
148 149

  const gchar *current_text;
Matthias Clasen's avatar
Matthias Clasen committed
150
  gssize       current_text_len;
151 152 153 154 155 156 157 158 159
  const gchar *current_text_end;

  /* used to save the start of the last interesting thingy */
  const gchar *start;

  const gchar *iter;

  guint document_empty : 1;
  guint parsing : 1;
160
  guint awaiting_pop : 1;
Matthias Clasen's avatar
Matthias Clasen committed
161
  gint balance;
162 163 164 165 166

  /* subparser support */
  GSList *subparser_stack; /* (GMarkupRecursionTracker *) */
  const char *subparser_element;
  gpointer held_user_data;
167 168
};

169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
/*
 * Helpers to reduce our allocation overhead, we have
 * a well defined allocation lifecycle.
 */
static GSList *
get_list_node (GMarkupParseContext *context, gpointer data)
{
  GSList *node;
  if (context->spare_list_nodes != NULL)
    {
      node = context->spare_list_nodes;
      context->spare_list_nodes = g_slist_remove_link (context->spare_list_nodes, node);
    }
  else
    node = g_slist_alloc();
  node->data = data;
  return node;
}

static void
free_list_node (GMarkupParseContext *context, GSList *node)
{
  node->data = NULL;
  context->spare_list_nodes = g_slist_concat (node, context->spare_list_nodes);
}

static inline void
string_blank (GString *string)
{
  string->str[0] = '\0';
  string->len = 0;
}

202 203 204 205 206
/**
 * g_markup_parse_context_new:
 * @parser: a #GMarkupParser
 * @flags: one or more #GMarkupParseFlags
 * @user_data: user data to pass to #GMarkupParser functions
Matthias Clasen's avatar
Matthias Clasen committed
207 208 209
 * @user_data_dnotify: user data destroy notifier called when
 *     the parse context is freed
 *
210 211 212
 * Creates a new parse context. A parse context is used to parse
 * marked-up documents. You can feed any number of documents into
 * a context, as long as no errors occur; once an error occurs,
Matthias Clasen's avatar
Matthias Clasen committed
213 214 215
 * the parse context can't continue to parse text (you have to
 * free it and create a new parse context).
 *
216 217
 * Return value: a new #GMarkupParseContext
 **/
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
GMarkupParseContext *
g_markup_parse_context_new (const GMarkupParser *parser,
                            GMarkupParseFlags    flags,
                            gpointer             user_data,
                            GDestroyNotify       user_data_dnotify)
{
  GMarkupParseContext *context;

  g_return_val_if_fail (parser != NULL, NULL);

  context = g_new (GMarkupParseContext, 1);

  context->parser = parser;
  context->flags = flags;
  context->user_data = user_data;
  context->dnotify = user_data_dnotify;

  context->line_number = 1;
  context->char_number = 1;

  context->partial_chunk = NULL;
239 240
  context->spare_chunks = NULL;
  context->spare_list_nodes = NULL;
241 242 243

  context->state = STATE_START;
  context->tag_stack = NULL;
244
  context->tag_stack_gstr = NULL;
245 246 247 248
  context->attr_names = NULL;
  context->attr_values = NULL;
  context->cur_attr = -1;
  context->alloc_attrs = 0;
249 250 251 252 253 254 255 256 257 258 259

  context->current_text = NULL;
  context->current_text_len = -1;
  context->current_text_end = NULL;

  context->start = NULL;
  context->iter = NULL;

  context->document_empty = TRUE;
  context->parsing = FALSE;

260 261 262 263 264 265 266
  context->awaiting_pop = FALSE;
  context->subparser_stack = NULL;
  context->subparser_element = NULL;

  /* this is only looked at if awaiting_pop = TRUE.  initialise anyway. */
  context->held_user_data = NULL;

Matthias Clasen's avatar
Matthias Clasen committed
267 268
  context->balance = 0;

269 270 271
  return context;
}

272
static void
Matthias Clasen's avatar
Matthias Clasen committed
273
string_full_free (gpointer ptr)
274 275 276 277 278 279
{
  g_string_free (ptr, TRUE);
}

static void clear_attributes (GMarkupParseContext *context);

280 281 282
/**
 * g_markup_parse_context_free:
 * @context: a #GMarkupParseContext
Matthias Clasen's avatar
Matthias Clasen committed
283 284 285 286 287 288
 *
 * Frees a #GMarkupParseContext.
 *
 * This function can't be called from inside one of the
 * #GMarkupParser functions or while a subparser is pushed.
 */
289 290 291 292 293
void
g_markup_parse_context_free (GMarkupParseContext *context)
{
  g_return_if_fail (context != NULL);
  g_return_if_fail (!context->parsing);
294 295
  g_return_if_fail (!context->subparser_stack);
  g_return_if_fail (!context->awaiting_pop);
296 297 298 299

  if (context->dnotify)
    (* context->dnotify) (context->user_data);

300 301 302
  clear_attributes (context);
  g_free (context->attr_names);
  g_free (context->attr_values);
303

Matthias Clasen's avatar
Matthias Clasen committed
304
  g_slist_free_full (context->tag_stack_gstr, string_full_free);
305 306
  g_slist_free (context->tag_stack);

Matthias Clasen's avatar
Matthias Clasen committed
307
  g_slist_free_full (context->spare_chunks, string_full_free);
308 309
  g_slist_free (context->spare_list_nodes);

310 311 312 313 314 315
  if (context->partial_chunk)
    g_string_free (context->partial_chunk, TRUE);

  g_free (context);
}

316 317
static void pop_subparser_stack (GMarkupParseContext *context);

318 319 320 321 322 323 324 325
static void
mark_error (GMarkupParseContext *context,
            GError              *error)
{
  context->state = STATE_ERROR;

  if (context->parser->error)
    (*context->parser->error) (context, error, context->user_data);
326 327 328 329 330 331 332 333 334 335

  /* report the error all the way up to free all the user-data */
  while (context->subparser_stack)
    {
      pop_subparser_stack (context);
      context->awaiting_pop = FALSE; /* already been freed */

      if (context->parser->error)
        (*context->parser->error) (context, error, context->user_data);
    }
336 337
}

Matthias Clasen's avatar
Matthias Clasen committed
338 339 340 341 342 343
static void
set_error (GMarkupParseContext  *context,
           GError              **error,
           GMarkupError          code,
           const gchar          *format,
           ...) G_GNUC_PRINTF (4, 5);
344

345
static void
Matthias Clasen's avatar
Matthias Clasen committed
346 347 348 349
set_error_literal (GMarkupParseContext  *context,
                   GError              **error,
                   GMarkupError          code,
                   const gchar          *message)
350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
{
  GError *tmp_error;

  tmp_error = g_error_new_literal (G_MARKUP_ERROR, code, message);

  g_prefix_error (&tmp_error,
                  _("Error on line %d char %d: "),
                  context->line_number,
                  context->char_number);

  mark_error (context, tmp_error);

  g_propagate_error (error, tmp_error);
}

365
static void
Matthias Clasen's avatar
Matthias Clasen committed
366 367 368 369
set_error (GMarkupParseContext  *context,
           GError              **error,
           GMarkupError          code,
           const gchar          *format,
370 371 372
           ...)
{
  gchar *s;
373
  gchar *s_valid;
374 375 376 377 378 379
  va_list args;

  va_start (args, format);
  s = g_strdup_vprintf (format, args);
  va_end (args);

Matthias Clasen's avatar
Matthias Clasen committed
380 381 382
  /* Make sure that the GError message is valid UTF-8
   * even if it is complaining about invalid UTF-8 in the markup
   */
383
  s_valid = _g_utf8_make_valid (s);
384
  set_error_literal (context, error, code, s);
385

386
  g_free (s);
387
  g_free (s_valid);
388 389
}

390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
static void
propagate_error (GMarkupParseContext  *context,
                 GError              **dest,
                 GError               *src)
{
  if (context->flags & G_MARKUP_PREFIX_ERROR_POSITION)
    g_prefix_error (&src,
                    _("Error on line %d char %d: "),
                    context->line_number,
                    context->char_number);

  mark_error (context, src);

  g_propagate_error (dest, src);
}
405 406 407 408

#define IS_COMMON_NAME_END_CHAR(c) \
  ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')

409
static gboolean
Matthias Clasen's avatar
Matthias Clasen committed
410 411 412
slow_name_validate (GMarkupParseContext  *context,
                    const gchar          *name,
                    GError              **error)
413
{
Matthias Clasen's avatar
Matthias Clasen committed
414
  const gchar *p = name;
415 416 417 418

  if (!g_utf8_validate (name, strlen (name), NULL))
    {
      set_error (context, error, G_MARKUP_ERROR_BAD_UTF8,
Matthias Clasen's avatar
Matthias Clasen committed
419
                 _("Invalid UTF-8 encoded text in name - not valid '%s'"), name);
420 421 422 423
      return FALSE;
    }

  if (!(g_ascii_isalpha (*p) ||
Matthias Clasen's avatar
Matthias Clasen committed
424 425 426 427
        (!IS_COMMON_NAME_END_CHAR (*p) &&
         (*p == '_' ||
          *p == ':' ||
          g_unichar_isalpha (g_utf8_get_char (p))))))
428 429
    {
      set_error (context, error, G_MARKUP_ERROR_PARSE,
Matthias Clasen's avatar
Matthias Clasen committed
430
                 _("'%s' is not a valid name "), name);
431 432 433 434 435 436 437
      return FALSE;
    }

  for (p = g_utf8_next_char (name); *p != '\0'; p = g_utf8_next_char (p))
    {
      /* is_name_char */
      if (!(g_ascii_isalnum (*p) ||
Matthias Clasen's avatar
Matthias Clasen committed
438 439 440 441 442 443 444 445 446 447 448
            (!IS_COMMON_NAME_END_CHAR (*p) &&
             (*p == '.' ||
              *p == '-' ||
              *p == '_' ||
              *p == ':' ||
              g_unichar_isalpha (g_utf8_get_char (p))))))
        {
          set_error (context, error, G_MARKUP_ERROR_PARSE,
                     _("'%s' is not a valid name: '%c' "), name, *p);
          return FALSE;
        }
449 450
    }
  return TRUE;
451 452
}

Matthias Clasen's avatar
Matthias Clasen committed
453
/*
454 455
 * Use me for elements, attributes etc.
 */
456
static gboolean
Matthias Clasen's avatar
Matthias Clasen committed
457 458 459
name_validate (GMarkupParseContext  *context,
               const gchar          *name,
               GError              **error)
460
{
461 462 463 464 465 466
  char mask;
  const char *p;

  /* name start char */
  p = name;
  if (G_UNLIKELY (IS_COMMON_NAME_END_CHAR (*p) ||
Matthias Clasen's avatar
Matthias Clasen committed
467
                  !(g_ascii_isalpha (*p) || *p == '_' || *p == ':')))
468
    goto slow_validate;
Matthias Clasen's avatar
Matthias Clasen committed
469

470 471 472 473 474 475
  for (mask = *p++; *p != '\0'; p++)
    {
      mask |= *p;

      /* is_name_char */
      if (G_UNLIKELY (!(g_ascii_isalnum (*p) ||
Matthias Clasen's avatar
Matthias Clasen committed
476 477 478 479 480 481
                        (!IS_COMMON_NAME_END_CHAR (*p) &&
                         (*p == '.' ||
                          *p == '-' ||
                          *p == '_' ||
                          *p == ':')))))
        goto slow_validate;
482 483 484 485 486 487 488 489 490
    }

  if (mask & 0x80) /* un-common / non-ascii */
    goto slow_validate;

  return TRUE;

 slow_validate:
  return slow_name_validate (context, name, error);
491 492
}

493
static gboolean
Matthias Clasen's avatar
Matthias Clasen committed
494 495 496 497
text_validate (GMarkupParseContext  *context,
               const gchar          *p,
               gint                  len,
               GError              **error)
498 499 500 501
{
  if (!g_utf8_validate (p, len, NULL))
    {
      set_error (context, error, G_MARKUP_ERROR_BAD_UTF8,
Matthias Clasen's avatar
Matthias Clasen committed
502
                 _("Invalid UTF-8 encoded text in name - not valid '%s'"), p);
503 504 505 506 507
      return FALSE;
    }
  else
    return TRUE;
}
508 509 510 511 512

static gchar*
char_str (gunichar c,
          gchar   *buf)
{
Matthias Clasen's avatar
Matthias Clasen committed
513
  memset (buf, 0, 8);
514 515 516 517 518 519 520 521 522 523 524 525 526
  g_unichar_to_utf8 (c, buf);
  return buf;
}

static gchar*
utf8_str (const gchar *utf8,
          gchar       *buf)
{
  char_str (g_utf8_get_char (utf8), buf);
  return buf;
}

static void
Matthias Clasen's avatar
Matthias Clasen committed
527 528 529 530 531
set_unescape_error (GMarkupParseContext  *context,
                    GError              **error,
                    const gchar          *remaining_text,
                    GMarkupError          code,
                    const gchar          *format,
532 533 534 535 536 537 538 539 540 541
                    ...)
{
  GError *tmp_error;
  gchar *s;
  va_list args;
  gint remaining_newlines;
  const gchar *p;

  remaining_newlines = 0;
  p = remaining_text;
542
  while (*p != '\0')
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565
    {
      if (*p == '\n')
        ++remaining_newlines;
      ++p;
    }

  va_start (args, format);
  s = g_strdup_vprintf (format, args);
  va_end (args);

  tmp_error = g_error_new (G_MARKUP_ERROR,
                           code,
                           _("Error on line %d: %s"),
                           context->line_number - remaining_newlines,
                           s);

  g_free (s);

  mark_error (context, tmp_error);

  g_propagate_error (error, tmp_error);
}

566 567 568 569 570
/*
 * re-write the GString in-place, unescaping anything that escaped.
 * most XML does not contain entities, or escaping.
 */
static gboolean
Matthias Clasen's avatar
Matthias Clasen committed
571 572 573 574
unescape_gstring_inplace (GMarkupParseContext  *context,
                          GString              *string,
                          gboolean             *is_ascii,
                          GError              **error)
575
{
576 577 578
  char mask, *to;
  int line_num = 1;
  const char *from;
579
  gboolean normalize_attribute;
580

581 582 583 584 585
  *is_ascii = FALSE;

  /* are we unescaping an attribute or not ? */
  if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
      context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
586 587 588 589
    normalize_attribute = TRUE;
  else
    normalize_attribute = FALSE;

590 591 592 593 594 595
  /*
   * Meeks' theorum: unescaping can only shrink text.
   * for &lt; etc. this is obvious, for &#xffff; more
   * thought is required, but this is patently so.
   */
  mask = 0;
Matthias Clasen's avatar
Matthias Clasen committed
596
  for (from = to = string->str; *from != '\0'; from++, to++)
597
    {
598 599 600 601
      *to = *from;

      mask |= *to;
      if (*to == '\n')
Matthias Clasen's avatar
Matthias Clasen committed
602
        line_num++;
603
      if (normalize_attribute && (*to == '\t' || *to == '\n'))
Matthias Clasen's avatar
Matthias Clasen committed
604
        *to = ' ';
605
      if (*to == '\r')
Matthias Clasen's avatar
Matthias Clasen committed
606 607 608 609 610
        {
          *to = normalize_attribute ? ' ' : '\n';
          if (from[1] == '\n')
            from++;
        }
611
      if (*from == '&')
Matthias Clasen's avatar
Matthias Clasen committed
612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682
        {
          from++;
          if (*from == '#')
            {
              gboolean is_hex = FALSE;
              gulong l;
              gchar *end = NULL;

              from++;

              if (*from == 'x')
                {
                  is_hex = TRUE;
                  from++;
                }

              /* digit is between start and p */
              errno = 0;
              if (is_hex)
                l = strtoul (from, &end, 16);
              else
                l = strtoul (from, &end, 10);

              if (end == from || errno != 0)
                {
                  set_unescape_error (context, error,
                                      from, G_MARKUP_ERROR_PARSE,
                                      _("Failed to parse '%-.*s', which "
                                        "should have been a digit "
                                        "inside a character reference "
                                        "(&#234; for example) - perhaps "
                                        "the digit is too large"),
                                      end - from, from);
                  return FALSE;
                }
              else if (*end != ';')
                {
                  set_unescape_error (context, error,
                                      from, G_MARKUP_ERROR_PARSE,
                                      _("Character reference did not end with a "
                                        "semicolon; "
                                        "most likely you used an ampersand "
                                        "character without intending to start "
                                        "an entity - escape ampersand as &amp;"));
                  return FALSE;
                }
              else
                {
                  /* characters XML 1.1 permits */
                  if ((0 < l && l <= 0xD7FF) ||
                      (0xE000 <= l && l <= 0xFFFD) ||
                      (0x10000 <= l && l <= 0x10FFFF))
                    {
                      gchar buf[8];
                      char_str (l, buf);
                      strcpy (to, buf);
                      to += strlen (buf) - 1;
                      from = end;
                      if (l >= 0x80) /* not ascii */
                        mask |= 0x80;
                    }
                  else
                    {
                      set_unescape_error (context, error,
                                          from, G_MARKUP_ERROR_PARSE,
                                          _("Character reference '%-.*s' does not "
                                            "encode a permitted character"),
                                          end - from, from);
                      return FALSE;
                    }
                }
683 684
            }

685
          else if (strncmp (from, "lt;", 3) == 0)
Matthias Clasen's avatar
Matthias Clasen committed
686 687 688 689
            {
              *to = '<';
              from += 2;
            }
690
          else if (strncmp (from, "gt;", 3) == 0)
Matthias Clasen's avatar
Matthias Clasen committed
691 692 693 694
            {
              *to = '>';
              from += 2;
            }
695
          else if (strncmp (from, "amp;", 4) == 0)
Matthias Clasen's avatar
Matthias Clasen committed
696 697 698 699
            {
              *to = '&';
              from += 3;
            }
700
          else if (strncmp (from, "quot;", 5) == 0)
Matthias Clasen's avatar
Matthias Clasen committed
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735
            {
              *to = '"';
              from += 4;
            }
          else if (strncmp (from, "apos;", 5) == 0)
            {
              *to = '\'';
              from += 4;
            }
          else
            {
              if (*from == ';')
                set_unescape_error (context, error,
                                    from, G_MARKUP_ERROR_PARSE,
                                    _("Empty entity '&;' seen; valid "
                                      "entities are: &amp; &quot; &lt; &gt; &apos;"));
              else
                {
                  const char *end = strchr (from, ';');
                  if (end)
                    set_unescape_error (context, error,
                                        from, G_MARKUP_ERROR_PARSE,
                                        _("Entity name '%-.*s' is not known"),
                                        end-from, from);
                  else
                    set_unescape_error (context, error,
                                        from, G_MARKUP_ERROR_PARSE,
                                        _("Entity did not end with a semicolon; "
                                          "most likely you used an ampersand "
                                          "character without intending to start "
                                          "an entity - escape ampersand as &amp;"));
                }
              return FALSE;
            }
        }
736 737
    }

738 739 740
  g_assert (to - string->str <= string->len);
  if (to - string->str != string->len)
    g_string_truncate (string, to - string->str);
741

742
  *is_ascii = !(mask & 0x80);
743

744
  return TRUE;
745 746
}

747
static inline gboolean
748
advance_char (GMarkupParseContext *context)
Matthias Clasen's avatar
Matthias Clasen committed
749
{
750 751
  context->iter++;
  context->char_number++;
752

753
  if (G_UNLIKELY (context->iter == context->current_text_end))
754
      return FALSE;
755 756

  else if (G_UNLIKELY (*context->iter == '\n'))
757
    {
758
      context->line_number++;
759 760
      context->char_number = 1;
    }
Matthias Clasen's avatar
Matthias Clasen committed
761

762
  return TRUE;
763 764
}

765
static inline gboolean
766 767 768 769 770
xml_isspace (char c)
{
  return c == ' ' || c == '\t' || c == '\n' || c == '\r';
}

771 772 773 774 775
static void
skip_spaces (GMarkupParseContext *context)
{
  do
    {
776
      if (!xml_isspace (*context->iter))
777 778 779 780 781 782 783 784 785 786
        return;
    }
  while (advance_char (context));
}

static void
advance_to_name_end (GMarkupParseContext *context)
{
  do
    {
787
      if (IS_COMMON_NAME_END_CHAR (*(context->iter)))
788
        return;
789
      if (xml_isspace (*(context->iter)))
Matthias Clasen's avatar
Matthias Clasen committed
790
        return;
791 792 793 794
    }
  while (advance_char (context));
}

795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810
static void
release_chunk (GMarkupParseContext *context, GString *str)
{
  GSList *node;
  if (!str)
    return;
  if (str->allocated_len > 256)
    { /* large strings are unusual and worth freeing */
      g_string_free (str, TRUE);
      return;
    }
  string_blank (str);
  node = get_list_node (context, str);
  context->spare_chunks = g_slist_concat (node, context->spare_chunks);
}

811 812 813 814 815 816
static void
add_to_partial (GMarkupParseContext *context,
                const gchar         *text_start,
                const gchar         *text_end)
{
  if (context->partial_chunk == NULL)
817
    { /* allocate a new chunk to parse into */
818

819
      if (context->spare_chunks != NULL)
Matthias Clasen's avatar
Matthias Clasen committed
820 821 822 823 824 825
        {
          GSList *node = context->spare_chunks;
          context->spare_chunks = g_slist_remove_link (context->spare_chunks, node);
          context->partial_chunk = node->data;
          free_list_node (context, node);
        }
826
      else
Matthias Clasen's avatar
Matthias Clasen committed
827
        context->partial_chunk = g_string_sized_new (MAX (28, text_end - text_start));
828
    }
829

830 831
  if (text_start != text_end)
    g_string_insert_len (context->partial_chunk, -1,
Matthias Clasen's avatar
Matthias Clasen committed
832
                         text_start, text_end - text_start);
833 834
}

835
static inline void
836
truncate_partial (GMarkupParseContext *context)
837 838
{
  if (context->partial_chunk != NULL)
839
    string_blank (context->partial_chunk);
840 841
}

842
static inline const gchar*
843 844 845 846 847
current_element (GMarkupParseContext *context)
{
  return context->tag_stack->data;
}

848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868
static void
pop_subparser_stack (GMarkupParseContext *context)
{
  GMarkupRecursionTracker *tracker;

  g_assert (context->subparser_stack);

  tracker = context->subparser_stack->data;

  context->awaiting_pop = TRUE;
  context->held_user_data = context->user_data;

  context->user_data = tracker->prev_user_data;
  context->parser = tracker->prev_parser;
  context->subparser_element = tracker->prev_element;
  g_slice_free (GMarkupRecursionTracker, tracker);

  context->subparser_stack = g_slist_delete_link (context->subparser_stack,
                                                  context->subparser_stack);
}

869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892
static void
push_partial_as_tag (GMarkupParseContext *context)
{
  GString *str = context->partial_chunk;
  /* sadly, this is exported by gmarkup_get_element_stack as-is */
  context->tag_stack = g_slist_concat (get_list_node (context, str->str), context->tag_stack);
  context->tag_stack_gstr = g_slist_concat (get_list_node (context, str), context->tag_stack_gstr);
  context->partial_chunk = NULL;
}

static void
pop_tag (GMarkupParseContext *context)
{
  GSList *nodea, *nodeb;

  nodea = context->tag_stack;
  nodeb = context->tag_stack_gstr;
  release_chunk (context, nodeb->data);
  context->tag_stack = g_slist_remove_link (context->tag_stack, nodea);
  context->tag_stack_gstr = g_slist_remove_link (context->tag_stack_gstr, nodeb);
  free_list_node (context, nodea);
  free_list_node (context, nodeb);
}

893 894 895 896 897 898 899 900 901 902 903 904
static void
possibly_finish_subparser (GMarkupParseContext *context)
{
  if (current_element (context) == context->subparser_element)
    pop_subparser_stack (context);
}

static void
ensure_no_outstanding_subparser (GMarkupParseContext *context)
{
  if (context->awaiting_pop)
    g_critical ("During the first end_element call after invoking a "
Matthias Clasen's avatar
Matthias Clasen committed
905 906 907 908
                "subparser you must pop the subparser stack and handle "
                "the freeing of the subparser user_data.  This can be "
                "done by calling the end function of the subparser.  "
                "Very probably, your program just leaked memory.");
909 910 911 912 913 914

  /* let valgrind watch the pointer disappear... */
  context->held_user_data = NULL;
  context->awaiting_pop = FALSE;
}

915 916 917
static const gchar*
current_attribute (GMarkupParseContext *context)
{
918
  g_assert (context->cur_attr >= 0);
919
  return context->attr_names[context->cur_attr]->str;
920 921
}

922
static void
923
add_attribute (GMarkupParseContext *context, GString *str)
924 925 926 927
{
  if (context->cur_attr + 2 >= context->alloc_attrs)
    {
      context->alloc_attrs += 5; /* silly magic number */
928 929
      context->attr_names = g_realloc (context->attr_names, sizeof(GString*)*context->alloc_attrs);
      context->attr_values = g_realloc (context->attr_values, sizeof(GString*)*context->alloc_attrs);
930 931
    }
  context->cur_attr++;
932
  context->attr_names[context->cur_attr] = str;
933 934
  context->attr_values[context->cur_attr] = NULL;
  context->attr_names[context->cur_attr+1] = NULL;
935
  context->attr_values[context->cur_attr+1] = NULL;
936 937
}

938 939 940 941 942 943 944 945 946 947 948 949 950
static void
clear_attributes (GMarkupParseContext *context)
{
  /* Go ahead and free the attributes. */
  for (; context->cur_attr >= 0; context->cur_attr--)
    {
      int pos = context->cur_attr;
      release_chunk (context, context->attr_names[pos]);
      release_chunk (context, context->attr_values[pos]);
      context->attr_names[pos] = context->attr_values[pos] = NULL;
    }
  g_assert (context->cur_attr == -1);
  g_assert (context->attr_names == NULL ||
Matthias Clasen's avatar
Matthias Clasen committed
951
            context->attr_names[0] == NULL);
952
  g_assert (context->attr_values == NULL ||
Matthias Clasen's avatar
Matthias Clasen committed
953
            context->attr_values[0] == NULL);
954
}
955 956

/* This has to be a separate function to ensure the alloca's
Matthias Clasen's avatar
Matthias Clasen committed
957 958 959
 * are unwound on exit - otherwise we grow & blow the stack
 * with large documents
 */
960
static inline void
Matthias Clasen's avatar
Matthias Clasen committed
961 962
emit_start_element (GMarkupParseContext  *context,
                    GError              **error)
963 964 965 966 967 968
{
  int i;
  const gchar *start_name;
  const gchar **attr_names;
  const gchar **attr_values;
  GError *tmp_error;
Matthias Clasen's avatar
Matthias Clasen committed
969

970 971 972 973 974 975 976 977 978
  attr_names = g_newa (const gchar *, context->cur_attr + 2);
  attr_values = g_newa (const gchar *, context->cur_attr + 2);
  for (i = 0; i < context->cur_attr + 1; i++)
    {
      attr_names[i] = context->attr_names[i]->str;
      attr_values[i] = context->attr_values[i]->str;
    }
  attr_names[i] = NULL;
  attr_values[i] = NULL;
Matthias Clasen's avatar
Matthias Clasen committed
979

980 981 982
  /* Call user callback for element start */
  tmp_error = NULL;
  start_name = current_element (context);
Matthias Clasen's avatar
Matthias Clasen committed
983

984 985 986
  if (context->parser->start_element &&
      name_validate (context, start_name, error))
    (* context->parser->start_element) (context,
Matthias Clasen's avatar
Matthias Clasen committed
987 988 989 990 991
                                        start_name,
                                        (const gchar **)attr_names,
                                        (const gchar **)attr_values,
                                        context->user_data,
                                        &tmp_error);
992
  clear_attributes (context);
Matthias Clasen's avatar
Matthias Clasen committed
993

994 995 996 997
  if (tmp_error != NULL)
    propagate_error (context, error, tmp_error);
}

998 999 1000 1001 1002 1003
/**
 * g_markup_parse_context_parse:
 * @context: a #GMarkupParseContext
 * @text: chunk of text to parse
 * @text_len: length of @text in bytes
 * @error: return location for a #GError
Matthias Clasen's avatar
Matthias Clasen committed
1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015
 *
 * Feed some data to the #GMarkupParseContext.
 *
 * The data need not be valid UTF-8; an error will be signaled if
 * it's invalid. The data need not be an entire document; you can
 * feed a document into the parser incrementally, via multiple calls
 * to this function. Typically, as you receive data from a network
 * connection or file, you feed each received chunk of data into this
 * function, aborting the process if an error occurs. Once an error
 * is reported, no further data may be fed to the #GMarkupParseContext;
 * all errors are fatal.
 *
1016
 * Return value: %FALSE if an error occurred, %TRUE on success
Matthias Clasen's avatar
Matthias Clasen committed
1017
 */
1018
gboolean
Matthias Clasen's avatar
Matthias Clasen committed
1019 1020 1021 1022
g_markup_parse_context_parse (GMarkupParseContext  *context,
                              const gchar          *text,
                              gssize                text_len,
                              GError              **error)
1023 1024 1025 1026 1027
{
  g_return_val_if_fail (context != NULL, FALSE);
  g_return_val_if_fail (text != NULL, FALSE);
  g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
  g_return_val_if_fail (!context->parsing, FALSE);
Matthias Clasen's avatar
Matthias Clasen committed
1028

1029 1030 1031 1032 1033
  if (text_len < 0)
    text_len = strlen (text);

  if (text_len == 0)
    return TRUE;
Matthias Clasen's avatar
Matthias Clasen committed
1034

1035
  context->parsing = TRUE;
Matthias Clasen's avatar
Matthias Clasen committed
1036

1037 1038 1039

  context->current_text = text;
  context->current_text_len = text_len;
1040
  context->current_text_end = context->current_text + text_len;
1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075
  context->iter = context->current_text;
  context->start = context->iter;

  if (context->current_text_len == 0)
    goto finished;

  while (context->iter != context->current_text_end)
    {
      switch (context->state)
        {
        case STATE_START:
          /* Possible next state: AFTER_OPEN_ANGLE */

          g_assert (context->tag_stack == NULL);

          /* whitespace is ignored outside of any elements */
          skip_spaces (context);

          if (context->iter != context->current_text_end)
            {
              if (*context->iter == '<')
                {
                  /* Move after the open angle */
                  advance_char (context);

                  context->state = STATE_AFTER_OPEN_ANGLE;

                  /* this could start a passthrough */
                  context->start = context->iter;

                  /* document is now non-empty */
                  context->document_empty = FALSE;
                }
              else
                {
1076 1077 1078 1079
                  set_error_literal (context,
                                     error,
                                     G_MARKUP_ERROR_PARSE,
                                     _("Document must begin with an element (e.g. <book>)"));
1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094
                }
            }
          break;

        case STATE_AFTER_OPEN_ANGLE:
          /* Possible next states: INSIDE_OPEN_TAG_NAME,
           *  AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH
           */
          if (*context->iter == '?' ||
              *context->iter == '!')
            {
              /* include < in the passthrough */
              const gchar *openangle = "<";
              add_to_partial (context, openangle, openangle + 1);
              context->start = context->iter;
Matthias Clasen's avatar
Matthias Clasen committed
1095
              context->balance = 1;
1096 1097 1098 1099 1100 1101 1102 1103 1104
              context->state = STATE_INSIDE_PASSTHROUGH;
            }
          else if (*context->iter == '/')
            {
              /* move after it */
              advance_char (context);

              context->state = STATE_AFTER_CLOSE_TAG_SLASH;
            }
1105
          else if (!IS_COMMON_NAME_END_CHAR (*(context->iter)))
1106 1107 1108 1109 1110 1111 1112 1113
            {
              context->state = STATE_INSIDE_OPEN_TAG_NAME;

              /* start of tag name */
              context->start = context->iter;
            }
          else
            {
Matthias Clasen's avatar
Matthias Clasen committed
1114 1115
              gchar buf[8];

1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152
              set_error (context,
                         error,
                         G_MARKUP_ERROR_PARSE,
                         _("'%s' is not a valid character following "
                           "a '<' character; it may not begin an "
                           "element name"),
                         utf8_str (context->iter, buf));
            }
          break;

          /* The AFTER_CLOSE_ANGLE state is actually sort of
           * broken, because it doesn't correspond to a range
           * of characters in the input stream as the others do,
           * and thus makes things harder to conceptualize
           */
        case STATE_AFTER_CLOSE_ANGLE:
          /* Possible next states: INSIDE_TEXT, STATE_START */
          if (context->tag_stack == NULL)
            {
              context->start = NULL;
              context->state = STATE_START;
            }
          else
            {
              context->start = context->iter;
              context->state = STATE_INSIDE_TEXT;
            }
          break;

        case STATE_AFTER_ELISION_SLASH:
          /* Possible next state: AFTER_CLOSE_ANGLE */

          {
            /* We need to pop the tag stack and call the end_element
             * function, since this is the close tag
             */
            GError *tmp_error = NULL;
Matthias Clasen's avatar
Matthias Clasen committed
1153

1154 1155
            g_assert (context->tag_stack != NULL);

1156 1157
            possibly_finish_subparser (context);

1158 1159 1160
            tmp_error = NULL;
            if (context->parser->end_element)
              (* context->parser->end_element) (context,
Matthias Clasen's avatar
Matthias Clasen committed
1161
                                                current_element (context),
1162 1163
                                                context->user_data,
                                                &tmp_error);
1164 1165

            ensure_no_outstanding_subparser (context);
Matthias Clasen's avatar
Matthias Clasen committed
1166

1167 1168 1169 1170
            if (tmp_error)
              {
                mark_error (context, tmp_error);
                g_propagate_error (error, tmp_error);
Matthias Clasen's avatar
Matthias Clasen committed
1171
              }
1172 1173 1174 1175 1176 1177 1178 1179 1180 1181
            else
              {
                if (*context->iter == '>')
                  {
                    /* move after the close angle */
                    advance_char (context);
                    context->state = STATE_AFTER_CLOSE_ANGLE;
                  }
                else
                  {
Matthias Clasen's avatar
Matthias Clasen committed
1182 1183
                    gchar buf[8];

1184 1185 1186 1187
                    set_error (context,
                               error,
                               G_MARKUP_ERROR_PARSE,
                               _("Odd character '%s', expected a '>' character "
Matthias Clasen's avatar
Matthias Clasen committed
1188
                                 "to end the empty-element tag '%s'"),
1189 1190 1191 1192
                               utf8_str (context->iter, buf),
                               current_element (context));
                  }
              }
Matthias Clasen's avatar
Matthias Clasen committed
1193
            pop_tag (context);
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219
          }
          break;

        case STATE_INSIDE_OPEN_TAG_NAME:
          /* Possible next states: BETWEEN_ATTRIBUTES */

          /* if there's a partial chunk then it's the first part of the
           * tag name. If there's a context->start then it's the start
           * of the tag name in current_text, the partial chunk goes
           * before that start though.
           */
          advance_to_name_end (context);

          if (context->iter == context->current_text_end)
            {
              /* The name hasn't necessarily ended. Merge with
               * partial chunk, leave state unchanged.
               */
              add_to_partial (context, context->start, context->iter);
            }
          else
            {
              /* The name has ended. Combine it with the partial chunk
               * if any; push it on the stack; enter next state.
               */
              add_to_partial (context, context->start, context->iter);
Matthias Clasen's avatar
Matthias Clasen committed
1220
              push_partial_as_tag (context);
1221 1222 1223 1224 1225 1226 1227

              context->state = STATE_BETWEEN_ATTRIBUTES;
              context->start = NULL;
            }
          break;

        case STATE_INSIDE_ATTRIBUTE_NAME:
1228 1229 1230
          /* Possible next states: AFTER_ATTRIBUTE_NAME */

          advance_to_name_end (context);
Matthias Clasen's avatar
Matthias Clasen committed
1231
          add_to_partial (context, context->start, context->iter);
1232 1233 1234 1235 1236

          /* read the full name, if we enter the equals sign state
           * then add the attribute to the list (without the value),
           * otherwise store a partial chunk to be prepended later.
           */
Matthias Clasen's avatar