gmarkup.c 85 KB
Newer Older
1 2
/* gmarkup.c - Simple XML-like parser
 *
3
 *  Copyright 2000, 2003 Red Hat, Inc.
4
 *  Copyright 2007, 2008 Ryan Lortie <desrt@desrt.ca>
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
 *
 * GLib is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * GLib is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with GLib; see the file COPYING.LIB.  If not,
 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 *   Boston, MA 02111-1307, USA.
 */

22
#include "config.h"
23

24
#include <stdarg.h>
25 26 27 28 29
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>

30 31 32 33 34 35
#include "gmarkup.h"

#include "galloca.h"
#include "gstrfuncs.h"
#include "gstring.h"
#include "gtestutils.h"
Owen Taylor's avatar
Owen Taylor committed
36
#include "glibintl.h"
37

Matthias Clasen's avatar
Matthias Clasen committed
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
/**
 * SECTION:markup
 * @Title: Simple XML Subset Parser
 * @Short_description: parses a subset of XML
 * @See_also: <ulink url="http://www.w3.org/TR/REC-xml/">XML
 *     Specification</ulink>
 *
 * The "GMarkup" parser is intended to parse a simple markup format
 * that's a subset of XML. This is a small, efficient, easy-to-use
 * parser. It should not be used if you expect to interoperate with
 * other applications generating full-scale XML. However, it's very
 * useful for application data files, config files, etc. where you
 * know your application will be the only one writing the file.
 * Full-scale XML parsers should be able to parse the subset used by
 * GMarkup, so you can easily migrate to full-scale XML at a later
 * time if the need arises.
 *
 * GMarkup is not guaranteed to signal an error on all invalid XML;
 * the parser may accept documents that an XML parser would not.
 * However, XML documents which are not well-formed<footnote
 * id="wellformed">Being wellformed is a weaker condition than being
 * valid. See the <ulink url="http://www.w3.org/TR/REC-xml/">XML
 * specification</ulink> for definitions of these terms.</footnote>
 * are not considered valid GMarkup documents.
 *
 * Simplifications to XML include:
 * <itemizedlist>
 * <listitem>Only UTF-8 encoding is allowed</listitem>
 * <listitem>No user-defined entities</listitem>
 * <listitem>Processing instructions, comments and the doctype declaration
 * are "passed through" but are not interpreted in any way</listitem>
 * <listitem>No DTD or validation.</listitem>
 * </itemizedlist>
 *
 * The markup format does support:
 * <itemizedlist>
 * <listitem>Elements</listitem>
 * <listitem>Attributes</listitem>
 * <listitem>5 standard entities:
 *   <literal>&amp;amp; &amp;lt; &amp;gt; &amp;quot; &amp;apos;</literal>
 * </listitem>
 * <listitem>Character references</listitem>
 * <listitem>Sections marked as CDATA</listitem>
 * </itemizedlist>
 */

84
GQuark
85
g_markup_error_quark (void)
86
{
87
  return g_quark_from_static_string ("g-markup-error-quark");
88 89 90 91 92 93 94 95 96 97
}

typedef enum
{
  STATE_START,
  STATE_AFTER_OPEN_ANGLE,
  STATE_AFTER_CLOSE_ANGLE,
  STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */
  STATE_INSIDE_OPEN_TAG_NAME,
  STATE_INSIDE_ATTRIBUTE_NAME,
98
  STATE_AFTER_ATTRIBUTE_NAME,
99 100
  STATE_BETWEEN_ATTRIBUTES,
  STATE_AFTER_ATTRIBUTE_EQUALS_SIGN,
101 102
  STATE_INSIDE_ATTRIBUTE_VALUE_SQ,
  STATE_INSIDE_ATTRIBUTE_VALUE_DQ,
103 104 105
  STATE_INSIDE_TEXT,
  STATE_AFTER_CLOSE_TAG_SLASH,
  STATE_INSIDE_CLOSE_TAG_NAME,
106
  STATE_AFTER_CLOSE_TAG_NAME,
107 108 109 110
  STATE_INSIDE_PASSTHROUGH,
  STATE_ERROR
} GMarkupParseState;

111 112 113 114 115 116 117
typedef struct
{
  const char *prev_element;
  const GMarkupParser *prev_parser;
  gpointer prev_user_data;
} GMarkupRecursionTracker;

118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
struct _GMarkupParseContext
{
  const GMarkupParser *parser;

  GMarkupParseFlags flags;

  gint line_number;
  gint char_number;

  gpointer user_data;
  GDestroyNotify dnotify;

  /* A piece of character data or an element that
   * hasn't "ended" yet so we haven't yet called
   * the callback for it.
   */
  GString *partial_chunk;
135
  GSList *spare_chunks;
136 137 138

  GMarkupParseState state;
  GSList *tag_stack;
139 140 141 142 143
  GSList *tag_stack_gstr;
  GSList *spare_list_nodes;

  GString **attr_names;
  GString **attr_values;
144 145
  gint cur_attr;
  gint alloc_attrs;
146 147

  const gchar *current_text;
Matthias Clasen's avatar
Matthias Clasen committed
148
  gssize       current_text_len;
149 150 151 152 153 154 155 156 157
  const gchar *current_text_end;

  /* used to save the start of the last interesting thingy */
  const gchar *start;

  const gchar *iter;

  guint document_empty : 1;
  guint parsing : 1;
158
  guint awaiting_pop : 1;
Matthias Clasen's avatar
Matthias Clasen committed
159
  gint balance;
160 161 162 163 164

  /* subparser support */
  GSList *subparser_stack; /* (GMarkupRecursionTracker *) */
  const char *subparser_element;
  gpointer held_user_data;
165 166
};

167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
/*
 * Helpers to reduce our allocation overhead, we have
 * a well defined allocation lifecycle.
 */
static GSList *
get_list_node (GMarkupParseContext *context, gpointer data)
{
  GSList *node;
  if (context->spare_list_nodes != NULL)
    {
      node = context->spare_list_nodes;
      context->spare_list_nodes = g_slist_remove_link (context->spare_list_nodes, node);
    }
  else
    node = g_slist_alloc();
  node->data = data;
  return node;
}

static void
free_list_node (GMarkupParseContext *context, GSList *node)
{
  node->data = NULL;
  context->spare_list_nodes = g_slist_concat (node, context->spare_list_nodes);
}

static inline void
string_blank (GString *string)
{
  string->str[0] = '\0';
  string->len = 0;
}

200 201 202 203 204
/**
 * g_markup_parse_context_new:
 * @parser: a #GMarkupParser
 * @flags: one or more #GMarkupParseFlags
 * @user_data: user data to pass to #GMarkupParser functions
Matthias Clasen's avatar
Matthias Clasen committed
205 206 207
 * @user_data_dnotify: user data destroy notifier called when
 *     the parse context is freed
 *
208 209 210
 * Creates a new parse context. A parse context is used to parse
 * marked-up documents. You can feed any number of documents into
 * a context, as long as no errors occur; once an error occurs,
Matthias Clasen's avatar
Matthias Clasen committed
211 212 213
 * the parse context can't continue to parse text (you have to
 * free it and create a new parse context).
 *
214 215
 * Return value: a new #GMarkupParseContext
 **/
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
GMarkupParseContext *
g_markup_parse_context_new (const GMarkupParser *parser,
                            GMarkupParseFlags    flags,
                            gpointer             user_data,
                            GDestroyNotify       user_data_dnotify)
{
  GMarkupParseContext *context;

  g_return_val_if_fail (parser != NULL, NULL);

  context = g_new (GMarkupParseContext, 1);

  context->parser = parser;
  context->flags = flags;
  context->user_data = user_data;
  context->dnotify = user_data_dnotify;

  context->line_number = 1;
  context->char_number = 1;

  context->partial_chunk = NULL;
237 238
  context->spare_chunks = NULL;
  context->spare_list_nodes = NULL;
239 240 241

  context->state = STATE_START;
  context->tag_stack = NULL;
242
  context->tag_stack_gstr = NULL;
243 244 245 246
  context->attr_names = NULL;
  context->attr_values = NULL;
  context->cur_attr = -1;
  context->alloc_attrs = 0;
247 248 249 250 251 252 253 254 255 256 257

  context->current_text = NULL;
  context->current_text_len = -1;
  context->current_text_end = NULL;

  context->start = NULL;
  context->iter = NULL;

  context->document_empty = TRUE;
  context->parsing = FALSE;

258 259 260 261 262 263 264
  context->awaiting_pop = FALSE;
  context->subparser_stack = NULL;
  context->subparser_element = NULL;

  /* this is only looked at if awaiting_pop = TRUE.  initialise anyway. */
  context->held_user_data = NULL;

Matthias Clasen's avatar
Matthias Clasen committed
265 266
  context->balance = 0;

267 268 269
  return context;
}

270
static void
Matthias Clasen's avatar
Matthias Clasen committed
271
string_full_free (gpointer ptr)
272 273 274 275 276 277
{
  g_string_free (ptr, TRUE);
}

static void clear_attributes (GMarkupParseContext *context);

278 279 280
/**
 * g_markup_parse_context_free:
 * @context: a #GMarkupParseContext
Matthias Clasen's avatar
Matthias Clasen committed
281 282 283 284 285 286
 *
 * Frees a #GMarkupParseContext.
 *
 * This function can't be called from inside one of the
 * #GMarkupParser functions or while a subparser is pushed.
 */
287 288 289 290 291
void
g_markup_parse_context_free (GMarkupParseContext *context)
{
  g_return_if_fail (context != NULL);
  g_return_if_fail (!context->parsing);
292 293
  g_return_if_fail (!context->subparser_stack);
  g_return_if_fail (!context->awaiting_pop);
294 295 296 297

  if (context->dnotify)
    (* context->dnotify) (context->user_data);

298 299 300
  clear_attributes (context);
  g_free (context->attr_names);
  g_free (context->attr_values);
301

Matthias Clasen's avatar
Matthias Clasen committed
302
  g_slist_free_full (context->tag_stack_gstr, string_full_free);
303 304
  g_slist_free (context->tag_stack);

Matthias Clasen's avatar
Matthias Clasen committed
305
  g_slist_free_full (context->spare_chunks, string_full_free);
306 307
  g_slist_free (context->spare_list_nodes);

308 309 310 311 312 313
  if (context->partial_chunk)
    g_string_free (context->partial_chunk, TRUE);

  g_free (context);
}

314 315
static void pop_subparser_stack (GMarkupParseContext *context);

316 317 318 319 320 321 322 323
static void
mark_error (GMarkupParseContext *context,
            GError              *error)
{
  context->state = STATE_ERROR;

  if (context->parser->error)
    (*context->parser->error) (context, error, context->user_data);
324 325 326 327 328 329 330 331 332 333

  /* report the error all the way up to free all the user-data */
  while (context->subparser_stack)
    {
      pop_subparser_stack (context);
      context->awaiting_pop = FALSE; /* already been freed */

      if (context->parser->error)
        (*context->parser->error) (context, error, context->user_data);
    }
334 335
}

Matthias Clasen's avatar
Matthias Clasen committed
336 337 338 339 340 341
static void
set_error (GMarkupParseContext  *context,
           GError              **error,
           GMarkupError          code,
           const gchar          *format,
           ...) G_GNUC_PRINTF (4, 5);
342

343
static void
Matthias Clasen's avatar
Matthias Clasen committed
344 345 346 347
set_error_literal (GMarkupParseContext  *context,
                   GError              **error,
                   GMarkupError          code,
                   const gchar          *message)
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362
{
  GError *tmp_error;

  tmp_error = g_error_new_literal (G_MARKUP_ERROR, code, message);

  g_prefix_error (&tmp_error,
                  _("Error on line %d char %d: "),
                  context->line_number,
                  context->char_number);

  mark_error (context, tmp_error);

  g_propagate_error (error, tmp_error);
}

363
static void
Matthias Clasen's avatar
Matthias Clasen committed
364 365 366 367
set_error (GMarkupParseContext  *context,
           GError              **error,
           GMarkupError          code,
           const gchar          *format,
368 369 370
           ...)
{
  gchar *s;
371
  gchar *s_valid;
372 373 374 375 376 377
  va_list args;

  va_start (args, format);
  s = g_strdup_vprintf (format, args);
  va_end (args);

Matthias Clasen's avatar
Matthias Clasen committed
378 379 380
  /* Make sure that the GError message is valid UTF-8
   * even if it is complaining about invalid UTF-8 in the markup
   */
381
  s_valid = _g_utf8_make_valid (s);
382
  set_error_literal (context, error, code, s);
383

384
  g_free (s);
385
  g_free (s_valid);
386 387
}

388 389 390 391 392 393 394 395 396 397 398 399 400 401 402
static void
propagate_error (GMarkupParseContext  *context,
                 GError              **dest,
                 GError               *src)
{
  if (context->flags & G_MARKUP_PREFIX_ERROR_POSITION)
    g_prefix_error (&src,
                    _("Error on line %d char %d: "),
                    context->line_number,
                    context->char_number);

  mark_error (context, src);

  g_propagate_error (dest, src);
}
403 404 405 406

#define IS_COMMON_NAME_END_CHAR(c) \
  ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')

407
static gboolean
Matthias Clasen's avatar
Matthias Clasen committed
408 409 410
slow_name_validate (GMarkupParseContext  *context,
                    const gchar          *name,
                    GError              **error)
411
{
Matthias Clasen's avatar
Matthias Clasen committed
412
  const gchar *p = name;
413 414 415 416

  if (!g_utf8_validate (name, strlen (name), NULL))
    {
      set_error (context, error, G_MARKUP_ERROR_BAD_UTF8,
Matthias Clasen's avatar
Matthias Clasen committed
417
                 _("Invalid UTF-8 encoded text in name - not valid '%s'"), name);
418 419 420 421
      return FALSE;
    }

  if (!(g_ascii_isalpha (*p) ||
Matthias Clasen's avatar
Matthias Clasen committed
422 423 424 425
        (!IS_COMMON_NAME_END_CHAR (*p) &&
         (*p == '_' ||
          *p == ':' ||
          g_unichar_isalpha (g_utf8_get_char (p))))))
426 427
    {
      set_error (context, error, G_MARKUP_ERROR_PARSE,
Matthias Clasen's avatar
Matthias Clasen committed
428
                 _("'%s' is not a valid name "), name);
429 430 431 432 433 434 435
      return FALSE;
    }

  for (p = g_utf8_next_char (name); *p != '\0'; p = g_utf8_next_char (p))
    {
      /* is_name_char */
      if (!(g_ascii_isalnum (*p) ||
Matthias Clasen's avatar
Matthias Clasen committed
436 437 438 439 440 441 442 443 444 445 446
            (!IS_COMMON_NAME_END_CHAR (*p) &&
             (*p == '.' ||
              *p == '-' ||
              *p == '_' ||
              *p == ':' ||
              g_unichar_isalpha (g_utf8_get_char (p))))))
        {
          set_error (context, error, G_MARKUP_ERROR_PARSE,
                     _("'%s' is not a valid name: '%c' "), name, *p);
          return FALSE;
        }
447 448
    }
  return TRUE;
449 450
}

Matthias Clasen's avatar
Matthias Clasen committed
451
/*
452 453
 * Use me for elements, attributes etc.
 */
454
static gboolean
Matthias Clasen's avatar
Matthias Clasen committed
455 456 457
name_validate (GMarkupParseContext  *context,
               const gchar          *name,
               GError              **error)
458
{
459 460 461 462 463 464
  char mask;
  const char *p;

  /* name start char */
  p = name;
  if (G_UNLIKELY (IS_COMMON_NAME_END_CHAR (*p) ||
Matthias Clasen's avatar
Matthias Clasen committed
465
                  !(g_ascii_isalpha (*p) || *p == '_' || *p == ':')))
466
    goto slow_validate;
Matthias Clasen's avatar
Matthias Clasen committed
467

468 469 470 471 472 473
  for (mask = *p++; *p != '\0'; p++)
    {
      mask |= *p;

      /* is_name_char */
      if (G_UNLIKELY (!(g_ascii_isalnum (*p) ||
Matthias Clasen's avatar
Matthias Clasen committed
474 475 476 477 478 479
                        (!IS_COMMON_NAME_END_CHAR (*p) &&
                         (*p == '.' ||
                          *p == '-' ||
                          *p == '_' ||
                          *p == ':')))))
        goto slow_validate;
480 481 482 483 484 485 486 487 488
    }

  if (mask & 0x80) /* un-common / non-ascii */
    goto slow_validate;

  return TRUE;

 slow_validate:
  return slow_name_validate (context, name, error);
489 490
}

491
static gboolean
Matthias Clasen's avatar
Matthias Clasen committed
492 493 494 495
text_validate (GMarkupParseContext  *context,
               const gchar          *p,
               gint                  len,
               GError              **error)
496 497 498 499
{
  if (!g_utf8_validate (p, len, NULL))
    {
      set_error (context, error, G_MARKUP_ERROR_BAD_UTF8,
Matthias Clasen's avatar
Matthias Clasen committed
500
                 _("Invalid UTF-8 encoded text in name - not valid '%s'"), p);
501 502 503 504 505
      return FALSE;
    }
  else
    return TRUE;
}
506 507 508 509 510

static gchar*
char_str (gunichar c,
          gchar   *buf)
{
Matthias Clasen's avatar
Matthias Clasen committed
511
  memset (buf, 0, 8);
512 513 514 515 516 517 518 519 520 521 522 523 524
  g_unichar_to_utf8 (c, buf);
  return buf;
}

static gchar*
utf8_str (const gchar *utf8,
          gchar       *buf)
{
  char_str (g_utf8_get_char (utf8), buf);
  return buf;
}

static void
Matthias Clasen's avatar
Matthias Clasen committed
525 526 527 528 529
set_unescape_error (GMarkupParseContext  *context,
                    GError              **error,
                    const gchar          *remaining_text,
                    GMarkupError          code,
                    const gchar          *format,
530 531 532 533 534 535 536 537 538 539
                    ...)
{
  GError *tmp_error;
  gchar *s;
  va_list args;
  gint remaining_newlines;
  const gchar *p;

  remaining_newlines = 0;
  p = remaining_text;
540
  while (*p != '\0')
541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563
    {
      if (*p == '\n')
        ++remaining_newlines;
      ++p;
    }

  va_start (args, format);
  s = g_strdup_vprintf (format, args);
  va_end (args);

  tmp_error = g_error_new (G_MARKUP_ERROR,
                           code,
                           _("Error on line %d: %s"),
                           context->line_number - remaining_newlines,
                           s);

  g_free (s);

  mark_error (context, tmp_error);

  g_propagate_error (error, tmp_error);
}

564 565 566 567 568
/*
 * re-write the GString in-place, unescaping anything that escaped.
 * most XML does not contain entities, or escaping.
 */
static gboolean
Matthias Clasen's avatar
Matthias Clasen committed
569 570 571 572
unescape_gstring_inplace (GMarkupParseContext  *context,
                          GString              *string,
                          gboolean             *is_ascii,
                          GError              **error)
573
{
574 575 576
  char mask, *to;
  int line_num = 1;
  const char *from;
577
  gboolean normalize_attribute;
578

579 580 581 582 583
  *is_ascii = FALSE;

  /* are we unescaping an attribute or not ? */
  if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
      context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
584 585 586 587
    normalize_attribute = TRUE;
  else
    normalize_attribute = FALSE;

588 589 590 591 592 593
  /*
   * Meeks' theorum: unescaping can only shrink text.
   * for &lt; etc. this is obvious, for &#xffff; more
   * thought is required, but this is patently so.
   */
  mask = 0;
Matthias Clasen's avatar
Matthias Clasen committed
594
  for (from = to = string->str; *from != '\0'; from++, to++)
595
    {
596 597 598 599
      *to = *from;

      mask |= *to;
      if (*to == '\n')
Matthias Clasen's avatar
Matthias Clasen committed
600
        line_num++;
601
      if (normalize_attribute && (*to == '\t' || *to == '\n'))
Matthias Clasen's avatar
Matthias Clasen committed
602
        *to = ' ';
603
      if (*to == '\r')
Matthias Clasen's avatar
Matthias Clasen committed
604 605 606 607 608
        {
          *to = normalize_attribute ? ' ' : '\n';
          if (from[1] == '\n')
            from++;
        }
609
      if (*from == '&')
Matthias Clasen's avatar
Matthias Clasen committed
610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680
        {
          from++;
          if (*from == '#')
            {
              gboolean is_hex = FALSE;
              gulong l;
              gchar *end = NULL;

              from++;

              if (*from == 'x')
                {
                  is_hex = TRUE;
                  from++;
                }

              /* digit is between start and p */
              errno = 0;
              if (is_hex)
                l = strtoul (from, &end, 16);
              else
                l = strtoul (from, &end, 10);

              if (end == from || errno != 0)
                {
                  set_unescape_error (context, error,
                                      from, G_MARKUP_ERROR_PARSE,
                                      _("Failed to parse '%-.*s', which "
                                        "should have been a digit "
                                        "inside a character reference "
                                        "(&#234; for example) - perhaps "
                                        "the digit is too large"),
                                      end - from, from);
                  return FALSE;
                }
              else if (*end != ';')
                {
                  set_unescape_error (context, error,
                                      from, G_MARKUP_ERROR_PARSE,
                                      _("Character reference did not end with a "
                                        "semicolon; "
                                        "most likely you used an ampersand "
                                        "character without intending to start "
                                        "an entity - escape ampersand as &amp;"));
                  return FALSE;
                }
              else
                {
                  /* characters XML 1.1 permits */
                  if ((0 < l && l <= 0xD7FF) ||
                      (0xE000 <= l && l <= 0xFFFD) ||
                      (0x10000 <= l && l <= 0x10FFFF))
                    {
                      gchar buf[8];
                      char_str (l, buf);
                      strcpy (to, buf);
                      to += strlen (buf) - 1;
                      from = end;
                      if (l >= 0x80) /* not ascii */
                        mask |= 0x80;
                    }
                  else
                    {
                      set_unescape_error (context, error,
                                          from, G_MARKUP_ERROR_PARSE,
                                          _("Character reference '%-.*s' does not "
                                            "encode a permitted character"),
                                          end - from, from);
                      return FALSE;
                    }
                }
681 682
            }

683
          else if (strncmp (from, "lt;", 3) == 0)
Matthias Clasen's avatar
Matthias Clasen committed
684 685 686 687
            {
              *to = '<';
              from += 2;
            }
688
          else if (strncmp (from, "gt;", 3) == 0)
Matthias Clasen's avatar
Matthias Clasen committed
689 690 691 692
            {
              *to = '>';
              from += 2;
            }
693
          else if (strncmp (from, "amp;", 4) == 0)
Matthias Clasen's avatar
Matthias Clasen committed
694 695 696 697
            {
              *to = '&';
              from += 3;
            }
698
          else if (strncmp (from, "quot;", 5) == 0)
Matthias Clasen's avatar
Matthias Clasen committed
699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733
            {
              *to = '"';
              from += 4;
            }
          else if (strncmp (from, "apos;", 5) == 0)
            {
              *to = '\'';
              from += 4;
            }
          else
            {
              if (*from == ';')
                set_unescape_error (context, error,
                                    from, G_MARKUP_ERROR_PARSE,
                                    _("Empty entity '&;' seen; valid "
                                      "entities are: &amp; &quot; &lt; &gt; &apos;"));
              else
                {
                  const char *end = strchr (from, ';');
                  if (end)
                    set_unescape_error (context, error,
                                        from, G_MARKUP_ERROR_PARSE,
                                        _("Entity name '%-.*s' is not known"),
                                        end-from, from);
                  else
                    set_unescape_error (context, error,
                                        from, G_MARKUP_ERROR_PARSE,
                                        _("Entity did not end with a semicolon; "
                                          "most likely you used an ampersand "
                                          "character without intending to start "
                                          "an entity - escape ampersand as &amp;"));
                }
              return FALSE;
            }
        }
734 735
    }

736 737 738
  g_assert (to - string->str <= string->len);
  if (to - string->str != string->len)
    g_string_truncate (string, to - string->str);
739

740
  *is_ascii = !(mask & 0x80);
741

742
  return TRUE;
743 744
}

745
static inline gboolean
746
advance_char (GMarkupParseContext *context)
Matthias Clasen's avatar
Matthias Clasen committed
747
{
748 749
  context->iter++;
  context->char_number++;
750

751
  if (G_UNLIKELY (context->iter == context->current_text_end))
752
      return FALSE;
753 754

  else if (G_UNLIKELY (*context->iter == '\n'))
755
    {
756
      context->line_number++;
757 758
      context->char_number = 1;
    }
Matthias Clasen's avatar
Matthias Clasen committed
759

760
  return TRUE;
761 762
}

763
static inline gboolean
764 765 766 767 768
xml_isspace (char c)
{
  return c == ' ' || c == '\t' || c == '\n' || c == '\r';
}

769 770 771 772 773
static void
skip_spaces (GMarkupParseContext *context)
{
  do
    {
774
      if (!xml_isspace (*context->iter))
775 776 777 778 779 780 781 782 783 784
        return;
    }
  while (advance_char (context));
}

static void
advance_to_name_end (GMarkupParseContext *context)
{
  do
    {
785
      if (IS_COMMON_NAME_END_CHAR (*(context->iter)))
786
        return;
787
      if (xml_isspace (*(context->iter)))
Matthias Clasen's avatar
Matthias Clasen committed
788
        return;
789 790 791 792
    }
  while (advance_char (context));
}

793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808
static void
release_chunk (GMarkupParseContext *context, GString *str)
{
  GSList *node;
  if (!str)
    return;
  if (str->allocated_len > 256)
    { /* large strings are unusual and worth freeing */
      g_string_free (str, TRUE);
      return;
    }
  string_blank (str);
  node = get_list_node (context, str);
  context->spare_chunks = g_slist_concat (node, context->spare_chunks);
}

809 810 811 812 813 814
static void
add_to_partial (GMarkupParseContext *context,
                const gchar         *text_start,
                const gchar         *text_end)
{
  if (context->partial_chunk == NULL)
815
    { /* allocate a new chunk to parse into */
816

817
      if (context->spare_chunks != NULL)
Matthias Clasen's avatar
Matthias Clasen committed
818 819 820 821 822 823
        {
          GSList *node = context->spare_chunks;
          context->spare_chunks = g_slist_remove_link (context->spare_chunks, node);
          context->partial_chunk = node->data;
          free_list_node (context, node);
        }
824
      else
Matthias Clasen's avatar
Matthias Clasen committed
825
        context->partial_chunk = g_string_sized_new (MAX (28, text_end - text_start));
826
    }
827

828 829
  if (text_start != text_end)
    g_string_insert_len (context->partial_chunk, -1,
Matthias Clasen's avatar
Matthias Clasen committed
830
                         text_start, text_end - text_start);
831 832
}

833
static inline void
834
truncate_partial (GMarkupParseContext *context)
835 836
{
  if (context->partial_chunk != NULL)
837
    string_blank (context->partial_chunk);
838 839
}

840
static inline const gchar*
841 842 843 844 845
current_element (GMarkupParseContext *context)
{
  return context->tag_stack->data;
}

846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866
static void
pop_subparser_stack (GMarkupParseContext *context)
{
  GMarkupRecursionTracker *tracker;

  g_assert (context->subparser_stack);

  tracker = context->subparser_stack->data;

  context->awaiting_pop = TRUE;
  context->held_user_data = context->user_data;

  context->user_data = tracker->prev_user_data;
  context->parser = tracker->prev_parser;
  context->subparser_element = tracker->prev_element;
  g_slice_free (GMarkupRecursionTracker, tracker);

  context->subparser_stack = g_slist_delete_link (context->subparser_stack,
                                                  context->subparser_stack);
}

867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890
static void
push_partial_as_tag (GMarkupParseContext *context)
{
  GString *str = context->partial_chunk;
  /* sadly, this is exported by gmarkup_get_element_stack as-is */
  context->tag_stack = g_slist_concat (get_list_node (context, str->str), context->tag_stack);
  context->tag_stack_gstr = g_slist_concat (get_list_node (context, str), context->tag_stack_gstr);
  context->partial_chunk = NULL;
}

static void
pop_tag (GMarkupParseContext *context)
{
  GSList *nodea, *nodeb;

  nodea = context->tag_stack;
  nodeb = context->tag_stack_gstr;
  release_chunk (context, nodeb->data);
  context->tag_stack = g_slist_remove_link (context->tag_stack, nodea);
  context->tag_stack_gstr = g_slist_remove_link (context->tag_stack_gstr, nodeb);
  free_list_node (context, nodea);
  free_list_node (context, nodeb);
}

891 892 893 894 895 896 897 898 899 900 901 902
static void
possibly_finish_subparser (GMarkupParseContext *context)
{
  if (current_element (context) == context->subparser_element)
    pop_subparser_stack (context);
}

static void
ensure_no_outstanding_subparser (GMarkupParseContext *context)
{
  if (context->awaiting_pop)
    g_critical ("During the first end_element call after invoking a "
Matthias Clasen's avatar
Matthias Clasen committed
903 904 905 906
                "subparser you must pop the subparser stack and handle "
                "the freeing of the subparser user_data.  This can be "
                "done by calling the end function of the subparser.  "
                "Very probably, your program just leaked memory.");
907 908 909 910 911 912

  /* let valgrind watch the pointer disappear... */
  context->held_user_data = NULL;
  context->awaiting_pop = FALSE;
}

913 914 915
static const gchar*
current_attribute (GMarkupParseContext *context)
{
916
  g_assert (context->cur_attr >= 0);
917
  return context->attr_names[context->cur_attr]->str;
918 919
}

920
static void
921
add_attribute (GMarkupParseContext *context, GString *str)
922 923 924 925
{
  if (context->cur_attr + 2 >= context->alloc_attrs)
    {
      context->alloc_attrs += 5; /* silly magic number */
926 927
      context->attr_names = g_realloc (context->attr_names, sizeof(GString*)*context->alloc_attrs);
      context->attr_values = g_realloc (context->attr_values, sizeof(GString*)*context->alloc_attrs);
928 929
    }
  context->cur_attr++;
930
  context->attr_names[context->cur_attr] = str;
931 932
  context->attr_values[context->cur_attr] = NULL;
  context->attr_names[context->cur_attr+1] = NULL;
933
  context->attr_values[context->cur_attr+1] = NULL;
934 935
}

936 937 938 939 940 941 942 943 944 945 946 947 948
static void
clear_attributes (GMarkupParseContext *context)
{
  /* Go ahead and free the attributes. */
  for (; context->cur_attr >= 0; context->cur_attr--)
    {
      int pos = context->cur_attr;
      release_chunk (context, context->attr_names[pos]);
      release_chunk (context, context->attr_values[pos]);
      context->attr_names[pos] = context->attr_values[pos] = NULL;
    }
  g_assert (context->cur_attr == -1);
  g_assert (context->attr_names == NULL ||
Matthias Clasen's avatar
Matthias Clasen committed
949
            context->attr_names[0] == NULL);
950
  g_assert (context->attr_values == NULL ||
Matthias Clasen's avatar
Matthias Clasen committed
951
            context->attr_values[0] == NULL);
952
}
953 954

/* This has to be a separate function to ensure the alloca's
Matthias Clasen's avatar
Matthias Clasen committed
955 956 957
 * are unwound on exit - otherwise we grow & blow the stack
 * with large documents
 */
958
static inline void
Matthias Clasen's avatar
Matthias Clasen committed
959 960
emit_start_element (GMarkupParseContext  *context,
                    GError              **error)
961 962 963 964 965 966
{
  int i;
  const gchar *start_name;
  const gchar **attr_names;
  const gchar **attr_values;
  GError *tmp_error;
Matthias Clasen's avatar
Matthias Clasen committed
967

968 969 970 971 972 973 974 975 976
  attr_names = g_newa (const gchar *, context->cur_attr + 2);
  attr_values = g_newa (const gchar *, context->cur_attr + 2);
  for (i = 0; i < context->cur_attr + 1; i++)
    {
      attr_names[i] = context->attr_names[i]->str;
      attr_values[i] = context->attr_values[i]->str;
    }
  attr_names[i] = NULL;
  attr_values[i] = NULL;
Matthias Clasen's avatar
Matthias Clasen committed
977

978 979 980
  /* Call user callback for element start */
  tmp_error = NULL;
  start_name = current_element (context);
Matthias Clasen's avatar
Matthias Clasen committed
981

982 983 984
  if (context->parser->start_element &&
      name_validate (context, start_name, error))
    (* context->parser->start_element) (context,
Matthias Clasen's avatar
Matthias Clasen committed
985 986 987 988 989
                                        start_name,
                                        (const gchar **)attr_names,
                                        (const gchar **)attr_values,
                                        context->user_data,
                                        &tmp_error);
990
  clear_attributes (context);
Matthias Clasen's avatar
Matthias Clasen committed
991

992 993 994 995
  if (tmp_error != NULL)
    propagate_error (context, error, tmp_error);
}

996 997 998 999 1000 1001
/**
 * g_markup_parse_context_parse:
 * @context: a #GMarkupParseContext
 * @text: chunk of text to parse
 * @text_len: length of @text in bytes
 * @error: return location for a #GError
Matthias Clasen's avatar
Matthias Clasen committed
1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013
 *
 * Feed some data to the #GMarkupParseContext.
 *
 * The data need not be valid UTF-8; an error will be signaled if
 * it's invalid. The data need not be an entire document; you can
 * feed a document into the parser incrementally, via multiple calls
 * to this function. Typically, as you receive data from a network
 * connection or file, you feed each received chunk of data into this
 * function, aborting the process if an error occurs. Once an error
 * is reported, no further data may be fed to the #GMarkupParseContext;
 * all errors are fatal.
 *
1014
 * Return value: %FALSE if an error occurred, %TRUE on success
Matthias Clasen's avatar
Matthias Clasen committed
1015
 */
1016
gboolean
Matthias Clasen's avatar
Matthias Clasen committed
1017 1018 1019 1020
g_markup_parse_context_parse (GMarkupParseContext  *context,
                              const gchar          *text,
                              gssize                text_len,
                              GError              **error)
1021 1022 1023 1024 1025
{
  g_return_val_if_fail (context != NULL, FALSE);
  g_return_val_if_fail (text != NULL, FALSE);
  g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
  g_return_val_if_fail (!context->parsing, FALSE);
Matthias Clasen's avatar
Matthias Clasen committed
1026

1027 1028 1029 1030 1031
  if (text_len < 0)
    text_len = strlen (text);

  if (text_len == 0)
    return TRUE;
Matthias Clasen's avatar
Matthias Clasen committed
1032

1033
  context->parsing = TRUE;
Matthias Clasen's avatar
Matthias Clasen committed
1034

1035 1036 1037

  context->current_text = text;
  context->current_text_len = text_len;
1038
  context->current_text_end = context->current_text + text_len;
1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073
  context->iter = context->current_text;
  context->start = context->iter;

  if (context->current_text_len == 0)
    goto finished;

  while (context->iter != context->current_text_end)
    {
      switch (context->state)
        {
        case STATE_START:
          /* Possible next state: AFTER_OPEN_ANGLE */

          g_assert (context->tag_stack == NULL);

          /* whitespace is ignored outside of any elements */
          skip_spaces (context);

          if (context->iter != context->current_text_end)
            {
              if (*context->iter == '<')
                {
                  /* Move after the open angle */
                  advance_char (context);

                  context->state = STATE_AFTER_OPEN_ANGLE;

                  /* this could start a passthrough */
                  context->start = context->iter;

                  /* document is now non-empty */
                  context->document_empty = FALSE;
                }
              else
                {
1074 1075 1076 1077
                  set_error_literal (context,
                                     error,
                                     G_MARKUP_ERROR_PARSE,
                                     _("Document must begin with an element (e.g. <book>)"));
1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092
                }
            }
          break;

        case STATE_AFTER_OPEN_ANGLE:
          /* Possible next states: INSIDE_OPEN_TAG_NAME,
           *  AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH
           */
          if (*context->iter == '?' ||
              *context->iter == '!')
            {
              /* include < in the passthrough */
              const gchar *openangle = "<";
              add_to_partial (context, openangle, openangle + 1);
              context->start = context->iter;
Matthias Clasen's avatar
Matthias Clasen committed
1093
              context->balance = 1;
1094 1095 1096 1097 1098 1099 1100 1101 1102
              context->state = STATE_INSIDE_PASSTHROUGH;
            }
          else if (*context->iter == '/')
            {
              /* move after it */
              advance_char (context);

              context->state = STATE_AFTER_CLOSE_TAG_SLASH;
            }
1103
          else if (!IS_COMMON_NAME_END_CHAR (*(context->iter)))
1104 1105 1106 1107 1108 1109 1110 1111
            {
              context->state = STATE_INSIDE_OPEN_TAG_NAME;

              /* start of tag name */
              context->start = context->iter;
            }
          else
            {
Matthias Clasen's avatar
Matthias Clasen committed
1112 1113
              gchar buf[8];

1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150
              set_error (context,
                         error,
                         G_MARKUP_ERROR_PARSE,
                         _("'%s' is not a valid character following "
                           "a '<' character; it may not begin an "
                           "element name"),
                         utf8_str (context->iter, buf));
            }
          break;

          /* The AFTER_CLOSE_ANGLE state is actually sort of
           * broken, because it doesn't correspond to a range
           * of characters in the input stream as the others do,
           * and thus makes things harder to conceptualize
           */
        case STATE_AFTER_CLOSE_ANGLE:
          /* Possible next states: INSIDE_TEXT, STATE_START */
          if (context->tag_stack == NULL)
            {
              context->start = NULL;
              context->state = STATE_START;
            }
          else
            {
              context->start = context->iter;
              context->state = STATE_INSIDE_TEXT;
            }
          break;

        case STATE_AFTER_ELISION_SLASH:
          /* Possible next state: AFTER_CLOSE_ANGLE */

          {
            /* We need to pop the tag stack and call the end_element
             * function, since this is the close tag
             */
            GError *tmp_error = NULL;
Matthias Clasen's avatar
Matthias Clasen committed
1151

1152 1153
            g_assert (context->tag_stack != NULL);

1154 1155
            possibly_finish_subparser (context);

1156 1157 1158
            tmp_error = NULL;
            if (context->parser->end_element)
              (* context->parser->end_element) (context,
Matthias Clasen's avatar
Matthias Clasen committed
1159
                                                current_element (context),
1160 1161
                                                context->user_data,
                                                &tmp_error);
1162 1163

            ensure_no_outstanding_subparser (context);
Matthias Clasen's avatar
Matthias Clasen committed
1164

1165 1166 1167 1168
            if (tmp_error)
              {
                mark_error (context, tmp_error);
                g_propagate_error (error, tmp_error);
Matthias Clasen's avatar
Matthias Clasen committed
1169
              }
1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
            else
              {
                if (*context->iter == '>')
                  {
                    /* move after the close angle */
                    advance_char (context);
                    context->state = STATE_AFTER_CLOSE_ANGLE;
                  }
                else
                  {
Matthias Clasen's avatar
Matthias Clasen committed
1180 1181
                    gchar buf[8];

1182 1183 1184 1185
                    set_error (context,
                               error,
                               G_MARKUP_ERROR_PARSE,
                               _("Odd character '%s', expected a '>' character "
Matthias Clasen's avatar
Matthias Clasen committed
1186
                                 "to end the empty-element tag '%s'"),
1187 1188 1189 1190
                               utf8_str (context->iter, buf),
                               current_element (context));
                  }
              }
Matthias Clasen's avatar
Matthias Clasen committed
1191
            pop_tag (context);
1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217
          }
          break;

        case STATE_INSIDE_OPEN_TAG_NAME:
          /* Possible next states: BETWEEN_ATTRIBUTES */

          /* if there's a partial chunk then it's the first part of the
           * tag name. If there's a context->start then it's the start
           * of the tag name in current_text, the partial chunk goes
           * before that start though.
           */
          advance_to_name_end (context);

          if (context->iter == context->current_text_end)
            {
              /* The name hasn't necessarily ended. Merge with
               * partial chunk, leave state unchanged.
               */
              add_to_partial (context, context->start, context->iter);
            }
          else
            {
              /* The name has ended. Combine it with the partial chunk
               * if any; push it on the stack; enter next state.
               */
              add_to_partial (context, context->start, context->iter);
Matthias Clasen's avatar
Matthias Clasen committed
1218
              push_partial_as_tag (context);
1219 1220 1221 1222 1223 1224 1225

              context->state = STATE_BETWEEN_ATTRIBUTES;
              context->start = NULL;
            }
          break;

        case STATE_INSIDE_ATTRIBUTE_NAME:
1226 1227 1228
          /* Possible next states: AFTER_ATTRIBUTE_NAME */

          advance_to_name_end (context);
Matthias Clasen's avatar
Matthias Clasen committed
1229
          add_to_partial (context, context->start, context->iter);
1230 1231 1232 1233 1234

          /* read the full name, if we enter the equals sign state
           * then add the attribute to the list (without the value),
           * otherwise store a partial chunk to be prepended later.
           */
1235
          if (context->iter != context->current_text_end)
Matthias Clasen's avatar
Matthias Clasen committed
1236 1237
            context->state = STATE_AFTER_ATTRIBUTE_NAME;
          break;
1238