gmarkup.c 86.3 KB
Newer Older
1
2
/* gmarkup.c - Simple XML-like parser
 *
3
 *  Copyright 2000, 2003 Red Hat, Inc.
4
 *  Copyright 2007, 2008 Ryan Lortie <desrt@desrt.ca>
5
 *
6
7
8
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
13
14
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
16
17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, see <http://www.gnu.org/licenses/>.
18
19
 */

20
#include "config.h"
21

22
#include <stdarg.h>
23
24
25
26
27
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>

28
29
#include "gmarkup.h"

30
#include "gatomic.h"
31
#include "gslice.h"
32
33
34
35
#include "galloca.h"
#include "gstrfuncs.h"
#include "gstring.h"
#include "gtestutils.h"
Owen Taylor's avatar
Owen Taylor committed
36
#include "glibintl.h"
Matthias Clasen's avatar
Matthias Clasen committed
37
#include "gthread.h"
38

Matthias Clasen's avatar
Matthias Clasen committed
39
40
/**
 * SECTION:markup
Sébastien Wilmet's avatar
Sébastien Wilmet committed
41
42
 * @Title: Simple XML Subset Parser
 * @Short_description: parses a subset of XML
43
 * @See_also: [XML Specification](http://www.w3.org/TR/REC-xml/)
Matthias Clasen's avatar
Matthias Clasen committed
44
45
46
47
48
49
50
51
52
53
54
55
56
 *
 * The "GMarkup" parser is intended to parse a simple markup format
 * that's a subset of XML. This is a small, efficient, easy-to-use
 * parser. It should not be used if you expect to interoperate with
 * other applications generating full-scale XML. However, it's very
 * useful for application data files, config files, etc. where you
 * know your application will be the only one writing the file.
 * Full-scale XML parsers should be able to parse the subset used by
 * GMarkup, so you can easily migrate to full-scale XML at a later
 * time if the need arises.
 *
 * GMarkup is not guaranteed to signal an error on all invalid XML;
 * the parser may accept documents that an XML parser would not.
57
58
 * However, XML documents which are not well-formed (which is a
 * weaker condition than being valid. See the
59
60
61
 * [XML specification](http://www.w3.org/TR/REC-xml/)
 * for definitions of these terms.) are not considered valid GMarkup
 * documents.
Matthias Clasen's avatar
Matthias Clasen committed
62
63
 *
 * Simplifications to XML include:
64
65
66
67
68
69
70
71
72
 *
 * - Only UTF-8 encoding is allowed
 *
 * - No user-defined entities
 *
 * - Processing instructions, comments and the doctype declaration
 *   are "passed through" but are not interpreted in any way
 *
 * - No DTD or validation
Matthias Clasen's avatar
Matthias Clasen committed
73
74
 *
 * The markup format does support:
75
76
77
78
79
 *
 * - Elements
 *
 * - Attributes
 *
80
 * - 5 standard entities: &amp; &lt; &gt; &quot; &apos;
81
82
83
84
 *
 * - Character references
 *
 * - Sections marked as CDATA
Matthias Clasen's avatar
Matthias Clasen committed
85
86
 */

87
G_DEFINE_QUARK (g-markup-error-quark, g_markup_error)
88
89
90
91
92
93
94
95
96

typedef enum
{
  STATE_START,
  STATE_AFTER_OPEN_ANGLE,
  STATE_AFTER_CLOSE_ANGLE,
  STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */
  STATE_INSIDE_OPEN_TAG_NAME,
  STATE_INSIDE_ATTRIBUTE_NAME,
97
  STATE_AFTER_ATTRIBUTE_NAME,
98
99
  STATE_BETWEEN_ATTRIBUTES,
  STATE_AFTER_ATTRIBUTE_EQUALS_SIGN,
100
101
  STATE_INSIDE_ATTRIBUTE_VALUE_SQ,
  STATE_INSIDE_ATTRIBUTE_VALUE_DQ,
102
103
104
  STATE_INSIDE_TEXT,
  STATE_AFTER_CLOSE_TAG_SLASH,
  STATE_INSIDE_CLOSE_TAG_NAME,
105
  STATE_AFTER_CLOSE_TAG_NAME,
106
107
108
109
  STATE_INSIDE_PASSTHROUGH,
  STATE_ERROR
} GMarkupParseState;

110
111
112
113
114
115
116
typedef struct
{
  const char *prev_element;
  const GMarkupParser *prev_parser;
  gpointer prev_user_data;
} GMarkupRecursionTracker;

117
118
119
120
struct _GMarkupParseContext
{
  const GMarkupParser *parser;

121
122
  volatile gint ref_count;

123
124
125
126
127
  GMarkupParseFlags flags;

  gint line_number;
  gint char_number;

128
129
  GMarkupParseState state;

130
131
132
133
134
135
136
137
  gpointer user_data;
  GDestroyNotify dnotify;

  /* A piece of character data or an element that
   * hasn't "ended" yet so we haven't yet called
   * the callback for it.
   */
  GString *partial_chunk;
138
  GSList *spare_chunks;
139
140

  GSList *tag_stack;
141
142
143
144
145
  GSList *tag_stack_gstr;
  GSList *spare_list_nodes;

  GString **attr_names;
  GString **attr_values;
146
147
  gint cur_attr;
  gint alloc_attrs;
148
149

  const gchar *current_text;
Matthias Clasen's avatar
Matthias Clasen committed
150
  gssize       current_text_len;
151
152
153
154
155
156
157
158
159
  const gchar *current_text_end;

  /* used to save the start of the last interesting thingy */
  const gchar *start;

  const gchar *iter;

  guint document_empty : 1;
  guint parsing : 1;
160
  guint awaiting_pop : 1;
Matthias Clasen's avatar
Matthias Clasen committed
161
  gint balance;
162
163
164
165
166

  /* subparser support */
  GSList *subparser_stack; /* (GMarkupRecursionTracker *) */
  const char *subparser_element;
  gpointer held_user_data;
167
168
};

169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
/*
 * Helpers to reduce our allocation overhead, we have
 * a well defined allocation lifecycle.
 */
static GSList *
get_list_node (GMarkupParseContext *context, gpointer data)
{
  GSList *node;
  if (context->spare_list_nodes != NULL)
    {
      node = context->spare_list_nodes;
      context->spare_list_nodes = g_slist_remove_link (context->spare_list_nodes, node);
    }
  else
    node = g_slist_alloc();
  node->data = data;
  return node;
}

static void
free_list_node (GMarkupParseContext *context, GSList *node)
{
  node->data = NULL;
  context->spare_list_nodes = g_slist_concat (node, context->spare_list_nodes);
}

static inline void
string_blank (GString *string)
{
  string->str[0] = '\0';
  string->len = 0;
}

202
203
204
205
206
/**
 * g_markup_parse_context_new:
 * @parser: a #GMarkupParser
 * @flags: one or more #GMarkupParseFlags
 * @user_data: user data to pass to #GMarkupParser functions
Matthias Clasen's avatar
Matthias Clasen committed
207
208
209
 * @user_data_dnotify: user data destroy notifier called when
 *     the parse context is freed
 *
210
211
212
 * Creates a new parse context. A parse context is used to parse
 * marked-up documents. You can feed any number of documents into
 * a context, as long as no errors occur; once an error occurs,
Matthias Clasen's avatar
Matthias Clasen committed
213
214
215
 * the parse context can't continue to parse text (you have to
 * free it and create a new parse context).
 *
216
 * Returns: a new #GMarkupParseContext
217
 **/
218
219
220
221
222
223
224
225
226
227
228
229
GMarkupParseContext *
g_markup_parse_context_new (const GMarkupParser *parser,
                            GMarkupParseFlags    flags,
                            gpointer             user_data,
                            GDestroyNotify       user_data_dnotify)
{
  GMarkupParseContext *context;

  g_return_val_if_fail (parser != NULL, NULL);

  context = g_new (GMarkupParseContext, 1);

230
  context->ref_count = 1;
231
232
233
234
235
236
237
238
239
  context->parser = parser;
  context->flags = flags;
  context->user_data = user_data;
  context->dnotify = user_data_dnotify;

  context->line_number = 1;
  context->char_number = 1;

  context->partial_chunk = NULL;
240
241
  context->spare_chunks = NULL;
  context->spare_list_nodes = NULL;
242
243
244

  context->state = STATE_START;
  context->tag_stack = NULL;
245
  context->tag_stack_gstr = NULL;
246
247
248
249
  context->attr_names = NULL;
  context->attr_values = NULL;
  context->cur_attr = -1;
  context->alloc_attrs = 0;
250
251
252
253
254
255
256
257
258
259
260

  context->current_text = NULL;
  context->current_text_len = -1;
  context->current_text_end = NULL;

  context->start = NULL;
  context->iter = NULL;

  context->document_empty = TRUE;
  context->parsing = FALSE;

261
262
263
264
265
266
267
  context->awaiting_pop = FALSE;
  context->subparser_stack = NULL;
  context->subparser_element = NULL;

  /* this is only looked at if awaiting_pop = TRUE.  initialise anyway. */
  context->held_user_data = NULL;

Matthias Clasen's avatar
Matthias Clasen committed
268
269
  context->balance = 0;

270
271
272
  return context;
}

273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
/**
 * g_markup_parse_context_ref:
 * @context: a #GMarkupParseContext
 *
 * Increases the reference count of @context.
 *
 * Returns: the same @context
 *
 * Since: 2.36
 **/
GMarkupParseContext *
g_markup_parse_context_ref (GMarkupParseContext *context)
{
  g_return_val_if_fail (context != NULL, NULL);
  g_return_val_if_fail (context->ref_count > 0, NULL);

  g_atomic_int_inc (&context->ref_count);

  return context;
}

/**
 * g_markup_parse_context_unref:
 * @context: a #GMarkupParseContext
 *
 * Decreases the reference count of @context.  When its reference count
 * drops to 0, it is freed.
 *
 * Since: 2.36
 **/
void
g_markup_parse_context_unref (GMarkupParseContext *context)
{
  g_return_if_fail (context != NULL);
  g_return_if_fail (context->ref_count > 0);

  if (g_atomic_int_dec_and_test (&context->ref_count))
    g_markup_parse_context_free (context);
}

313
static void
Matthias Clasen's avatar
Matthias Clasen committed
314
string_full_free (gpointer ptr)
315
316
317
318
319
320
{
  g_string_free (ptr, TRUE);
}

static void clear_attributes (GMarkupParseContext *context);

321
322
323
/**
 * g_markup_parse_context_free:
 * @context: a #GMarkupParseContext
Matthias Clasen's avatar
Matthias Clasen committed
324
325
326
327
328
329
 *
 * Frees a #GMarkupParseContext.
 *
 * This function can't be called from inside one of the
 * #GMarkupParser functions or while a subparser is pushed.
 */
330
331
332
333
334
void
g_markup_parse_context_free (GMarkupParseContext *context)
{
  g_return_if_fail (context != NULL);
  g_return_if_fail (!context->parsing);
335
336
  g_return_if_fail (!context->subparser_stack);
  g_return_if_fail (!context->awaiting_pop);
337
338
339
340

  if (context->dnotify)
    (* context->dnotify) (context->user_data);

341
342
343
  clear_attributes (context);
  g_free (context->attr_names);
  g_free (context->attr_values);
344

Matthias Clasen's avatar
Matthias Clasen committed
345
  g_slist_free_full (context->tag_stack_gstr, string_full_free);
346
347
  g_slist_free (context->tag_stack);

Matthias Clasen's avatar
Matthias Clasen committed
348
  g_slist_free_full (context->spare_chunks, string_full_free);
349
350
  g_slist_free (context->spare_list_nodes);

351
352
353
354
355
356
  if (context->partial_chunk)
    g_string_free (context->partial_chunk, TRUE);

  g_free (context);
}

357
358
static void pop_subparser_stack (GMarkupParseContext *context);

359
360
361
362
363
364
365
366
static void
mark_error (GMarkupParseContext *context,
            GError              *error)
{
  context->state = STATE_ERROR;

  if (context->parser->error)
    (*context->parser->error) (context, error, context->user_data);
367
368
369
370
371
372
373
374
375
376

  /* report the error all the way up to free all the user-data */
  while (context->subparser_stack)
    {
      pop_subparser_stack (context);
      context->awaiting_pop = FALSE; /* already been freed */

      if (context->parser->error)
        (*context->parser->error) (context, error, context->user_data);
    }
377
378
}

Matthias Clasen's avatar
Matthias Clasen committed
379
380
381
382
383
384
static void
set_error (GMarkupParseContext  *context,
           GError              **error,
           GMarkupError          code,
           const gchar          *format,
           ...) G_GNUC_PRINTF (4, 5);
385

386
static void
Matthias Clasen's avatar
Matthias Clasen committed
387
388
389
390
set_error_literal (GMarkupParseContext  *context,
                   GError              **error,
                   GMarkupError          code,
                   const gchar          *message)
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
{
  GError *tmp_error;

  tmp_error = g_error_new_literal (G_MARKUP_ERROR, code, message);

  g_prefix_error (&tmp_error,
                  _("Error on line %d char %d: "),
                  context->line_number,
                  context->char_number);

  mark_error (context, tmp_error);

  g_propagate_error (error, tmp_error);
}

406
G_GNUC_PRINTF(4, 5)
407
static void
Matthias Clasen's avatar
Matthias Clasen committed
408
409
410
411
set_error (GMarkupParseContext  *context,
           GError              **error,
           GMarkupError          code,
           const gchar          *format,
412
413
414
           ...)
{
  gchar *s;
415
  gchar *s_valid;
416
417
418
419
420
421
  va_list args;

  va_start (args, format);
  s = g_strdup_vprintf (format, args);
  va_end (args);

Matthias Clasen's avatar
Matthias Clasen committed
422
423
424
  /* Make sure that the GError message is valid UTF-8
   * even if it is complaining about invalid UTF-8 in the markup
   */
425
  s_valid = g_utf8_make_valid (s, -1);
426
  set_error_literal (context, error, code, s);
427

428
  g_free (s);
429
  g_free (s_valid);
430
431
}

432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
static void
propagate_error (GMarkupParseContext  *context,
                 GError              **dest,
                 GError               *src)
{
  if (context->flags & G_MARKUP_PREFIX_ERROR_POSITION)
    g_prefix_error (&src,
                    _("Error on line %d char %d: "),
                    context->line_number,
                    context->char_number);

  mark_error (context, src);

  g_propagate_error (dest, src);
}
447
448
449
450

#define IS_COMMON_NAME_END_CHAR(c) \
  ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')

451
static gboolean
Matthias Clasen's avatar
Matthias Clasen committed
452
453
454
slow_name_validate (GMarkupParseContext  *context,
                    const gchar          *name,
                    GError              **error)
455
{
Matthias Clasen's avatar
Matthias Clasen committed
456
  const gchar *p = name;
457
458
459
460

  if (!g_utf8_validate (name, strlen (name), NULL))
    {
      set_error (context, error, G_MARKUP_ERROR_BAD_UTF8,
461
                 _("Invalid UTF-8 encoded text in name — not valid “%s”"), name);
462
463
464
465
      return FALSE;
    }

  if (!(g_ascii_isalpha (*p) ||
Matthias Clasen's avatar
Matthias Clasen committed
466
467
468
469
        (!IS_COMMON_NAME_END_CHAR (*p) &&
         (*p == '_' ||
          *p == ':' ||
          g_unichar_isalpha (g_utf8_get_char (p))))))
470
471
    {
      set_error (context, error, G_MARKUP_ERROR_PARSE,
472
                 _("“%s” is not a valid name"), name);
473
474
475
476
477
478
479
      return FALSE;
    }

  for (p = g_utf8_next_char (name); *p != '\0'; p = g_utf8_next_char (p))
    {
      /* is_name_char */
      if (!(g_ascii_isalnum (*p) ||
Matthias Clasen's avatar
Matthias Clasen committed
480
481
482
483
484
485
486
487
            (!IS_COMMON_NAME_END_CHAR (*p) &&
             (*p == '.' ||
              *p == '-' ||
              *p == '_' ||
              *p == ':' ||
              g_unichar_isalpha (g_utf8_get_char (p))))))
        {
          set_error (context, error, G_MARKUP_ERROR_PARSE,
488
                     _("“%s” is not a valid name: “%c”"), name, *p);
Matthias Clasen's avatar
Matthias Clasen committed
489
490
          return FALSE;
        }
491
492
    }
  return TRUE;
493
494
}

Matthias Clasen's avatar
Matthias Clasen committed
495
/*
496
497
 * Use me for elements, attributes etc.
 */
498
static gboolean
Matthias Clasen's avatar
Matthias Clasen committed
499
500
501
name_validate (GMarkupParseContext  *context,
               const gchar          *name,
               GError              **error)
502
{
503
504
505
506
507
508
  char mask;
  const char *p;

  /* name start char */
  p = name;
  if (G_UNLIKELY (IS_COMMON_NAME_END_CHAR (*p) ||
Matthias Clasen's avatar
Matthias Clasen committed
509
                  !(g_ascii_isalpha (*p) || *p == '_' || *p == ':')))
510
    goto slow_validate;
Matthias Clasen's avatar
Matthias Clasen committed
511

512
513
514
515
516
517
  for (mask = *p++; *p != '\0'; p++)
    {
      mask |= *p;

      /* is_name_char */
      if (G_UNLIKELY (!(g_ascii_isalnum (*p) ||
Matthias Clasen's avatar
Matthias Clasen committed
518
519
520
521
522
523
                        (!IS_COMMON_NAME_END_CHAR (*p) &&
                         (*p == '.' ||
                          *p == '-' ||
                          *p == '_' ||
                          *p == ':')))))
        goto slow_validate;
524
525
526
527
528
529
530
531
532
    }

  if (mask & 0x80) /* un-common / non-ascii */
    goto slow_validate;

  return TRUE;

 slow_validate:
  return slow_name_validate (context, name, error);
533
534
}

535
static gboolean
Matthias Clasen's avatar
Matthias Clasen committed
536
537
538
539
text_validate (GMarkupParseContext  *context,
               const gchar          *p,
               gint                  len,
               GError              **error)
540
541
542
543
{
  if (!g_utf8_validate (p, len, NULL))
    {
      set_error (context, error, G_MARKUP_ERROR_BAD_UTF8,
544
                 _("Invalid UTF-8 encoded text in name — not valid “%s”"), p);
545
546
547
548
549
      return FALSE;
    }
  else
    return TRUE;
}
550
551
552
553
554

static gchar*
char_str (gunichar c,
          gchar   *buf)
{
Matthias Clasen's avatar
Matthias Clasen committed
555
  memset (buf, 0, 8);
556
557
558
559
  g_unichar_to_utf8 (c, buf);
  return buf;
}

560
561
562
/* Format the next UTF-8 character as a gchar* for printing in error output
 * when we encounter a syntax error. This correctly handles invalid UTF-8,
 * emitting it as hex escapes. */
563
564
565
566
static gchar*
utf8_str (const gchar *utf8,
          gchar       *buf)
{
567
568
569
570
571
572
573
574
575
576
  gunichar c = g_utf8_get_char_validated (utf8, -1);
  if (c == (gunichar) -1 || c == (gunichar) -2)
    {
      gchar *temp = g_strdup_printf ("\\x%02x", (guint)(guchar)*utf8);
      memset (buf, 0, 8);
      memcpy (buf, temp, strlen (temp));
      g_free (temp);
    }
  else
    char_str (c, buf);
577
578
579
  return buf;
}

580
G_GNUC_PRINTF(5, 6)
581
static void
Matthias Clasen's avatar
Matthias Clasen committed
582
583
584
585
586
set_unescape_error (GMarkupParseContext  *context,
                    GError              **error,
                    const gchar          *remaining_text,
                    GMarkupError          code,
                    const gchar          *format,
587
588
589
590
591
592
593
594
595
596
                    ...)
{
  GError *tmp_error;
  gchar *s;
  va_list args;
  gint remaining_newlines;
  const gchar *p;

  remaining_newlines = 0;
  p = remaining_text;
597
  while (*p != '\0')
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
    {
      if (*p == '\n')
        ++remaining_newlines;
      ++p;
    }

  va_start (args, format);
  s = g_strdup_vprintf (format, args);
  va_end (args);

  tmp_error = g_error_new (G_MARKUP_ERROR,
                           code,
                           _("Error on line %d: %s"),
                           context->line_number - remaining_newlines,
                           s);

  g_free (s);

  mark_error (context, tmp_error);

  g_propagate_error (error, tmp_error);
}

621
622
623
624
625
/*
 * re-write the GString in-place, unescaping anything that escaped.
 * most XML does not contain entities, or escaping.
 */
static gboolean
Matthias Clasen's avatar
Matthias Clasen committed
626
627
628
629
unescape_gstring_inplace (GMarkupParseContext  *context,
                          GString              *string,
                          gboolean             *is_ascii,
                          GError              **error)
630
{
631
632
  char mask, *to;
  const char *from;
633
  gboolean normalize_attribute;
634

635
636
637
638
639
  *is_ascii = FALSE;

  /* are we unescaping an attribute or not ? */
  if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ ||
      context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
640
641
642
643
    normalize_attribute = TRUE;
  else
    normalize_attribute = FALSE;

644
  /*
645
   * Meeks' theorem: unescaping can only shrink text.
646
647
648
649
   * for &lt; etc. this is obvious, for &#xffff; more
   * thought is required, but this is patently so.
   */
  mask = 0;
Matthias Clasen's avatar
Matthias Clasen committed
650
  for (from = to = string->str; *from != '\0'; from++, to++)
651
    {
652
653
654
655
      *to = *from;

      mask |= *to;
      if (normalize_attribute && (*to == '\t' || *to == '\n'))
Matthias Clasen's avatar
Matthias Clasen committed
656
        *to = ' ';
657
      if (*to == '\r')
Matthias Clasen's avatar
Matthias Clasen committed
658
659
660
661
662
        {
          *to = normalize_attribute ? ' ' : '\n';
          if (from[1] == '\n')
            from++;
        }
663
      if (*from == '&')
Matthias Clasen's avatar
Matthias Clasen committed
664
665
666
667
        {
          from++;
          if (*from == '#')
            {
Matthias Clasen's avatar
Matthias Clasen committed
668
              gint base = 10;
Matthias Clasen's avatar
Matthias Clasen committed
669
670
671
672
673
674
675
              gulong l;
              gchar *end = NULL;

              from++;

              if (*from == 'x')
                {
Matthias Clasen's avatar
Matthias Clasen committed
676
                  base = 16;
Matthias Clasen's avatar
Matthias Clasen committed
677
678
679
680
                  from++;
                }

              errno = 0;
Matthias Clasen's avatar
Matthias Clasen committed
681
              l = strtoul (from, &end, base);
Matthias Clasen's avatar
Matthias Clasen committed
682
683
684
685
686

              if (end == from || errno != 0)
                {
                  set_unescape_error (context, error,
                                      from, G_MARKUP_ERROR_PARSE,
687
                                      _("Failed to parse “%-.*s”, which "
Matthias Clasen's avatar
Matthias Clasen committed
688
689
                                        "should have been a digit "
                                        "inside a character reference "
690
                                        "(&#234; for example) — perhaps "
Matthias Clasen's avatar
Matthias Clasen committed
691
                                        "the digit is too large"),
692
                                      (int)(end - from), from);
Matthias Clasen's avatar
Matthias Clasen committed
693
694
695
696
697
698
699
700
701
702
                  return FALSE;
                }
              else if (*end != ';')
                {
                  set_unescape_error (context, error,
                                      from, G_MARKUP_ERROR_PARSE,
                                      _("Character reference did not end with a "
                                        "semicolon; "
                                        "most likely you used an ampersand "
                                        "character without intending to start "
703
                                        "an entity — escape ampersand as &amp;"));
Matthias Clasen's avatar
Matthias Clasen committed
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
                  return FALSE;
                }
              else
                {
                  /* characters XML 1.1 permits */
                  if ((0 < l && l <= 0xD7FF) ||
                      (0xE000 <= l && l <= 0xFFFD) ||
                      (0x10000 <= l && l <= 0x10FFFF))
                    {
                      gchar buf[8];
                      char_str (l, buf);
                      strcpy (to, buf);
                      to += strlen (buf) - 1;
                      from = end;
                      if (l >= 0x80) /* not ascii */
                        mask |= 0x80;
                    }
                  else
                    {
                      set_unescape_error (context, error,
                                          from, G_MARKUP_ERROR_PARSE,
725
                                          _("Character reference “%-.*s” does not "
Matthias Clasen's avatar
Matthias Clasen committed
726
                                            "encode a permitted character"),
727
                                          (int)(end - from), from);
Matthias Clasen's avatar
Matthias Clasen committed
728
729
730
                      return FALSE;
                    }
                }
731
732
            }

733
          else if (strncmp (from, "lt;", 3) == 0)
Matthias Clasen's avatar
Matthias Clasen committed
734
735
736
737
            {
              *to = '<';
              from += 2;
            }
738
          else if (strncmp (from, "gt;", 3) == 0)
Matthias Clasen's avatar
Matthias Clasen committed
739
740
741
742
            {
              *to = '>';
              from += 2;
            }
743
          else if (strncmp (from, "amp;", 4) == 0)
Matthias Clasen's avatar
Matthias Clasen committed
744
745
746
747
            {
              *to = '&';
              from += 3;
            }
748
          else if (strncmp (from, "quot;", 5) == 0)
Matthias Clasen's avatar
Matthias Clasen committed
749
750
751
752
753
754
755
756
757
758
759
760
761
762
            {
              *to = '"';
              from += 4;
            }
          else if (strncmp (from, "apos;", 5) == 0)
            {
              *to = '\'';
              from += 4;
            }
          else
            {
              if (*from == ';')
                set_unescape_error (context, error,
                                    from, G_MARKUP_ERROR_PARSE,
763
                                    _("Empty entity “&;” seen; valid "
Matthias Clasen's avatar
Matthias Clasen committed
764
765
766
767
768
769
770
                                      "entities are: &amp; &quot; &lt; &gt; &apos;"));
              else
                {
                  const char *end = strchr (from, ';');
                  if (end)
                    set_unescape_error (context, error,
                                        from, G_MARKUP_ERROR_PARSE,
771
                                        _("Entity name “%-.*s” is not known"),
772
                                        (int)(end - from), from);
Matthias Clasen's avatar
Matthias Clasen committed
773
774
775
776
777
778
                  else
                    set_unescape_error (context, error,
                                        from, G_MARKUP_ERROR_PARSE,
                                        _("Entity did not end with a semicolon; "
                                          "most likely you used an ampersand "
                                          "character without intending to start "
779
                                          "an entity — escape ampersand as &amp;"));
Matthias Clasen's avatar
Matthias Clasen committed
780
781
782
783
                }
              return FALSE;
            }
        }
784
785
    }

786
787
788
  g_assert (to - string->str <= string->len);
  if (to - string->str != string->len)
    g_string_truncate (string, to - string->str);
789

790
  *is_ascii = !(mask & 0x80);
791

792
  return TRUE;
793
794
}

795
static inline gboolean
796
advance_char (GMarkupParseContext *context)
Matthias Clasen's avatar
Matthias Clasen committed
797
{
798
799
  context->iter++;
  context->char_number++;
800

801
  if (G_UNLIKELY (context->iter == context->current_text_end))
802
      return FALSE;
803
804

  else if (G_UNLIKELY (*context->iter == '\n'))
805
    {
806
      context->line_number++;
807
808
      context->char_number = 1;
    }
Matthias Clasen's avatar
Matthias Clasen committed
809

810
  return TRUE;
811
812
}

813
static inline gboolean
814
815
816
817
818
xml_isspace (char c)
{
  return c == ' ' || c == '\t' || c == '\n' || c == '\r';
}

819
820
821
822
823
static void
skip_spaces (GMarkupParseContext *context)
{
  do
    {
824
      if (!xml_isspace (*context->iter))
825
826
827
828
829
830
831
832
833
834
        return;
    }
  while (advance_char (context));
}

static void
advance_to_name_end (GMarkupParseContext *context)
{
  do
    {
835
      if (IS_COMMON_NAME_END_CHAR (*(context->iter)))
836
        return;
837
      if (xml_isspace (*(context->iter)))
Matthias Clasen's avatar
Matthias Clasen committed
838
        return;
839
840
841
842
    }
  while (advance_char (context));
}

843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
static void
release_chunk (GMarkupParseContext *context, GString *str)
{
  GSList *node;
  if (!str)
    return;
  if (str->allocated_len > 256)
    { /* large strings are unusual and worth freeing */
      g_string_free (str, TRUE);
      return;
    }
  string_blank (str);
  node = get_list_node (context, str);
  context->spare_chunks = g_slist_concat (node, context->spare_chunks);
}

859
860
861
862
863
864
static void
add_to_partial (GMarkupParseContext *context,
                const gchar         *text_start,
                const gchar         *text_end)
{
  if (context->partial_chunk == NULL)
865
    { /* allocate a new chunk to parse into */
866

867
      if (context->spare_chunks != NULL)
Matthias Clasen's avatar
Matthias Clasen committed
868
869
870
871
872
873
        {
          GSList *node = context->spare_chunks;
          context->spare_chunks = g_slist_remove_link (context->spare_chunks, node);
          context->partial_chunk = node->data;
          free_list_node (context, node);
        }
874
      else
Matthias Clasen's avatar
Matthias Clasen committed
875
        context->partial_chunk = g_string_sized_new (MAX (28, text_end - text_start));
876
    }
877

878
879
  if (text_start != text_end)
    g_string_insert_len (context->partial_chunk, -1,
Matthias Clasen's avatar
Matthias Clasen committed
880
                         text_start, text_end - text_start);
881
882
}

883
static inline void
884
truncate_partial (GMarkupParseContext *context)
885
886
{
  if (context->partial_chunk != NULL)
887
    string_blank (context->partial_chunk);
888
889
}

890
static inline const gchar*
891
892
893
894
895
current_element (GMarkupParseContext *context)
{
  return context->tag_stack->data;
}

896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
static void
pop_subparser_stack (GMarkupParseContext *context)
{
  GMarkupRecursionTracker *tracker;

  g_assert (context->subparser_stack);

  tracker = context->subparser_stack->data;

  context->awaiting_pop = TRUE;
  context->held_user_data = context->user_data;

  context->user_data = tracker->prev_user_data;
  context->parser = tracker->prev_parser;
  context->subparser_element = tracker->prev_element;
  g_slice_free (GMarkupRecursionTracker, tracker);

  context->subparser_stack = g_slist_delete_link (context->subparser_stack,
                                                  context->subparser_stack);
}

917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
static void
push_partial_as_tag (GMarkupParseContext *context)
{
  GString *str = context->partial_chunk;
  /* sadly, this is exported by gmarkup_get_element_stack as-is */
  context->tag_stack = g_slist_concat (get_list_node (context, str->str), context->tag_stack);
  context->tag_stack_gstr = g_slist_concat (get_list_node (context, str), context->tag_stack_gstr);
  context->partial_chunk = NULL;
}

static void
pop_tag (GMarkupParseContext *context)
{
  GSList *nodea, *nodeb;

  nodea = context->tag_stack;
  nodeb = context->tag_stack_gstr;
  release_chunk (context, nodeb->data);
  context->tag_stack = g_slist_remove_link (context->tag_stack, nodea);
  context->tag_stack_gstr = g_slist_remove_link (context->tag_stack_gstr, nodeb);
  free_list_node (context, nodea);
  free_list_node (context, nodeb);
}

941
942
943
944
945
946
947
948
949
950
951
952
static void
possibly_finish_subparser (GMarkupParseContext *context)
{
  if (current_element (context) == context->subparser_element)
    pop_subparser_stack (context);
}

static void
ensure_no_outstanding_subparser (GMarkupParseContext *context)
{
  if (context->awaiting_pop)
    g_critical ("During the first end_element call after invoking a "
Matthias Clasen's avatar
Matthias Clasen committed
953
954
955
956
                "subparser you must pop the subparser stack and handle "
                "the freeing of the subparser user_data.  This can be "
                "done by calling the end function of the subparser.  "
                "Very probably, your program just leaked memory.");
957
958
959
960
961
962

  /* let valgrind watch the pointer disappear... */
  context->held_user_data = NULL;
  context->awaiting_pop = FALSE;
}

963
964
965
static const gchar*
current_attribute (GMarkupParseContext *context)
{
966
  g_assert (context->cur_attr >= 0);
967
  return context->attr_names[context->cur_attr]->str;
968
969
}

970
static void
971
add_attribute (GMarkupParseContext *context, GString *str)
972
973
974
975
{
  if (context->cur_attr + 2 >= context->alloc_attrs)
    {
      context->alloc_attrs += 5; /* silly magic number */
976
977
      context->attr_names = g_realloc (context->attr_names, sizeof(GString*)*context->alloc_attrs);
      context->attr_values = g_realloc (context->attr_values, sizeof(GString*)*context->alloc_attrs);
978
979
    }
  context->cur_attr++;
980
  context->attr_names[context->cur_attr] = str;
981
982
  context->attr_values[context->cur_attr] = NULL;
  context->attr_names[context->cur_attr+1] = NULL;
983
  context->attr_values[context->cur_attr+1] = NULL;
984
985
}

986
987
988
989
990
991
992
993
994
995
996
997
998
static void
clear_attributes (GMarkupParseContext *context)
{
  /* Go ahead and free the attributes. */
  for (; context->cur_attr >= 0; context->cur_attr--)
    {
      int pos = context->cur_attr;
      release_chunk (context, context->attr_names[pos]);
      release_chunk (context, context->attr_values[pos]);
      context->attr_names[pos] = context->attr_values[pos] = NULL;
    }
  g_assert (context->cur_attr == -1);
  g_assert (context->attr_names == NULL ||
Matthias Clasen's avatar
Matthias Clasen committed
999
            context->attr_names[0] == NULL);
1000
  g_assert (context->attr_values == NULL ||