parserInternals.c 61.3 KB
Newer Older
1
/*
2 3
 * parserInternals.c : Internal routines (and obsolete ones) needed for the
 *                     XML and HTML parsers.
4 5 6
 *
 * See Copyright for the status of this software.
 *
7
 * daniel@veillard.com
8 9
 */

10
#define IN_LIBXML
Bjorn Reese's avatar
Bjorn Reese committed
11 12
#include "libxml.h"

13
#if defined(WIN32) && !defined (__CYGWIN__)
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
#define XML_DIR_SEP '\\'
#else
#define XML_DIR_SEP '/'
#endif

#include <string.h>
#ifdef HAVE_CTYPE_H
#include <ctype.h>
#endif
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_ZLIB_H
#include <zlib.h>
#endif

#include <libxml/xmlmemory.h>
#include <libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/parserInternals.h>
#include <libxml/valid.h>
#include <libxml/entities.h>
#include <libxml/xmlerror.h>
#include <libxml/encoding.h>
#include <libxml/valid.h>
#include <libxml/xmlIO.h>
#include <libxml/uri.h>
50
#include <libxml/dict.h>
51
#include <libxml/SAX.h>
52 53 54
#ifdef LIBXML_CATALOG_ENABLED
#include <libxml/catalog.h>
#endif
55
#include <libxml/globals.h>
56
#include <libxml/chvalid.h>
57

58 59 60
#include "buf.h"
#include "enc.h"

61 62 63
/*
 * Various global defaults for parsing
 */
64

65
/**
66 67 68 69 70 71 72 73 74 75
 * xmlCheckVersion:
 * @version: the include version number
 *
 * check the compiled lib version against the include one.
 * This can warn or immediately kill the application
 */
void
xmlCheckVersion(int version) {
    int myversion = (int) LIBXML_VERSION;

76
    xmlInitParser();
77

78
    if ((myversion / 10000) != (version / 10000)) {
Daniel Veillard's avatar
Daniel Veillard committed
79
	xmlGenericError(xmlGenericErrorContext,
80 81
		"Fatal: program compiled against libxml %d using libxml %d\n",
		(version / 10000), (myversion / 10000));
Daniel Veillard's avatar
Daniel Veillard committed
82
	fprintf(stderr,
83 84
		"Fatal: program compiled against libxml %d using libxml %d\n",
		(version / 10000), (myversion / 10000));
85 86
    }
    if ((myversion / 100) < (version / 100)) {
Daniel Veillard's avatar
Daniel Veillard committed
87
	xmlGenericError(xmlGenericErrorContext,
88 89 90 91 92
		"Warning: program compiled against libxml %d using older %d\n",
		(version / 100), (myversion / 100));
    }
}

93 94 95

/************************************************************************
 *									*
Daniel Veillard's avatar
Daniel Veillard committed
96
 *		Some factorized error routines				*
97 98 99 100 101 102 103 104 105 106 107 108 109 110
 *									*
 ************************************************************************/


/**
 * xmlErrMemory:
 * @ctxt:  an XML parser context
 * @extra:  extra informations
 *
 * Handle a redefinition of attribute error
 */
void
xmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
{
111 112 113
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
        (ctxt->instate == XML_PARSER_EOF))
	return;
114 115 116 117 118 119
    if (ctxt != NULL) {
        ctxt->errNo = XML_ERR_NO_MEMORY;
        ctxt->instate = XML_PARSER_EOF;
        ctxt->disableSAX = 1;
    }
    if (extra)
120
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
121 122 123 124
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
                        NULL, NULL, 0, 0,
                        "Memory allocation failed : %s\n", extra);
    else
125
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
126 127 128 129 130
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
                        NULL, NULL, 0, 0, "Memory allocation failed\n");
}

/**
131
 * __xmlErrEncoding:
132
 * @ctxt:  an XML parser context
133
 * @xmlerr:  the error number
134 135 136 137 138 139
 * @msg:  the error message
 * @str1:  an string info
 * @str2:  an string info
 *
 * Handle an encoding error
 */
140
void
141
__xmlErrEncoding(xmlParserCtxtPtr ctxt, xmlParserErrors xmlerr,
142
                 const char *msg, const xmlChar * str1, const xmlChar * str2)
143
{
144 145 146
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
        (ctxt->instate == XML_PARSER_EOF))
	return;
147
    if (ctxt != NULL)
148
        ctxt->errNo = xmlerr;
149
    __xmlRaiseError(NULL, NULL, NULL,
150
                    ctxt, NULL, XML_FROM_PARSER, xmlerr, XML_ERR_FATAL,
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
                    NULL, 0, (const char *) str1, (const char *) str2,
                    NULL, 0, 0, msg, str1, str2);
    if (ctxt != NULL) {
        ctxt->wellFormed = 0;
        if (ctxt->recovery == 0)
            ctxt->disableSAX = 1;
    }
}

/**
 * xmlErrInternal:
 * @ctxt:  an XML parser context
 * @msg:  the error message
 * @str:  error informations
 *
 * Handle an internal error
 */
static void
xmlErrInternal(xmlParserCtxtPtr ctxt, const char *msg, const xmlChar * str)
{
171 172 173
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
        (ctxt->instate == XML_PARSER_EOF))
	return;
174 175
    if (ctxt != NULL)
        ctxt->errNo = XML_ERR_INTERNAL_ERROR;
176
    __xmlRaiseError(NULL, NULL, NULL,
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
                    ctxt, NULL, XML_FROM_PARSER, XML_ERR_INTERNAL_ERROR,
                    XML_ERR_FATAL, NULL, 0, (const char *) str, NULL, NULL,
                    0, 0, msg, str);
    if (ctxt != NULL) {
        ctxt->wellFormed = 0;
        if (ctxt->recovery == 0)
            ctxt->disableSAX = 1;
    }
}

/**
 * xmlErrEncodingInt:
 * @ctxt:  an XML parser context
 * @error:  the error number
 * @msg:  the error message
 * @val:  an integer value
 *
 * n encoding error
 */
static void
xmlErrEncodingInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
                  const char *msg, int val)
{
200 201 202
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
        (ctxt->instate == XML_PARSER_EOF))
	return;
203 204
    if (ctxt != NULL)
        ctxt->errNo = error;
205
    __xmlRaiseError(NULL, NULL, NULL,
206 207 208 209 210 211 212 213 214
                    ctxt, NULL, XML_FROM_PARSER, error, XML_ERR_FATAL,
                    NULL, 0, NULL, NULL, NULL, val, 0, msg, val);
    if (ctxt != NULL) {
        ctxt->wellFormed = 0;
        if (ctxt->recovery == 0)
            ctxt->disableSAX = 1;
    }
}

215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
/**
 * xmlIsLetter:
 * @c:  an unicode character (int)
 *
 * Check whether the character is allowed by the production
 * [84] Letter ::= BaseChar | Ideographic
 *
 * Returns 0 if not, non-zero otherwise
 */
int
xmlIsLetter(int c) {
    return(IS_BASECHAR(c) || IS_IDEOGRAPHIC(c));
}

/************************************************************************
 *									*
231
 *		Input handling functions for progressive parsing	*
232 233 234 235 236 237 238 239 240 241 242 243 244 245
 *									*
 ************************************************************************/

/* #define DEBUG_INPUT */
/* #define DEBUG_STACK */
/* #define DEBUG_PUSH */


/* we need to keep enough input to show errors in context */
#define LINE_LEN        80

#ifdef DEBUG_INPUT
#define CHECK_BUFFER(in) check_buffer(in)

246
static
247
void check_buffer(xmlParserInputPtr in) {
248
    if (in->base != xmlBufContent(in->buf->buffer)) {
249 250 251 252 253 254 255
        xmlGenericError(xmlGenericErrorContext,
		"xmlParserInput: base mismatch problem\n");
    }
    if (in->cur < in->base) {
        xmlGenericError(xmlGenericErrorContext,
		"xmlParserInput: cur < base problem\n");
    }
256
    if (in->cur > in->base + xmlBufUse(in->buf->buffer)) {
257 258 259
        xmlGenericError(xmlGenericErrorContext,
		"xmlParserInput: cur > base + use problem\n");
    }
260 261 262
    xmlGenericError(xmlGenericErrorContext,"buffer %x : content %x, cur %d, use %d\n",
            (int) in, (int) xmlBufContent(in->buf->buffer), in->cur - in->base,
	    xmlBufUse(in->buf->buffer));
263 264 265
}

#else
266
#define CHECK_BUFFER(in)
267 268 269 270 271 272 273 274
#endif


/**
 * xmlParserInputRead:
 * @in:  an XML parser input
 * @len:  an indicative size for the lookahead
 *
275
 * This function was internal and is deprecated.
276
 *
277
 * Returns -1 as this is an error to use it.
278 279
 */
int
280 281
xmlParserInputRead(xmlParserInputPtr in ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED) {
    return(-1);
282 283 284 285 286 287 288 289 290 291
}

/**
 * xmlParserInputGrow:
 * @in:  an XML parser input
 * @len:  an indicative size for the lookahead
 *
 * This function increase the input for the parser. It tries to
 * preserve pointers to the input buffer, and keep already read data
 *
292
 * Returns the amount of char read, or -1 in case of error, 0 indicate the
293 294 295 296
 * end of this entity
 */
int
xmlParserInputGrow(xmlParserInputPtr in, int len) {
297 298 299
    size_t ret;
    size_t indx;
    const xmlChar *content;
300

301
    if ((in == NULL) || (len < 0)) return(-1);
302 303 304 305 306 307 308 309 310 311
#ifdef DEBUG_INPUT
    xmlGenericError(xmlGenericErrorContext, "Grow\n");
#endif
    if (in->buf == NULL) return(-1);
    if (in->base == NULL) return(-1);
    if (in->cur == NULL) return(-1);
    if (in->buf->buffer == NULL) return(-1);

    CHECK_BUFFER(in);

312
    indx = in->cur - in->base;
313
    if (xmlBufUse(in->buf->buffer) > (unsigned int) indx + INPUT_CHUNK) {
314 315 316 317 318

	CHECK_BUFFER(in);

        return(0);
    }
319
    if (in->buf->readcallback != NULL) {
320
	ret = xmlParserInputBufferGrow(in->buf, len);
321
    } else
322 323 324
        return(0);

    /*
325
     * NOTE : in->base may be a "dangling" i.e. freed pointer in this
326 327 328 329
     *        block, but we use it really as an integer to do some
     *        pointer arithmetic. Insure will raise it as a bug but in
     *        that specific case, that's not !
     */
330 331 332

    content = xmlBufContent(in->buf->buffer);
    if (in->base != content) {
333
        /*
334
	 * the buffer has been reallocated
335
	 */
336
	indx = in->cur - in->base;
337 338
	in->base = content;
	in->cur = &content[indx];
339
    }
340
    in->end = xmlBufEnd(in->buf->buffer);
341 342 343 344 345 346 347 348 349 350 351 352 353 354

    CHECK_BUFFER(in);

    return(ret);
}

/**
 * xmlParserInputShrink:
 * @in:  an XML parser input
 *
 * This function removes used input for the parser.
 */
void
xmlParserInputShrink(xmlParserInputPtr in) {
355 356 357 358
    size_t used;
    size_t ret;
    size_t indx;
    const xmlChar *content;
359 360 361 362

#ifdef DEBUG_INPUT
    xmlGenericError(xmlGenericErrorContext, "Shrink\n");
#endif
363
    if (in == NULL) return;
364 365 366 367 368 369 370
    if (in->buf == NULL) return;
    if (in->base == NULL) return;
    if (in->cur == NULL) return;
    if (in->buf->buffer == NULL) return;

    CHECK_BUFFER(in);

371
    used = in->cur - xmlBufContent(in->buf->buffer);
372 373
    /*
     * Do not shrink on large buffers whose only a tiny fraction
374
     * was consumed
375 376
     */
    if (used > INPUT_CHUNK) {
377
	ret = xmlBufShrink(in->buf->buffer, used - LINE_LEN);
378 379 380 381
	if (ret > 0) {
	    in->cur -= ret;
	    in->consumed += ret;
	}
382
	in->end = xmlBufEnd(in->buf->buffer);
383 384 385 386
    }

    CHECK_BUFFER(in);

387
    if (xmlBufUse(in->buf->buffer) > INPUT_CHUNK) {
388 389 390
        return;
    }
    xmlParserInputBufferRead(in->buf, 2 * INPUT_CHUNK);
391 392
    content = xmlBufContent(in->buf->buffer);
    if (in->base != content) {
393
        /*
394
	 * the buffer has been reallocated
395
	 */
396
	indx = in->cur - in->base;
397 398
	in->base = content;
	in->cur = &content[indx];
399
    }
400
    in->end = xmlBufEnd(in->buf->buffer);
401 402 403 404 405 406

    CHECK_BUFFER(in);
}

/************************************************************************
 *									*
Daniel Veillard's avatar
Daniel Veillard committed
407
 *		UTF8 character input and related functions		*
408 409 410 411 412 413 414 415 416 417 418
 *									*
 ************************************************************************/

/**
 * xmlNextChar:
 * @ctxt:  the XML parser context
 *
 * Skip to the next char input char.
 */

void
419 420
xmlNextChar(xmlParserCtxtPtr ctxt)
{
421 422
    if ((ctxt == NULL) || (ctxt->instate == XML_PARSER_EOF) ||
        (ctxt->input == NULL))
423
        return;
424

425
    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
        if ((*ctxt->input->cur == 0) &&
            (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0) &&
            (ctxt->instate != XML_PARSER_COMMENT)) {
            /*
             * If we are at the end of the current entity and
             * the context allows it, we pop consumed entities
             * automatically.
             * the auto closing should be blocked in other cases
             */
            xmlPopInput(ctxt);
        } else {
            const unsigned char *cur;
            unsigned char c;

            /*
             *   2.11 End-of-Line Handling
             *   the literal two-character sequence "#xD#xA" or a standalone
             *   literal #xD, an XML processor must pass to the application
             *   the single character #xA.
             */
            if (*(ctxt->input->cur) == '\n') {
447
                ctxt->input->line++; ctxt->input->col = 1;
448 449 450 451 452 453 454 455 456 457
            } else
                ctxt->input->col++;

            /*
             * We are supposed to handle UTF8, check it's valid
             * From rfc2044: encoding of the Unicode values on UTF-8:
             *
             * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
             * 0000 0000-0000 007F   0xxxxxxx
             * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
Daniel Veillard's avatar
Daniel Veillard committed
458
             * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
459 460 461 462 463 464 465
             *
             * Check for the 0x110000 limit too
             */
            cur = ctxt->input->cur;

            c = *cur;
            if (c & 0x80) {
466 467
	        if (c == 0xC0)
		    goto encoding_error;
468
                if (cur[1] == 0) {
469
                    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
470 471
                    cur = ctxt->input->cur;
                }
472 473 474 475 476
                if ((cur[1] & 0xc0) != 0x80)
                    goto encoding_error;
                if ((c & 0xe0) == 0xe0) {
                    unsigned int val;

477
                    if (cur[2] == 0) {
478
                        xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
479 480
                        cur = ctxt->input->cur;
                    }
481 482 483
                    if ((cur[2] & 0xc0) != 0x80)
                        goto encoding_error;
                    if ((c & 0xf0) == 0xf0) {
484
                        if (cur[3] == 0) {
485
                            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
486 487
                            cur = ctxt->input->cur;
                        }
488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506
                        if (((c & 0xf8) != 0xf0) ||
                            ((cur[3] & 0xc0) != 0x80))
                            goto encoding_error;
                        /* 4-byte code */
                        ctxt->input->cur += 4;
                        val = (cur[0] & 0x7) << 18;
                        val |= (cur[1] & 0x3f) << 12;
                        val |= (cur[2] & 0x3f) << 6;
                        val |= cur[3] & 0x3f;
                    } else {
                        /* 3-byte code */
                        ctxt->input->cur += 3;
                        val = (cur[0] & 0xf) << 12;
                        val |= (cur[1] & 0x3f) << 6;
                        val |= cur[2] & 0x3f;
                    }
                    if (((val > 0xd7ff) && (val < 0xe000)) ||
                        ((val > 0xfffd) && (val < 0x10000)) ||
                        (val >= 0x110000)) {
507 508 509
			xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
					  "Char 0x%X out of allowed range\n",
					  val);
510 511 512 513 514 515 516 517 518 519 520 521
                    }
                } else
                    /* 2-byte code */
                    ctxt->input->cur += 2;
            } else
                /* 1-byte code */
                ctxt->input->cur++;

            ctxt->nbChars++;
            if (*ctxt->input->cur == 0)
                xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
        }
522
    } else {
523 524 525 526 527 528 529
        /*
         * Assume it's a fixed length encoding (1) with
         * a compatible encoding for the ASCII set, since
         * XML constructs only use < 128 chars
         */

        if (*(ctxt->input->cur) == '\n') {
530
            ctxt->input->line++; ctxt->input->col = 1;
531 532 533 534 535 536
        } else
            ctxt->input->col++;
        ctxt->input->cur++;
        ctxt->nbChars++;
        if (*ctxt->input->cur == 0)
            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
537
    }
538
    if ((*ctxt->input->cur == '%') && (!ctxt->html))
539
        xmlParserHandlePEReference(ctxt);
540
    if ((*ctxt->input->cur == 0) &&
541
        (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0))
542
        xmlPopInput(ctxt);
543
    return;
544
encoding_error:
545 546
    /*
     * If we detect an UTF8 error that probably mean that the
547
     * input encoding didn't get properly advertised in the
548 549 550 551
     * declaration header. Report the error and switch the encoding
     * to ISO-Latin-1 (if you don't like this policy, just declare the
     * encoding !)
     */
552 553 554 555 556 557 558 559 560 561 562 563 564 565
    if ((ctxt == NULL) || (ctxt->input == NULL) ||
        (ctxt->input->end - ctxt->input->cur < 4)) {
	__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
		     "Input is not proper UTF-8, indicate encoding !\n",
		     NULL, NULL);
    } else {
        char buffer[150];

	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
			ctxt->input->cur[0], ctxt->input->cur[1],
			ctxt->input->cur[2], ctxt->input->cur[3]);
	__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
		     "Input is not proper UTF-8, indicate encoding !\n%s",
		     BAD_CAST buffer, NULL);
566
    }
567
    ctxt->charset = XML_CHAR_ENCODING_8859_1;
568
    ctxt->input->cur++;
569 570 571 572 573 574 575 576
    return;
}

/**
 * xmlCurrentChar:
 * @ctxt:  the XML parser context
 * @len:  pointer to the length of the char read
 *
577
 * The current char value, if using UTF-8 this may actually span multiple
578 579 580 581 582 583 584 585 586
 * bytes in the input buffer. Implement the end of line normalization:
 * 2.11 End-of-Line Handling
 * Wherever an external parsed entity or the literal entity value
 * of an internal parsed entity contains either the literal two-character
 * sequence "#xD#xA" or a standalone literal #xD, an XML processor
 * must pass to the application the single character #xA.
 * This behavior can conveniently be produced by normalizing all
 * line breaks to #xA on input, before parsing.)
 *
587
 * Returns the current char value and its length
588 589 590 591
 */

int
xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
592
    if ((ctxt == NULL) || (len == NULL) || (ctxt->input == NULL)) return(0);
593 594 595
    if (ctxt->instate == XML_PARSER_EOF)
	return(0);

596 597 598
    if ((*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) {
	    *len = 1;
	    return((int) *ctxt->input->cur);
599 600 601 602 603 604 605 606 607
    }
    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
	/*
	 * We are supposed to handle UTF8, check it's valid
	 * From rfc2044: encoding of the Unicode values on UTF-8:
	 *
	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
	 * 0000 0000-0000 007F   0xxxxxxx
	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
Daniel Veillard's avatar
Daniel Veillard committed
608
	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
609 610 611 612 613 614 615 616 617
	 *
	 * Check for the 0x110000 limit too
	 */
	const unsigned char *cur = ctxt->input->cur;
	unsigned char c;
	unsigned int val;

	c = *cur;
	if (c & 0x80) {
618
	    if (((c & 0x40) == 0) || (c == 0xC0))
619
		goto encoding_error;
620
	    if (cur[1] == 0) {
621
		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
622 623
                cur = ctxt->input->cur;
            }
624
	    if ((cur[1] & 0xc0) != 0x80)
625 626
		goto encoding_error;
	    if ((c & 0xe0) == 0xe0) {
627
		if (cur[2] == 0) {
628
		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
629 630
                    cur = ctxt->input->cur;
                }
631
		if ((cur[2] & 0xc0) != 0x80)
632 633
		    goto encoding_error;
		if ((c & 0xf0) == 0xf0) {
634
		    if (cur[3] == 0) {
635
			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
636 637
                        cur = ctxt->input->cur;
                    }
638
		    if (((c & 0xf8) != 0xf0) ||
639 640 641 642 643 644 645 646
			((cur[3] & 0xc0) != 0x80))
			goto encoding_error;
		    /* 4-byte code */
		    *len = 4;
		    val = (cur[0] & 0x7) << 18;
		    val |= (cur[1] & 0x3f) << 12;
		    val |= (cur[2] & 0x3f) << 6;
		    val |= cur[3] & 0x3f;
647 648
		    if (val < 0x10000)
			goto encoding_error;
649 650 651 652 653 654
		} else {
		  /* 3-byte code */
		    *len = 3;
		    val = (cur[0] & 0xf) << 12;
		    val |= (cur[1] & 0x3f) << 6;
		    val |= cur[2] & 0x3f;
655 656
		    if (val < 0x800)
			goto encoding_error;
657 658 659 660 661 662
		}
	    } else {
	      /* 2-byte code */
		*len = 2;
		val = (cur[0] & 0x1f) << 6;
		val |= cur[1] & 0x3f;
663 664
		if (val < 0x80)
		    goto encoding_error;
665 666
	    }
	    if (!IS_CHAR(val)) {
667 668
	        xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
				  "Char 0x%X out of allowed range\n", val);
Daniel Veillard's avatar
Daniel Veillard committed
669
	    }
670 671 672 673
	    return(val);
	} else {
	    /* 1-byte code */
	    *len = 1;
674 675 676 677 678 679 680
	    if (*ctxt->input->cur == 0)
		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
	    if ((*ctxt->input->cur == 0) &&
	        (ctxt->input->end > ctxt->input->cur)) {
	        xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
				  "Char 0x0 out of allowed range\n", 0);
	    }
681
	    if (*ctxt->input->cur == 0xD) {
682
		if (ctxt->input->cur[1] == 0xA) {
683 684 685 686 687 688 689 690 691
		    ctxt->nbChars++;
		    ctxt->input->cur++;
		}
		return(0xA);
	    }
	    return((int) *ctxt->input->cur);
	}
    }
    /*
692
     * Assume it's a fixed length encoding (1) with
693
     * a compatible encoding for the ASCII set, since
694 695 696 697
     * XML constructs only use < 128 chars
     */
    *len = 1;
    if (*ctxt->input->cur == 0xD) {
698
	if (ctxt->input->cur[1] == 0xA) {
699 700 701 702 703 704 705
	    ctxt->nbChars++;
	    ctxt->input->cur++;
	}
	return(0xA);
    }
    return((int) *ctxt->input->cur);
encoding_error:
706 707 708 709 710 711 712 713 714 715
    /*
     * An encoding problem may arise from a truncated input buffer
     * splitting a character in the middle. In that case do not raise
     * an error but return 0 to endicate an end of stream problem
     */
    if (ctxt->input->end - ctxt->input->cur < 4) {
	*len = 0;
	return(0);
    }

716 717
    /*
     * If we detect an UTF8 error that probably mean that the
718
     * input encoding didn't get properly advertised in the
719 720 721 722
     * declaration header. Report the error and switch the encoding
     * to ISO-Latin-1 (if you don't like this policy, just declare the
     * encoding !)
     */
723 724 725
    {
        char buffer[150];

726
	snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
727 728
			ctxt->input->cur[0], ctxt->input->cur[1],
			ctxt->input->cur[2], ctxt->input->cur[3]);
729 730 731
	__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
		     "Input is not proper UTF-8, indicate encoding !\n%s",
		     BAD_CAST buffer, NULL);
732
    }
Daniel Veillard's avatar
Daniel Veillard committed
733
    ctxt->charset = XML_CHAR_ENCODING_8859_1;
734 735 736 737 738 739 740 741 742 743
    *len = 1;
    return((int) *ctxt->input->cur);
}

/**
 * xmlStringCurrentChar:
 * @ctxt:  the XML parser context
 * @cur:  pointer to the beginning of the char
 * @len:  pointer to the length of the char read
 *
744
 * The current char value, if using UTF-8 this may actually span multiple
745 746
 * bytes in the input buffer.
 *
747
 * Returns the current char value and its length
748 749 750
 */

int
751 752
xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len)
{
753
    if ((len == NULL) || (cur == NULL)) return(0);
754
    if ((ctxt == NULL) || (ctxt->charset == XML_CHAR_ENCODING_UTF8)) {
755 756 757 758 759 760 761
        /*
         * We are supposed to handle UTF8, check it's valid
         * From rfc2044: encoding of the Unicode values on UTF-8:
         *
         * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
         * 0000 0000-0000 007F   0xxxxxxx
         * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
Daniel Veillard's avatar
Daniel Veillard committed
762
         * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799
         *
         * Check for the 0x110000 limit too
         */
        unsigned char c;
        unsigned int val;

        c = *cur;
        if (c & 0x80) {
            if ((cur[1] & 0xc0) != 0x80)
                goto encoding_error;
            if ((c & 0xe0) == 0xe0) {

                if ((cur[2] & 0xc0) != 0x80)
                    goto encoding_error;
                if ((c & 0xf0) == 0xf0) {
                    if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80))
                        goto encoding_error;
                    /* 4-byte code */
                    *len = 4;
                    val = (cur[0] & 0x7) << 18;
                    val |= (cur[1] & 0x3f) << 12;
                    val |= (cur[2] & 0x3f) << 6;
                    val |= cur[3] & 0x3f;
                } else {
                    /* 3-byte code */
                    *len = 3;
                    val = (cur[0] & 0xf) << 12;
                    val |= (cur[1] & 0x3f) << 6;
                    val |= cur[2] & 0x3f;
                }
            } else {
                /* 2-byte code */
                *len = 2;
                val = (cur[0] & 0x1f) << 6;
                val |= cur[1] & 0x3f;
            }
            if (!IS_CHAR(val)) {
800 801
	        xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
				  "Char 0x%X out of allowed range\n", val);
802 803 804 805 806 807 808
            }
            return (val);
        } else {
            /* 1-byte code */
            *len = 1;
            return ((int) *cur);
        }
809 810
    }
    /*
811
     * Assume it's a fixed length encoding (1) with
812
     * a compatible encoding for the ASCII set, since
813 814 815
     * XML constructs only use < 128 chars
     */
    *len = 1;
816
    return ((int) *cur);
817
encoding_error:
818

819 820 821 822 823 824 825 826 827 828
    /*
     * An encoding problem may arise from a truncated input buffer
     * splitting a character in the middle. In that case do not raise
     * an error but return 0 to endicate an end of stream problem
     */
    if ((ctxt == NULL) || (ctxt->input == NULL) ||
        (ctxt->input->end - ctxt->input->cur < 4)) {
	*len = 0;
	return(0);
    }
829 830
    /*
     * If we detect an UTF8 error that probably mean that the
831
     * input encoding didn't get properly advertised in the
832 833 834 835
     * declaration header. Report the error and switch the encoding
     * to ISO-Latin-1 (if you don't like this policy, just declare the
     * encoding !)
     */
836 837 838 839 840 841 842 843 844
    {
        char buffer[150];

	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
			ctxt->input->cur[0], ctxt->input->cur[1],
			ctxt->input->cur[2], ctxt->input->cur[3]);
	__xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
		     "Input is not proper UTF-8, indicate encoding !\n%s",
		     BAD_CAST buffer, NULL);
845 846
    }
    *len = 1;
847
    return ((int) *cur);
848 849 850
}

/**
851
 * xmlCopyCharMultiByte:
852
 * @out:  pointer to an array of xmlChar
853 854
 * @val:  the char value
 *
Daniel Veillard's avatar
Daniel Veillard committed
855
 * append the char value in the array
856 857 858 859
 *
 * Returns the number of xmlChar written
 */
int
860
xmlCopyCharMultiByte(xmlChar *out, int val) {
861
    if (out == NULL) return(0);
862 863 864 865 866 867 868
    /*
     * We are supposed to handle UTF8, check it's valid
     * From rfc2044: encoding of the Unicode values on UTF-8:
     *
     * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
     * 0000 0000-0000 007F   0xxxxxxx
     * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
Daniel Veillard's avatar
Daniel Veillard committed
869
     * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
870
     */
871 872 873 874 875 876 877
    if  (val >= 0x80) {
	xmlChar *savedout = out;
	int bits;
	if (val <   0x800) { *out++= (val >>  6) | 0xC0;  bits=  0; }
	else if (val < 0x10000) { *out++= (val >> 12) | 0xE0;  bits=  6;}
	else if (val < 0x110000)  { *out++= (val >> 18) | 0xF0;  bits=  12; }
	else {
878
	    xmlErrEncodingInt(NULL, XML_ERR_INVALID_CHAR,
879
		    "Internal error, xmlCopyCharMultiByte 0x%X out of bound\n",
880
			      val);
881 882
	    return(0);
	}
883 884 885
	for ( ; bits >= 0; bits-= 6)
	    *out++= ((val >> bits) & 0x3F) | 0x80 ;
	return (out - savedout);
886
    }
887 888 889
    *out = (xmlChar) val;
    return 1;
}
890

891 892 893
/**
 * xmlCopyChar:
 * @len:  Ignored, compatibility
894
 * @out:  pointer to an array of xmlChar
895 896
 * @val:  the char value
 *
Daniel Veillard's avatar
Daniel Veillard committed
897
 * append the char value in the array
898 899 900
 *
 * Returns the number of xmlChar written
 */
901

902
int
903
xmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) {
904
    if (out == NULL) return(0);
905 906 907
    /* the len parameter is ignored */
    if  (val >= 0x80) {
	return(xmlCopyCharMultiByte (out, val));
908 909
    }
    *out = (xmlChar) val;
910
    return 1;
911 912 913 914 915 916 917 918
}

/************************************************************************
 *									*
 *		Commodity functions to switch encodings			*
 *									*
 ************************************************************************/

919 920 921 922 923 924
static int
xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
                       xmlCharEncodingHandlerPtr handler, int len);
static int
xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
                          xmlCharEncodingHandlerPtr handler, int len);
925 926 927 928 929 930 931 932 933 934 935 936 937 938
/**
 * xmlSwitchEncoding:
 * @ctxt:  the parser context
 * @enc:  the encoding value (number)
 *
 * change the input functions when discovering the character encoding
 * of a given entity.
 *
 * Returns 0 in case of success, -1 otherwise
 */
int
xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
{
    xmlCharEncodingHandlerPtr handler;
939
    int len = -1;
940
    int ret;
941

942
    if (ctxt == NULL) return(-1);
943 944
    switch (enc) {
	case XML_CHAR_ENCODING_ERROR:
945
	    __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING,
946
	                   "encoding unknown\n", NULL, NULL);
947
	    return(-1);
948 949 950 951 952 953 954
	case XML_CHAR_ENCODING_NONE:
	    /* let's assume it's UTF-8 without the XML decl */
	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
	    return(0);
	case XML_CHAR_ENCODING_UTF8:
	    /* default encoding, no conversion should be needed */
	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
955 956 957 958 959 960

	    /*
	     * Errata on XML-1.0 June 20 2001
	     * Specific handling of the Byte Order Mark for
	     * UTF-8
	     */
961 962
	    if ((ctxt->input != NULL) &&
		(ctxt->input->cur[0] == 0xEF) &&
963 964 965 966
		(ctxt->input->cur[1] == 0xBB) &&
		(ctxt->input->cur[2] == 0xBF)) {
		ctxt->input->cur += 3;
	    }
967
	    return(0);
968 969 970 971 972 973 974 975 976 977
    case XML_CHAR_ENCODING_UTF16LE:
    case XML_CHAR_ENCODING_UTF16BE:
        /*The raw input characters are encoded
         *in UTF-16. As we expect this function
         *to be called after xmlCharEncInFunc, we expect
         *ctxt->input->cur to contain UTF-8 encoded characters.
         *So the raw UTF16 Byte Order Mark
         *has also been converted into
         *an UTF-8 BOM. Let's skip that BOM.
         */
978
        if ((ctxt->input != NULL) && (ctxt->input->cur != NULL) &&
979 980 981 982 983
            (ctxt->input->cur[0] == 0xEF) &&
            (ctxt->input->cur[1] == 0xBB) &&
            (ctxt->input->cur[2] == 0xBF)) {
            ctxt->input->cur += 3;
        }
984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
        len = 90;
	break;
    case XML_CHAR_ENCODING_UCS2:
        len = 90;
	break;
    case XML_CHAR_ENCODING_UCS4BE:
    case XML_CHAR_ENCODING_UCS4LE:
    case XML_CHAR_ENCODING_UCS4_2143:
    case XML_CHAR_ENCODING_UCS4_3412:
        len = 180;
	break;
    case XML_CHAR_ENCODING_EBCDIC:
    case XML_CHAR_ENCODING_8859_1:
    case XML_CHAR_ENCODING_8859_2:
    case XML_CHAR_ENCODING_8859_3:
    case XML_CHAR_ENCODING_8859_4:
    case XML_CHAR_ENCODING_8859_5:
    case XML_CHAR_ENCODING_8859_6:
    case XML_CHAR_ENCODING_8859_7:
    case XML_CHAR_ENCODING_8859_8:
    case XML_CHAR_ENCODING_8859_9:
    case XML_CHAR_ENCODING_ASCII:
    case XML_CHAR_ENCODING_2022_JP:
    case XML_CHAR_ENCODING_SHIFT_JIS:
    case XML_CHAR_ENCODING_EUC_JP:
        len = 45;
	break;
1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026
    }
    handler = xmlGetCharEncodingHandler(enc);
    if (handler == NULL) {
	/*
	 * Default handlers.
	 */
	switch (enc) {
	    case XML_CHAR_ENCODING_ASCII:
		/* default encoding, no conversion should be needed */
		ctxt->charset = XML_CHAR_ENCODING_UTF8;
		return(0);
	    case XML_CHAR_ENCODING_UTF16LE:
		break;
	    case XML_CHAR_ENCODING_UTF16BE:
		break;
	    case XML_CHAR_ENCODING_UCS4LE:
1027
		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1028 1029
			       "encoding not supported %s\n",
			       BAD_CAST "USC4 little endian", NULL);
1030 1031
		break;
	    case XML_CHAR_ENCODING_UCS4BE:
1032
		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1033 1034
			       "encoding not supported %s\n",
			       BAD_CAST "USC4 big endian", NULL);
1035 1036
		break;
	    case XML_CHAR_ENCODING_EBCDIC:
1037
		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1038 1039
			       "encoding not supported %s\n",
			       BAD_CAST "EBCDIC", NULL);
1040 1041
		break;
	    case XML_CHAR_ENCODING_UCS4_2143:
1042
		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1043 1044
			       "encoding not supported %s\n",
			       BAD_CAST "UCS4 2143", NULL);
1045 1046
		break;
	    case XML_CHAR_ENCODING_UCS4_3412:
1047
		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1048 1049
			       "encoding not supported %s\n",
			       BAD_CAST "UCS4 3412", NULL);
1050 1051
		break;
	    case XML_CHAR_ENCODING_UCS2:
1052
		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1053 1054
			       "encoding not supported %s\n",
			       BAD_CAST "UCS2", NULL);
1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072
		break;
	    case XML_CHAR_ENCODING_8859_1:
	    case XML_CHAR_ENCODING_8859_2:
	    case XML_CHAR_ENCODING_8859_3:
	    case XML_CHAR_ENCODING_8859_4:
	    case XML_CHAR_ENCODING_8859_5:
	    case XML_CHAR_ENCODING_8859_6:
	    case XML_CHAR_ENCODING_8859_7:
	    case XML_CHAR_ENCODING_8859_8:
	    case XML_CHAR_ENCODING_8859_9:
		/*
		 * We used to keep the internal content in the
		 * document encoding however this turns being unmaintainable
		 * So xmlGetCharEncodingHandler() will return non-null
		 * values for this now.
		 */
		if ((ctxt->inputNr == 1) &&
		    (ctxt->encoding == NULL) &&
1073
		    (ctxt->input != NULL) &&
1074 1075 1076 1077 1078 1079
		    (ctxt->input->encoding != NULL)) {
		    ctxt->encoding = xmlStrdup(ctxt->input->encoding);
		}
		ctxt->charset = enc;
		return(0);
	    case XML_CHAR_ENCODING_2022_JP:
1080
		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1081 1082
			       "encoding not supported %s\n",
			       BAD_CAST "ISO-2022-JP", NULL);
1083 1084
		break;
	    case XML_CHAR_ENCODING_SHIFT_JIS:
1085
		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1086 1087
			       "encoding not supported %s\n",
			       BAD_CAST "Shift_JIS", NULL);
1088 1089
		break;
	    case XML_CHAR_ENCODING_EUC_JP:
1090
		__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1091 1092
			       "encoding not supported %s\n",
			       BAD_CAST "EUC-JP", NULL);
1093
		break;
1094 1095
	    default:
	        break;
1096 1097 1098 1099 1100
	}
    }
    if (handler == NULL)
	return(-1);
    ctxt->charset = XML_CHAR_ENCODING_UTF8;
1101 1102 1103 1104 1105 1106 1107 1108 1109
    ret = xmlSwitchToEncodingInt(ctxt, handler, len);
    if ((ret < 0) || (ctxt->errNo == XML_I18N_CONV_FAILED)) {
        /*
	 * on encoding conversion errors, stop the parser
	 */
        xmlStopParser(ctxt);
	ctxt->errNo = XML_I18N_CONV_FAILED;
    }
    return(ret);
1110 1111 1112
}

/**
1113
 * xmlSwitchInputEncoding:
1114
 * @ctxt:  the parser context
1115
 * @input:  the input stream
1116
 * @handler:  the encoding handler
1117
 * @len:  the number of bytes to convert for the first line or -1
1118 1119 1120 1121 1122 1123
 *
 * change the input functions when discovering the character encoding
 * of a given entity.
 *
 * Returns 0 in case of success, -1 otherwise
 */
1124 1125 1126
static int
xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
                          xmlCharEncodingHandlerPtr handler, int len)
1127 1128 1129
{
    int nbchars;

1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141
    if (handler == NULL)
        return (-1);
    if (input == NULL)
        return (-1);
    if (input->buf != NULL) {
        if (input->buf->encoder != NULL) {
            /*
             * Check in case the auto encoding detetection triggered
             * in already.
             */
            if (input->buf->encoder == handler)
                return (0);
1142

1143 1144 1145 1146 1147 1148 1149 1150 1151
            /*
             * "UTF-16" can be used for both LE and BE
             if ((!xmlStrncmp(BAD_CAST input->buf->encoder->name,
             BAD_CAST "UTF-16", 6)) &&
             (!xmlStrncmp(BAD_CAST handler->name,
             BAD_CAST "UTF-16", 6))) {
             return(0);
             }
             */
1152

1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164