camel-mime-parser.c 47.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
/*
 *  Copyright (C) 2000 Helix Code Inc.
 *
 *  Authors: Michael Zucchi <notzed@helixcode.com>
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Library General Public License
 *  as published by the Free Software Foundation; either version 2 of
 *  the License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Library General Public License for more details.
 *
 *  You should have received a copy of the GNU Library General Public
 *  License along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/* What should hopefully be a fast mail parser */

23 24 25 26 27
/* Do not change this code without asking me (Michael Zucchi) first

   There is almost always a reason something was done a certain way.
 */

28 29 30 31 32 33 34 35 36 37 38 39
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

#include <string.h>

#include <stdio.h>
#include <errno.h>

#include <unicode.h>

40
#include <regex.h>
41
#include <ctype.h>
42

43
#include <glib.h>
44 45 46 47 48 49
#include "camel-mime-parser.h"
#include "camel-mime-utils.h"
#include "camel-mime-filter.h"
#include "camel-stream.h"
#include "camel-seekable-stream.h"

Not Zed's avatar
Not Zed committed
50 51 52 53
#define r(x) 
#define h(x) 
#define c(x) 
#define d(x) 
54

55 56
/*#define PURIFY*/

57 58 59
#define MEMPOOL

#define STRUCT_ALIGN 4
60

61 62 63 64 65
#ifdef PURIFY
int inend_id = -1,
  inbuffer_id = -1;
#endif

NotZed's avatar
NotZed committed
66
#if 0
67 68 69 70 71 72 73
extern int strdup_count;
extern int malloc_count;
extern int free_count;

#define g_strdup(x) (strdup_count++, g_strdup(x))
#define g_malloc(x) (malloc_count++, g_malloc(x))
#define g_free(x) (free_count++, g_free(x))
NotZed's avatar
NotZed committed
74
#endif
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116

#ifdef MEMPOOL
typedef struct _MemPoolNode {
	struct _MemPoolNode *next;

	int free;
	char data[1];
} MemPoolNode;

typedef struct _MemPoolThresholdNode {
	struct _MemPoolThresholdNode *next;
	char data[1];
} MemPoolThresholdNode;

typedef struct _MemPool {
	int blocksize;
	int threshold;
	struct _MemPoolNode *blocks;
	struct _MemPoolThresholdNode *threshold_blocks;
} MemPool;

MemPool *mempool_new(int blocksize, int threshold);
void *mempool_alloc(MemPool *pool, int size);
void mempool_flush(MemPool *pool, int freeall);
void mempool_free(MemPool *pool);

MemPool *mempool_new(int blocksize, int threshold)
{
	MemPool *pool;

	pool = g_malloc(sizeof(*pool));
	if (threshold >= blocksize)
		threshold = blocksize * 2 / 3;
	pool->blocksize = blocksize;
	pool->threshold = threshold;
	pool->blocks = NULL;
	pool->threshold_blocks = NULL;
	return pool;
}

void *mempool_alloc(MemPool *pool, int size)
{
117
	size = (size + STRUCT_ALIGN) & (~(STRUCT_ALIGN-1));
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
	if (size>=pool->threshold) {
		MemPoolThresholdNode *n;

		n = g_malloc(sizeof(*n) - sizeof(char) + size);
		n->next = pool->threshold_blocks;
		pool->threshold_blocks = n;
		return &n->data[0];
	} else {
		MemPoolNode *n;

		n = pool->blocks;
		while (n) {
			if (n->free >= size) {
				n->free -= size;
				return &n->data[n->free];
			}
			n = n->next;
		}

		n = g_malloc(sizeof(*n) - sizeof(char) + pool->blocksize);
		n->next = pool->blocks;
		pool->blocks = n;
		n->free = pool->blocksize - size;
		return &n->data[n->free];
	}
}

void mempool_flush(MemPool *pool, int freeall)
{
	MemPoolThresholdNode *tn, *tw;
	MemPoolNode *pw, *pn;

	tw = pool->threshold_blocks;
	while (tw) {
		tn = tw->next;
		g_free(tw);
		tw = tn;
	}
	pool->threshold_blocks = NULL;

	if (freeall) {
		pw = pool->blocks;
		while (pw) {
			pn = pw->next;
			g_free(pw);
			pw = pn;
		}
		pool->blocks = NULL;
	} else {
		pw = pool->blocks;
		while (pw) {
			pw->free = pool->blocksize;
			pw = pw->next;
		}
	}
}

void mempool_free(MemPool *pool)
{
	if (pool) {
		mempool_flush(pool, 1);
		g_free(pool);
	}
}
182

183 184 185 186 187 188 189 190 191 192 193 194 195
#endif












196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
#define SCAN_BUF 4096		/* size of read buffer */
#define SCAN_HEAD 128		/* headroom guaranteed to be before each read buffer */

/* a little hacky, but i couldn't be bothered renaming everything */
#define _header_scan_state _CamelMimeParserPrivate
#define _PRIVATE(o) (((CamelMimeParser *)(o))->priv)

struct _header_scan_state {

    /* global state */

	enum _header_state state;

	/* for building headers during scanning */
	char *outbuf;
	char *outptr;
	char *outend;

	int fd;			/* input for a fd input */
	CamelStream *stream;	/* or for a stream */

	/* for scanning input buffers */
	char *realbuf;		/* the real buffer, SCAN_HEAD*2 + SCAN_BUF bytes */
	char *inbuf;		/* points to a subset of the allocated memory, the underflow */
	char *inptr;		/* (upto SCAN_HEAD) is for use by filters so they dont copy all data */
	char *inend;

	int atleast;

	int seek;		/* current offset to start of buffer */
226
	int unstep;		/* how many states to 'unstep' (repeat the current state) */
227

228 229 230
	unsigned int midline:1;		/* are we mid-line interrupted? */
	unsigned int scan_from:1;	/* do we care about From lines? */
	unsigned int scan_pre_from:1;	/* do we return pre-from data? */
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250

	int start_of_from;	/* where from started */
	int start_of_headers;	/* where headers started from the last scan */

	int header_start;	/* start of last header, or -1 */

	/* filters to apply to all content before output */
	int filterid;		/* id of next filter */
	struct _header_scan_filter *filters;

    /* per message/part info */
	struct _header_scan_stack *parts;

};

struct _header_scan_stack {
	struct _header_scan_stack *parent;

	enum _header_state savestate; /* state at invocation of this part */

251 252 253
#ifdef MEMPOOL
	MemPool *pool;		/* memory pool to keep track of headers/etc at this level */
#endif
254 255 256 257
	struct _header_raw *headers;	/* headers for this part */

	struct _header_content_type *content_type;

258 259 260 261 262
	/* I dont use GString's casue you can't efficiently append a buffer to them */
	GByteArray *pretext;	/* for multipart types, save the pre-boundary data here */
	GByteArray *posttext;	/* for multipart types, save the post-boundary data here */
	int prestage;		/* used to determine if it is a pre-boundary or post-boundary data segment */

263 264
	GByteArray *from_line;	/* the from line */

265
	char *boundary;		/* for multipart/ * boundaries, including leading -- and trailing -- for the final part */
266 267 268
	int boundarylen;	/* actual length of boundary, including leading -- if there is one */
	int boundarylenfinal;	/* length of boundary, including trailing -- if there is one */
	int atleast;		/* the biggest boundary from here to the parent */
269 270 271 272 273 274 275 276 277
};

struct _header_scan_filter {
	struct _header_scan_filter *next;
	int id;
	CamelMimeFilter *filter;
};

static void folder_scan_step(struct _header_scan_state *s, char **databuffer, int *datalength);
278
static void folder_scan_drop_step(struct _header_scan_state *s);
279 280 281 282 283 284
static int folder_scan_init_with_fd(struct _header_scan_state *s, int fd);
static int folder_scan_init_with_stream(struct _header_scan_state *s, CamelStream *stream);
static struct _header_scan_state *folder_scan_init(void);
static void folder_scan_close(struct _header_scan_state *s);
static struct _header_scan_stack *folder_scan_content(struct _header_scan_state *s, int *lastone, char **data, int *length);
static struct _header_scan_stack *folder_scan_header(struct _header_scan_state *s, int *lastone);
285
static int folder_scan_skip_line(struct _header_scan_state *s, GByteArray *save);
286 287
static off_t folder_seek(struct _header_scan_state *s, off_t offset, int whence);
static off_t folder_tell(struct _header_scan_state *s);
288
static int folder_read(struct _header_scan_state *s);
289 290 291
#ifdef MEMPOOL
static void header_append_mempool(struct _header_scan_state *s, struct _header_scan_stack *h, char *header, int offset);
#endif
292 293 294 295

static void camel_mime_parser_class_init (CamelMimeParserClass *klass);
static void camel_mime_parser_init       (CamelMimeParser *obj);

296
#if d(!)0
297 298
static char *states[] = {
	"HSCAN_INITIAL",
299
	"HSCAN_PRE_FROM",	/* pre-from data */
300 301 302 303
	"HSCAN_FROM",		/* got 'From' line */
	"HSCAN_HEADER",		/* toplevel header */
	"HSCAN_BODY",		/* scanning body of message */
	"HSCAN_MULTIPART",	/* got multipart header */
304
	"HSCAN_MESSAGE",	/* rfc822/news message */
305 306 307 308

	"HSCAN_PART",		/* part of a multipart */

	"HSCAN_EOF",		/* end of file */
309
	"HSCAN_PRE_FROM_END",
310 311 312 313 314 315
	"HSCAN_FROM_END",
	"HSCAN_HEAER_END",
	"HSCAN_BODY_END",
	"HSCAN_MULTIPART_END",
	"HSCAN_MESSAGE_END",
};
316
#endif
317

318
static CamelObjectClass *camel_mime_parser_parent;
319

Peter Williams's avatar
Peter Williams committed
320 321 322 323 324
static void
camel_mime_parser_class_init (CamelMimeParserClass *klass)
{
	camel_mime_parser_parent = camel_type_get_global_classfuncs (camel_object_get_type ());
}
325

Peter Williams's avatar
Peter Williams committed
326 327
static void
camel_mime_parser_init (CamelMimeParser *obj)
328
{
Peter Williams's avatar
Peter Williams committed
329 330 331 332
	struct _header_scan_state *s;

	s = folder_scan_init();
	_PRIVATE(obj) = s;
333 334 335
}

static void
336
camel_mime_parser_finalise(CamelObject *o)
337 338
{
	struct _header_scan_state *s = _PRIVATE(o);
339 340 341
#ifdef PURIFY
	purify_watch_remove_all();
#endif
342 343 344
	folder_scan_close(s);
}

Peter Williams's avatar
Peter Williams committed
345 346
CamelType
camel_mime_parser_get_type (void)
347
{
Peter Williams's avatar
Peter Williams committed
348
	static CamelType type = CAMEL_INVALID_TYPE;
349
	
Peter Williams's avatar
Peter Williams committed
350 351 352 353 354 355 356
	if (type == CAMEL_INVALID_TYPE) {
		type = camel_type_register (camel_object_get_type (), "CamelMimeParser",
					    sizeof (CamelMimeParser),
					    sizeof (CamelMimeParserClass),
					    (CamelObjectClassInitFunc) camel_mime_parser_class_init,
					    NULL,
					    (CamelObjectInitFunc) camel_mime_parser_init,
357
					    (CamelObjectFinalizeFunc) camel_mime_parser_finalise);
Peter Williams's avatar
Peter Williams committed
358 359 360
	}
	
	return type;
361 362 363 364 365 366 367 368 369 370 371 372
}

/**
 * camel_mime_parser_new:
 *
 * Create a new CamelMimeParser object.
 * 
 * Return value: A new CamelMimeParser widget.
 **/
CamelMimeParser *
camel_mime_parser_new (void)
{
Peter Williams's avatar
Peter Williams committed
373
	CamelMimeParser *new = CAMEL_MIME_PARSER ( camel_object_new (camel_mime_parser_get_type ()));
374 375 376 377
	return new;
}


378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393
/**
 * camel_mime_parser_filter_add:
 * @m: 
 * @mf: 
 * 
 * Add a filter that will be applied to any body content before it is passed
 * to the caller.  Filters may be pipelined to perform multi-pass operations
 * on the content, and are applied in the order they were added.
 *
 * Note that filters are only applied to the body content of messages, and once
 * a filter has been set, all content returned by a filter_step() with a state
 * of HSCAN_BODY will have passed through the filter.
 * 
 * Return value: An id that may be passed to filter_remove() to remove
 * the filter, or -1 if the operation failed.
 **/
394 395 396 397 398 399 400 401 402 403 404 405
int
camel_mime_parser_filter_add(CamelMimeParser *m, CamelMimeFilter *mf)
{
	struct _header_scan_state *s = _PRIVATE(m);
	struct _header_scan_filter *f, *new;

	new = g_malloc(sizeof(*new));
	new->filter = mf;
	new->id = s->filterid++;
	if (s->filterid == -1)
		s->filterid++;
	new->next = 0;
Peter Williams's avatar
Peter Williams committed
406
	camel_object_ref((CamelObject *)mf);
407 408 409 410 411 412 413 414 415

	/* yes, this is correct, since 'next' is the first element of the struct */
	f = (struct _header_scan_filter *)&s->filters;
	while (f->next)
		f = f->next;
	f->next = new;
	return new->id;
}

416 417 418 419 420 421 422 423
/**
 * camel_mime_parser_filter_remove:
 * @m: 
 * @id: 
 * 
 * Remove a processing filter from the pipeline.  There is no
 * restriction on the order the filters can be removed.
 **/
424 425 426 427 428 429 430 431 432 433
void
camel_mime_parser_filter_remove(CamelMimeParser *m, int id)
{
	struct _header_scan_state *s = _PRIVATE(m);
	struct _header_scan_filter *f, *old;
	
	f = (struct _header_scan_filter *)&s->filters;
	while (f && f->next) {
		old = f->next;
		if (old->id == id) {
Peter Williams's avatar
Peter Williams committed
434
			camel_object_unref((CamelObject *)old->filter);
435 436 437 438 439 440 441 442 443
			f->next = old->next;
			g_free(old);
			/* there should only be a single matching id, but
			   scan the whole lot anyway */
		}
		f = f->next;
	}
}

444 445 446 447 448 449 450 451 452 453 454 455
/**
 * camel_mime_parser_header:
 * @m: 
 * @name: Name of header.
 * @offset: Pointer that can receive the offset of the header in
 * the stream from the start of parsing.
 * 
 * Lookup a header by name.
 * 
 * Return value: The header value, or NULL if the header is not
 * defined.
 **/
456 457 458 459 460 461 462 463 464 465 466 467
const char *
camel_mime_parser_header(CamelMimeParser *m, const char *name, int *offset)
{
	struct _header_scan_state *s = _PRIVATE(m);

	if (s->parts &&
	    s->parts->headers) {
		return header_raw_find(&s->parts->headers, name, offset);
	}
	return NULL;
}

468 469 470 471 472 473 474 475 476
/**
 * camel_mime_parser_headers_raw:
 * @m: 
 * 
 * Get the list of the raw headers which are defined for the
 * current state of the parser.  These headers are valid
 * until the next call to parser_step(), or parser_drop_step().
 * 
 * Return value: The raw headers, or NULL if there are no headers
477
 * defined for the current part or state.  These are READ ONLY.
478
 **/
479 480 481 482 483 484 485 486 487 488
struct _header_raw *
camel_mime_parser_headers_raw(CamelMimeParser *m)
{
	struct _header_scan_state *s = _PRIVATE(m);

	if (s->parts)
		return s->parts->headers;
	return NULL;
}

489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541
static const char *
byte_array_to_string(GByteArray *array)
{
	if (array == NULL)
		return NULL;

	if (array->len == 0 || array->data[array->len-1] != '\0')
		g_byte_array_append(array, "", 1);

	return array->data;
}

/**
 * camel_mime_parser_preface:
 * @m: 
 * 
 * Retrieve the preface text for the current multipart.
 * Can only be used when the state is HSCAN_MULTIPART_END.
 * 
 * Return value: The preface text, or NULL if there wasn't any.
 **/
const char *
camel_mime_parser_preface(CamelMimeParser *m)
{
	struct _header_scan_state *s = _PRIVATE(m);

	if (s->parts)
		return byte_array_to_string(s->parts->pretext);

	return NULL;
}

/**
 * camel_mime_parser_postface:
 * @m: 
 * 
 * Retrieve the postface text for the current multipart.
 * Only returns valid data when the current state if
 * HSCAN_MULTIPART_END.
 * 
 * Return value: The postface text, or NULL if there wasn't any.
 **/
const char *
camel_mime_parser_postface(CamelMimeParser *m)
{
	struct _header_scan_state *s = _PRIVATE(m);

	if (s->parts)
		return byte_array_to_string(s->parts->posttext);

	return NULL;
}

542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
/**
 * camel_mime_parser_from_line:
 * @m: 
 * 
 * Get the last scanned "From " line, from a recently scanned from.
 * This should only be called in the HSCAN_FROM state.  The
 * from line will include the closing \n found (if there was one).
 *
 * The return value will remain valid while in the HSCAN_FROM
 * state, or any deeper state.
 * 
 * Return value: The From line, or NULL if called out of context.
 **/
const char *
camel_mime_parser_from_line(CamelMimeParser *m)
{
	struct _header_scan_state *s = _PRIVATE(m);

	if (s->parts)
		return byte_array_to_string(s->parts->from_line);

	return NULL;
}
565

566 567 568 569 570 571 572 573 574 575 576 577 578 579 580
/**
 * camel_mime_parser_init_with_fd:
 * @m: 
 * @fd: A valid file descriptor.
 * 
 * Initialise the scanner with an fd.  The scanner's offsets
 * will be relative to the current file position of the file
 * descriptor.  As a result, seekable descritors should
 * be seeked using the parser seek functions.
 * 
 * An initial buffer will be read from the file descriptor
 * immediately, although no parsing will occur.
 *
 * Return value: Returns -1 on error.
 **/
581 582 583 584 585 586 587 588
int
camel_mime_parser_init_with_fd(CamelMimeParser *m, int fd)
{
	struct _header_scan_state *s = _PRIVATE(m);

	return folder_scan_init_with_fd(s, fd);
}

589 590 591 592 593 594 595 596 597 598 599 600 601 602 603
/**
 * camel_mime_parser_init_with_stream:
 * @m: 
 * @stream: 
 * 
 * Initialise the scanner with a source stream.  The scanner's
 * offsets will be relative to the current file position of
 * the stream.  As a result, seekable streams should only
 * be seeked using the parser seek function.
 *
 * An initial buffer will be read from the stream
 * immediately, although no parsing will occur.
 * 
 * Return value: -1 on error.
 **/
604 605 606 607 608 609 610 611
int
camel_mime_parser_init_with_stream(CamelMimeParser *m, CamelStream *stream)
{
	struct _header_scan_state *s = _PRIVATE(m);

	return folder_scan_init_with_stream(s, stream);
}

612 613 614 615 616 617 618 619 620 621
/**
 * camel_mime_parser_scan_from:
 * @m: 
 * @scan_from: #TRUE if the scanner should scan From lines.
 * 
 * Tell the scanner if it should scan "^From " lines or not.
 *
 * If the scanner is scanning from lines, two additional
 * states HSCAN_FROM and HSCAN_FROM_END will be returned
 * to the caller during parsing.
622 623 624 625 626
 *
 * This may also be preceeded by an optional
 * HSCAN_PRE_FROM state which contains the scanned data
 * found before the From line is encountered.  See also
 * scan_pre_from().
627
 **/
628 629 630 631 632 633 634
void
camel_mime_parser_scan_from(CamelMimeParser *m, int scan_from)
{
	struct _header_scan_state *s = _PRIVATE(m);
	s->scan_from = scan_from;
}

635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
/**
 * camel_mime_parser_scan_pre_from:
 * @: 
 * @scan_pre_from: #TRUE if we want to get pre-from data.
 * 
 * Tell the scanner whether we want to know abou the pre-from
 * data during a scan.  If we do, then we may get an additional
 * state HSCAN_PRE_FROM which returns the specified data.
 **/
void
camel_mime_parser_scan_pre_from(CamelMimeParser *m, int scan_pre_from)
{
	struct _header_scan_state *s = _PRIVATE(m);
	s->scan_pre_from = scan_pre_from;
}

651 652 653 654 655 656 657 658 659 660
/**
 * camel_mime_parser_content_type:
 * @m: 
 * 
 * Get the content type defined in the current part.
 * 
 * Return value: A content_type structure, or NULL if there
 * is no content-type defined for this part of state of the
 * parser.
 **/
661 662 663 664 665
struct _header_content_type *
camel_mime_parser_content_type(CamelMimeParser *m)
{
	struct _header_scan_state *s = _PRIVATE(m);

666
	/* FIXME: should this search up until it's found the 'right'
667 668 669 670 671 672
	   content-type?  can it? */
	if (s->parts)
		return s->parts->content_type;
	return NULL;
}

673 674 675 676 677 678 679 680 681 682 683
/**
 * camel_mime_parser_unstep:
 * @m: 
 * 
 * Cause the last step operation to repeat itself.  If this is 
 * called repeated times, then the same step will be repeated
 * that many times.
 *
 * Note that it is not possible to scan back using this function,
 * only to have a way of peeking the next state.
 **/
684 685 686 687 688 689 690
void camel_mime_parser_unstep(CamelMimeParser *m)
{
	struct _header_scan_state *s = _PRIVATE(m);

	s->unstep++;
}

691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713
/**
 * camel_mime_parser_drop_step:
 * @m: 
 * 
 * Drop the last step call.  This should only be used
 * in conjunction with seeking of the stream as the
 * stream may be in an undefined state relative to the
 * state of the parser.
 *
 * Use this call with care.
 **/
void camel_mime_parser_drop_step(CamelMimeParser *m)
{
	struct _header_scan_state *s = _PRIVATE(m);

	s->unstep = 0;
	folder_scan_drop_step(s);
}

/**
 * camel_mime_parser_step:
 * @m: 
 * @databuffer: Pointer to accept a pointer to the data
714 715
 * associated with this step (if any).  May be #NULL,
 * in which case datalength is also ingored.
716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
 * @datalength: Pointer to accept a pointer to the data
 * length associated with this step (if any).
 * 
 * Parse the next part of the MIME message.  If _unstep()
 * has been called, then continue to return the same state
 * for that many calls.
 *
 * If the step is HSCAN_BODY then the databuffer and datalength
 * pointers will be setup to point to the internal data buffer
 * of the scanner and may be processed as required.  Any
 * filters will have already been applied to this data.
 *
 * Refer to the state diagram elsewhere for a full listing of
 * the states an application is gauranteed to get from the
 * scanner.
 *
 * Return value: The current new state of the parser
 * is returned.
 **/
735 736 737 738 739
enum _header_state
camel_mime_parser_step(CamelMimeParser *m, char **databuffer, int *datalength)
{
	struct _header_scan_state *s = _PRIVATE(m);

740 741
	d(printf("OLD STATE:  '%s' :\n", states[s->state]));

742 743 744 745 746 747 748 749 750
	if (s->unstep <= 0) {
		char *dummy;
		int dummylength;

		if (databuffer == NULL) {
			databuffer = &dummy;
			datalength = &dummylength;
		}
			
751
		folder_scan_step(s, databuffer, datalength);
752
	} else
753 754 755 756
		s->unstep--;

	d(printf("NEW STATE:  '%s' :\n", states[s->state]));

757 758 759
	return s->state;
}

760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
/**
 * camel_mime_parser_read:
 * @m: 
 * @databuffer: 
 * @len: 
 * 
 * Read at most @len bytes from the internal mime parser buffer.
 *
 * Returns the address of the internal buffer in @databuffer,
 * and the length of useful data.
 *
 * @len may be specified as INT_MAX, in which case you will
 * get the full remainder of the buffer at each call.
 *
 * Note that no parsing of the data read through this function
 * occurs, so no state changes occur, but the seek position
 * is updated appropriately.
 *
 * Return value: The number of bytes available, or -1 on error.
 **/
int
camel_mime_parser_read(CamelMimeParser *m, const char **databuffer, int len)
{
	struct _header_scan_state *s = _PRIVATE(m);
	int there;

	if (len == 0)
		return 0;

789 790
	d(printf("parser::read() reading %d bytes\n", len));

791
	there = MIN(s->inend - s->inptr, len);
792
	d(printf("parser::read() there = %d bytes\n", there));
793 794 795 796 797 798 799 800 801 802
	if (there > 0) {
		*databuffer = s->inptr;
		s->inptr += there;
		return there;
	}

	if (folder_read(s) == -1)
		return -1;

	there = MIN(s->inend - s->inptr, len);
803 804
	d(printf("parser::read() had to re-read, now there = %d bytes\n", there));

805 806 807 808 809 810
	*databuffer = s->inptr;
	s->inptr += there;

	return there;
}

811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830
/**
 * camel_mime_parser_tell:
 * @m: 
 * 
 * Return the current scanning offset.  The meaning of this
 * value will depend on the current state of the parser.
 *
 * An incomplete listing of the states:
 *
 * HSCAN_INITIAL, The start of the current message.
 * HSCAN_HEADER, HSCAN_MESSAGE, HSCAN_MULTIPART, the character
 * position immediately after the end of the header.
 * HSCAN_BODY, Position within the message of the start
 * of the current data block.
 * HSCAN_*_END, The position of the character starting
 * the next section of the scan (the last position + 1 of
 * the respective current state).
 * 
 * Return value: See above.
 **/
831 832 833 834 835 836 837
off_t camel_mime_parser_tell(CamelMimeParser *m)
{
	struct _header_scan_state *s = _PRIVATE(m);

	return folder_tell(s);
}

838 839 840 841 842 843 844 845 846 847 848
/**
 * camel_mime_parser_tell_start_headers:
 * @m: 
 * 
 * Find out the position within the file of where the
 * headers started, this is cached by the parser
 * at the time.
 * 
 * Return value: The header start position, or -1 if
 * no headers were scanned in the current state.
 **/
849 850 851 852 853 854 855
off_t camel_mime_parser_tell_start_headers(CamelMimeParser *m)
{
	struct _header_scan_state *s = _PRIVATE(m);

	return s->start_of_headers;
}

856 857 858 859 860 861 862 863 864 865
/**
 * camel_mime_parser_tell_start_from:
 * @m: 
 * 
 * If the parser is scanning From lines, then this returns
 * the position of the start of the From line.
 * 
 * Return value: The start of the from line, or -1 if there
 * was no From line, or From lines are not being scanned.
 **/
866 867 868 869 870 871 872
off_t camel_mime_parser_tell_start_from(CamelMimeParser *m)
{
	struct _header_scan_state *s = _PRIVATE(m);

	return s->start_of_from;
}

873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889
/**
 * camel_mime_parser_seek:
 * @m: 
 * @off: Number of bytes to offset the seek by.
 * @whence: SEEK_SET, SEEK_CUR, SEEK_END
 * 
 * Reset the source position to a known value.
 *
 * Note that if the source stream/descriptor was not
 * positioned at 0 to begin with, and an absolute seek
 * is specified (whence != SEEK_CUR), then the seek
 * position may not match the desired seek position.
 * 
 * Return value: The new seek offset, or -1 on
 * an error (for example, trying to seek on a non-seekable
 * stream or file descriptor).
 **/
890 891 892 893 894 895
off_t camel_mime_parser_seek(CamelMimeParser *m, off_t off, int whence)
{
	struct _header_scan_state *s = _PRIVATE(m);
	return folder_seek(s, off, whence);
}

896 897 898 899 900 901 902 903
/**
 * camel_mime_parser_state:
 * @m: 
 * 
 * Get the current parser state.
 * 
 * Return value: The current parser state.
 **/
904 905 906 907 908 909
enum _header_state camel_mime_parser_state(CamelMimeParser *m)
{
	struct _header_scan_state *s = _PRIVATE(m);
	return s->state;
}

910 911 912 913 914 915 916 917 918 919 920 921 922
/**
 * camel_mime_parser_stream:
 * @m: 
 * 
 * Get the stream, if any, the parser has been initialised
 * with.  May be used to setup sub-streams, but should not
 * be read from directly (without saving and restoring
 * the seek position in between).
 * 
 * Return value: The stream from _init_with_stream(), or NULL
 * if the parser is reading from a file descriptor or is
 * uninitialised.
 **/
923 924 925 926 927 928
CamelStream *camel_mime_parser_stream(CamelMimeParser *m)
{
	struct _header_scan_state *s = _PRIVATE(m);
	return s->stream;
}

929 930 931 932 933 934 935 936 937 938 939 940 941 942
/**
 * camel_mime_parser_fd:
 * @m: 
 * 
 * Return the file descriptor, if any, the parser has been
 * initialised with.
 *
 * Should not be read from unless the parser it to terminate,
 * or the seek offset can be reset before the next parse
 * step.
 * 
 * Return value: The file descriptor or -1 if the parser
 * is reading from a stream or has not been initialised.
 **/
943 944 945 946 947 948
int camel_mime_parser_fd(CamelMimeParser *m)
{
	struct _header_scan_state *s = _PRIVATE(m);
	return s->fd;
}

949 950 951 952 953 954 955 956 957 958 959 960 961
/* ********************************************************************** */
/*    Implementation							  */
/* ********************************************************************** */

/* read the next bit of data, ensure there is enough room 'atleast' bytes */
static int
folder_read(struct _header_scan_state *s)
{
	int len;
	int inoffset;

	if (s->inptr<s->inend-s->atleast)
		return s->inend-s->inptr;
962 963 964 965
#ifdef PURIFY
	purify_watch_remove(inend_id);
	purify_watch_remove(inbuffer_id);
#endif
966 967 968 969 970 971
	/* check for any remaning bytes (under the atleast limit( */
	inoffset = s->inend - s->inptr;
	if (inoffset>0) {
		memcpy(s->inbuf, s->inptr, inoffset);
	}
	if (s->stream) {
972
		len = camel_stream_read(s->stream, s->inbuf+inoffset, SCAN_BUF-inoffset);
973 974 975 976 977 978 979 980 981 982 983
	} else {
		len = read(s->fd, s->inbuf+inoffset, SCAN_BUF-inoffset);
	}
	r(printf("read %d bytes, offset = %d\n", len, inoffset));
	if (len>=0) {
		/* add on the last read block */
		s->seek += s->inptr - s->inbuf;
		s->inptr = s->inbuf;
		s->inend = s->inbuf+len+inoffset;
		r(printf("content = %d '%.*s'\n",s->inend - s->inptr,  s->inend - s->inptr, s->inptr));
	}
984 985

	g_assert(s->inptr<=s->inend);
986 987 988 989
#ifdef PURIFY
	inend_id = purify_watch(&s->inend);
	inbuffer_id = purify_watch_n(s->inend+1, SCAN_HEAD-1, "rw");
#endif
990
	r(printf("content = %d '%.*s'\n", s->inend - s->inptr,  s->inend - s->inptr, s->inptr));
991 992
	/* set a sentinal, for the inner loops to check against */
	s->inend[0] = '\n';
993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016
	return s->inend-s->inptr;
}

/* return the current absolute position of the data pointer */
static off_t
folder_tell(struct _header_scan_state *s)
{
	return s->seek + (s->inptr - s->inbuf);
}

/*
  need some way to prime the parser state, so this actually works for 
  other than top-level messages
*/
static off_t
folder_seek(struct _header_scan_state *s, off_t offset, int whence)
{
	off_t newoffset;
	int len;

	if (s->stream) {
		if (CAMEL_IS_SEEKABLE_STREAM(s->stream)) {
			/* NOTE: assumes whence seekable stream == whence libc, which is probably
			   the case (or bloody well should've been) */
1017
			newoffset = camel_seekable_stream_seek((CamelSeekableStream *)s->stream, offset, whence);
1018 1019 1020 1021 1022 1023 1024
		} else {
			newoffset = -1;
			errno = EINVAL;
		}
	} else {
		newoffset = lseek(s->fd, offset, whence);
	}
1025 1026 1027 1028
#ifdef PURIFY
	purify_watch_remove(inend_id);
	purify_watch_remove(inbuffer_id);
#endif
1029 1030 1031 1032 1033
	if (newoffset != -1) {
		s->seek = newoffset;
		s->inptr = s->inbuf;
		s->inend = s->inbuf;
		if (s->stream)
1034
			len = camel_stream_read(s->stream, s->inbuf, SCAN_BUF);
1035 1036
		else
			len = read(s->fd, s->inbuf, SCAN_BUF);
1037
		if (len>=0) {
1038
			s->inend = s->inbuf+len;
1039 1040
			s->inend[0] = '\n';
		} else
1041 1042
			newoffset = -1;
	}
1043 1044 1045 1046
#ifdef PURIFY
	inend_id = purify_watch(&s->inend);
	inbuffer_id = purify_watch_n(s->inend+1, SCAN_HEAD-1, "rw");
#endif
1047 1048 1049 1050 1051 1052
	return newoffset;
}

static void
folder_push_part(struct _header_scan_state *s, struct _header_scan_stack *h)
{
1053 1054 1055
	if (s->parts && s->parts->atleast > h->boundarylenfinal)
		h->atleast = s->parts->atleast;
	else
1056
		h->atleast = MAX(h->boundarylenfinal, 1);
1057

1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070
	h->parent = s->parts;
	s->parts = h;
}

static void
folder_pull_part(struct _header_scan_state *s)
{
	struct _header_scan_stack *h;

	h = s->parts;
	if (h) {
		s->parts = h->parent;
		g_free(h->boundary);
1071 1072 1073
#ifdef MEMPOOL
		mempool_free(h->pool);
#else
1074
		header_raw_clear(&h->headers);
1075
#endif
1076
		header_content_type_unref(h->content_type);
1077 1078 1079 1080
		if (h->pretext)
			g_byte_array_free(h->pretext, TRUE);
		if (h->posttext)
			g_byte_array_free(h->posttext, TRUE);
1081 1082
		if (h->from_line)
			g_byte_array_free(h->from_line, TRUE);
1083 1084 1085 1086 1087 1088 1089
		g_free(h);
	} else {
		g_warning("Header stack underflow!\n");
	}
}

static int
1090
folder_scan_skip_line(struct _header_scan_state *s, GByteArray *save)
1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106
{
	int atleast = s->atleast;
	register char *inptr, *inend, c;
	int len;

	s->atleast = 1;

	while ( (len = folder_read(s)) > 0 && len > s->atleast) { /* ensure we have at least enough room here */
		inptr = s->inptr;
		inend = s->inend-1;

		c = -1;
		while (inptr<inend
		       && (c = *inptr++)!='\n')
			;

1107 1108 1109
		if (save)
			g_byte_array_append(save, s->inptr, inptr-s->inptr);

1110 1111 1112 1113 1114 1115 1116 1117 1118 1119
		s->inptr = inptr;

		if (c=='\n') {
			s->atleast = atleast;
			return 0;
		}
	}

	s->atleast = atleast;

1120 1121
	return -1;		/* not found */
}
1122

1123
/* TODO: Is there any way to make this run faster?  It gets called a lot ... */
1124 1125 1126 1127
static struct _header_scan_stack *
folder_boundary_check(struct _header_scan_state *s, const char *boundary, int *lastone)
{
	struct _header_scan_stack *part;
1128 1129
	int len = s->atleast;	/* make sure we dont access past the buffer */
	
1130 1131 1132 1133
	h(printf("checking boundary marker upto %d bytes\n", len));
	part = s->parts;
	while (part) {
		h(printf("  boundary: %s\n", part->boundary));
1134
		h(printf("   against: '%.*s'\n", s->atleast, boundary));
1135 1136 1137 1138 1139
		if (part->boundary
		    && part->boundarylen <= len
		    && memcmp(boundary, part->boundary, part->boundarylen)==0) {
			h(printf("matched boundary: %s\n", part->boundary));
			/* again, make sure we're in range */
1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151
			if (part->boundarylenfinal <= len) {
				int extra = part->boundarylenfinal - part->boundarylen;
				
				/* check the extra stuff on an final boundary, normally -- for mime parts */
				if (extra>0) {
					*lastone = memcmp(&boundary[part->boundarylen],
							  &part->boundary[part->boundarylen],
							  extra) == 0;
				} else {
					*lastone = TRUE;
				}
				h(printf("checking lastone = %s\n", *lastone?"TRUE":"FALSE"));
1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163
			} else {
				h(printf("not enough room to check last one?\n"));
				*lastone = FALSE;
			}
			/*printf("ok, we found it! : %s \n", (*lastone)?"Last one":"More to come?");*/
			return part;
		}
		part = part->parent;
	}
	return NULL;
}

1164 1165 1166 1167 1168 1169
#ifdef MEMPOOL
static void
header_append_mempool(struct _header_scan_state *s, struct _header_scan_stack *h, char *header, int offset)
{
	struct _header_raw *l, *n;
	char *content;
1170
	
1171 1172 1173 1174 1175
	content = strchr(header, ':');
	if (content) {
		register int len;
		n = mempool_alloc(h->pool, sizeof(*n));
		n->next = NULL;
1176
		
1177 1178 1179 1180
		len = content-header;
		n->name = mempool_alloc(h->pool, len+1);
		memcpy(n->name, header, len);
		n->name[len] = 0;
1181
		
1182
		content++;
1183
		
1184 1185 1186 1187
		len = s->outptr - content;
		n->value = mempool_alloc(h->pool, len+1);
		memcpy(n->value, content, len);
		n->value[len] = 0;
1188
		
1189
		n->offset = offset;
1190
		
1191 1192 1193 1194 1195 1196
		l = (struct _header_raw *)&h->headers;
		while (l->next) {
			l = l->next;
		}
		l->next = n;
	}
1197
	
1198 1199 1200 1201 1202 1203
}

#define header_raw_append_parse(a, b, c) (header_append_mempool(s, h, b, c))

#endif

1204 1205
/* Copy the string start->inptr into the header buffer (s->outbuf),
   grow if necessary
1206
   remove trailing \r chars (\n's assumed already removed)
1207 1208
   and track the start offset of the header */
/* Basically an optimised version of g_byte_array_append() */
1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228
#define header_append(s, start, inptr)								\
{												\
	register int headerlen = inptr-start;							\
												\
	if (headerlen > 0) {									\
		if (headerlen >= (s->outend - s->outptr)) {					\
			register char *outnew;							\
			register int len = ((s->outend - s->outbuf)+headerlen)*2+1;		\
			outnew = g_realloc(s->outbuf, len);					\
			s->outptr = s->outptr - s->outbuf + outnew;				\
			s->outbuf = outnew;							\
			s->outend = outnew + len;						\
		}										\
		if (start[headerlen-1] == '\r')							\
			headerlen--;								\
		memcpy(s->outptr, start, headerlen);						\
		s->outptr += headerlen;								\
	}											\
	if (s->header_start == -1)								\
		s->header_start = (start-s->inbuf) + s->seek;					\
1229 1230
}

1231 1232 1233
static struct _header_scan_stack *
folder_scan_header(struct _header_scan_state *s, int *lastone)
{
1234 1235
	int atleast = s->atleast, newatleast;
	char *start = NULL;
1236 1237
	int len;
	struct _header_scan_stack *h;
NotZed's avatar
NotZed committed
1238 1239
	char *inend;
	register char *inptr;
1240 1241 1242 1243

	h(printf("scanning first bit\n"));

	h = g_malloc0(sizeof(*h));
1244 1245 1246
#ifdef MEMPOOL
	h->pool = mempool_new(8192, 4096);
#endif
1247

1248 1249
	if (s->parts)
		newatleast = s->parts->atleast;
1250
	else
1251
		newatleast = 1;
1252
	*lastone = FALSE;
1253

1254 1255 1256 1257 1258 1259 1260 1261
	do {
		s->atleast = newatleast;

		h(printf("atleast = %d\n", s->atleast));

		while ((len = folder_read(s))>0 && len >= s->atleast) { /* ensure we have at least enough room here */
			inptr = s->inptr;
			inend = s->inend-s->atleast+1;
1262
			
1263 1264 1265 1266 1267 1268 1269 1270
			while (inptr<inend) {
				if (!s->midline) {
					if (folder_boundary_check(s, inptr, lastone)) {
						if ((s->outptr>s->outbuf))
							goto header_truncated; /* may not actually be truncated */
						
						goto header_done;
					}
1271
				}
NotZed's avatar
NotZed committed
1272
				
1273
				start = inptr;
1274

1275 1276 1277 1278 1279
				/* goto next line/sentinal */
				while ((*inptr++)!='\n')
					;
			
				g_assert(inptr<=s->inend+1);
NotZed's avatar
NotZed committed
1280
				
1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295
				/* check for sentinal or real end of line */
				if (inptr > inend) {
					h(printf("not at end of line yet, going further\n"));
					/* didn't find end of line within our allowed area */
					inptr = inend;
					s->midline = TRUE;
					header_append(s, start, inptr);
				} else {
					h(printf("got line part: '%.*s'\n", inptr-1-start, start));
					/* got a line, strip and add it, process it */
					s->midline = FALSE;
					header_append(s, start, inptr-1);

					/* check for end of headers */
					if (s->outbuf == s->outptr)
1296
						goto header_done;
1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316

					/* check for continuation/compress headers, we have atleast 1 char here to work with */
					if (inptr[0] ==  ' ' || inptr[0] == '\t') {
						h(printf("continuation\n"));
						/* TODO: this wont catch multiple space continuation across a read boundary, but
						   that is assumed rare, and not fatal anyway */
						do
							inptr++;
						while (*inptr == ' ' || *inptr == '\t');
						inptr--;
						*inptr = ' ';
					} else {
						/* otherwise, complete header, add it */
						s->outptr[0] = 0;
				
						h(printf("header '%.20s' at %d\n", s->outbuf, s->header_start));
						
						header_raw_append_parse(&h->headers, s->outbuf, s->header_start);
						s->outptr = s->outbuf;
						s->header_start = -1;
1317
					}
NotZed's avatar
NotZed committed
1318
				}
1319
			}
1320
			s->inptr = inptr;
1321
		}
1322 1323 1324
		h(printf("end of file?  read %d bytes\n", len));
		newatleast = 1;
	} while (s->atleast > 1);
1325 1326 1327 1328

	if ((s->outptr > s->outbuf) || s->inend > s->inptr) {
		start = s->inptr;
		inptr = s->inend;
1329 1330 1331 1332
		if (inptr > start) {
			if (inptr[-1] == '\n')
				inptr--;
		}
1333 1334
		goto header_truncated;
	}
1335
	
1336
	s->atleast = atleast;
1337
	
1338
	return h;
1339
	
1340
header_truncated:
1341
	header_append(s, start, inptr);
1342
	
1343
	s->outptr[0] = 0;
1344
	if (s->outbuf == s->outptr)
1345
		goto header_done;
1346
	
1347
	header_raw_append_parse(&h->headers, s->outbuf, s->header_start);
1348
	
1349
	s->outptr = s->outbuf;
1350
header_done:
1351 1352 1353 1354 1355 1356 1357 1358 1359
	s->inptr = inptr;
	s->atleast = atleast;
	s->header_start = -1;
	return h;
}

static struct _header_scan_stack *
folder_scan_content(struct _header_scan_state *s, int *lastone, char **data, int *length)
{
1360
	int atleast = s->atleast, newatleast;
NotZed's avatar
NotZed committed
1361 1362
	register char *inptr;
	char *inend;
1363 1364
	char *start;
	int len;
1365
	struct _header_scan_stack *part;
1366
	int onboundary = FALSE;
1367

Not Zed's avatar
Not Zed committed
1368
	c(printf("scanning content\n"));
1369 1370

	part = s->parts;
1371
	if (part)
1372
		newatleast = part->atleast;
1373
	else
1374
		newatleast = 1;
1375
	*lastone = FALSE;
1376

1377 1378
	c(printf("atleast = %d\n", s->atleast));

1379 1380
	do {
		s->atleast = newatleast;
1381

1382 1383 1384 1385
		while ((len = folder_read(s))>0 && len >= s->atleast) { /* ensure we have at least enough room here */
			inptr = s->inptr;
			inend = s->inend-s->atleast+1;
			start = inptr;
1386

1387
			c(printf("inptr = %p, inend = %p\n", inptr, inend));
1388

1389 1390 1391 1392
			while (inptr<inend) {
				if (!s->midline
				    && (part = folder_boundary_check(s, inptr, lastone))) {
					onboundary = TRUE;
1393

1394 1395 1396 1397
					/* since we truncate the boundary data, we need at least 1 char here spare,
					   to remain in the same state */
					if ( (inptr-start) > 1)
						goto content;
NotZed's avatar
NotZed committed
1398

1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413
					/* otherwise, jump to the state of the boundary we actually found */
					goto normal_exit;
				}
				
				/* goto the next line */
				while ((*inptr++)!='\n')
					;

				/* check the sentinal, if we went past the atleast limit, and reset it to there */
				if (inptr > inend) {
					s->midline = TRUE;
					inptr = inend;
				} else {
					s->midline = FALSE;
				}
1414
			}
1415

1416 1417 1418 1419 1420 1421
			c(printf("ran out of input, dumping what i have (%d) bytes midline = %s\n",
				 inptr-start, s->midline?"TRUE":"FALSE"));
			goto content;
		}
		newatleast = 1;
	} while (s->atleast > 1);
1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441

	c(printf("length read = %d\n", len));

	if (s->inend > s->inptr) {
		start = s->inptr;
		inptr = s->inend;
		goto content;
	}

	*length = 0;
	s->atleast = atleast;
	return NULL;

content:
	part = s->parts;
normal_exit:
	s->atleast = atleast;
	s->inptr = inptr;

	*data = start;
1442
	/* if we hit a boundary, we should not include the closing \n */
1443
	if (onboundary && (inptr-start)>0)
1444 1445 1446
		*length = inptr-start-1;
	else
		*length = inptr-start;
1447

1448
	/*printf("got %scontent: '%.*s'\n", s->midline?"partial ":"", inptr-start, start);*/
1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460

	return part;
}


static void
folder_scan_close(struct _header_scan_state *s)
{
	g_free(s->realbuf);
	g_free(s->outbuf);
	while (s->parts)
		folder_pull_part(s);
1461 1462
	if (s->fd != -1)
		close(s->fd);
1463
	if (s->stream) {
Peter Williams's avatar
Peter Williams committed
1464
		camel_object_unref((CamelObject *)s->stream);
1465
	}
1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490
	g_free(s);
}


static struct _header_scan_state *
folder_scan_init(void)
{
	struct _header_scan_state *s;

	s = g_malloc(sizeof(*s));

	s->fd = -1;
	s->stream = NULL;

	s->outbuf = g_malloc(1024);
	s->outptr = s->outbuf;
	s->outend = s->outbuf+1024;

	s->realbuf = g_malloc(SCAN_BUF + SCAN_HEAD*2);
	s->inbuf = s->realbuf + SCAN_HEAD;
	s->inptr = s->inbuf;
	s->inend = s->inbuf;
	s->atleast = 0;

	s->seek = 0;		/* current character position in file of the last read block */
1491
	s->unstep = 0;
1492 1493 1494 1495 1496 1497 1498 1499

	s->header_start = -1;

	s->start_of_from = -1;
	s->start_of_headers = -1;

	s->midline = FALSE;
	s->scan_from = FALSE;
1500
	s->scan_pre_from = FALSE;
1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518

	s->filters = NULL;
	s->filterid = 1;

	s->parts = NULL;

	s->state = HSCAN_INITIAL;
	return s;
}

static int
folder_scan_init_with_fd(struct _header_scan_state *s, int fd)
{
	int len;

	len = read(fd, s->inbuf, SCAN_BUF);
	if (len>=0) {
		s->inend = s->inbuf+len;
1519 1520
		s->inptr = s->inbuf;
		s->inend[0] = '\n';
1521 1522
		if (s->fd != -1)
			close(s->fd);
1523 1524
		s->fd = fd;
		if (s->stream) {
Peter Williams's avatar
Peter Williams committed
1525
			camel_object_unref((CamelObject *)s->stream);
1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538
			s->stream = NULL;
		}
		return 0;
	} else {
		return -1;
	}
}

static int
folder_scan_init_with_stream(struct _header_scan_state *s, CamelStream *stream)
{
	int len;

1539 1540
	len = camel_stream_read(stream, s->inbuf, SCAN_BUF);
	if (len >= 0) {
1541
		s->inend = s->inbuf+len;
1542 1543
		s->inptr = s->inbuf;
		s->inend[0] = '\n';
1544
		if (s->stream)
Peter Williams's avatar
Peter Williams committed
1545
			camel_object_unref((CamelObject *)s->stream);
1546
		s->stream = stream;
Peter Williams's avatar
Peter Williams committed
1547
		camel_object_ref((CamelObject *)stream);
1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576
		if (s->fd != -1) {
			close(s->fd);
			s->fd = -1;
		}
		return 0;
	} else {
		return -1;
	}
}

#define USE_FROM

static void
folder_scan_step(struct _header_scan_state *s, char **databuffer, int *datalength)
{
	struct _header_scan_stack *h, *hb;
	const char *content;
	const char *bound;
	int type;
	int state;
	struct _header_content_type *ct = NULL;
	struct _header_scan_filter *f;
	size_t presize;

/*	printf("\nSCAN PASS: state = %d '%s'\n", s->state, states[s->state]);*/

tail_recurse:
	d({
		printf("\nSCAN STACK:\n");
Not Zed's avatar
Not Zed committed
1577
		printf(" '%s' :\n", states[s->state]);
1578 1579
		hb = s->parts;
		while (hb) {
Not Zed's avatar
Not Zed committed
1580 1581 1582 1583 1584 1585 1586
			printf("  '%s' : %s ", states[hb->savestate], hb->boundary);
			if (hb->content_type) {
				printf("(%s/%s)", hb->content_type->type, hb->content_type->subtype);
			} else {
				printf("(default)");
			}
			printf("\n");
1587 1588 1589 1590 1591 1592 1593 1594
			hb = hb->parent;
		}
		printf("\n");
	});

	switch (s->state) {

#ifdef USE_FROM
1595
	case HSCAN_INITIAL:
1596 1597 1598 1599
		if (s->scan_from) {
			h = g_malloc0(sizeof(*h));
			h->boundary = g_strdup("From ");
			h->boundarylen = strlen(h->boundary);
1600
			h->boundarylenfinal = h->boundarylen;
1601
			h->from_line = g_byte_array_new();
1602
			folder_push_part(s, h);
1603
			s->state = HSCAN_PRE_FROM;
1604 1605
		} else {
			s->start_of_from = -1;
1606
			goto scan_header;
1607 1608
		}

1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636
	case HSCAN_PRE_FROM:

		h = s->parts;
		do {
			hb = folder_scan_content(s, &state, databuffer, datalength);
			if (s->scan_pre_from && *datalength > 0) {
				d(printf("got pre-from content %d bytes\n", *datalength));
				return;
			}
		} while (hb==h && *datalength>0);

		if (*datalength==0 && hb==h) {
			d(printf("found 'From '\n"));
			s->start_of_from = folder_tell(s);
			folder_scan_skip_line(s, h->from_line);
			h->savestate = HSCAN_INITIAL;
			s->state = HSCAN_FROM;
		} else {
			folder_pull_part(s);
			s->state = HSCAN_EOF;
		}
		return;
#else
	case HSCAN_INITIAL:
	case HSCAN_PRE_FROM:
#endif /* !USE_FROM */

	scan_header:
1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651
	case HSCAN_FROM:
		s->start_of_headers = folder_tell(s);
		h = folder_scan_header(s, &state);
#ifdef USE_FROM
		if (s->scan_from)
			h->savestate = HSCAN_FROM_END;
		else
#endif
			h->savestate = HSCAN_EOF;

		/* FIXME: should this check for MIME-Version: 1.0 as well? */

		type = HSCAN_HEADER;
		if ( (content = header_raw_find(&h->headers, "Content-Type", NULL))
		     && (ct = header_content_type_decode(content))) {
1652
			if (!strcasecmp(ct->type, "multipart")) {
1653 1654 1655 1656
				bound = header_content_type_param(ct, "boundary");
				if (bound) {
					d(printf("multipart, boundary = %s\n", bound));
					h->boundarylen = strlen(bound)+2;
1657
					h->boundarylenfinal = h->boundarylen+2;
1658 1659 1660 1661
					h->boundary = g_malloc(h->boundarylen+3);
					sprintf(h->boundary, "--%s--", bound);
					type = HSCAN_MULTIPART;
				} else {
1662 1663 1664 1665
					header_content_type_unref(ct);
					ct = header_content_type_decode("text/plain");
/* We can't quite do this, as it will mess up all the offsets ... */
/*					header_raw_replace(&h->headers, "Content-Type", "text/plain", offset);*/
1666 1667
					g_warning("Multipart with no boundary, treating as text/plain");
				}
1668 1669
			} else if (!strcasecmp(ct->type, "message")) {
				if (!strcasecmp(ct->subtype, "rfc822")
1670
				    || !strcasecmp(ct->subtype, "news")
1671
				    /*|| !strcasecmp(ct->subtype, "partial")*/) {
1672 1673 1674
					type = HSCAN_MESSAGE;
				}
			}
Not Zed's avatar
Not Zed committed
1675 1676 1677 1678 1679 1680 1681 1682 1683 1684
		} else {
			/* make the default type for multipart/digest be message/rfc822 */
			if ((s->parts
			     && header_content_type_is(s->parts->content_type, "multipart", "digest"))) {
				ct = header_content_type_decode("message/rfc822");
				type = HSCAN_MESSAGE;
				d(printf("parent was multipart/digest, autoupgrading to message/rfc822?\n"));
				/* maybe we should do this too?
				   header_raw_append_parse(&h->headers, "Content-Type: message/rfc822", -1);*/
			}
1685 1686 1687 1688 1689
		}
		h->content_type = ct;
		folder_push_part(s, h);
		s->state = type;
		return;
1690
		
1691 1692
	case HSCAN_HEADER:
		s->state = HSCAN_BODY;
1693
		
1694 1695 1696 1697 1698
	case HSCAN_BODY:
		h = s->parts;
		*datalength = 0;
		presize = SCAN_HEAD;
		f = s->filters;
1699
		
1700
		do {
1701
			hb = folder_scan_content (s, &state, databuffer, datalength);
1702 1703 1704 1705

			d(printf ("\n\nOriginal content: '"));
			d(fwrite(*databuffer, sizeof(char), *datalength, stdout));
			d(printf("'\n"));
1706

1707
			if (*datalength > 0) {
1708
				while (f) {
1709 1710
					camel_mime_filter_filter(f->filter, *databuffer, *datalength, presize,
								 databuffer, datalength, &presize);
1711 1712 1713 1714
					d(printf ("Filtered content (%s): '",
						  camel_type_to_name(((CamelObject *)f->filter)->s.type)));
					d(fwrite(*databuffer, sizeof(char), *datalength, stdout));
					d(printf("'\n"));
1715 1716 1717 1718
					f = f->next;
				}
				return;
			}
1719 1720
		} while (hb == h && *datalength > 0);
		
1721
		/* check for any filter completion data */
1722 1723 1724 1725
		while (f) {
			camel_mime_filter_complete(f->filter, *databuffer, *datalength, presize,
						   databuffer, datalength, &presize);
			f = f->next;
1726
		}
1727

1728 1729
		if (*datalength > 0)
			return;
1730
		
1731 1732
		s->state = HSCAN_BODY_END;
		break;
1733
		
1734 1735 1736 1737 1738 1739
	case HSCAN_MULTIPART:
		h = s->parts;
		do {
			do {
				hb = folder_scan_content(s, &state, databuffer, datalength);
				if (*datalength>0) {
1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752
					/* instead of a new state, we'll just store it locally and provide
					   an accessor function */
					d(printf("Multipart %s Content %p: '%.*s'\n",
						 h->prestage>0?"post":"pre", h, *datalength, *databuffer));
					if (h->prestage > 0) {
						if (h->posttext == NULL)
							h->posttext = g_byte_array_new();
						g_byte_array_append(h->posttext, *databuffer, *datalength);
					} else {
						if (h->pretext == NULL)
							h->pretext = g_byte_array_new();
						g_byte_array_append(h->pretext, *databuffer, *datalength);
					}
1753 1754
				}
			} while (hb==h && *datalength>0);
1755
			h->prestage++;
1756 1757
			if (*datalength==0 && hb==h) {
				d(printf("got boundary: %s\n", hb->boundary));
1758
				folder_scan_skip_line(s, NULL);
1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799
				if (!state) {
					s->state = HSCAN_FROM;
					folder_scan_step(s, databuffer, datalength);
					s->parts->savestate = HSCAN_MULTIPART; /* set return state for the new head part */
					return;
				}
			} else {
				break;
			}
		} while (1);

		s->state = HSCAN_MULTIPART_END;
		break;

	case HSCAN_MESSAGE:
		s->state = HSCAN_FROM;
		folder_scan_step(s, databuffer, datalength);
		s->parts->savestate = HSCAN_MESSAGE_END;
		break;

	case HSCAN_FROM_END:
	case HSCAN_BODY_END:
	case HSCAN_MULTIPART_END:
	case HSCAN_MESSAGE_END:
		s->state = s->parts->savestate;
		folder_pull_part(s);
		if (s->state & HSCAN_END)
			return;
		goto tail_recurse;

	case HSCAN_EOF:
		return;

	default:
		g_warning("Invalid state in camel-mime-parser: %d", s->state);
		break;
	}

	return;
}

1800 1801 1802 1803 1804 1805 1806 1807 1808 1809
/* drops the current state back one */
static void
folder_scan_drop_step(struct _header_scan_state *s)
{
	switch (s->state) {
	case HSCAN_INITIAL:
	case HSCAN_EOF:
		return;

	case HSCAN_FROM:
1810
	case HSCAN_PRE_FROM:
1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829
		s->state = HSCAN_INITIAL;
		folder_pull_part(s);
		return;

	case HSCAN_MESSAGE:
	case HSCAN_HEADER:
	case HSCAN_MULTIPART:

	case HSCAN_FROM_END:
	case HSCAN_BODY_END:
	case HSCAN_MULTIPART_END:
	case HSCAN_MESSAGE_END:

		s->state = s->parts->savestate;
		folder_pull_part(s);
		if (s->state & HSCAN_END) {
			s->state &= ~HSCAN_END;
		}
		return;
Not Zed's avatar
Not Zed committed
1830 1831
	default:
		/* FIXME: not sure if this is entirely right */
1832 1833 1834
	}
}

1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864
#ifdef STANDALONE
int main(int argc, char **argv)
{
	int fd;
	struct _header_scan_state *s;
	char *data;
	int len;
	int state;
	char *name = "/tmp/evmail/Inbox";
	struct _header_scan_stack *h;
	int i;
	int attach = 0;

	if (argc==2)
		name = argv[1];

	printf("opening: %s", name);

	for (i=1;i<argc;i++) {
		const char *encoding = NULL, *charset = NULL;
		char *attachname;

		name = argv[i];
		printf("opening: %s", name);
		
		fd = open(name, O_RDONLY);
		if (fd==-1) {
			perror("Cannot open mailbox");
			exit(1);
		}
Not Zed's avatar
Not Zed committed
1865 1866
		s = folder_scan_init();
		folder_scan_init_with_fd(s, fd);
1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879
		s->scan_from = FALSE;
#if 0
		h = g_malloc0(sizeof(*h));
		h->savestate = HSCAN_EOF;
		folder_push_part(s, h);
#endif	
		while (s->state != HSCAN_EOF) {
			folder_scan_step(s, &data, &len);
			printf("\n -- PARSER STEP RETURN -- %d '%s'\n\n", s->state, states[s->state]);
			switch (s->state) {
			case HSCAN_HEADER:
				if (s->parts->content_type
				    && (charset = header_content_type_param(s->parts->content_type, "charset"))) {
1880
					if (strcasecmp(charset, "us-ascii")) {
Not Zed's avatar
Not Zed committed
1881
#if 0
1882
						folder_push_filter_charset(s, "UTF-8", charset);
Not Zed's avatar
Not Zed committed
1883
#endif
NotZed's avatar