From: chris mikkelson Date: Wed, 10 Mar 2010 04:14:04 +0000 (-0600) Subject: Preliminary implementation of message/rfc822 and multipart part processor. X-Git-Url: https://git.mikk.net/?a=commitdiff_plain;h=dfd64cfde1bb6bbe0ecbd373501eb85318a3f54c;p=liburl Preliminary implementation of message/rfc822 and multipart part processor. --- diff --git a/Makefile b/Makefile index 5af3b51..fa99451 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,10 @@ CFLAGS=-g -Wall -Werror -I/usr/local/include .c.o: $(CC) $(CFLAGS) -c $> -SRCS=msgproc.c base64.c quoted-printable.c re_stream.c html.c text.c multipart.c -OBJS=msgproc.o base64.o quoted-printable.o re_stream.o html.o text.o multipart.o +SRCS=msgproc.c base64.c quoted-printable.c re_stream.c html.c text.c \ + multipart.c message.c +OBJS=msgproc.o base64.o quoted-printable.o re_stream.o html.o text.o \ + multipart.o message.o default: liburl.a diff --git a/message.c b/message.c new file mode 100644 index 0000000..be32fec --- /dev/null +++ b/message.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2009 Christopher L. Mikkelson + * All Rights Reserved, for now. + */ + +#include +#include +#include +#include "msgproc.h" + +static const char *ctype_pat = "^Content-Type:.*?(^\\S|\\Z)"; +static pcre *ctype_re; +static const char *cxfer_pat = "^Content-Tranfer-Encoding:.*?(^\\S|\\Z)"; +static pcre *cxfer_re; +static const char *type_pat = "text/(plain|html)|(multipart)/"; +static pcre *type_re; +static const char *enc_pat = "7bit|8bit|base64|quoted-printable"; +static pcre *enc_re; +static const char *bound_pat = "boundary\\s*=\\s*\"?([\\da-z'()+,-./:=? \r\n])\"?"; +static pcre *bound_re; + +static void +message_init(void) +{ + static int initialized = 0; + const char *etxt; + int epos; + int options = PCRE_MULTILINE | PCRE_DOTALL | PCRE_CASELESS; + + if (initialized) return; + initialized = 1; + + ctype_re = pcre_compile(ctype_pat, options, &etxt, &epos, 0); + cxfer_re = pcre_compile(cxfer_pat, options, &etxt, &epos, 0); + options = PCRE_CASELESS; + type_re = pcre_compile(type_pat, options, &etxt, &epos, 0); + enc_re = pcre_compile(enc_pat, options, &etxt, &epos, 0); + bound_re = pcre_compile(bound_pat, options, &etxt, &epos, 0); +} + +static msgproc_module *text_module = &msgproc_text, + *html_module = &msgproc_html, + *multipart_module = &msgproc_multipart; + +#define HDRALLOC 512 +#define HDRINCR HDRALLOC + +struct message_state { + char *header; + int hdrlen, hdralloc; + enum { + START = 0, + CR, NL, + CR2, + BODY + } state; +}; + +static void +message_start(msgproc *m) +{ + struct message_state *ms = malloc(sizeof(struct message_state)); + if (ms) { + bzero(ms, sizeof(*ms)); + ms->header = malloc(HDRALLOC); + if (!ms->header) { + free(ms); + ms = 0; + } + } + msgproc_setpriv(m, (void*)ms); +} + +static void +message_finish(msgproc *m) +{ + struct message_state *ms = msgproc_getpriv(m); + if (ms) { + if (ms->header) free(ms->header); + free(ms); + } + msgproc_free(m); +} + +static inline char * +append_hdr(char c, struct message_state *ms) +{ + if (!ms->header) return ms->header; + while (ms->header && ms->hdrlen >= ms->hdralloc) { + ms->hdralloc += HDRINCR; + ms->header = reallocf(ms->header, ms->hdralloc); + } + if (ms->header) { + ms->header[ms->hdrlen++] = c; + } + return ms->header; +} + +static int +parse_cxfer(msgproc *m, char *h, size_t hlen) +{ + int match, res = 1; + int ovec[30]; + msgproc *next; + + match = pcre_exec(enc_re, 0, h, hlen, 0, 0, ovec, 30); + if (match > 0) { + char *enc; + pcre_get_substring(h, ovec, match, 1, (const char **)&enc); + if (!enc) res = 0; + if (!strcasecmp(enc, "quoted-printable")) { + next = msgproc_create(m, &msgproc_quoted); + } else if (!strcasecmp(enc, "base64")) { + next = msgproc_create(m, &msgproc_base64); + } else { + res = 0; + } + free(enc); + } else { + res = 0; + } + return res; +} + +static int +parse_ctype(msgproc *m, char *h, size_t hlen) +{ + int match, res = 1; + int ovec[30]; + msgproc *next, *parent; + + next = msgproc_next(m); + if (next) { + parent = next; + } else { + parent = m; + } + + match = pcre_exec(type_re, 0, h, hlen, 0, 0, ovec, 30); + if (match > 0) { + char *type; + pcre_get_substring(h, ovec, match, 1, (const char **)&type); + if (!type) res = 0; + if (!strcasecmp(type, "plain")) { + msgproc_create(parent, text_module); + } else if (!strcasecmp(type, "html")) { + msgproc_create(parent, html_module); + } else if (!strcasecmp(type, "multipart")) { + /* XXX -- unlike text/plain and text/html, + multipart parts cannot be encoded. + Remove any previously-established + encoding */ + next = msgproc_next(m); + if (next) msgproc_finish(next); + + match = pcre_exec(bound_re, 0, h, hlen, 0, 0, ovec, 30); + if (match > 0) { + char *b; + pcre_get_substring(h, ovec, match, 1, (const char **)&b); + next = msgproc_create(m, multipart_module); + msgproc_start(next); + msgproc_set(next, 1, b, strlen(b)); + free(h); + res = 2; + } + } else { + res = 0; + } + free(type); + } else { + res = 0; + } + return res; +} + +static void +message_process(msgproc *m, char *buf, size_t len) +{ + struct message_state *ms = msgproc_getpriv(m); + char *s, *h; + int res, match, ovec[30]; + msgproc *next = 0; + + if (!ms) return; + if (!ms->header) return; + + for (s = buf; len > 0; s++, len--) { + if (ms->state == BODY) { + next = msgproc_next(m); + if (next) + msgproc_process(next, s, len); + return; + } + append_hdr(*s, ms); + switch(ms->state) { + case START: + if (*s == '\r') ms->state = CR; + if (*s == '\n') ms->state = NL; + break; + case CR: + if (*s == '\n') ms->state = NL; + else ms->state = START; + break; + case NL: + if (*s == '\r') ms->state = CR2; + if (*s == '\n') ms->state = BODY; + break; + case CR2: + if (*s == '\n') ms->state = BODY; + else ms->state = START; + break; + case BODY: + break; + } + if (ms->state != BODY) continue; + + /* parse saved headers, look for non-default Content-Type + and Content-Transfer-Encoding values. */ + + match = pcre_exec(cxfer_re, 0, ms->header, ms->hdrlen, 0, 0, + ovec, 30); + if (match > 0) { + pcre_get_substring(ms->header, ovec, match, 0, (const char **)&h); + res = parse_cxfer(m, h, strlen(h)); + free(h); + if (!res) return; + } + + match = pcre_exec(ctype_re, 0, ms->header, ms->hdrlen, 0, 0, + ovec, 30); + if (match > 0) { + pcre_get_substring(ms->header, ovec, match, 0, (const char **)&h); + res = parse_ctype(m, h, strlen(h)); + free(h); + if (!res) return; + if (res == 2) continue; /* XXX -- avoid a double + start of multipart downstream, + which would clobber the + boundary value. Better fix + would be to include a state + in the core msgproc struct + to prevent double starts. */ + } + + next = msgproc_next(m); + assert(next); + msgproc_start(next); + } +} + +msgproc_module msgproc_message = { + MSGPROC_MESSAGE, /* type */ + message_init, /* module init */ + NULL, /* set module parameter */ + message_start, /* start module instance */ + NULL, /* set module instance parameter */ + message_process, /* process data */ + message_finish, /* shut down, free module instance */ + NULL /* shut down, free module */ +}; + +msgproc_module msgproc_part = { + MSGPROC_PART, /* type */ + message_init, + NULL, /* set module parameter */ + message_start, /* start module instance */ + NULL, /* set module instance parameter */ + message_process, /* process data */ + message_finish, /* shut down, free module instance */ + NULL /* shut down, free module */ +};