From 4c33bd1ffb9fcb024ea70de24e69a111d3e675c2 Mon Sep 17 00:00:00 2001 From: chris mikkelson Date: Sun, 25 Jan 2009 22:39:11 -0600 Subject: [PATCH 1/1] Beginnings of a URL extraction library for e-mail messages. General plumbing is complete, as are the stream-based regexp matcher, base64, and quoted-printable decoder. Next steps are basic header parsing (regexp-based?), MIME boundary extraction and matching. --- base64.c | 103 ++++++++++++++++++++++++++++++++++ decoders.h | 27 +++++++++ msgproc.c | 74 ++++++++++++++++++++++++ msgproc.h | 33 +++++++++++ parser.c | 89 +++++++++++++++++++++++++++++ parser.h | 40 +++++++++++++ quoted-printable.c | 113 +++++++++++++++++++++++++++++++++++++ re_stream.c | 137 +++++++++++++++++++++++++++++++++++++++++++++ re_stream.h | 17 ++++++ 9 files changed, 633 insertions(+) create mode 100644 base64.c create mode 100644 decoders.h create mode 100644 msgproc.c create mode 100644 msgproc.h create mode 100644 parser.c create mode 100644 parser.h create mode 100644 quoted-printable.c create mode 100644 re_stream.c create mode 100644 re_stream.h diff --git a/base64.c b/base64.c new file mode 100644 index 0000000..752ae19 --- /dev/null +++ b/base64.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2009 Christopher L. Mikkelson + * All Rights Reserved, for now. + */ + +/* Adapted from phk@freebsd.dk's public domain implementation. */ + +#include +#include "decoders.h" + +static const char *b64c = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static char i64[256]; + +void +base64_init(void) +{ + int i; + const char *p; + for (i = 0; i < 256; i++) + i64[i] = -1; + for (p = b64c, i = 0; *p; p++, i++) + i64[(unsigned int)*p] = i; + i64['='] = 0; +} + +struct b64_state * +b64_start (struct b64_state *b64s) +{ + bzero(b64s, sizeof(*b64s)); + return b64s; +} + +void +b64_stop (struct b64_state *b64s) +{ + bzero(b64s, sizeof(*b64s)); +} + +int +b64_decode(struct b64_state *b64s, char *s, int len, + char *out, int size, int *outlen) +{ + char *p; + unsigned u, v, l; + *outlen = 0; + + l = 0; + u = b64s->u; + v = b64s->v; + b64s->u = b64s->v = 0; + + for (p = s; p < s + len;) { + if (*p == '\r' || *p == '\n') continue; + if (size < 3) return (int)(p - s); + for (; v < 4; v++) { + if (p == s + len) { + b64s->u = u; + b64s->v = v; + return len; + } + l = i64[(unsigned int)*p++]; + if (l < 0) return -1; + u <<= 6; + u |= l; + } + for (v = 0; v < 3; v++) { + /* XXX: out must have space for decoded + '=' padding. For (valid) encoded + text, strlen() will get the true + length of output. For binary, this + won't work. */ + *out++ = (u >> 16) & 0xff; + (*outlen)++; + size --; + u <<= 8; + } + v = 0; + } + return p - s; +} + +#ifdef _UNIT_TEST +#include + +int main(void) { + char *intext = "aGVsbG8sIHdvcmxkCg=="; + char *s, out[80]; + int outlen, i; + struct b64_state b64s; + + base64_init(); + + b64_start(&b64s); + for (s = intext; *s; s += 4) { + b64_decode(&b64s, s, 4, out, sizeof(out), &outlen); + if (outlen) + for (i = 0; i < outlen; i++) putchar(out[i]); + } + b64_stop(&b64s); + return 0; +} +#endif diff --git a/decoders.h b/decoders.h new file mode 100644 index 0000000..2835652 --- /dev/null +++ b/decoders.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2009 Christopher L. Mikkelson + * All Rights Reserved, for now. + */ + +/* Base64 decoder */ +void base64_init(void); + +struct b64_state { + unsigned u; + int v; +}; + +struct b64_state *b64_start(struct b64_state *); +void b64_end(struct b64_state *); +int b64_decode(struct b64_state *, char *, int, char *, int, int *); + +/* Quoted-printable decoder */ + +struct qp_state { + unsigned char c; + int state; +}; + +struct qp_state *qp_start(struct qp_state *); +void qp_stop(struct qp_state *); +int qp_decode(struct qp_state *, char *, int, char *, int, int *); diff --git a/msgproc.c b/msgproc.c new file mode 100644 index 0000000..1c84cce --- /dev/null +++ b/msgproc.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2009 Christopher L. Mikkelson + * All Rights Reserved, for now. + */ + +#include "msgproc.h" + +struct msgproc_stage * +mps_alloc(struct msgproc_stage *parent) +{ + struct msgproc_stage *child = malloc(sizeof(struct msgproc_stage)); + if (parent) parent->next = child; + if (child) bzero(child, sizeof(*child)); + return child; +} + +/* + void finish_type(struct msgproc_stage *m) + { + ...clean up + free(m->state); + mps_free(m); + } +*/ + +void +mps_free(struct msgproc_stage *m) +{ + mps_finish_next(m); + if (m->prev) m->prev->next = 0; + free(m); +} + +/* + void process_decoder_type(struct msgproc_stage *m, char *buf, int len) + { + char tmp[BUFSIZ]; int tmplen; + ... process buf/len into tmp/tmplen + mps_process_next(m,tmp,tmplen); + } + + void process_multipart(struct msgproc_tage *m, char *buf, int len) { + char tmp[BUFSIZ]; int tmplen; + /* note; this will automatically skip over plain + text prior to boundary */ + do { + while (!boundary && remaining input) { + copy stuff into tmp + mps_process_next(m, tmp, tmplen); + } + if (boundary) + mps_finish_next(m); + tmplen = 0; + start_multipart_chunk(m); + } + } while input remains + } +*/ + +void +mps_process_next(struct msgproc_stage *m, char *buf, int len) +{ + if (m->next && m->next->process) { + m->next->process(m,buf,len); + } +} + +void +mps_finish_next(struct msgproc_stage *m, char *buf, int len) +{ + if (m->next && m->next->finish) { + m->next->finish(m); + } +} diff --git a/msgproc.h b/msgproc.h new file mode 100644 index 0000000..8339a4c --- /dev/null +++ b/msgproc.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2009 Christopher L. Mikkelson + * All Rights Reserved, for now. + */ + +struct msgproc_stage; + +struct msgproc_base { + void (*callback)(); + void *call_data; + struct msgproc_stage *msgproc; +}; + +struct msgproc_stage { + int type; + void *state; + void (*process)(struct msgproc_stage *, char *, int); + void (*finish)(struct msgproc_stage *); /* finish frees its argument */ + struct msgproc_base *base; + struct msgproc_stage *prev, *next; +}; + +/* main interface to the user. could wrap _start further... */ +struct msgproc_base *msgproc_init(void); +void msgproc_start(struct msgproc_base *, struct msgproc_stage *); +void msgproc_process(struct msgproc_base *, char *, int); +void msgproc_finish(struct msgproc_base *); +/* */ + +struct msgproc_stage *mps_alloc(struct msgproc_stage *); +void mps_free(struct msgproc_stage *); +void mps_process_next(struct msgproc_stage *, char *, int); +void mps_finish_next(struct msgproc_stage *); diff --git a/parser.c b/parser.c new file mode 100644 index 0000000..69de2a3 --- /dev/null +++ b/parser.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2009 Christopher L. Mikkelson + * All Rights Reserved, for now. + */ + +#include "msgproc.h" +#include "parser.h" + +static inline struct msgproc_stage * +init_mps(struct msgproc_stage *parent, + void (*process)(struct msgproc_stage *, char *, int), + void (*finish)(struct msgproc_stage *)) +{ + struct msgproc_stage *child = mps_alloc(parent); + if (child) { + if (parent) child->base = parent->base; + child->process = process; + child->finish = finish; + } + return child; +} + +struct msgproc_stage * +start_text_parser(struct msgproc_stage *parent) +{ + return init_mps(parent, process_text, finish_text); +} + +struct msgproc_stage * +start_html_parser(struct msgproc_stage *parent) +{ + return init_mps(parent, process_html, finish_html); +} + +struct msgproc_stage * +start_base64_decoder(struct msgproc_stage *parent) +{ + return init_mps(parent, process_base64, finish_base64); +} + +struct msgproc_stage * +start_quoted_decoder(struct msgproc_stage *parent) +{ + return init_mps(parent, process_quoted, finish_quoted); +} + +struct msgproc_stage * +start_multipart_parser(struct msgproc_stage *parent) +{ + return init_mps(parent, process_multipart, finish_multipart); +} + +struct msgproc_stage * +start_rfc822_parser(struct msgproc_stage *parent) +{ + struct msgproc_stage *m = init_mps(parent, process_chunk, finish_chunk); + /* TBD: use the integer types */ + set_contenttype(m, "text/plain"); + set_encoding(m, "7bit"); + return m; +} + +struct msgproc_stage * +start_chunk_parser(struct msgproc_stage *parent) +{ + return init_mps(parent, process_chunk, finish_chunk); +} + +/* these should probably be static inlines in .c file */ +void set_boundary(struct msgproc_stage *); +void set_contenttype(struct msgproc_stage *); +void set_encoding(struct msgproc_stage *); +void set_disposition(struct msgproc_stage *); + +void process_text(struct msgproc_stage *, char *, int); +void process_html(struct msgproc_stage *, char *, int); +void process_base64(struct msgproc_stage *, char *, int); +void process_quoted(struct msgproc_stage *, char *, int); +void process_multipart(struct msgproc_stage *, char *, int); +/* used for rfc822 complete messages and MIME parts; former + has defaults for content-type (text/plain) and encoding (7bit) */ +void process_message_chunk(struct msgproc_stage *, char *, int); + +struct msgproc_stage *finish_text_parser(struct msgproc_stage*); +struct msgproc_stage *finish_html_parser(struct msgproc_stage*); +struct msgproc_stage *finish_base64_decoder(struct msgproc_stage*); +struct msgproc_stage *finish_quoted_decoder(struct msgproc_stage*); +struct msgproc_stage *finish_multipart_parser(struct msgproc_stage*); +struct msgproc_stage *finish_rfc822_parser(struct msgproc_stage*); diff --git a/parser.h b/parser.h new file mode 100644 index 0000000..a6159ac --- /dev/null +++ b/parser.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2009 Christopher L. Mikkelson + * All Rights Reserved, for now. + */ + +#define PARSE_TEXT 0 +#define PARSE_HTML 1 +#define DECODE_BASE64 2 +#define DECODE_QUOTED 3 +#define PARSE_MULTIPART 4 +#define PARSE_RFC822 5 + +struct msgproc_stage *start_text_parser(struct msgproc_stage *); +struct msgproc_stage *start_html_parser(struct msgproc_stage*); +struct msgproc_stage *start_base64_decoder(struct msgproc_stage*); +struct msgproc_stage *start_quoted_decoder(struct msgproc_stage*); +struct msgproc_stage *start_multipart_parser(struct msgproc_stage*); +struct msgproc_stage *start_rfc822_parser(struct msgproc_stage*); + +/* these should probably be static inlines in .c file */ +void set_boundary(struct msgproc_stage *); +void set_contenttype(struct msgproc_stage *); +void set_encoding(struct msgproc_stage *); +void set_disposition(struct msgproc_stage *); + +void process_text(struct msgproc_stage *, char *, int); +void process_html(struct msgproc_stage *, char *, int); +void process_base64(struct msgproc_stage *, char *, int); +void process_quoted(struct msgproc_stage *, char *, int); +void process_multipart(struct msgproc_stage *, char *, int); +/* used for rfc822 complete messages and MIME parts; former + has defaults for content-type (text/plain) and encoding (7bit) */ +void process_message_chunk(struct msgproc_stage *, char *, int); + +struct msgproc_stage *finish_text_parser(struct msgproc_stage*); +struct msgproc_stage *finish_html_parser(struct msgproc_stage*); +struct msgproc_stage *finish_base64_decoder(struct msgproc_stage*); +struct msgproc_stage *finish_quoted_decoder(struct msgproc_stage*); +struct msgproc_stage *finish_multipart_parser(struct msgproc_stage*); +struct msgproc_stage *finish_rfc822_parser(struct msgproc_stage*); diff --git a/quoted-printable.c b/quoted-printable.c new file mode 100644 index 0000000..e80cfab --- /dev/null +++ b/quoted-printable.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2009 Christopher L. Mikkelson + * All Rights Reserved, for now. + */ + +#include +#include "decoders.h" + +struct qp_state * +qp_start(struct qp_state *qps) +{ + bzero(qps, sizeof(*qps)); +} + +void +qp_stop(struct qp_state *qps) +{ + qp_start(qps); +} + +static inline char +hexval(char c) { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return -1; +} + +int +qp_decode(struct qp_state *qps, char *s, int len, + char *out, int size, int *olen) +{ + char c; + int ret; + + for (*olen = 0; len > 0 && size > 0; s++, len--, ret++) { + switch(qps->state) { + case 0: + if (*s == '=') { + qps->state = 1; + } + else { + *out++ = *s; + (*olen)++; + size --; + } + break; + case 1: + if (*s == '\r') { + qps->state = 2; + break; + } + if (*s == '\n') { + qps->state = 0; + break; + } + c = hexval(*s); + if (c < 0) { + qps->state = 4; + break; + } + qps->c = c; + qps->state = 3; + break; + case 2: + if (*s == '\n') qps->state = 0; + else { + qps->state = 4; + break; + } + break; + case 3: + c = hexval(*s); + if (c < 0) { + qps->state = 4; + break; + } + *out++ = (qps->c << 4) | c; + (*olen)++; + qps->state = 0; + break; + case 4: /* error state, find next character which + is not '=' and start processing there */ + if (*s != '=') qps->state = 0; + break; + } + } + return ret; +} + +#ifdef _UNIT_TEST +#include +char *chunks[] = { + "the quick=20brown=\r", + "\nfox=3d jumped over\r\n", + 0 +}; +int main(void) { + struct qp_state qps; + char out[80]; + int ret, off = 0, outinc; + char **c; + + qp_start(&qps); + for (c=chunks; *c; c++) { + ret = qp_decode(&qps, *c, strlen(*c), out + off, sizeof(out) - off, &outinc); + off += outinc; + } + + write(1,out,off); + return 0; +} +#endif diff --git a/re_stream.c b/re_stream.c new file mode 100644 index 0000000..9a7f4a1 --- /dev/null +++ b/re_stream.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2009 Christopher L. Mikkelson + * All Rights Reserved, for now. + */ +/* Stream-based regular expression matching implementation */ + +#include +#include "re_stream.h" + +/* TODO: make these runtime tunables, add upper bound for allocation. */ +#ifndef RALLOC_INIT +#define RALLOC_INIT 1024 +#endif +#ifndef RALLOC_INCR +#define RALLOC_INCR RALLOC_INIT +#endif + +struct stream_re * +re_stream_start(struct stream_re *sre, pcre *re, int flags) +{ + sre->re = re; + sre->rlen = 0; + sre->flags = flags & ~(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL); + sre->state = 0; + if (!sre->result) { + sre->result = malloc(RALLOC_INIT); + if (!sre->result) return NULL; + sre->ralloc = RALLOC_INIT; + sre->result[0] = 0; + } + return sre; +} + +void +re_stream_stop(struct stream_re *sre) +{ + re_stream_start(sre, sre->re, sre->flags); + if (sre->result) { + free(sre->result); + } + bzero(sre, sizeof(*sre)); +} + +/* caller must make copy of result if they wish to use + it. Future calls may alter it. */ +char * +re_stream_getresult(struct stream_re *sre) +{ + return sre->result; +} + +/* returns 0 => no match, -1 => partial match, 1 => complete match */ +int +re_stream_result(struct stream_re *sre) +{ + return sre->state; +} + +/* TODO: rewrite below for above interface. Much simpler to use. */ +int +re_stream_exec(struct stream_re *sre, char *sub, int slen) +{ + int match, flags, newlen, end = slen; + + if (sre->state == -1) { + flags = PCRE_PARTIAL | PCRE_DFA_RESTART; + sre->state = 1; + } else { + flags = PCRE_PARTIAL; + sre->state = 0; + } + + match = pcre_dfa_exec(sre->re, 0, sub, slen, 0, sre->flags | flags, + sre->ovec, sizeof(sre->ovec), + sre->wspace, sizeof(sre->wspace)); + + + if (match > 0 || match == PCRE_ERROR_PARTIAL) { + end = sre->ovec[1]; + sre->state = 1; + newlen = sre->rlen + sre->ovec[1] - sre->ovec[0]; + if (newlen >= sre->ralloc) { + sre->ralloc += RALLOC_INIT; + sre->result = reallocf(sre->result, sre->ralloc); + if (!sre->result) return -1; + } + pcre_copy_substring(sub, sre->ovec, sizeof(sre->ovec), + 0, sre->result + sre->rlen, + sre->ralloc - sre->rlen); + sre->rlen = newlen; + + if (sre->ovec[1] == slen && match == PCRE_ERROR_PARTIAL) + sre->state = -1; + } + return end; +} + +#ifdef _UNIT_TEST +#include + +char *pat = "asdf+g"; + +char *chunks[] = {"this is a", + "sd", + "ffffff", + "ghij", + 0}; + +int main(void) { + char **s = chunks; + const char *etxt; + int epos; + pcre *re; + struct stream_re sr; + int r; + + re = pcre_compile(pat, 0, &etxt, &epos, 0); + re_stream_start(&sr,re,0); + + for (s = chunks; *s; s++) { + int off = 0; + do { + r = re_stream_exec(&sr, *s + off, strlen(*s) - off); + printf("input = %s, processed %d, m=%d," + " current match %s\n", *s + off, r, + re_stream_result(&sr), + re_stream_getresult(&sr)); + off += r; + } while (off < strlen(*s)); + } + puts(re_stream_getresult(&sr)); + re_stream_stop(&sr); + + return 0; +} + +#endif diff --git a/re_stream.h b/re_stream.h new file mode 100644 index 0000000..797c571 --- /dev/null +++ b/re_stream.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2009 Christopher L. Mikkelson + * All Rights Reserved, for now. + */ + +/* Stream-based regexp matching layer */ +struct stream_re { + pcre *re; + int rlen, ralloc, flags, state; + char *result; + int ovec[30], wspace[50]; +}; + +struct stream_re *re_stream_start(struct stream_re *, pcre *, int); +struct stream_re *re_stream_stop(struct stream_re *); +struct stream_re *re_stream_result(struct stream_re *); +struct stream_re *re_stream_getresult(struct stream_re *); -- 2.50.1