From a00c29e62581f17ed78f5e4ded0a7de5a593fad5 Mon Sep 17 00:00:00 2001 From: chris mikkelson Date: Sun, 25 Jan 2009 23:44:14 -0600 Subject: [PATCH] Moved html and text parsers into separate files, and fleshed out their implementation in the process. Other parsers will also be in their own files, eventually. --- html.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ parser.c | 59 ++++++++++++++++---------------------------------- parser.h | 37 +++++++++++++++++++++---------- text.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 174 insertions(+), 52 deletions(-) create mode 100644 html.c create mode 100644 text.c diff --git a/html.c b/html.c new file mode 100644 index 0000000..58e2512 --- /dev/null +++ b/html.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2009 Christopher L. Mikkelson + * All Rights Reserved, for now. + */ + +#include +#include "re_stream.h" + +#include "msgproc.h" +#include "parser.h" + +static const char *html_url_pattern = "<(a href|img src)=\"https?://[^\"]+"; +static pcre *html_url_re; + +void +init_html_parser(void) { + const char *etxt; int epos; + html_url_re = pcre_compile(html_url_pattern, 0, &etxt, &epos, 0); + if (!html_url_re) { + /* die */ + } +} + +struct msgproc_stage * +start_html_parser(struct msgproc_stage *parent) +{ + struct msgproc_stage *m; + m = init_mps(parent, process_text, finish_text_parser); + if (m) { + m->type = PARSE_HTML; + m->state = malloc(sizeof(struct stream_re)); + if (!m->state) { + msgproc_free(m); + return NULL; + } + re_stream_start((struct stream_re *)m->state, html_url_re, 0); + } + return m; +} + +void +process_text(struct msgproc_stage *m, char *buf, int size) +{ + struct stream_re *sr = (struct stream_re *)m->state; + int n; + + while (size > 0) { + n = re_stream_exec(sr, buf, size); + if (re_stream_result(sr) == 1 && m->base->callback) { + /* TODO: send more information back to callback... */ + m->base->callback("html", re_stream_getresult(sr)); + } + size -= n; + buf += n; + } +} + +void +finish_html_parser(struct msgproc_stage *m) +{ + re_stream_stop((struct stream_re *)m->state); + free(m->state); + msgproc_free(m); +} diff --git a/parser.c b/parser.c index 69de2a3..df6f451 100644 --- a/parser.c +++ b/parser.c @@ -6,54 +6,31 @@ #include "msgproc.h" #include "parser.h" -static inline struct msgproc_stage * -init_mps(struct msgproc_stage *parent, - void (*process)(struct msgproc_stage *, char *, int), - void (*finish)(struct msgproc_stage *)) -{ - struct msgproc_stage *child = mps_alloc(parent); - if (child) { - if (parent) child->base = parent->base; - child->process = process; - child->finish = finish; - } - return child; -} -struct msgproc_stage * -start_text_parser(struct msgproc_stage *parent) -{ - return init_mps(parent, process_text, finish_text); -} - -struct msgproc_stage * -start_html_parser(struct msgproc_stage *parent) -{ - return init_mps(parent, process_html, finish_html); -} struct msgproc_stage * start_base64_decoder(struct msgproc_stage *parent) { - return init_mps(parent, process_base64, finish_base64); + return init_mps(parent, process_base64, finish_base64_decoder); } struct msgproc_stage * start_quoted_decoder(struct msgproc_stage *parent) { - return init_mps(parent, process_quoted, finish_quoted); + return init_mps(parent, process_quoted, finish_quoted_decoder); } struct msgproc_stage * start_multipart_parser(struct msgproc_stage *parent) { - return init_mps(parent, process_multipart, finish_multipart); + return init_mps(parent, process_multipart, finish_multipart_parser); } struct msgproc_stage * start_rfc822_parser(struct msgproc_stage *parent) { - struct msgproc_stage *m = init_mps(parent, process_chunk, finish_chunk); + struct msgproc_stage *m; + m = init_mps(parent, process_chunk, finish_chunk_parser); /* TBD: use the integer types */ set_contenttype(m, "text/plain"); set_encoding(m, "7bit"); @@ -63,17 +40,15 @@ start_rfc822_parser(struct msgproc_stage *parent) struct msgproc_stage * start_chunk_parser(struct msgproc_stage *parent) { - return init_mps(parent, process_chunk, finish_chunk); + return init_mps(parent, process_chunk, finish_chunk_parser); } /* these should probably be static inlines in .c file */ -void set_boundary(struct msgproc_stage *); -void set_contenttype(struct msgproc_stage *); -void set_encoding(struct msgproc_stage *); -void set_disposition(struct msgproc_stage *); +void set_boundary(struct msgproc_stage *, char *); +void set_contenttype(struct msgproc_stage *, char *); +void set_encoding(struct msgproc_stage *, char *); +void set_disposition(struct msgproc_stage *, char *); -void process_text(struct msgproc_stage *, char *, int); -void process_html(struct msgproc_stage *, char *, int); void process_base64(struct msgproc_stage *, char *, int); void process_quoted(struct msgproc_stage *, char *, int); void process_multipart(struct msgproc_stage *, char *, int); @@ -81,9 +56,11 @@ void process_multipart(struct msgproc_stage *, char *, int); has defaults for content-type (text/plain) and encoding (7bit) */ void process_message_chunk(struct msgproc_stage *, char *, int); -struct msgproc_stage *finish_text_parser(struct msgproc_stage*); -struct msgproc_stage *finish_html_parser(struct msgproc_stage*); -struct msgproc_stage *finish_base64_decoder(struct msgproc_stage*); -struct msgproc_stage *finish_quoted_decoder(struct msgproc_stage*); -struct msgproc_stage *finish_multipart_parser(struct msgproc_stage*); -struct msgproc_stage *finish_rfc822_parser(struct msgproc_stage*); +void +finish_base64_decoder(struct msgproc_stage*); +void +finish_quoted_decoder(struct msgproc_stage*); +void +finish_multipart_parser(struct msgproc_stage*); +void +finish_rfc822_parser(struct msgproc_stage*); diff --git a/parser.h b/parser.h index a6159ac..c1dca60 100644 --- a/parser.h +++ b/parser.h @@ -18,10 +18,10 @@ struct msgproc_stage *start_multipart_parser(struct msgproc_stage*); struct msgproc_stage *start_rfc822_parser(struct msgproc_stage*); /* these should probably be static inlines in .c file */ -void set_boundary(struct msgproc_stage *); -void set_contenttype(struct msgproc_stage *); -void set_encoding(struct msgproc_stage *); -void set_disposition(struct msgproc_stage *); +void set_boundary(struct msgproc_stage *, char *); +void set_contenttype(struct msgproc_stage *, char *); +void set_encoding(struct msgproc_stage *, char *); +void set_disposition(struct msgproc_stage *, char *); void process_text(struct msgproc_stage *, char *, int); void process_html(struct msgproc_stage *, char *, int); @@ -30,11 +30,26 @@ void process_quoted(struct msgproc_stage *, char *, int); void process_multipart(struct msgproc_stage *, char *, int); /* used for rfc822 complete messages and MIME parts; former has defaults for content-type (text/plain) and encoding (7bit) */ -void process_message_chunk(struct msgproc_stage *, char *, int); +void process_chunk(struct msgproc_stage *, char *, int); -struct msgproc_stage *finish_text_parser(struct msgproc_stage*); -struct msgproc_stage *finish_html_parser(struct msgproc_stage*); -struct msgproc_stage *finish_base64_decoder(struct msgproc_stage*); -struct msgproc_stage *finish_quoted_decoder(struct msgproc_stage*); -struct msgproc_stage *finish_multipart_parser(struct msgproc_stage*); -struct msgproc_stage *finish_rfc822_parser(struct msgproc_stage*); +void finish_text_parser(struct msgproc_stage *); +void finish_html_parser(struct msgproc_stage *); +void finish_base64_decoder(struct msgproc_stage *); +void finish_quoted_decoder(struct msgproc_stage *); +void finish_multipart_parser(struct msgproc_stage *); +void finish_chunk_parser(struct msgproc_stage *); + + +static inline struct msgproc_stage * +init_mps(struct msgproc_stage *parent, + void (*process)(struct msgproc_stage *, char *, int), + void (*finish)(struct msgproc_stage *)) +{ + struct msgproc_stage *child = mps_alloc(parent); + if (child) { + if (parent) child->base = parent->base; + child->process = process; + child->finish = finish; + } + return child; +} diff --git a/text.c b/text.c new file mode 100644 index 0000000..22eb12d --- /dev/null +++ b/text.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2009 Christopher L. Mikkelson + * All Rights Reserved, for now. + */ + +#include +#include "re_stream.h" + +#include "msgproc.h" +#include "parser.h" + +static const char *text_url_pattern = "https?://\\S+"; +static pcre *text_url_re; + +void +init_text_parser(void) { + const char *etxt; int epos; + text_url_re = pcre_compile(text_url_pattern, 0, &etxt, &epos, 0); + if (!text_url_re) { + /* die */ + } +} + +struct msgproc_stage * +start_text_parser(struct msgproc_stage *parent) +{ + struct msgproc_stage *m; + m = init_mps(parent, process_text, finish_text_parser); + if (m) { + m->type = PARSE_TEXT; + m->state = malloc(sizeof(struct stream_re)); + if (!m->state) { + msgproc_free(m); + return NULL; + } + re_stream_start((struct stream_re *)m->state, text_url_re, 0); + } + return m; +} + +void +process_text(struct msgproc_stage *m, char *buf, int size) +{ + struct stream_re *sr = (struct stream_re *)m->state; + int n; + + while (size > 0) { + n = re_stream_exec(sr, buf, size); + if (re_stream_result(sr) == 1) { + /* TODO: send more information back to callback... */ + if (m->base->callback) + m->base->callback("text", + re_stream_getresult(sr)); + } + size -= n; + buf += n; + } +} + +void +finish_text_parser(struct msgproc_stage *m) +{ + re_stream_stop((struct stream_re *)m->state); + free(m->state); + msgproc_free(m); +} -- 2.50.1