From: chris mikkelson Date: Tue, 9 Mar 2010 18:25:21 +0000 (-0600) Subject: Convert multipart splitting to use stream regexps. Fix some X-Git-Url: https://git.mikk.net/?a=commitdiff_plain;h=7a3235dca83912c5b10606b70e80f110370e5a4a;p=liburl Convert multipart splitting to use stream regexps. Fix some stream regexp logic bugs found in the process. --- diff --git a/multipart.c b/multipart.c index 83e8f26..4707c77 100644 --- a/multipart.c +++ b/multipart.c @@ -3,8 +3,11 @@ * All Rights Reserved, for now. */ +#include /* XXX -- debugging */ #include #include +#include +#include "re_stream.h" #include "msgproc.h" /* needs to be published for other callers */ @@ -14,10 +17,8 @@ static msgproc_module *nextmod; struct multipart_state { - char *boundary; - int blen; - char *line; - int l; + struct stream_re *boundre; + pcre *bre; int state; }; @@ -41,117 +42,154 @@ setnextmod(int type, void *data, size_t size) static void setboundary(msgproc *m, int type, void *data, size_t size) { + char *boundpat = 0; + char *b, *s; + const char *etxt; + int epos; + struct stream_re *sre = 0; struct multipart_state *mps = msgproc_getpriv(m); + if (type != MULTIPART_BOUNDARY) return; - if (mps) { - mps->boundary = malloc(size + 2); - if (!mps->boundary) return; - mps->boundary[0] = mps->boundary[1] = '-'; - memcpy(mps->boundary + 2, data, size); - mps->blen = size + 2; + if (!mps) return; + + boundpat = malloc(2*size + strlen("^--(--)?$") + 1); + if (!boundpat) goto fail; + + strcpy(boundpat, "^--"); + s = boundpat + strlen("^--"); + /* escape PCRE pattern metacharacters in boundary */ + for (b = (char *)data; b < (char *)data + size; b++) { + switch(*b) { + case '(': + case ')': + case '?': + case '.': + case '+': /* pattern metacharacters allowed by RFC 1341 */ + case '\\': + *s++ = '\\'; + /* FALLTHROUGH */ + default: + *s++ = *b; + } } + strcpy(s, "(--)?$"); + + mps->bre = pcre_compile(boundpat, PCRE_MULTILINE, &etxt, &epos, 0); + if (!mps->bre) goto fail; + + /* fprintf(stderr, "multipart_init: boundpat = %s\n", boundpat); */ + + free(boundpat); /* no longer needed */ + boundpat = 0; + + sre = malloc(sizeof(struct stream_re)); + if (!sre) goto fail; + + re_stream_start(sre, mps->bre, 0); + + mps->boundre = sre; + return; + +fail: if (boundpat) free(boundpat); + if (mps->bre) pcre_free(mps->bre); } -#define STATE_INMATCH 0 -#define STATE_MATCH 1 -#define STATE_MATCHEOL 2 -#define STATE_NOMATCH 3 -#define STATE_NOMATCHEOL 4 -#define STATE_MATCHEND 5 -#define STATE_DONE 6 +#define STATE_PREAMBLE 0 +#define STATE_PARTS 1 +#define STATE_END 2 static void multipart_process(msgproc *m, char *buf, size_t len) { struct multipart_state *mps = msgproc_getpriv(m); + char *s = buf; + size_t p, l = len; + int ml; msgproc *next; - char *s, *t; - int r, n; - - /* r = number of characters in current line which match - boundary and resided in previous buffer. - - n = number of characters in current line which match - boundary and reside in current buffer. - - mps->l = number of characters in boundary matched by - current line (regardless of which buffer(s)). - - s = current character in buffer. - - t = beginning of current part in current buffer. - */ - - if (!mps) return; - if (mps->state == STATE_DONE) return; - if (!mps->boundary) return; - r = mps->l; - next = msgproc_next(m); - - for (s = t = buf; s < buf + len; s++) { - - switch (mps->state) { - case STATE_INMATCH: - if (mps->boundary[mps->l++] != *s) { - mps->state = STATE_NOMATCH; - n = mps->l = 0; - continue; + char *pmatch = 0; + + /* fprintf(stderr, "multipart_process: %d chars\n", len); */ + + while (l > 0 && mps->state != STATE_END) { + /* re_stream_exec needs to be tweaked: + - set state to nomatch and process to + end of match if partial + match is not at end of input. + - set state to nomatch and process to + beginning of match if previous state + was partial match and new match begins + after beginning of input + */ + if (re_stream_result(mps->boundre) == -1) + pmatch = strdup(re_stream_getresult(mps->boundre)); + + /* fprintf(stderr, "multipart_process: s=%p, l=%d, state = %d\n", + s, l, mps->state); + fwrite(s, p, 1, stderr); + fprintf(stderr,"\n"); + if (pmatch) + fprintf(stderr, "multipart_process pmatch = '%s'\n", + pmatch); */ + p = re_stream_exec(mps->boundre, s, l); + if (p < 0) break; + /* fprintf(stderr, "multipart_process: p = %d, result = %d\n", + p, re_stream_result(mps->boundre)); */ + + switch(mps->state) { + case STATE_PREAMBLE: + if (re_stream_result(mps->boundre) == 1) { + s += p; + l -= p; + mps->state = STATE_PARTS; + next = msgproc_create(m, nextmod); + msgproc_start(next); + } else { + s += l; + l = 0; } - n++; - if (mps->l < mps->blen) continue; - mps->state = STATE_MATCH; - if (*s == '\r') mps->state = STATE_MATCHEOL; - if (*s == '-') mps->state = STATE_MATCHEND; - continue; - case STATE_MATCH: - if (*s == '\r') mps->state = STATE_MATCHEOL; - continue; - case STATE_NOMATCH: - if (*s == '\r') mps->state = STATE_NOMATCHEOL; - continue; - case STATE_MATCHEOL: - if (*s == '\n') { - mps->state = STATE_INMATCH; - n = mps->l; - mps->l = 0; + break; + case STATE_PARTS: + next = msgproc_next(m); + if (re_stream_result(mps->boundre) == 0) { + msgproc_process(next, s, l); + s += l; + l = 0; break; + } else if (re_stream_result(mps->boundre) == -1) { + ml = strlen(re_stream_getresult(mps->boundre)); + if (p > ml) + msgproc_process(next, s, p - ml); + s += l; + l = 0; + break; + } else { /* full match */ + char *bm = re_stream_getresult(mps->boundre); + ml = strlen(bm); + if (p > ml) { + if (pmatch) + msgproc_process(next, pmatch, + strlen(pmatch)); + msgproc_process(next, s, p - ml); + } + msgproc_finish(next); + /* check for -- at end of match */ + /* fprintf(stderr, "endofmatch = %s\n", + bm + ml - 2); */ + if (!strcmp("--", bm + ml - 2)) { + mps->state = STATE_END; + s += l; + l = 0; + break; + } + next = msgproc_create(m, nextmod); + msgproc_start(next); + s += p; + l -= p; } - mps->state = STATE_MATCH; - continue; - case STATE_NOMATCHEOL: - if (*s == '\n') mps->state = STATE_INMATCH; - mps->state = STATE_NOMATCH; - continue;; - case STATE_MATCHEND: - if (*s != '-') { - mps->state = STATE_MATCH; - continue; - } - mps->state = STATE_DONE; - n = mps->l; - mps->l = 0; - break; - case STATE_DONE: - t=s; - continue; } - - /* "break" above lands down here, continue skips to next char */ - /* send stuff downstream. (including a chunk of - boundary if partial match happened at end - of previous segment) */ - if (r) { - msgproc_process(next, mps->boundary, r); - r = 0; - } - msgproc_process(next, t, s - t - n); - n = 0; - - if (mps->state == STATE_DONE) return; + if (pmatch) free(pmatch); + pmatch = 0; } - - /* reached end of buffer. Process what we have. */ - msgproc_process(next, t, s - t - n); } void @@ -159,7 +197,11 @@ multipart_finish(msgproc *m) { struct multipart_state *mps = msgproc_getpriv(m); if (mps) { - free(mps->boundary); + if (mps->boundre) { + re_stream_stop(mps->boundre); + free(mps->boundre); + } + if (mps->bre) pcre_free(mps->bre); } free(mps); msgproc_free(m); diff --git a/re_stream.c b/re_stream.c index dd99d35..5230f8d 100644 --- a/re_stream.c +++ b/re_stream.c @@ -4,6 +4,7 @@ */ /* Stream-based regular expression matching implementation */ +#include /* XXX -- debugging */ #include #include #include "re_stream.h" @@ -61,38 +62,65 @@ re_stream_result(struct stream_re *sre) int re_stream_exec(struct stream_re *sre, char *sub, int slen) { - int match, flags, newlen, end = slen; + int match, flags, newlen, start, end; if (sre->state == -1) { flags = PCRE_PARTIAL | PCRE_DFA_RESTART; - sre->state = 1; } else { flags = PCRE_PARTIAL; sre->state = 0; + sre->rlen = 0; + sre->result[0] = 0; } match = pcre_dfa_exec(sre->re, 0, sub, slen, 0, sre->flags | flags, sre->ovec, sizeof(sre->ovec), sre->wspace, sizeof(sre->wspace)); + /* fprintf(stderr, "in (%d) = %s\n", slen, sub); + fprintf(stderr, "pcre_dfa_exec: match = %d\n", match); */ - if (match > 0 || match == PCRE_ERROR_PARTIAL) { - end = sre->ovec[1]; - sre->state = 1; - newlen = sre->rlen + sre->ovec[1] - sre->ovec[0]; - if (newlen >= sre->ralloc) { - sre->ralloc += RALLOC_INIT; - sre->result = reallocf(sre->result, sre->ralloc); - if (!sre->result) return -1; - } - pcre_copy_substring(sub, sre->ovec, sizeof(sre->ovec), - 0, sre->result + sre->rlen, - sre->ralloc - sre->rlen); - sre->rlen = newlen; - - if (sre->ovec[1] == slen && match == PCRE_ERROR_PARTIAL) - sre->state = -1; + if (match < 0 && match != PCRE_ERROR_PARTIAL) { + sre->result[0] = sre->rlen = 0; + sre->state = 0; + return slen; + } + + start = sre->ovec[0]; + end = sre->ovec[1]; + + if (match == PCRE_ERROR_PARTIAL && end < slen) { + /* partial match ending before end of input => no match */ + sre->result[0] = sre->rlen = 0; + sre->state = 0; + return end; } + + if (sre->state == -1 && start > 0) { + /* new match begins after buffer start, previous + buffer had partial match. set no match and + return start of next match to inform caller */ + sre->result[0] = sre->rlen = 0; + sre->state = 0; + return start; + } + + newlen = sre->rlen + end - start; + if (newlen >= sre->ralloc) { + sre->ralloc += RALLOC_INIT; + sre->result = reallocf(sre->result, sre->ralloc); + if (!sre->result) return -1; + } + pcre_copy_substring(sub, sre->ovec, sizeof(sre->ovec), + 0, sre->result + sre->rlen, + sre->ralloc - sre->rlen); + sre->rlen = newlen; + + if (match == PCRE_ERROR_PARTIAL) + sre->state = -1; + else + sre->state = 1; + return end; }