]> git.mikk.net Git - liburl/commitdiff
Convert multipart splitting to use stream regexps. Fix some
authorchris mikkelson <chris@mikk.net>
Tue, 9 Mar 2010 18:25:21 +0000 (12:25 -0600)
committerchris mikkelson <chris@mikk.net>
Tue, 9 Mar 2010 22:03:02 +0000 (16:03 -0600)
stream regexp logic bugs found in the process.

multipart.c
re_stream.c

index 83e8f2669d6c76209a1f232ee06324762308da5d..4707c776bd5cc71be1b3e779065f99f3386e293e 100644 (file)
@@ -3,8 +3,11 @@
  * All Rights Reserved, for now.
  */
 
+#include <stdio.h>   /* XXX -- debugging */
 #include <stdlib.h>
 #include <string.h>
+#include <pcre.h>
+#include "re_stream.h"
 #include "msgproc.h"
 
 /* needs to be published for other callers */
 static msgproc_module *nextmod;
 
 struct multipart_state {
-       char *boundary;
-       int blen;
-       char *line;
-       int l;
+       struct stream_re *boundre;
+       pcre *bre;
        int state;
 };
 
@@ -41,117 +42,154 @@ setnextmod(int type, void *data, size_t size)
 static void
 setboundary(msgproc *m, int type, void *data, size_t size)
 {
+       char *boundpat = 0;
+       char *b, *s;
+       const char *etxt;
+       int epos;
+       struct stream_re *sre = 0;
        struct multipart_state *mps = msgproc_getpriv(m);
+
        if (type != MULTIPART_BOUNDARY) return;
-       if (mps) {
-               mps->boundary = malloc(size + 2);
-               if (!mps->boundary) return;
-               mps->boundary[0] = mps->boundary[1] = '-';
-               memcpy(mps->boundary + 2, data, size);
-               mps->blen = size + 2;
+       if (!mps) return;
+
+       boundpat = malloc(2*size + strlen("^--(--)?$") + 1);
+       if (!boundpat) goto fail;
+
+       strcpy(boundpat, "^--");
+       s = boundpat + strlen("^--");
+       /* escape PCRE pattern metacharacters in boundary */
+       for (b = (char *)data; b < (char *)data + size; b++) {
+               switch(*b) {
+               case '(':
+               case ')':
+               case '?':
+               case '.':
+               case '+':       /* pattern metacharacters allowed by RFC 1341 */
+               case '\\':
+                       *s++ = '\\';
+                       /* FALLTHROUGH */
+               default:
+                       *s++ = *b;
+               }
        }
+       strcpy(s, "(--)?$");
+
+       mps->bre = pcre_compile(boundpat, PCRE_MULTILINE, &etxt, &epos, 0);
+       if (!mps->bre) goto fail;
+
+       /* fprintf(stderr, "multipart_init: boundpat = %s\n", boundpat); */
+       
+       free(boundpat);  /* no longer needed */
+       boundpat = 0;
+       
+       sre = malloc(sizeof(struct stream_re));
+       if (!sre) goto fail;
+
+       re_stream_start(sre, mps->bre, 0);
+
+       mps->boundre = sre;
+       return;
+
+fail:  if (boundpat) free(boundpat);
+       if (mps->bre) pcre_free(mps->bre);
 }
 
-#define STATE_INMATCH 0
-#define STATE_MATCH 1
-#define STATE_MATCHEOL 2
-#define STATE_NOMATCH 3
-#define STATE_NOMATCHEOL 4
-#define STATE_MATCHEND 5
-#define STATE_DONE 6
+#define STATE_PREAMBLE 0
+#define STATE_PARTS 1
+#define STATE_END 2
 
 static void
 multipart_process(msgproc *m, char *buf, size_t len)
 {
        struct multipart_state *mps = msgproc_getpriv(m);
+       char *s = buf;
+       size_t p, l = len;
+       int ml; 
        msgproc *next;
-       char *s, *t;
-       int r, n;
-
-       /* r = number of characters in current line which match
-               boundary and resided in previous buffer.
-
-          n = number of characters in current line which match
-               boundary and reside in current buffer.
-
-          mps->l = number of characters in boundary matched by
-                   current line (regardless of which buffer(s)).
-
-          s = current character in buffer.
-
-          t = beginning of current part in current buffer.
-       */
-
-       if (!mps) return;
-       if (mps->state == STATE_DONE) return;
-       if (!mps->boundary) return;
-       r = mps->l;
-       next = msgproc_next(m);
-
-       for (s = t = buf; s < buf + len; s++) {
-
-               switch (mps->state) {
-               case STATE_INMATCH: 
-                       if (mps->boundary[mps->l++] != *s) {
-                               mps->state = STATE_NOMATCH;
-                               n = mps->l = 0;
-                               continue;
+       char *pmatch = 0;
+
+       /* fprintf(stderr, "multipart_process: %d chars\n", len); */
+
+       while (l > 0 && mps->state != STATE_END) {      
+               /* re_stream_exec needs to be tweaked:
+                       - set state to nomatch and process to
+                       end of match if partial
+                       match is not at end of input.
+                       - set state to nomatch and process to
+                       beginning of match if previous state
+                       was partial match and new match begins
+                       after beginning of input
+               */
+               if (re_stream_result(mps->boundre) == -1)
+                       pmatch = strdup(re_stream_getresult(mps->boundre));
+                       
+               /* fprintf(stderr, "multipart_process: s=%p, l=%d, state = %d\n",
+                               s, l, mps->state);
+               fwrite(s, p, 1, stderr);
+               fprintf(stderr,"\n");
+               if (pmatch) 
+                       fprintf(stderr, "multipart_process pmatch = '%s'\n",
+                                       pmatch); */
+               p = re_stream_exec(mps->boundre, s, l);
+               if (p < 0) break;
+               /* fprintf(stderr, "multipart_process: p = %d, result = %d\n",
+                               p, re_stream_result(mps->boundre)); */
+
+               switch(mps->state) {
+               case STATE_PREAMBLE:
+                       if (re_stream_result(mps->boundre) == 1) {
+                               s += p;
+                               l -= p;
+                               mps->state = STATE_PARTS;
+                               next = msgproc_create(m, nextmod);
+                               msgproc_start(next);
+                       } else {
+                               s += l;
+                               l = 0;
                        }
-                       n++;
-                       if (mps->l < mps->blen) continue;
-                       mps->state = STATE_MATCH;
-                       if (*s == '\r') mps->state = STATE_MATCHEOL;
-                       if (*s == '-') mps->state = STATE_MATCHEND;
-                       continue;
-               case STATE_MATCH:
-                       if (*s == '\r') mps->state = STATE_MATCHEOL;
-                       continue;
-               case STATE_NOMATCH:
-                       if (*s == '\r') mps->state = STATE_NOMATCHEOL;
-                       continue;
-               case STATE_MATCHEOL:
-                       if (*s == '\n') {
-                               mps->state = STATE_INMATCH;
-                               n = mps->l;
-                               mps->l = 0;
+                       break;
+               case STATE_PARTS:
+                       next = msgproc_next(m);
+                       if (re_stream_result(mps->boundre) == 0) {
+                               msgproc_process(next, s, l);
+                               s += l;
+                               l = 0;
                                break;
+                       } else if (re_stream_result(mps->boundre) == -1) {
+                               ml = strlen(re_stream_getresult(mps->boundre));
+                               if (p > ml) 
+                                       msgproc_process(next, s, p - ml);
+                               s += l;
+                               l = 0;
+                               break;
+                       } else {  /* full match */
+                               char *bm = re_stream_getresult(mps->boundre);
+                               ml = strlen(bm);
+                               if (p > ml) {
+                                       if (pmatch)
+                                               msgproc_process(next, pmatch,
+                                                       strlen(pmatch));
+                                       msgproc_process(next, s, p - ml);
+                               }
+                               msgproc_finish(next);
+                               /* check for -- at end of match */
+                               /* fprintf(stderr, "endofmatch = %s\n",
+                                               bm + ml - 2); */
+                               if (!strcmp("--", bm + ml - 2)) {
+                                       mps->state = STATE_END;
+                                       s += l;
+                                       l = 0;
+                                       break;
+                               }
+                               next = msgproc_create(m, nextmod);
+                               msgproc_start(next);
+                               s += p;
+                               l -= p;
                        }
-                       mps->state = STATE_MATCH;
-                       continue;
-               case STATE_NOMATCHEOL:
-                       if (*s == '\n') mps->state = STATE_INMATCH;
-                       mps->state = STATE_NOMATCH;
-                       continue;;
-               case STATE_MATCHEND:
-                       if (*s != '-') {
-                               mps->state = STATE_MATCH;
-                               continue;
-                       }
-                       mps->state = STATE_DONE;
-                       n = mps->l;
-                       mps->l = 0;
-                       break;
-               case STATE_DONE:
-                       t=s;
-                       continue;
                }
-
-               /* "break" above lands down here, continue skips to next char */
-               /* send stuff downstream.  (including a chunk of
-                       boundary if partial match happened at end
-                       of previous segment) */
-               if (r) {
-                       msgproc_process(next, mps->boundary, r);
-                       r = 0;
-               } 
-               msgproc_process(next, t, s - t - n);
-               n = 0;
-
-               if (mps->state == STATE_DONE) return;
+               if (pmatch) free(pmatch);
+               pmatch = 0;
        }
-
-       /* reached end of buffer.  Process what we have. */
-       msgproc_process(next, t, s - t - n);
 }
 
 void
@@ -159,7 +197,11 @@ multipart_finish(msgproc *m)
 {
        struct multipart_state *mps = msgproc_getpriv(m);
        if (mps) {
-               free(mps->boundary);
+               if (mps->boundre) {
+                       re_stream_stop(mps->boundre);
+                       free(mps->boundre);
+               }
+               if (mps->bre) pcre_free(mps->bre);
        }
        free(mps);
        msgproc_free(m);
index dd99d35a1809cd9bc126d73cfe63f067198ea84c..5230f8d834bf48a374fd6c846959bec9f73fb6fe 100644 (file)
@@ -4,6 +4,7 @@
  */
 /* Stream-based regular expression matching implementation */
 
+#include <stdio.h>   /* XXX -- debugging */
 #include <pcre.h>
 #include <strings.h>
 #include "re_stream.h"
@@ -61,38 +62,65 @@ re_stream_result(struct stream_re *sre)
 int
 re_stream_exec(struct stream_re *sre, char *sub, int slen)
 {
-       int match, flags, newlen, end = slen;
+       int match, flags, newlen, start, end;
 
        if (sre->state == -1) {
                flags = PCRE_PARTIAL | PCRE_DFA_RESTART;
-               sre->state = 1;
        } else {
                flags = PCRE_PARTIAL;
                sre->state = 0;
+               sre->rlen = 0;
+               sre->result[0] = 0;
        }
 
        match = pcre_dfa_exec(sre->re, 0, sub, slen, 0, sre->flags | flags,
                        sre->ovec, sizeof(sre->ovec),
                        sre->wspace, sizeof(sre->wspace));
 
+       /* fprintf(stderr, "in (%d) = %s\n", slen, sub);
+       fprintf(stderr, "pcre_dfa_exec: match = %d\n", match); */
 
-       if (match > 0 || match == PCRE_ERROR_PARTIAL) {
-               end = sre->ovec[1];
-               sre->state = 1;
-               newlen = sre->rlen + sre->ovec[1] - sre->ovec[0];
-               if (newlen >= sre->ralloc) {
-                       sre->ralloc += RALLOC_INIT;
-                       sre->result = reallocf(sre->result, sre->ralloc);
-                       if (!sre->result) return -1;
-               }
-               pcre_copy_substring(sub, sre->ovec, sizeof(sre->ovec),
-                                       0, sre->result + sre->rlen, 
-                                       sre->ralloc - sre->rlen);
-               sre->rlen = newlen;
-
-               if (sre->ovec[1] == slen && match == PCRE_ERROR_PARTIAL) 
-                       sre->state = -1;
+       if (match < 0 && match != PCRE_ERROR_PARTIAL) {
+               sre->result[0] = sre->rlen = 0;
+               sre->state = 0;
+               return slen;
+       }
+
+       start = sre->ovec[0];
+       end = sre->ovec[1];
+
+       if (match == PCRE_ERROR_PARTIAL && end < slen) {
+               /* partial match ending before end of input => no match */
+               sre->result[0] = sre->rlen = 0;
+               sre->state = 0;
+               return end;
        }
+
+       if (sre->state == -1 && start > 0) {
+               /* new match begins after buffer start, previous
+                  buffer had partial match.  set no match and
+                  return start of next match to inform caller */
+               sre->result[0] = sre->rlen = 0;
+               sre->state = 0;
+               return start;
+       }
+
+       newlen = sre->rlen + end - start;
+       if (newlen >= sre->ralloc) {
+               sre->ralloc += RALLOC_INIT;
+               sre->result = reallocf(sre->result, sre->ralloc);
+               if (!sre->result) return -1;
+       }
+       pcre_copy_substring(sub, sre->ovec, sizeof(sre->ovec),
+                               0, sre->result + sre->rlen, 
+                               sre->ralloc - sre->rlen);
+       sre->rlen = newlen;
+
+       if (match == PCRE_ERROR_PARTIAL) 
+               sre->state = -1;
+       else
+               sre->state = 1;
+
        return end;
 }