]> git.mikk.net Git - liburl/commitdiff
URL extraction fixes -- Add more robust text/plain URL regecxp,
authorchris mikkelson <chris@mikk.net>
Thu, 11 Mar 2010 21:34:56 +0000 (15:34 -0600)
committerchris mikkelson <chris@mikk.net>
Thu, 11 Mar 2010 21:50:41 +0000 (15:50 -0600)
use html-specific regexp for html module, and extract both
anchor (<a href=) and image (<img src=) URLs.

text.c

diff --git a/text.c b/text.c
index 8c77eb20959c0b27e5a7dc653d91fc47cc9a6a58..c0b15d403a242d9ddf30f50b0957d739868eb607 100644 (file)
--- a/text.c
+++ b/text.c
@@ -4,14 +4,19 @@
  */
 
 #include <pcre.h>
+#include <string.h>
 #include <err.h>
 #include "re_stream.h"
 
 #include "msgproc.h"
 
-static const char *text_url_pattern = "https?://[^\\s]+(?=\\s|$|\\))";
+/*static const char *text_url_pattern = "https?://[^\\s]+(?=\\s|$|\\))"; */
+static const char *text_url_pattern =
+       "https?://[a-z0-9.]+(:[0-9]+)?"
+       "(/[0-9a-z_=./+&%?-]+)?"
+       "(?=\r?\n|[^0-9a-z_=./+&%?-])";
 static pcre *text_url_re;
-static const char *html_url_pattern = "(?<=<a href=\")https?://[^\"]+(?=\")"; 
+static const char *html_url_pattern = "<(a href|img src)=\"https?://[^\"]+(?=\")"; 
 static pcre *html_url_re;
 
 static void
@@ -22,11 +27,11 @@ text_init(void) {
        if (initialized) return;
        initialized = 1;
 
-       text_url_re = pcre_compile(text_url_pattern, 0, &etxt, &epos, 0);
+       text_url_re = pcre_compile(text_url_pattern, PCRE_CASELESS, &etxt, &epos, 0);
        if (!text_url_re) {
                errx(1, "text_url_pattern compile error\n");
        }
-       html_url_re = pcre_compile(html_url_pattern, 0, &etxt, &epos, 0);
+       html_url_re = pcre_compile(html_url_pattern, PCRE_CASELESS, &etxt, &epos, 0);
        if (!html_url_re) {
                errx(1, "html_url_pattern compile error\n");
        }
@@ -44,7 +49,7 @@ static void
 text_html_start(msgproc *m)
 {
        struct stream_re *s = malloc(sizeof(struct stream_re));
-       re_stream_start(s, text_url_re, 0);
+       re_stream_start(s, html_url_re, 0);
        msgproc_setpriv(m, (void *)s);
 }
 
@@ -57,7 +62,10 @@ text_process(msgproc *m, char *buf, size_t size)
        while (size > 0) {
                n = re_stream_exec(sr, buf, size);
                if (re_stream_result(sr) == 1) {
-                       char *url = re_stream_getresult(sr);
+                       char *s, *url = re_stream_getresult(sr);
+                       if (m->mp_mod->mpm_type == MSGPROC_HTML)
+                               if ((s = strchr(url, '"'))) 
+                                       url = s + 1;
                        if (m->callback) {
                                m->callback(m, url, m->call_data);
                        }