From 6ad4b4f27627595a66d05b4904cc6a24ae13e2c2 Mon Sep 17 00:00:00 2001 From: chris mikkelson Date: Thu, 11 Mar 2010 15:34:56 -0600 Subject: [PATCH] URL extraction fixes -- Add more robust text/plain URL regecxp, use html-specific regexp for html module, and extract both anchor ( +#include #include #include "re_stream.h" #include "msgproc.h" -static const char *text_url_pattern = "https?://[^\\s]+(?=\\s|$|\\))"; +/*static const char *text_url_pattern = "https?://[^\\s]+(?=\\s|$|\\))"; */ +static const char *text_url_pattern = + "https?://[a-z0-9.]+(:[0-9]+)?" + "(/[0-9a-z_=./+&%?-]+)?" + "(?=\r?\n|[^0-9a-z_=./+&%?-])"; static pcre *text_url_re; -static const char *html_url_pattern = "(?<= 0) { n = re_stream_exec(sr, buf, size); if (re_stream_result(sr) == 1) { - char *url = re_stream_getresult(sr); + char *s, *url = re_stream_getresult(sr); + if (m->mp_mod->mpm_type == MSGPROC_HTML) + if ((s = strchr(url, '"'))) + url = s + 1; if (m->callback) { m->callback(m, url, m->call_data); } -- 2.50.1