Refactor adblock simplify parsing and to honor matching options

There is a known flaw in that URL rules may erroneously match domains.

The code path for WebKitGTK+ < 1.1.14 is now crashing.
This commit is contained in:
Alexander Butenko 2009-11-20 01:00:48 +01:00 committed by Christian Dywan
parent 5580b62e00
commit 2d514bfb6c

View file

@ -26,6 +26,13 @@
(__filter && (g_str_has_prefix (__filter, "http") \
|| g_str_has_prefix (__filter, "file")))
typedef struct
{
const gchar* page_uri;
const gchar* uri;
const gchar* query;
} Matcher;
static GHashTable* pattern;
static gchar* blockcss = NULL;
static gchar* blockcssprivate = NULL;
@ -41,21 +48,20 @@ adblock_build_js (const gchar* style,
return g_strdup_printf (
"window.addEventListener ('DOMContentLoaded',"
"function () {"
"var URL = location.href;"
"var sites = new Array(); %s;"
"var public = '%s';"
"for (var i in sites) {"
"if (URL.indexOf(i) != -1) {"
"public += sites[i];"
"break;"
"}}"
"public += ' {display: none !important;}';"
"var mystyle = document.createElement(\"style\");"
"mystyle.setAttribute(\"type\", \"text/css\");"
"mystyle.appendChild(document.createTextNode(public));"
"var head = document.getElementsByTagName(\"head\")[0];"
"if (head) head.appendChild(mystyle);"
"else document.documentElement.insertBefore(mystyle, document.documentElement.firstChild);"
" var URL = location.href;"
" var sites = new Array(); %s;"
" var public = '%s';"
" for (var i in sites) {"
" if (URL.indexOf(i) != -1) {"
" public += sites[i];"
" break;"
" }}"
" public += ' {display: none !important;}';"
" var mystyle = document.createElement('style');"
" mystyle.setAttribute('type', 'text/css');"
" mystyle.appendChild(document.createTextNode(public));"
" var head = document.getElementsByTagName('head')[0];"
" if (head) head.appendChild(mystyle);"
"}, true);",
private,
style);
@ -73,10 +79,6 @@ adblock_fixup_regexp (gchar* src)
/* FIXME: Avoid always allocating twice the string */
s = dst = g_malloc (strlen (src) * 2);
/* |http:// means ^http:// */
if (src[0] == '|')
src[0] = '^';
while (*src)
{
switch (*src)
@ -93,6 +95,15 @@ adblock_fixup_regexp (gchar* src)
case '|':
*s++ = '\\';
break;
case '/':
*s++ = '\\';
break;
/* FIXME: We actually need to match :[0-9]+ or '/'. Sign means
"here could be port number or nothing". So bla.com^ will match
bla.com/ or bla.com:8080/ but not bla.com.au/ */
case '^':
*src = '?';
break;
}
*s++ = *src;
src++;
@ -518,11 +529,37 @@ adblock_browser_populate_tool_menu_cb (MidoriBrowser* browser,
}
static gboolean
adblock_is_matched (const gchar* patt,
adblock_is_matched (const gchar* opts,
const GRegex* regex,
const gchar* uri)
Matcher* data)
{
return g_regex_match_full (regex, uri, -1, 0, 0, NULL, NULL);
gchar* patt;
if (g_regex_match_simple ("type=fulluri,", opts, G_REGEX_UNGREEDY, G_REGEX_MATCH_NOTEMPTY))
patt = g_strdup (data->uri);
else
patt = g_strdup (data->query);
if (g_regex_match_full (regex, patt, -1, 0, 0, NULL, NULL))
{
if (g_regex_match_simple (",third-party", opts,
G_REGEX_CASELESS, G_REGEX_MATCH_NOTEMPTY))
{
if (data->page_uri && g_regex_match_full (regex, data->page_uri, -1, 0, 0, NULL, NULL))
{
g_free (patt);
return FALSE;
}
}
/* TODO: Domain opt check */
g_free (patt);
return TRUE;
}
else
{
g_free (patt);
return FALSE;
}
}
#if HAVE_WEBKIT_RESOURCE_REQUEST
@ -534,10 +571,38 @@ adblock_resource_request_starting_cb (WebKitWebView* web_view,
WebKitNetworkResponse* response,
GtkWidget* image)
{
const gchar* uri = webkit_network_request_get_uri (request);
if (!strncmp (uri, "data", 4))
Matcher data;
const char *page_uri;
const gchar* uri;
SoupMessage* msg;
SoupURI* soup_uri;
uri = webkit_network_request_get_uri (request);
if (!strncmp (uri, "data", 4) || !strncmp (uri, "file", 4))
return;
if (g_hash_table_find (pattern, (GHRFunc) adblock_is_matched, (char*)uri))
msg = webkit_network_request_get_message (request);
if (!msg)
return;
if (msg->method && !strncmp (msg->method, "POST", 4))
return;
soup_uri = soup_uri_new (uri);
if (soup_uri->query)
data.query = g_strdup_printf ("%s?%s", soup_uri->path, soup_uri->query);
else
data.query = g_strdup (soup_uri->path);
soup_uri_free (soup_uri);
data.uri = uri;
page_uri = webkit_web_view_get_uri (web_view);
if (!page_uri || !strcmp (page_uri, "about:blank"))
page_uri = uri;
data.page_uri = page_uri;
if (g_hash_table_find (pattern, (GHRFunc) adblock_is_matched, &data))
{
#if 0
gchar* text;
@ -558,9 +623,32 @@ static void
adblock_session_request_queued_cb (SoupSession* session,
SoupMessage* msg)
{
SoupURI* soup_uri = soup_message_get_uri (msg);
gchar* uri = soup_uri ? soup_uri_to_string (soup_uri, FALSE) : g_strdup ("");
if (g_hash_table_find (pattern, (GHRFunc) adblock_is_matched, uri))
Matcher data;
SoupURI* soup_uri;
gchar* uri;
gchar* page_uri;
if (msg->method && !strncmp (msg->method, "POST", 4))
return;
/* FIXME: There is a crasher somewhere introduced with the refactoring */
soup_uri = soup_message_get_uri (msg);
uri = soup_uri_to_string (soup_uri, FALSE);
if (soup_uri->query)
data.query = g_strdup_printf ("%s?%s", soup_uri->path, soup_uri->query);
else
data.query = g_strdup (soup_uri->path);
soup_uri_free (soup_uri);
data.uri = uri;
page_uri = NULL; /* FIXME */
if (!page_uri || !strcmp (page_uri, "about:blank"))
page_uri = uri;
data.page_uri = page_uri;
if (g_hash_table_find (pattern, (GHRFunc) adblock_is_matched, &data))
{
/* FIXME: Update image tooltip */
@ -635,6 +723,27 @@ adblock_app_add_browser_cb (MidoriApp* app,
g_object_unref (statusbar);
}
static void
adblock_compile_regexp (GHashTable* tbl,
gchar* patt,
gchar* opts)
{
GRegex* regex;
GError* error = NULL;
/* TODO: Play with optimization flags */
regex = g_regex_new (patt, G_REGEX_OPTIMIZE,
G_REGEX_MATCH_NOTEMPTY, &error);
if (!error)
g_hash_table_insert (tbl, opts, regex);
else
{
g_warning ("%s: %s", G_STRFUNC, error->message);
g_error_free (error);
}
}
static void
adblock_frame_add (gchar* line)
{
@ -657,8 +766,9 @@ adblock_frame_add_private (gchar* line)
{
gchar** domains;
gint max, i;
domains = g_strsplit (data[0], ",", -1);
for (max = i = 0; domains [i]; i++)
for (max = i = 0; domains[i]; i++)
{
new_blockcss = g_strdup_printf ("%s;\nsites['%s']+=',%s'",
blockcssprivate, g_strstrip (domains[i]), data[1]);
@ -675,6 +785,56 @@ adblock_frame_add_private (gchar* line)
g_strfreev (data);
}
static void
adblock_add_url_pattern (gchar* line)
{
gchar* opts;
gchar** data;
gchar* patt;
gchar* parsed;
if (line[0] == '|' && line[1] == '|' )
{
(void)*line++;
(void)*line++;
data = g_strsplit (line, "$", 2);
parsed = adblock_fixup_regexp (data[0]);
patt = g_strdup_printf ("^https?://([a-z0-9\\.]+)?%s", parsed);
if (data[1])
opts = g_strdup_printf ("type=fulluri,regexp=%s,%s", patt, data[1]);
else
opts = g_strdup_printf ("type=fulluri,regexp=%s", patt);
g_strfreev (data);
g_free (parsed);
}
else if (line[0] == '|')
{
(void)*line++;
data = g_strsplit (line, "$", 2);
parsed = adblock_fixup_regexp (data[0]);
patt = g_strdup_printf ("^%s", parsed);
if (data[1])
opts = g_strdup_printf ("type=fulluri,regexp=%s,%s", patt, data[1]);
else
opts = g_strdup_printf ("type=fulluri,regexp=%s", patt);
g_strfreev (data);
g_free (parsed);
}
else
{
patt = adblock_fixup_regexp (line);
opts = g_strdup_printf ("regexp=%s", patt);
}
/* g_debug ("got: %s opts %s", patt, opts); */
adblock_compile_regexp (pattern, patt, opts);
g_free (patt);
}
static gchar*
adblock_parse_line (gchar* line)
{
@ -687,63 +847,42 @@ adblock_parse_line (gchar* line)
/* FIXME: No support for whitelisting */
if (line[0] == '@' && line[1] == '@')
return NULL;
/* FIXME: What is this? */
if (line[0] == '|' && line[1] == '|')
/* FIXME: No support for [include] and [exclude] tags */
if (line[0] == '[')
return NULL;
/* ditto */
if (strstr (line,"$"))
return NULL;
/* Got block hider */
if (line[0] == '#' && line[1] == '#' && (line[2] == '.' || line[2] == '#'
|| line[2] == 'A' || line[2] == 'a' || line[2] == 'D' || line[2] == 'U'))
/* Got CSS block hider */
if (line[0] == '#' && line[1] == '#' )
{
adblock_frame_add (line);
return NULL;
}
/* FIXME: Do we have smth else starting with ##? */
if (line[0] == '#' && line[1] == '#')
/* Some crazy lists do this */
if (line[0] == '#')
return NULL;
/* Got per domain CSS hider rule */
if (strstr (line,"##"))
{
adblock_frame_add_private (line);
return NULL;
}
/* FIXME: No support for [include] and [exclude] tags */
if (line[0] == '[')
return NULL;
return adblock_fixup_regexp (line);
/* Got URL blocker rule */
adblock_add_url_pattern (line);
return line;
}
static void
adblock_parse_file (gchar* path)
{
FILE* file;
gchar line[500];
if ((file = g_fopen (path, "r")))
{
gchar line[500];
GRegex* regex;
while (fgets (line, 500, file))
{
GError* error = NULL;
gchar* parsed;
parsed = adblock_parse_line (line);
if (!parsed)
continue;
regex = g_regex_new (parsed, G_REGEX_OPTIMIZE,
G_REGEX_MATCH_NOTEMPTY, &error);
if (error)
{
g_warning ("%s: %s", G_STRFUNC, error->message);
g_error_free (error);
g_free (parsed);
}
else
g_hash_table_insert (pattern, parsed, regex);
}
adblock_parse_line (line);
fclose (file);
}
}
@ -846,6 +985,7 @@ test_adblock_pattern (void)
temp = g_file_open_tmp ("midori_adblock_match_test_XXXXXX", &filename, NULL);
/* TODO: Update some tests and add new ones. */
g_file_set_contents (filename,
"*ads.foo.bar*\n"
"*ads.bogus.name*\n"