20d03e43aa
Previously the description was used in the treeview if the title was not present, however this is not possible in some cases, as it may contain markup. The the RSS code is changed so now if there is no title, the description is used as the title, but with the html stripped from it. We also have to consider that some description elements may simply contain an html tag (and therefore no text). In that case the URI is used as the title.
264 lines
6 KiB
C
264 lines
6 KiB
C
/*
|
|
Copyright (C) 2009 Dale Whittaker <dayul@users.sf.net>
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
See the file COPYING for the full license text.
|
|
*/
|
|
|
|
#include "feed-parse.h"
|
|
#include <time.h>
|
|
|
|
gchar*
|
|
feed_get_element_string (FeedParser* fparser)
|
|
{
|
|
xmlNodePtr node;
|
|
|
|
node = fparser->node;
|
|
|
|
if (!node->children ||
|
|
xmlIsBlankNode (node->children) ||
|
|
node->children->type != XML_TEXT_NODE
|
|
)
|
|
{
|
|
/* Some servers add required elements with no content,
|
|
* create a dummy string to handle it.
|
|
*/
|
|
return g_strdup (" ");
|
|
}
|
|
|
|
return (gchar*)xmlNodeListGetString (fparser->doc, node->children, 1);
|
|
}
|
|
|
|
static void
|
|
handle_markup_chars (void* user_data,
|
|
const xmlChar* ch,
|
|
int len)
|
|
{
|
|
if (len > 0)
|
|
{
|
|
gchar** markup;
|
|
gchar* temp;
|
|
|
|
markup = (gchar**)user_data;
|
|
temp = g_strndup ((gchar*)ch, len);
|
|
*markup = (*markup) ? g_strconcat (*markup, temp, NULL) : g_strdup (temp);
|
|
g_free (temp);
|
|
}
|
|
}
|
|
|
|
gchar*
|
|
feed_remove_markup (gchar* markup)
|
|
{
|
|
const xmlChar* stag;
|
|
if (((stag = xmlStrchr (BAD_CAST markup, '<')) && xmlStrchr (stag, '>')) ||
|
|
xmlStrchr (BAD_CAST markup, '&'))
|
|
{
|
|
gchar* text = NULL;
|
|
htmlSAXHandlerPtr psax;
|
|
|
|
psax = g_new0 (htmlSAXHandler, 1);
|
|
psax->characters = handle_markup_chars;
|
|
htmlSAXParseDoc (BAD_CAST markup, "UTF-8", psax, &text);
|
|
g_free (psax);
|
|
g_free (markup);
|
|
return text;
|
|
}
|
|
return markup;
|
|
}
|
|
|
|
gchar*
|
|
feed_get_element_markup (FeedParser* fparser)
|
|
{
|
|
gchar* markup;
|
|
|
|
markup = feed_get_element_string (fparser);
|
|
return feed_remove_markup (markup);
|
|
}
|
|
|
|
gint64
|
|
feed_get_element_date (FeedParser* fparser)
|
|
{
|
|
time_t date;
|
|
gchar* content;
|
|
|
|
date = 0;
|
|
content = feed_get_element_string (fparser);
|
|
|
|
if (content)
|
|
{
|
|
SoupDate* sdate;
|
|
|
|
sdate = soup_date_new_from_string (content);
|
|
date = soup_date_to_time_t (sdate);
|
|
soup_date_free (sdate);
|
|
g_free (content);
|
|
}
|
|
return ((gint64)date);
|
|
}
|
|
|
|
KatzeItem*
|
|
feed_item_exists (KatzeArray* array,
|
|
KatzeItem* item)
|
|
{
|
|
const gchar* guid;
|
|
gchar* hstr;
|
|
guint hash;
|
|
|
|
guid = katze_item_get_token (item);
|
|
if (!guid)
|
|
{
|
|
hstr = g_strjoin (NULL,
|
|
katze_item_get_name (item),
|
|
katze_item_get_uri (item),
|
|
katze_item_get_text (item),
|
|
NULL);
|
|
hash = g_str_hash (hstr);
|
|
g_free (hstr);
|
|
|
|
hstr = g_strdup_printf ("%u", hash);
|
|
katze_item_set_token (item, hstr);
|
|
g_free (hstr);
|
|
|
|
guid = katze_item_get_token (item);
|
|
}
|
|
|
|
return (katze_array_find_token (array, guid));
|
|
}
|
|
|
|
void
|
|
feed_parse_node (FeedParser* fparser)
|
|
{
|
|
xmlNodePtr node;
|
|
xmlNodePtr child;
|
|
|
|
if (!*fparser->error)
|
|
{
|
|
if (fparser->preparse)
|
|
(*fparser->preparse) (fparser);
|
|
|
|
if (fparser->parse)
|
|
{
|
|
node = fparser->node;
|
|
child = node->last;
|
|
|
|
while (child)
|
|
{
|
|
if (child->type == XML_ELEMENT_NODE)
|
|
{
|
|
fparser->node = child;
|
|
|
|
(*fparser->parse) (fparser);
|
|
|
|
if (*fparser->error)
|
|
break;
|
|
}
|
|
child = child->prev;
|
|
}
|
|
fparser->node = node;
|
|
}
|
|
|
|
if (fparser->postparse)
|
|
(*fparser->postparse) (fparser);
|
|
}
|
|
}
|
|
|
|
static void
|
|
feed_parse_doc (xmlDocPtr doc,
|
|
GSList* parsers,
|
|
KatzeArray* array,
|
|
GError** error)
|
|
{
|
|
FeedParser* fparser;
|
|
xmlNodePtr root;
|
|
gboolean isvalid;
|
|
|
|
root = xmlDocGetRootElement (doc);
|
|
|
|
if (!root)
|
|
{
|
|
*error = g_error_new (FEED_PARSE_ERROR,
|
|
FEED_PARSE_ERROR_MISSING_ELEMENT,
|
|
_("Failed to find root element in feed XML data."));
|
|
return;
|
|
}
|
|
|
|
while (parsers)
|
|
{
|
|
fparser = (FeedParser*)parsers->data;
|
|
fparser->error = error;
|
|
fparser->doc = doc;
|
|
fparser->node = root;
|
|
|
|
if (fparser && fparser->isvalid)
|
|
{
|
|
isvalid = (*fparser->isvalid) (fparser);
|
|
|
|
if (*fparser->error)
|
|
return;
|
|
|
|
if (isvalid)
|
|
{
|
|
fparser->item = KATZE_ITEM (array);
|
|
|
|
if (fparser->update &&
|
|
(*fparser->update) (fparser))
|
|
feed_parse_node (fparser);
|
|
}
|
|
}
|
|
|
|
fparser->error = NULL;
|
|
fparser->doc = NULL;
|
|
fparser->node = NULL;
|
|
|
|
if (isvalid)
|
|
return;
|
|
|
|
parsers = g_slist_next (parsers);
|
|
}
|
|
|
|
*error = g_error_new (FEED_PARSE_ERROR,
|
|
FEED_PARSE_ERROR_INVALID_FORMAT,
|
|
_("Unsupported feed format."));
|
|
}
|
|
|
|
gboolean
|
|
parse_feed (gchar* data,
|
|
gint64 length,
|
|
GSList* parsers,
|
|
KatzeArray* array,
|
|
GError** error)
|
|
{
|
|
xmlDocPtr doc;
|
|
xmlErrorPtr xerror;
|
|
|
|
LIBXML_TEST_VERSION
|
|
|
|
doc = xmlReadMemory (
|
|
data, length, "feedfile.xml", NULL,
|
|
XML_PARSE_NOWARNING | XML_PARSE_NOERROR /*| XML_PARSE_RECOVER*/
|
|
);
|
|
|
|
if (doc)
|
|
{
|
|
feed_parse_doc (doc, parsers, array, error);
|
|
xmlFreeDoc (doc);
|
|
}
|
|
else
|
|
{
|
|
xerror = xmlGetLastError ();
|
|
*error = g_error_new (FEED_PARSE_ERROR,
|
|
FEED_PARSE_ERROR_PARSE,
|
|
_("Failed to parse XML feed: %s"),
|
|
xerror->message);
|
|
xmlResetLastError ();
|
|
}
|
|
xmlCleanupParser ();
|
|
xmlMemoryDump ();
|
|
|
|
return *error ? FALSE : TRUE;
|
|
}
|
|
|