package feed.parser;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.commons.lang3.StringUtils;
import org.caudexorigo.ErrorAnalyser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class StaxParser
{
private static final Logger log = LoggerFactory.getLogger(StaxParser.class);
private static final XMLInputFactory factory = XMLInputFactory.newInstance();
private static Map<String, String> ns_aliases = new HashMap<String, String>();
private static Map<String, FeedChannelProcessor> feed_channel_processors = new HashMap<String, FeedChannelProcessor>();
private static Map<String, FeedEntryProcessor> feed_entry_processors = new HashMap<String, FeedEntryProcessor>();
private boolean stripHtml;
static
{
ns_aliases.put("http://a9.com/-/spec/opensearchrss/1.0/", "opensearch");
ns_aliases.put("http://backend.userland.com/blogChannelModule", "blogChannel");
ns_aliases.put("http://backend.userland.com/creativeCommonsRssModule", "creativeCommons");
ns_aliases.put("http://example.com/dtds/podcast-1.0.dtd", "itunes");
ns_aliases.put("http://freshmeat.net/rss/fm/", "fm");
ns_aliases.put("http://hacks.benhammersley.com/rss/streaming/", "str");
ns_aliases.put("http://madskills.com/public/xml/rss/module/pingback/", "pingback");
ns_aliases.put("http://madskills.com/public/xml/rss/module/trackback/", "trackback");
ns_aliases.put("http://media.tangent.org/rss/1.0/", "audio");
ns_aliases.put("http://my.theinfo.org/changed/1.0/rss/", "cp");
ns_aliases.put("http://postneo.com/icbm/", "icbm");
ns_aliases.put("http://prismstandard.org/namespaces/1.2/basic/", "prism");
ns_aliases.put("http://purl.org/dc/elements/1.1/", "dc");
ns_aliases.put("http://purl.org/dc/terms/", "dcterms");
ns_aliases.put("http://purl.org/rss/1.0/modules/aggregation/", "ag");
ns_aliases.put("http://purl.org/rss/1.0/modules/annotate/", "annotate");
ns_aliases.put("http://purl.org/rss/1.0/modules/company", "co");
ns_aliases.put("http://purl.org/rss/1.0/modules/content/", "content");
ns_aliases.put("http://purl.org/rss/1.0/modules/email/", "email");
ns_aliases.put("http://purl.org/rss/1.0/modules/event/", "ev");
ns_aliases.put("http://purl.org/rss/1.0/modules/image/", "image");
ns_aliases.put("http://purl.org/rss/1.0/modules/link/", "l");
ns_aliases.put("http://purl.org/rss/1.0/modules/reference/", "ref");
ns_aliases.put("http://purl.org/rss/1.0/modules/richequiv/", "reqv");
ns_aliases.put("http://purl.org/rss/1.0/modules/search/", "search");
ns_aliases.put("http://purl.org/rss/1.0/modules/servicestatus/", "ss");
ns_aliases.put("http://purl.org/rss/1.0/modules/slash/", "slash");
ns_aliases.put("http://purl.org/rss/1.0/modules/subscription/", "sub");
ns_aliases.put("http://purl.org/rss/1.0/modules/syndication/", "sy");
ns_aliases.put("http://purl.org/rss/1.0/modules/taxonomy/", "taxo");
ns_aliases.put("http://purl.org/rss/1.0/modules/textinput/", "ti");
ns_aliases.put("http://purl.org/rss/1.0/modules/threading/", "thr");
ns_aliases.put("http://purl.org/rss/1.0/modules/wiki/", "wiki");
ns_aliases.put("http://purl.org/rss/1.0/", "rdf");
ns_aliases.put("http://rssnamespace.org/feedburner/ext/1.0", "feedburner");
ns_aliases.put("http://schemas.xmlsoap.org/soap/envelope/", "soap");
ns_aliases.put("http://webns.net/mvcb/", "admin");
ns_aliases.put("http://web.resource.org/cc/", "cc");
ns_aliases.put("http://wellformedweb.org/commentapi/", "wfw");
ns_aliases.put("http://www.georss.org/georss", "georss");
ns_aliases.put("http://www.itunes.com/dtds/podcast-1.0.dtd", "itunes");
ns_aliases.put("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "rdf");
ns_aliases.put("http://www.w3.org/1999/xhtml", "xhtml");
ns_aliases.put("http://www.w3.org/2000/01/rdf-schema#", "rdfs");
ns_aliases.put("http://www.w3.org/2003/01/geo/wgs84_pos#", "wgs84");
ns_aliases.put("http://www.w3.org/2005/atom", "atom");
ns_aliases.put("http://www.w3.org/XML/1998/namespace", "xml");
ns_aliases.put("http://xmlns.com/foaf/0.1/", "foaf");
ns_aliases.put("http://search.yahoo.com/mrss/", "media");
ns_aliases.put("", "rss");
feed_channel_processors.put("/rss:rss/rss:channel/rss:title", new SimpleFeedChannelProcessor("title"));
feed_channel_processors.put("/rss:rss/rss:channel/rss:link", new SimpleFeedChannelProcessor("link"));
feed_channel_processors.put("/rss:rss/rss:channel/rss:description", new SimpleFeedChannelProcessor("description"));
feed_channel_processors.put("/rss:rss/rss:channel/rss:language", new SimpleFeedChannelProcessor("language"));
feed_channel_processors.put("/rss:rss/rss:channel/rss:item", new AddFeedEntryProcessor());
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/rss:title", new SimpleFeedEntryProcessor("title"));
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/rss:author", new SimpleFeedEntryProcessor("author"));
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/rss:guid", new GuidProcessor());
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/rss:pubdate", new SimpleFeedEntryProcessor("pubdate"));
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/rss:description", new SimpleFeedEntryProcessor("body"));
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/atom:summary", new SimpleFeedEntryProcessor("body"));
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/rss:link", new SimpleFeedEntryProcessor("link"));
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/feedburner:origlink", new SimpleFeedEntryProcessor("origlink"));
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/rss:category", new CategoryProcessor());
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/rss:tag", new CategoryProcessor());
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/rss:subject", new CategoryProcessor());
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/rss:subtitle", new CategoryProcessor());
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/dc:subject", new CategoryProcessor());
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/dc:date", new SimpleFeedEntryProcessor("pubdate"));
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/dc:creator", new SimpleFeedEntryProcessor("author"));
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/content:encoded", new SimpleFeedEntryProcessor("body"));
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/rss:enclosure", new EnclosureProcessor(Enclosure.Type.RSS));
feed_entry_processors.put("/rss:rss/rss:channel/rss:item/media:content", new EnclosureProcessor(Enclosure.Type.YAHOO_MEDIA));
feed_channel_processors.put("/rdf:rdf/rdf:channel/rdf:title", new SimpleFeedChannelProcessor("title"));
feed_channel_processors.put("/rdf:rdf/rdf:channel/rdf:link", new SimpleFeedChannelProcessor("link"));
feed_channel_processors.put("/rdf:rdf/rdf:channel/rdf:description", new SimpleFeedChannelProcessor("description"));
feed_channel_processors.put("/rdf:rdf/rdf:item", new AddFeedEntryProcessor());
feed_entry_processors.put("/rdf:rdf/rdf:item/rdf:title", new SimpleFeedEntryProcessor("title"));
feed_entry_processors.put("/rdf:rdf/rdf:item/dc:subject", new CategoryProcessor());
feed_entry_processors.put("/rdf:rdf/rdf:item/rdf:link", new SimpleFeedEntryProcessor("link"));
feed_entry_processors.put("/rdf:rdf/rdf:item/feedburner:origlink", new SimpleFeedEntryProcessor("origlink"));
feed_entry_processors.put("/rdf:rdf/rdf:item/dc:creator", new SimpleFeedEntryProcessor("author"));
feed_entry_processors.put("/rdf:rdf/rdf:item/dc:date", new SimpleFeedEntryProcessor("pubdate"));
feed_entry_processors.put("/rdf:rdf/rdf:item/rdf:description", new SimpleFeedEntryProcessor("body"));
feed_channel_processors.put("/atom:feed/atom:title", new SimpleFeedChannelProcessor("title"));
feed_channel_processors.put("/atom:feed/atom:subtitle", new SimpleFeedChannelProcessor("description"));
feed_channel_processors.put("/atom:feed/atom:link", new AtomFeedChannelLinkProcessor("service.feed", "application/x.atom+xml"));
feed_channel_processors.put("/atom:feed/atom:entry", new AddFeedEntryProcessor());
feed_entry_processors.put("/atom:feed/atom:entry/atom:title", new SimpleFeedEntryProcessor("title"));
feed_entry_processors.put("/atom:feed/atom:entry/feedburner:origlink", new SimpleFeedEntryProcessor("origlink"));
feed_entry_processors.put("/atom:feed/atom:entry/atom:id", new GuidProcessor());
feed_entry_processors.put("/atom:feed/atom:entry/atom:link", new AtomFeedEntryLinkProcessor("alternate", "text/html"));
feed_entry_processors.put("/atom:feed/atom:entry/atom:author/atom:name", new SimpleFeedEntryProcessor("author"));
feed_entry_processors.put("/atom:feed/atom:entry/atom:published", new SimpleFeedEntryProcessor("pubdate"));
feed_entry_processors.put("/atom:feed/atom:entry/atom:category", new CategoryProcessor());
feed_entry_processors.put("/atom:feed/atom:entry/atom:content", new AtomContentProcessor());
}
public StaxParser()
{
super();
}
// public FeedChannel parse(InputStream stream)
// {
// return parse(stream, true);
// }
//
// public FeedChannel parse(InputStream stream, boolean failOnError)
// {
// try
// {
// XMLStreamReader staxXmlReader = factory.createXMLStreamReader(stream,
// "UTF-8");
// return _parse(staxXmlReader, failOnError);
// }
// catch (Throwable t)
// {
// if (failOnError)
// {
// throw new RuntimeException(t);
// }
// else
// {
// Throwable r = ErrorAnalyser.findRootCause(t);
// log.error(r.getMessage(), r);
// return new FeedChannel();
// }
// }
// }
public FeedChannel parse(Reader reader)
{
return parse(reader, true, false);
}
public FeedChannel parse(Reader reader, boolean failOnError, boolean stripHtml)
{
this.stripHtml = stripHtml;
try
{
XMLStreamReader staxXmlReader = factory.createXMLStreamReader(reader);
return _parse(staxXmlReader, failOnError);
}
catch (Throwable t)
{
if (failOnError)
{
throw new RuntimeException(t);
}
else
{
Throwable r = ErrorAnalyser.findRootCause(t);
log.error(r.getMessage(), r);
return new FeedChannel(stripHtml);
}
}
}
private FeedChannel _parse(XMLStreamReader staxXmlReader, boolean failOnError) throws Throwable
{
int eventType = staxXmlReader.getEventType();
Stack<FeedXmlElement> stack = new Stack<FeedXmlElement>();
FeedChannel feed_channel = new FeedChannel(stripHtml);
do
{
try
{
if (eventType == XMLStreamConstants.START_ELEMENT)
{
String ns = StringUtils.isBlank(staxXmlReader.getNamespaceURI()) ? "" : staxXmlReader.getNamespaceURI().toLowerCase();
String lname = staxXmlReader.getLocalName().toLowerCase();
String alias_prefix = ns_aliases.get(ns);
FeedXmlElement fxe = new FeedXmlElement(alias_prefix, lname);
stack.push(fxe);
String path = buildPath(stack);
if (log.isDebugEnabled())
{
log.debug("Xml element path: {}", path);
}
FeedChannelProcessor fc_proc = feed_channel_processors.get(path);
if (fc_proc != null)
{
fc_proc.process(feed_channel, staxXmlReader);
}
FeedEntryProcessor fe_proc = feed_entry_processors.get(path);
if (fe_proc != null)
{
try
{
fe_proc.process(feed_channel, staxXmlReader);
}
catch (Throwable t)
{
Throwable r = ErrorAnalyser.findRootCause(t);
if (r instanceof XMLStreamException)
{
log.error("Invalid feed element", r);
continue;
}
else
{
throw new RuntimeException(r);
}
}
}
eventType = staxXmlReader.getEventType();
}
if (eventType == XMLStreamConstants.END_ELEMENT)
{
stack.pop();
}
}
catch (Throwable t)
{
if (failOnError)
{
throw new RuntimeException(t);
}
else
{
Throwable r = ErrorAnalyser.findRootCause(t);
log.error(r.getMessage(), r);
}
}
eventType = staxXmlReader.next();
}
while (eventType != XMLStreamConstants.END_DOCUMENT);
return feed_channel;
}
private String buildPath(Stack<FeedXmlElement> stack)
{
StringBuilder sb = new StringBuilder();
sb.append("/");
int s = stack.size();
for (FeedXmlElement fxe : stack)
{
sb.append(fxe.toString());
if (--s > 0)
{
sb.append("/");
}
}
return sb.toString();
}
}