package net.x4a42.volksempfaenger.feedparser;
import java.io.IOException;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.EmptyStackException;
import java.util.Locale;
import java.util.Stack;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.x4a42.volksempfaenger.Utils;
import net.x4a42.volksempfaenger.feedparser.Enums.AtomRel;
import net.x4a42.volksempfaenger.feedparser.Enums.Mime;
import net.x4a42.volksempfaenger.feedparser.Enums.Namespace;
import net.x4a42.volksempfaenger.feedparser.Enums.Tag;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class FeedParser {
public static void parseEvented(Reader reader, FeedParserListener listener)
throws FeedParserException, IOException {
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser parser;
FeedHandler handler = new FeedHandler(listener);
parser = factory.newSAXParser();
parser.parse(new InputSource(reader), handler);
if (!handler.isFeed) {
throw new NotAFeedException();
}
} catch (ParserConfigurationException e) {
throw new FeedParserException(e);
} catch (SAXException e) {
throw new FeedParserException(e);
} catch (NullPointerException e) {
throw new FeedParserException("NullPointerException inside Parser",
e);
}
}
public static Feed parse(Reader reader) throws FeedParserException,
IOException {
LegacyFeedParserListener listener = new LegacyFeedParserListener();
FeedParser.parseEvented(reader, listener);
return listener.feed;
}
private static class LegacyFeedParserListener implements FeedParserListener {
public Feed feed;
private final ArrayList<FeedItem> feedItems = new ArrayList<FeedItem>();
private final ArrayList<Enclosure> enclosures = new ArrayList<Enclosure>();
@Override
public void onFeedItem(FeedItem feedItem) {
feedItems.add(feedItem);
feedItem.enclosures.addAll(enclosures);
enclosures.clear();
}
@Override
public void onFeed(Feed feed) {
this.feed = feed;
feed.items.addAll(feedItems);
feedItems.clear();
}
@Override
public void onEnclosure(Enclosure enclosure) {
enclosures.add(enclosure);
}
}
private static class FeedHandler extends DefaultHandler {
public boolean isFeed = false;
private final Feed feed = new Feed();
private FeedItem feedItem = new FeedItem();
private Enclosure enclosure = new Enclosure();
private final Stack<Tag> parents = new Stack<Tag>();
private boolean skipMode = false;
private boolean xhtmlMode = false;
private boolean currentRssItemHasHtml = false;
private boolean currentItemHasITunesSummaryAlternative = false;
private boolean currentAtomItemHasPublished = false;
private boolean hasITunesImage = false;
private int skipDepth = 0;
private final StringBuilder buffer = new StringBuilder();
private static final String ATOM_ATTR_HREF = "href";
private static final String ATOM_ATTR_REL = "rel";
private static final String ATOM_ATTR_TYPE = "type";
private static final String ATOM_ATTR_LENGTH = "length";
private static final String ATOM_ATTR_TITLE = "title";
private static final String RSS_ATTR_URL = "url";
private static final String RSS_ATTR_TYPE = "type";
private static final String RSS_ATTR_LENGTH = "length";
private final FeedParserListener listener;
public FeedHandler(FeedParserListener listener) {
super();
this.listener = listener;
}
@Override
public void startElement(String uri, String localName, String qName,
Attributes atts) throws SAXException {
if (skipMode) {
skipDepth++;
} else {
Namespace ns = getNamespace(uri);
Tag tag = getTag(ns, localName);
if (!isFeed) {
// is current element one of the toplevel elements
if (((ns == Namespace.ATOM) && tag == Tag.ATOM_FEED)
|| ((ns == Namespace.NONE) && tag == Tag.RSS_TOPLEVEL)
|| ((ns == Namespace.RSS) && tag == Tag.RSS_TOPLEVEL)) {
isFeed = true;
}
}
if (ns == Namespace.ATOM) {
onStartTagAtom(tag, atts);
} else if (ns == Namespace.NONE || ns == Namespace.RSS
|| ns == Namespace.RSS_CONTENT) {
onStartTagRss(tag, atts);
} else if (ns == Namespace.XHTML && xhtmlMode) {
onStartTagXHtml(localName, atts);
} else if (ns == Namespace.ITUNES) {
onStartTagITunes(tag, atts);
} else {
skipMode = true;
skipDepth = 0;
return;
}
parents.push(tag);
}
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
if (skipMode) {
return;
}
if (!safePeek(Tag.UNKNOWN) || xhtmlMode) {
if (safePeek(Tag.RSS_DESCRIPTION) && currentRssItemHasHtml) {
// we already have an HTML version of this, so just ignore
// the plaintext
return;
}
buffer.append(ch, start, length);
}
}
@Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
if (skipMode) {
if (skipDepth == 0) {
skipMode = false;
} else {
skipDepth--;
}
} else {
Namespace ns = getNamespace(uri);
Tag tag;
try {
tag = parents.pop();
} catch (EmptyStackException e) {
return;
}
if (ns == Namespace.ATOM) {
onEndTagAtom(tag);
} else if (ns == Namespace.NONE || ns == Namespace.RSS
|| ns == Namespace.RSS_CONTENT) {
onEndTagRss(tag);
} else if (ns == Namespace.XHTML && xhtmlMode) {
onEndTagXHtml(localName);
} else if (ns == Namespace.ITUNES) {
onEndTagITunes(tag);
}
if (tag != Tag.UNKNOWN) {
// clear buffer
buffer.setLength(0);
}
}
}
private void onStartTagAtom(Tag tag, Attributes atts) {
switch (tag) {
case ATOM_ENTRY:
feedItem = new FeedItem();
feedItem.feed = feed;
currentItemHasITunesSummaryAlternative = false;
currentAtomItemHasPublished = false;
break;
case ATOM_CONTENT:
if (atts.getValue(ATOM_ATTR_TYPE).equals("xhtml")) {
xhtmlMode = true;
}
case ATOM_LINK:
String relString = atts.getValue(ATOM_ATTR_REL);
AtomRel rel = AtomRel.UNKNOWN;
if (relString != null) {
rel = getAtomRel(relString);
relString = null;
}
switch (rel) {
case ENCLOSURE:
if (safePeek(Tag.ATOM_ENTRY)) {
enclosure = new Enclosure();
enclosure.feedItem = feedItem;
enclosure.url = atts.getValue(ATOM_ATTR_HREF);
enclosure.mime = atts.getValue(ATOM_ATTR_TYPE);
enclosure.title = atts.getValue(ATOM_ATTR_TITLE);
String length = atts.getValue(ATOM_ATTR_LENGTH);
enclosure.size = safeParseLong(length);
onEnclosure();
}
break;
case ALTERNATE:
String mimeString = atts.getValue(ATOM_ATTR_TYPE);
Mime type = Mime.UNKNOWN;
if (mimeString != null) {
type = getMime(mimeString);
mimeString = null;
}
if (safePeek(Tag.ATOM_ENTRY)) {
if (type == Mime.UNKNOWN || type == Mime.HTML
|| type == Mime.XHTML) {
// actually there can be multiple
// "alternate links"
// this uses the LAST alternate link as the
// URL for
// the FeedItem
feedItem.url = atts.getValue(ATOM_ATTR_HREF);
}
} else if (safePeek(Tag.ATOM_FEED)) {
if (type == Mime.UNKNOWN || type == Mime.HTML
|| type == Mime.XHTML) {
// same issue as above with multiple
// alternate links
feed.website = atts.getValue(ATOM_ATTR_HREF);
}
}
break;
case SELF:
// I changed the tag below from Tag.ATOM_FEED to
// Tag.RSS_CHANNEL.
// I think this fixes an upstream bug:
// http://volksempfaenger.0x4a42.net/dev/ticket/216
if (safePeek(Tag.RSS_CHANNEL)) {
feed.url = atts.getValue(ATOM_ATTR_HREF);
}
break;
case PAYMENT:
if (safePeek(Tag.ATOM_ENTRY) || safePeek(Tag.RSS_ITEM)) {
String url = atts.getValue(ATOM_ATTR_HREF);
try {
if (new URL(url).getHost().equals("flattr.com")) {
feedItem.flattrUrl = url;
}
} catch (MalformedURLException e) {
// ignore if url is malformed
}
}
break;
default:
break;
}
break;
default:
break;
}
}
private void onStartTagRss(Tag tag, Attributes atts) {
switch (tag) {
case RSS_ITEM:
feedItem = new FeedItem();
feedItem.feed = feed;
currentRssItemHasHtml = false;
currentItemHasITunesSummaryAlternative = false;
break;
case RSS_ENCLOSURE:
if (safePeek(Tag.RSS_ITEM)) {
enclosure = new Enclosure();
enclosure.feedItem = feedItem;
enclosure.url = atts.getValue(RSS_ATTR_URL);
enclosure.mime = atts.getValue(RSS_ATTR_TYPE);
String length = atts.getValue(RSS_ATTR_LENGTH);
enclosure.size = safeParseLong(length);
onEnclosure();
}
break;
default:
break;
}
}
private void onStartTagXHtml(String name, Attributes atts) {
buffer.append("<");
buffer.append(name);
for (int i = 0; i < atts.getLength(); i++) {
buffer.append(" ");
buffer.append(atts.getLocalName(i));
buffer.append("=\"");
// escape double quotes (hope this works)
buffer.append(atts.getValue(i).replaceAll("\"", "\\\""));
buffer.append("\"");
}
buffer.append(">");
}
private void onStartTagITunes(Tag tag, Attributes atts) {
if (tag == Tag.ITUNES_IMAGE
&& (safePeek(Tag.RSS_CHANNEL) || safePeek(Tag.ATOM_FEED))) {
feed.image = atts.getValue("href");
hasITunesImage = true;
}
}
private void onEndTagAtom(Tag tag) {
switch (tag) {
case ATOM_TITLE:
if (safePeek(Tag.ATOM_FEED)) {
// feed title
feed.title = Utils.trimmedString(buffer);
} else if (safePeek(Tag.ATOM_ENTRY)) {
// entry title
feedItem.title = Utils.trimmedString(buffer);
}
break;
case ATOM_CONTENT:
if (xhtmlMode) {
xhtmlMode = false;
}
if (safePeek(Tag.ATOM_ENTRY)) {
feedItem.description = Utils.trimmedString(buffer);
currentItemHasITunesSummaryAlternative = true;
}
break;
case ATOM_PUBLISHED:
if (safePeek(Tag.ATOM_ENTRY)) {
try {
feedItem.date = parseAtomDate(buffer.toString());
currentAtomItemHasPublished = true;
} catch (IndexOutOfBoundsException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
break;
case ATOM_UPDATED:
if (safePeek(Tag.ATOM_ENTRY) && !currentAtomItemHasPublished) {
try {
feedItem.date = parseAtomDate(buffer.toString());
} catch (IndexOutOfBoundsException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
break;
case ATOM_SUBTITLE:
feed.description = Utils.trimmedString(buffer);
break;
case ATOM_ENTRY:
onFeedItem();
break;
case ATOM_ID:
if (safePeek(Tag.ATOM_ENTRY)) {
feedItem.itemId = Utils.trimmedString(buffer);
}
break;
case ATOM_ICON:
if (safePeek(Tag.ATOM_FEED) && !hasITunesImage) {
feed.image = Utils.trimmedString(buffer);
}
break;
case ATOM_FEED:
onFeed();
break;
default:
break;
}
}
private void onEndTagRss(Tag tag) {
switch (tag) {
case RSS_TITLE:
if (safePeek(Tag.RSS_CHANNEL)) {
feed.title = Utils.trimmedString(buffer);
} else if (safePeek(Tag.RSS_ITEM)) {
feedItem.title = Utils.trimmedString(buffer);
}
break;
case RSS_PUB_DATE:
if (safePeek(Tag.RSS_ITEM)) {
try {
feedItem.date = parseRssDate(buffer.toString());
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
break;
case RSS_LINK:
if (safePeek(Tag.RSS_ITEM)) {
feedItem.url = Utils.trimmedString(buffer);
} else if (safePeek(Tag.RSS_CHANNEL)) {
feed.website = Utils.trimmedString(buffer);
}
break;
case RSS_DESCRIPTION:
if (!currentRssItemHasHtml) {
if (safePeek(Tag.RSS_ITEM)) {
feedItem.description = Utils.trimmedString(buffer);
currentItemHasITunesSummaryAlternative = true;
} else if (safePeek(Tag.RSS_CHANNEL)) {
feed.description = Utils.trimmedString(buffer);
}
}
break;
case RSS_CONTENT_ENCODED:
currentRssItemHasHtml = true;
if (safePeek(Tag.RSS_ITEM)) {
feedItem.description = Utils.trimmedString(buffer);
currentItemHasITunesSummaryAlternative = true;
} else if (safePeek(Tag.RSS_CHANNEL)) {
feed.description = Utils.trimmedString(buffer);
}
break;
case RSS_ITEM:
onFeedItem();
currentRssItemHasHtml = false;
break;
case RSS_GUID:
if (safePeek(Tag.RSS_ITEM)) {
feedItem.itemId = Utils.trimmedString(buffer);
}
break;
case RSS_URL:
if (safePeek(Tag.RSS_IMAGE) && !hasITunesImage) {
Tag copy;
try {
copy = parents.pop();
} catch (EmptyStackException e) {
return;
}
if (safePeek(Tag.RSS_CHANNEL)) {
feed.image = Utils.trimmedString(buffer);
}
parents.push(copy);
}
case RSS_CHANNEL:
onFeed();
break;
default:
break;
}
}
private void onEndTagXHtml(String name) {
buffer.append("</");
buffer.append(name);
buffer.append(">");
}
private void onEndTagITunes(Tag tag) {
if (tag == Tag.ITUNES_SUMMARY
&& (safePeek(Tag.ATOM_ENTRY) || safePeek(Tag.RSS_ITEM))
&& !currentItemHasITunesSummaryAlternative) {
feedItem.description = Utils.trimmedString(buffer);
}
}
private Date parseAtomDate(String datestring)
throws java.text.ParseException, IndexOutOfBoundsException {
datestring = datestring.trim().toUpperCase(Locale.getDefault());
// dirty version - write a new one TODO
// Modified version of http://cokere.com/RFC3339Date.txt
/*
* I was working on an Atom (http://www.w3.org/2005/Atom) parser and
* discovered that I could not parse dates in the format defined by
* RFC 3339 using the SimpleDateFormat class. The reason was the ':'
* in the time zone. This code strips out the colon if it's there
* and tries four different formats on the resulting string
* depending on if it has a time zone, or if it has a fractional
* second part. There is a probably a better way to do this, and a
* more proper way. But this is a really small addition to a
* codebase (You don't need a jar, just throw this function in some
* static Utility class if you have one).
*
* Feel free to use this in your code, but I'd appreciate it if you
* keep this note in the code if you distribute it. Thanks!
*
* For people who might be googling: The date format parsed by this
* goes by: atomDateConstruct, xsd:dateTime, RFC3339 and is
* compatable with: ISO.8601.1988, W3C.NOTE-datetime-19980827 and
* W3C.REC-xmlschema-2-20041028 (that I know of)
*
*
* Copyright 2007, Chad Okere (ceothrow1 at gmail dotcom) OMG NO
* WARRENTY EXPRESSED OR IMPLIED!!!1
*/
// if there is no time zone, we don't need to do any special
// parsing.
if (datestring.charAt(datestring.length() - 1) == 'Z') {
try {
// spec for RFC3339
return formats[4].parse(datestring);
} catch (java.text.ParseException pe) {
// try again with optional decimals
// spec for RFC3339 (with fractional seconds)
return formats[5].parse(datestring);
}
}
// step one, split off the timezone.
String firstpart = datestring.substring(0,
datestring.lastIndexOf('-'));
String secondpart = datestring.substring(datestring
.lastIndexOf('-'));
// step two, remove the colon from the timezone offset
secondpart = secondpart.substring(0, secondpart.indexOf(':'))
+ secondpart.substring(secondpart.indexOf(':') + 1);
datestring = firstpart + secondpart;
try {
return formats[6].parse(datestring);// spec for RFC3339
} catch (java.text.ParseException pe) {
// try again with optional decimals
// spec for RFC3339 (with fractional seconds)
return formats[7].parse(datestring);
}
}
private static final SimpleDateFormat formats[] = new SimpleDateFormat[] {
new SimpleDateFormat("d MMM yy HH:mm z", Locale.US),
new SimpleDateFormat("d MMM yy HH:mm:ss z", Locale.US),
new SimpleDateFormat("d MMM yyyy HH:mm z", Locale.US),
new SimpleDateFormat("d MMM yyyy HH:mm:ss z", Locale.US),
new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.US),
new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSSSS'Z'",
Locale.US),
new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US),
new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSSSSZ", Locale.US) };
static {
formats[5].setLenient(true);
formats[7].setLenient(true);
}
private Date parseRssDate(String datestring) throws ParseException {
// dirty version - write a new one TODO
SimpleDateFormat format;
int commaPos = datestring.indexOf(',');
if (commaPos > -1) {
// remove weekday if present
datestring = datestring.substring(commaPos + 1);
}
datestring = datestring.trim();
if (datestring.length() > 8 && datestring.charAt(8) == ' ') {
if (datestring.length() > 16 && datestring.charAt(14) == ' ') {
format = formats[0];
} else {
format = formats[1];
}
} else {
if (datestring.length() > 16 && datestring.charAt(16) == ' ') {
format = formats[2];
} else {
format = formats[3];
}
}
return format.parse(datestring);
}
private static Namespace getNamespace(String nsString) {
return StringLookup.lookupNamespace(nsString);
}
private static Tag getTag(Namespace ns, String tagString) {
switch (ns) {
case ATOM:
return StringLookup.lookupAtomTag(tagString);
case RSS:
case NONE:
return StringLookup.lookupRssTag(tagString);
case RSS_CONTENT:
if (tagString.equals("encoded")) {
return Tag.RSS_CONTENT_ENCODED;
} else {
return Tag.UNKNOWN;
}
case ITUNES:
return StringLookup.lookupITunesTag(tagString);
default:
return Tag.UNKNOWN;
}
}
private static AtomRel getAtomRel(String relString) {
return StringLookup.lookupAtomRel(relString);
}
private static Mime getMime(String mimeString) {
return StringLookup.lookupMime(mimeString);
}
private void onEnclosure() {
listener.onEnclosure(enclosure);
}
private void onFeedItem() {
if (feedItem.itemId == null) {
if (feedItem.url != null) {
feedItem.itemId = feedItem.url;
} else if (feedItem.title != null) {
feedItem.itemId = feedItem.title;
} else {
return;
}
}
if (feedItem.date == null) {
return;
}
listener.onFeedItem(feedItem);
}
private void onFeed() {
listener.onFeed(feed);
}
private long safeParseLong(String number) {
if (number != null) {
try {
return Long.parseLong(number.trim());
} catch (NumberFormatException e) {
return 0;
}
}
return 0;
}
private boolean safePeek(Tag tag) {
try {
Tag parent = parents.peek();
if (tag == parent) {
return true;
} else {
return false;
}
} catch (EmptyStackException e) {
return false;
}
}
}
}