package info.persistent.pushbot.util;
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.hash.Hashing;
import com.google.common.io.CharStreams;
import com.sun.syndication.feed.atom.Entry;
import com.sun.syndication.feed.rss.Guid;
import com.sun.syndication.feed.rss.Item;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.feed.synd.SyndLink;
import com.sun.syndication.io.FeedException;
import com.sun.syndication.io.SyndFeedInput;
import com.sun.syndication.io.XmlReader;
import org.jdom.Element;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
public class Feeds {
public static final Logger logger = Logger.getLogger(Feeds.class.getName());
public static final String HUB_RELATION = "hub";
public static final String SELF_RELATION = "self";
public static final String ATOM_NAMESPACE = "http://www.w3.org/2005/Atom";
public static final String ATOM_LINK = "link";
public static final String ATOM_REL_ATTRIBUTE = "rel";
public static final String ATOM_HREF_ATTRIBUTE = "href";
public static SyndFeed parseFeed(InputStream inputStream) {
SyndFeedInput input = new SyndFeedInput();
// Methods like {@link #getEntryId} rely on having access to the wire data
input.setPreserveWireFeed(true);
// Try to filter out control characters. We guess that the encoding is
// UTF-8, per https://groups.google.com/group/pubsubhubbub/browse_thread/thread/cea55f2a9caa64fc
// figuring out the real encoding requires too much bookkeeping.
String inputString = null;
try {
inputString = CharStreams.toString(
new InputStreamReader(inputStream, Charsets.UTF_8));
StringBuilder filteredInput = new StringBuilder();
for (int i = 0; i < inputString.length(); i++) {
char c = inputString.charAt(i);
if (c >= 0x20 || c == 0x9 || c == 0xA || c == 0xD) {
filteredInput.append(c);
}
}
inputStream = new ByteArrayInputStream(
filteredInput.toString().getBytes(Charsets.UTF_8));
} catch (IOException err) {
logger.log(Level.WARNING, "Could not parse input as UTF-8, not " +
"removing possible control characters", err);
}
try {
try {
XmlReader xmlReader = new XmlReader(inputStream);
return input.build(xmlReader);
} catch (IOException err) {
logger.log(Level.WARNING, "Feed read error", err);
logger.log(Level.WARNING, "Feed contents: " + inputString);
return null;
}
} catch (IllegalArgumentException err) {
logger.log(Level.WARNING, "Feed parse error 1", err);
logger.log(Level.WARNING, "Feed contents: " + inputString);
return null;
} catch (FeedException err) {
logger.log(Level.WARNING, "Feed parse error 2", err);
logger.log(Level.WARNING, "Feed contents: " + inputString);
return null;
}
}
@SuppressWarnings("unchecked")
public
static List<URL> getLinkUrl(SyndFeed feed, String relation) {
List<URL> results = Lists.newArrayList();
// Atom feeds can have links accessed directly.
for (SyndLink link : ((List<SyndLink>) feed.getLinks())) {
if (link.getRel().equals(relation)) {
try {
results.add(new URL(link.getHref()));
} catch (MalformedURLException err) {
logger.log(Level.INFO, "Malformed " + relation + " URL", err);
}
}
}
// If we have an Atom 1.0 <link> in an RSS feed, it's in the foreign markup
// list.
List<Element> elements = (List<Element>) feed.getForeignMarkup();
for (Element element : elements) {
if (element.getNamespaceURI().equals(ATOM_NAMESPACE) &&
element.getName().equals(ATOM_LINK) &&
relation.equals(element.getAttributeValue(ATOM_REL_ATTRIBUTE))) {
String href = element.getAttributeValue(ATOM_HREF_ATTRIBUTE);
if (href != null && !href.isEmpty()) {
try {
results.add(new URL(href));
} catch (MalformedURLException err) {
logger.log(Level.INFO, "Malformed " + relation + " URL", err);
}
}
}
}
return results;
}
/**
* Gets a (hopefully) unique and short identifier for this entry.
*/
public static String getEntryId(SyndEntry entry) {
// Look for an Atom ID
if (entry.getWireEntry() instanceof Entry) {
Entry atomEntry = (Entry) entry.getWireEntry();
if (atomEntry.getId() != null && !atomEntry.getId().isEmpty()) {
return hash(atomEntry.getId());
}
}
// Or an RSS GUID
if (entry.getWireEntry() instanceof Item) {
Item rssItem = (Item) entry.getWireEntry();
if (rssItem.getGuid() != null) {
Guid guid = rssItem.getGuid();
if (guid.getValue() != null && !guid.getValue().isEmpty()) {
return hash(guid.getValue());
}
}
}
// Fall back on the item link
if (entry.getLink() != null && !entry.getLink().isEmpty()) {
return hash(entry.getLink());
}
// Then the title
if (entry.getTitle() != null && !entry.getTitle().isEmpty()) {
return hash(entry.getTitle());
}
// Or the entry itself
return hash(entry.toString());
}
/**
* Gets a hash code for the input string.
*/
private static String hash(String s) {
return Hashing.murmur3_128().hashString(s).toString();
}
private Feeds() {
// Not instantiable.
}
}