package net.i2p.router.news; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Set; import net.i2p.I2PAppContext; import net.i2p.util.Log; import org.cybergarage.util.Debug; import org.cybergarage.xml.Attribute; import org.cybergarage.xml.Node; import org.cybergarage.xml.ParserException; /** * Parse out the news.xml file which is in Atom format (RFC4287). * * We use the XML parser from the UPnP library. * * @since 0.9.17 */ public class NewsXMLParser { private final I2PAppContext _context; private final Log _log; private List<NewsEntry> _entries; private List<CRLEntry> _crlEntries; private BlocklistEntries _blocklistEntries; private NewsMetadata _metadata; private XHTMLMode _mode; private static final Set<String> xhtmlWhitelist = new HashSet<String>(Arrays.asList(new String[] { "a", "b", "br", "div", "i", "p", "span", "font", "blockquote", "hr", "del", "ins", "em", "strong", "mark", "sub", "sup", "tt", "code", "strike", "s", "u", "h4", "h5", "h6", "ol", "ul", "li", "dl", "dt", "dd", "table", "tr", "td", "th", // put in by parser XMLParser.TEXT_NAME })); // http://www.w3.org/TR/html-markup/global-attributes.html#common.attrs.event-handler private static final Set<String> attributeBlacklist = new HashSet<String>(Arrays.asList(new String[] { "onabort", "onblur", "oncanplay", "oncanplaythrough", "onchange", "onclick", "oncontextmenu", "ondblclick", "ondrag", "ondragend", "ondragenter", "ondragleave", "ondragover", "ondragstart", "ondrop", "ondurationchange", "onemptied", "onended", "onerror", "onfocus", "oninput", "onivalid", "onkeydown", "onkeypress", "onkeyup", "onload", "onloadeddata", "onloadedmetadata", "onloadstart", "onmousedown", "onmousemove", "onmouseout", "onmouseover", "onmouseup", "onmousewheel", "onpause", "onplay", "onplaying", "onprogress", "onratechange", "onreadystatechange", "onreset", "onscroll", "onseeked", "onseeking", "onselect", "onshow", "onstalled", "onsubmit", "onsuspend", "ontimeupdate", "onvolumechange", "onwaiting" })); /** * The action taken when encountering a non-whitelisted * XHTML element or blacklisted attribute in the feed content. */ public enum XHTMLMode { /** abort the parsing on any non-whitelisted element or blacklisted attribute */ ABORT, /** remove only the non-whitelisted element, or element containing a blacklisted attribute */ REMOVE_ELEMENT, /** remove only the non-whitelisted element, remove only the blacklisted attribute */ REMOVE_ATTRIBUTE, /** skip the feed entry containing the non-whitelisted element or blacklisted attribute */ SKIP_ENTRY, /** disable all whitelist and blacklist checks */ ALLOW_ALL } public NewsXMLParser(I2PAppContext ctx) { _context = ctx; _log = ctx.logManager().getLog(NewsXMLParser.class); _mode = XHTMLMode.REMOVE_ELEMENT; } /** * Sets the action taken when encountering a non-whitelisted * XHTML element in the feed content. * Must be set before parse(). * Default REMOVE_ELEMENT. */ public void setXHTMLMode(XHTMLMode mode) { _mode = mode; } /** * Process the XML file. * * @param file XML content only. Any su3 or gunzip handling must have * already happened. * @return the root node * @throws IOException on any parse error */ public Node parse(File file) throws IOException { return parse(new BufferedInputStream(new FileInputStream(file))); } /** * Process the XML input stream. * * @param in XML content only. Any su3 or gunzip handling must have * already happened. * @return the root node * @throws IOException on any parse error */ public Node parse(InputStream in) throws IOException { _entries = null; _metadata = null; XMLParser parser = new XMLParser(_context); try { Node root = parser.parse(in); extract(root); return root; } catch (ParserException pe) { throw new I2PParserException(pe); } } /** * The news entries. * Must call parse() first. * * @return sorted, newest first, null if parse failed */ public List<NewsEntry> getEntries() { return _entries; } /** * The news metatdata. * Must call parse() first. * * @return null if parse failed */ public NewsMetadata getMetadata() { return _metadata; } /** * The news CRL entries. * Must call parse() first. * * @return unsorted, null if none * @since 0.9.26 */ public List<CRLEntry> getCRLEntries() { return _crlEntries; } /** * The blocklist entries. * Must call parse() first. * * @return null if none * @since 0.9.28 */ public BlocklistEntries getBlocklistEntries() { return _blocklistEntries; } private void extract(Node root) throws I2PParserException { if (!root.getName().equals("feed")) throw new I2PParserException("no feed in XML"); _metadata = extractNewsMetadata(root); _entries = extractNewsEntries(root); _crlEntries = extractCRLEntries(root); _blocklistEntries = extractBlocklistEntries(root); } private static NewsMetadata extractNewsMetadata(Node feed) throws I2PParserException { NewsMetadata rv = new NewsMetadata(); Node n = feed.getNode("title"); if (n != null) { rv.feedTitle = n.getValue(); if (rv.feedTitle != null) rv.feedTitle = rv.feedTitle.trim(); } n = feed.getNode("subtitle"); if (n != null) { rv.feedSubtitle = n.getValue(); if (rv.feedSubtitle != null) rv.feedSubtitle = rv.feedTitle.trim(); } n = feed.getNode("id"); if (n != null) { rv.feedID = n.getValue(); if (rv.feedTitle != null) rv.feedTitle = rv.feedTitle.trim(); } n = feed.getNode("updated"); if (n != null) { String v = n.getValue(); if (v != null) { long time = RFC3339Date.parse3339Date(v.trim()); if (time > 0) rv.feedUpdated = time; } } List<NewsMetadata.Release> releases = new ArrayList<NewsMetadata.Release>(); List<Node> releaseNodes = getNodes(feed, "i2p:release"); if (releaseNodes.size() == 0) throw new I2PParserException("no release data in XML"); for (Node r : releaseNodes) { NewsMetadata.Release release = new NewsMetadata.Release(); // release attributes String a = r.getAttributeValue("date"); if (a.length() > 0) { long time = RFC3339Date.parse3339Date(a.trim()); if (time > 0) release.date = time; } a = r.getAttributeValue("minVersion"); if (a.length() > 0) release.minVersion = a.trim(); a = r.getAttributeValue("minJavaVersion"); if (a.length() > 0) release.minJavaVersion = a.trim(); // release nodes n = r.getNode("i2p:version"); if (n != null) { release.i2pVersion = n.getValue(); if (release.i2pVersion != null) release.i2pVersion = release.i2pVersion.trim(); } List<NewsMetadata.Update> updates = new ArrayList<NewsMetadata.Update>(); List<Node> updateNodes = getNodes(r, "i2p:update"); if (updateNodes.size() == 0) throw new I2PParserException("no updates in release"); Set<String> types = new HashSet<String>(); for (Node u : updateNodes) { // returns "" for none String type = u.getAttributeValue("type"); if (type.isEmpty()) throw new I2PParserException("update with no type"); if (types.contains(type)) throw new I2PParserException("update with duplicate type"); NewsMetadata.Update update = new NewsMetadata.Update(); update.type = type.trim(); types.add(type); int totalSources = 0; Node t = u.getNode("i2p:torrent"); if (t != null) { // returns "" for none String href = t.getAttributeValue("href"); if (href.length() > 0) { update.torrent = href.trim(); totalSources += 1; } } if (totalSources == 0) throw new I2PParserException("no sources for update type " + type); updates.add(update); } Collections.sort(updates); release.updates = updates; releases.add(release); } Collections.sort(releases); rv.releases = releases; return rv; } /** * This does not check for any missing values. * Any field in any NewsEntry may be null. */ private List<NewsEntry> extractNewsEntries(Node feed) throws I2PParserException { List<NewsEntry> rv = new ArrayList<NewsEntry>(); List<Node> entries = getNodes(feed, "entry"); for (Node entry : entries) { NewsEntry e = new NewsEntry(); Node n = entry.getNode("title"); if (n != null) { e.title = n.getValue(); if (e.title != null) e.title = e.title.trim(); } n = entry.getNode("link"); if (n != null) { String a = n.getAttributeValue("href"); if (a.length() > 0) e.link = a.trim(); } n = entry.getNode("id"); if (n != null) { e.id = n.getValue(); if (e.id != null) e.id = e.id.trim(); } n = entry.getNode("updated"); if (n != null) { String v = n.getValue(); if (v != null) { long time = RFC3339Date.parse3339Date(v.trim()); if (time > 0) e.updated = time; } } n = entry.getNode("summary"); if (n != null) { e.summary = n.getValue(); if (e.summary != null) e.summary = e.summary.trim(); } n = entry.getNode("author"); if (n != null) { n = n.getNode("name"); if (n != null) { e.authorName = n.getValue(); if (e.authorName != null) e.authorName = e.authorName.trim(); } } n = entry.getNode("content"); if (n != null) { String a = n.getAttributeValue("type"); if (a.length() > 0) e.contentType = a; // now recursively sanitize // and convert everything in the content to string StringBuilder buf = new StringBuilder(256); for (int i = 0; i < n.getNNodes(); i++) { Node sn = n.getNode(i); try { boolean removed = validate(sn); if (removed) { i--; continue; } } catch (I2PParserException ipe) { switch (_mode) { case ABORT: throw ipe; case SKIP_ENTRY: if (_log.shouldLog(Log.WARN)) _log.warn("Skipping entry", ipe); e = null; break; case REMOVE_ATTRIBUTE: case REMOVE_ELEMENT: if (_log.shouldLog(Log.WARN)) _log.warn("Removing element", ipe); continue; case ALLOW_ALL: default: break; } } if (e == null) break; XMLParser.toString(buf, sn); } if (e == null) continue; e.content = buf.toString(); } rv.add(e); } Collections.sort(rv); return rv; } /** * This does not check for any missing values. * Any field in any CRLEntry may be null. * * @return null if none * @since 0.9.26 */ private static List<CRLEntry> extractCRLEntries(Node feed) throws I2PParserException { Node rev = feed.getNode("i2p:revocations"); if (rev == null) return null; List<Node> entries = getNodes(rev, "i2p:crl"); if (entries.isEmpty()) return null; List<CRLEntry> rv = new ArrayList<CRLEntry>(entries.size()); for (Node entry : entries) { CRLEntry e = new CRLEntry(); String a = entry.getAttributeValue("id"); if (a.length() > 0) e.id = a; a = entry.getAttributeValue("updated"); if (a.length() > 0) { long time = RFC3339Date.parse3339Date(a.trim()); if (time > 0) e.updated = time; } a = entry.getValue(); if (a != null) e.data = a.trim(); rv.add(e); } return rv; } /** * This does not check for any missing values. * Any field in a BlocklistEntry may be null. * Signature is verified here. * * @return null if none * @since 0.9.28 */ private BlocklistEntries extractBlocklistEntries(Node feed) throws I2PParserException { Node bl = feed.getNode("i2p:blocklist"); if (bl == null) return null; List<Node> entries = getNodes(bl, "i2p:block"); BlocklistEntries rv = new BlocklistEntries(entries.size()); String a = bl.getAttributeValue("signer"); if (a.length() > 0) rv.signer = a; a = bl.getAttributeValue("sig"); if (a.length() > 0) { rv.sig = a; } Node n = bl.getNode("updated"); if (n == null) return null; a = n.getValue(); if (a != null) { rv.supdated = a; long time = RFC3339Date.parse3339Date(a.trim()); if (time > 0) rv.updated = time; } for (Node entry : entries) { a = entry.getValue(); if (a != null) { rv.entries.add(a.trim()); } } List<Node> rentries = getNodes(bl, "i2p:unblock"); if (entries.isEmpty() && rentries.isEmpty()) return null; for (Node entry : rentries) { a = entry.getValue(); if (a != null) { rv.removes.add(a.trim()); } } rv.verify(_context); return rv; } /** * Helper to get all Nodes matching the name * * @return non-null */ public static List<Node> getNodes(Node node, String name) { List<Node> rv = new ArrayList<Node>(); int count = node.getNNodes(); for (int i = 0; i < count; i++) { Node n = node.getNode(i); if (n.getName().equals(name)) rv.add(n); } return rv; } /** * @throws I2PParserException if any node not in whitelist (depends on mode) * @return true if node was removed from parent (only for REMOVE_ELEMENT mode) */ private boolean validate(Node node) throws I2PParserException { String name = node.getName(); //if (_log.shouldLog(Log.DEBUG)) // _log.debug("Validating element: " + name); if (!xhtmlWhitelist.contains(name.toLowerCase(Locale.US))) { switch (_mode) { case ABORT: case SKIP_ENTRY: throw new I2PParserException("Invalid XHTML element \"" + name + '"'); case REMOVE_ATTRIBUTE: case REMOVE_ELEMENT: if (_log.shouldLog(Log.WARN)) _log.warn("Removing element: " + node); node.getParentNode().removeNode(node); return true; case ALLOW_ALL: if (_log.shouldLog(Log.WARN)) _log.warn("Allowing non-whitelisted element by configuration: " + node); break; } } for (int i = 0; i < node.getNAttributes(); i++) { Attribute attr = node.getAttribute(i); String aname = attr.getName(); if (attributeBlacklist.contains(aname.toLowerCase(Locale.US))) { switch (_mode) { case ABORT: case SKIP_ENTRY: throw new I2PParserException("Invalid XHTML element \"" + name + "\" due to attribute " + aname); case REMOVE_ELEMENT: if (_log.shouldLog(Log.WARN)) _log.warn("Removing element: " + node + " due to attribute " + aname); node.getParentNode().removeNode(node); return true; case REMOVE_ATTRIBUTE: if (_log.shouldLog(Log.WARN)) _log.warn("Removing attribute: " + aname + " from " + node); // sadly, no removeAttribute(int) if (node.removeAttribute(attr)) i--; break; case ALLOW_ALL: if (_log.shouldLog(Log.WARN)) _log.warn("Allowing blacklisted attribute by configuration: " + node); break; } } } int count = node.getNNodes(); for (int i = 0; i < node.getNNodes(); i++) { boolean removed = validate(node.getNode(i)); if (removed) i--; } return false; } /** * Extend IOE since cybergarage ParserException extends Exception */ private static class I2PParserException extends IOException { public I2PParserException(String s) { super(s); } public I2PParserException(Throwable t) { super("XML Parse Error", t); } } public static void main(String[] args) { if (args.length <= 0 || args.length > 2) { System.err.println("Usage: NewsXMLParser file.xml [parserMode]"); System.exit(1); } try { I2PAppContext ctx = new I2PAppContext(); Debug.initialize(ctx); NewsXMLParser parser = new NewsXMLParser(ctx); if (args.length > 1) { XHTMLMode mode = XHTMLMode.valueOf(args[1]); parser.setXHTMLMode(mode); } else { parser.setXHTMLMode(XHTMLMode.ABORT); } parser.parse(new File(args[0])); NewsMetadata ud = parser.getMetadata(); List<NewsEntry> entries = parser.getEntries(); NewsMetadata.Release latestRelease = ud.releases.get(0); System.out.println("Latest version is " + latestRelease.i2pVersion); System.out.println("Release timestamp: " + latestRelease.date); System.out.println("Feed timestamp: " + ud.feedUpdated); System.out.println("Found " + entries.size() + " news entries"); Set<String> uuids = new HashSet<String>(entries.size()); for (int i = 0; i < entries.size(); i++) { NewsEntry e = entries.get(i); System.out.println("\n****** News #" + (i+1) + ": " + e.title + '\n' + e.content); if (e.id == null) throw new IOException("missing ID"); if (e.title == null) throw new IOException("missing title"); if (e.content == null) throw new IOException("missing content"); if (e.authorName == null) throw new IOException("missing author"); if (e.updated == 0) throw new IOException("missing updated"); if (!uuids.add(e.id)) throw new IOException("duplicate ID"); } } catch (IOException ioe) { ioe.printStackTrace(); System.exit(1); } } }