/* * Copyright 2009 NCHOVY * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.krakenapps.rss.impl; import org.krakenapps.rss.RssCategory; import org.krakenapps.rss.RssChannel; import org.krakenapps.rss.RssEntry; import org.krakenapps.rss.RssFeed; import org.w3c.dom.*; import javax.xml.parsers.*; import javax.xml.xpath.*; import java.util.regex.Pattern; public class RssParser { private enum FeedType { RSS1, RSS2, ATOM } private String datePattern1 = "\\d{4}-\\d{2}-\\d{2}"; private String datePattern2 = "\\d{4}-\\d{2}-\\d{2} "; private RssFeed feed = new RssFeed(); public RssFeed parse(String url) { try { DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory .newInstance(); DocumentBuilder builder = documentBuilderFactory .newDocumentBuilder(); Document rssDoc = builder.parse(url); XPathFactory xpathFactory = XPathFactory.newInstance(); RssChannel channel = new RssChannel(); FeedType feedType = getFeedType(rssDoc); channel.setTitle(getChannelTitle(rssDoc, xpathFactory, feedType)); feed.setChannel(channel); parseEntries(feed, xpathFactory, rssDoc, feedType, channel .getTitle()); return feed; } catch (Exception e) { //logger.warn(e.getMessage()); return null; } } private String getChannelTitle(Document rssXml, XPathFactory xpathFactory, FeedType feedType) { try { XPath xpath = xpathFactory.newXPath(); XPathExpression xPathExpression = xpath .compile(getChannelXPath(feedType)); NodeList titleNodeList = (NodeList) xPathExpression.evaluate( rssXml, XPathConstants.NODESET); if (titleNodeList.getLength() == 0) return null; return titleNodeList.item(0).getTextContent(); } catch (XPathExpressionException e) { System.err.println(e.toString()); return null; } } private String getChannelXPath(FeedType feedType) { switch (feedType) { case ATOM: return "//feed/title"; default: return "//channel/title"; } } private RssEntry getEntry(Node entryNode, FeedType feedType, String channelTitle) { RssEntry entry = null; switch (feedType) { case ATOM: entry = parseAtomEntry(entryNode); break; case RSS2: entry = parseRss2Entry(entryNode); break; default: entry = parseRss1Entry(entryNode); break; } if (entry != null) entry.setSource(channelTitle); return entry; } private RssEntry parseAtomEntry(Node entryNode) { NodeList childNodeList = entryNode.getChildNodes(); RssEntry entry = new RssEntry(); for (int i = 0; i < childNodeList.getLength(); ++i) { Node childNode = childNodeList.item(i); if (childNode.getNodeName() == "author") { entry.setAuthor(childNode.getFirstChild().getTextContent()); } else if (childNode.getNodeName() == "title") { entry.setTitle(childNode.getTextContent()); } else if (childNode.getNodeName() == "link") { NamedNodeMap attr = childNode.getAttributes(); Node linkTypeNode = attr.getNamedItem("rel"); if (linkTypeNode.getTextContent().equals("alternate")) { Node newChildNode = attr.getNamedItem("href"); entry.setLink(newChildNode.getTextContent()); } } else if (childNode.getNodeName() == "id") { entry.setGuid(childNode.getTextContent()); } else if (childNode.getNodeName() == "published") { if (Pattern.matches(datePattern1, childNode.getTextContent()) || Pattern.matches(datePattern2, childNode .getTextContent())) feed.setIsHaveNotDate(true); entry.setCreatedAt(RssDateParser.parse(childNode .getTextContent())); entry.setIsHaveDateField(true); } else if (childNode.getNodeName() == "updated") { if (Pattern.matches(datePattern1, childNode.getTextContent()) || Pattern.matches(datePattern2, childNode .getTextContent())) feed.setIsHaveNotDate(true); entry.setModifiedAt(RssDateParser.parse(childNode .getTextContent())); } else if (childNode.getNodeName() == "content") { String textContent = childNode.getTextContent(); String description = textContent.replaceAll( "<!\\[CDATA\\[|\\]\\]>", ""); entry.setContent(description); } } return entry; } private RssEntry parseRss1Entry(Node entryNode) { NodeList childNodeList = entryNode.getChildNodes(); RssEntry entry = new RssEntry(); for (int i = 0; i < childNodeList.getLength(); ++i) { Node childNode = childNodeList.item(i); if (childNode.getNodeName() == "dc:creator") { entry.setAuthor(childNode.getTextContent()); } else if (childNode.getNodeName() == "title") { entry.setTitle(childNode.getTextContent()); } else if (childNode.getNodeName() == "link") { entry.setLink(childNode.getTextContent()); } else if (childNode.getNodeName() == "dc:date") { if (Pattern.matches(datePattern1, childNode.getTextContent()) || Pattern.matches(datePattern2, childNode .getTextContent())) feed.setIsHaveNotDate(true); entry.setCreatedAt(RssDateParser.parse(childNode .getTextContent())); entry.setIsHaveDateField(true); } else if (childNode.getNodeName() == "description") { String textContent = childNode.getTextContent(); String description = textContent.replaceAll( "<!\\[CDATA\\[|\\]\\]>", ""); entry.setContent(description); } } return entry; } private RssEntry parseRss2Entry(Node entryNode) { NodeList childNodeList = entryNode.getChildNodes(); RssEntry entry = new RssEntry(); for (int i = 0; i < childNodeList.getLength(); ++i) { Node childNode = childNodeList.item(i); if (childNode.getNodeName() == "author") { entry.setAuthor(childNode.getTextContent()); } else if (childNode.getNodeName() == "title") { String textContent = childNode.getTextContent(); String textContent2 = textContent.replaceAll( "<!\\[CDATA\\[|\\]\\]>", ""); String title = textContent2.replaceAll( "(<b>|</b>|<font [A-Za-z0-9=]*>|</font>)", ""); entry.setTitle(title); } else if (childNode.getNodeName() == "link") { entry.setLink(childNode.getTextContent()); } else if (childNode.getNodeName() == "guid") { entry.setGuid(childNode.getTextContent()); } else if (childNode.getNodeName() == "pubDate") { if (Pattern.matches(datePattern1, childNode.getTextContent()) || Pattern.matches(datePattern2, childNode .getTextContent())) feed.setIsHaveNotDate(true); entry.setCreatedAt(RssDateParser.parse(childNode .getTextContent())); entry.setIsHaveDateField(true); } else if (childNode.getNodeName() == "dc:date") { if (Pattern.matches(datePattern1, childNode.getTextContent()) || Pattern.matches(datePattern2, childNode .getTextContent())) feed.setIsHaveNotDate(true); entry.setCreatedAt(RssDateParser.parse(childNode .getTextContent())); entry.setIsHaveDateField(true); } else if (childNode.getNodeName() == "description") { String textContent = childNode.getTextContent(); String description = textContent.replaceAll( "<!\\[CDATA\\[|\\]\\]>", ""); entry.setContent(description); } else if (childNode.getNodeName() == "category") { RssCategory category = new RssCategory(); category.setName(childNode.getTextContent()); entry.getCategories().add(category); } } return entry; } private String getEntryXPath(FeedType feedType) { switch (feedType) { case ATOM: return "//entry"; default: return "//item"; } } private void parseEntries(RssFeed rssFeed, XPathFactory xpathFactory, Document rssXml, FeedType feedType, String channelTitle) { try { XPath xpath = xpathFactory.newXPath(); XPathExpression xPathExpression = xpath .compile(getEntryXPath(feedType)); NodeList entryNodeList = (NodeList) xPathExpression.evaluate( rssXml, XPathConstants.NODESET); for (int i = 0; i < entryNodeList.getLength(); ++i) { rssFeed.addEntry(getEntry(entryNodeList.item(i), feedType, channelTitle)); } } catch (XPathExpressionException e) { e.printStackTrace(); } } private FeedType getFeedType(Document feed) { if (feed.getDocumentElement().getTagName() == "feed") return FeedType.ATOM; else if (feed.getDocumentElement().getTagName() == "rss") return FeedType.RSS2; else return FeedType.RSS1; } }