/* * Copyright 2009 NCHOVY * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.araqne.rss.impl; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.felix.ipojo.annotations.Component; import org.apache.felix.ipojo.annotations.Provides; import org.araqne.rss.FeedType; import org.araqne.rss.RssCategory; import org.araqne.rss.RssChannel; import org.araqne.rss.RssEntry; import org.araqne.rss.RssFeed; import org.araqne.rss.RssReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; @Component(name = "araqne-rss-reader") @Provides public class RssReaderImpl implements RssReader { private final Logger slog = LoggerFactory.getLogger(RssReaderImpl.class); @Override public RssFeed read(String url, boolean stripTag) { return readRssFeed(url.toString(), stripTag); } private RssFeed readRssFeed(String url, boolean stripTag) { try { RssFeed feed = new RssFeed(); DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = documentBuilderFactory.newDocumentBuilder(); XPathFactory xpathFactory = XPathFactory.newInstance(); Document rssXml = builder.parse(url); feed.setType(getFeedType(rssXml)); feed.setChannel(getChannel(rssXml, xpathFactory, feed.getType())); setEntries(feed, xpathFactory, rssXml, stripTag); return feed; } catch (Throwable t) { slog.error("araqne-rss: cannot read rss feed [" + url.toString() + "]", t); throw new IllegalStateException("cannot read rss feed"); } } private void setEntries(RssFeed feed, XPathFactory xpathFactory, Document rssXml, boolean stripTag) throws XPathExpressionException { XPath xpath = xpathFactory.newXPath(); XPathExpression xPathExpression = xpath.compile(getEntryXPath(feed.getType())); NodeList entryNodeList = (NodeList) xPathExpression.evaluate(rssXml, XPathConstants.NODESET); for (int i = 0; i < entryNodeList.getLength(); ++i) { feed.addEntry(getEntry(feed, entryNodeList.item(i), stripTag)); } } private RssEntry getEntry(RssFeed feed, Node entryNode, boolean stripTag) { RssEntry entry = null; switch (feed.getType()) { case ATOM: entry = parseAtomEntry(feed, entryNode, stripTag); break; case RSS2: entry = parseRss2Entry(feed, entryNode, stripTag); break; default: entry = parseRss1Entry(feed, entryNode, stripTag); break; } if (entry != null) entry.setSource(feed.getChannel().getTitle()); return entry; } private RssEntry parseAtomEntry(RssFeed feed, Node entryNode, boolean stripTag) { NodeList childNodeList = entryNode.getChildNodes(); RssEntry entry = new RssEntry(); for (int i = 0; i < childNodeList.getLength(); ++i) { Node childNode = childNodeList.item(i); if (childNode.getNodeName() == "author") { entry.setAuthor(childNode.getFirstChild().getTextContent()); } else if (childNode.getNodeName() == "title") { entry.setTitle(childNode.getTextContent()); } else if (childNode.getNodeName() == "link") { NamedNodeMap attr = childNode.getAttributes(); Node linkTypeNode = attr.getNamedItem("rel"); if (linkTypeNode.getTextContent().equals("alternate")) { Node newChildNode = attr.getNamedItem("href"); entry.setLink(newChildNode.getTextContent()); } } else if (childNode.getNodeName() == "id") { entry.setGuid(childNode.getTextContent()); } else if (childNode.getNodeName() == "published") { feed.setIsHaveNotDate(isMatchedDatePattern(childNode.getTextContent())); entry.setCreatedAt(RssDateParser.parse(childNode.getTextContent())); entry.setIsHaveDateField(true); } else if (childNode.getNodeName() == "updated") { feed.setIsHaveNotDate(isMatchedDatePattern(childNode.getTextContent())); entry.setModifiedAt(RssDateParser.parse(childNode.getTextContent())); } else if (childNode.getNodeName() == "content") { String textContent = childNode.getTextContent(); if (stripTag) { textContent = textContent.replaceAll("<(/)?([a-zA-Z]*)(\\s[a-zA-Z]*=[^>]*)?(\\s)*(/)?>", ""); textContent = textContent.replaceAll("<!--(.|\n|\r)*-->", ""); } entry.setContent(textContent.trim()); } } return entry; } private RssEntry parseRss1Entry(RssFeed feed, Node entryNode, boolean stripTag) { NodeList childNodeList = entryNode.getChildNodes(); RssEntry entry = new RssEntry(); for (int i = 0; i < childNodeList.getLength(); ++i) { Node childNode = childNodeList.item(i); if (childNode.getNodeName() == "dc:creator") { entry.setAuthor(childNode.getTextContent()); } else if (childNode.getNodeName() == "title") { entry.setTitle(childNode.getTextContent()); } else if (childNode.getNodeName() == "link") { entry.setLink(childNode.getTextContent()); } else if (childNode.getNodeName() == "dc:date") { feed.setIsHaveNotDate(isMatchedDatePattern(childNode.getTextContent())); entry.setCreatedAt(RssDateParser.parse(childNode.getTextContent())); entry.setIsHaveDateField(true); } else if (childNode.getNodeName() == "description") { String textContent = childNode.getTextContent(); if (stripTag) { textContent = textContent.replaceAll("<(/)?([a-zA-Z]*)(\\s[a-zA-Z]*=[^>]*)?(\\s)*(/)?>", ""); textContent = textContent.replaceAll("<!--(.|\n|\r)*-->", ""); } entry.setContent(textContent.trim()); } } return entry; } private RssEntry parseRss2Entry(RssFeed feed, Node entryNode, boolean stripTag) { NodeList childNodeList = entryNode.getChildNodes(); RssEntry entry = new RssEntry(); for (int i = 0; i < childNodeList.getLength(); ++i) { Node childNode = childNodeList.item(i); if (childNode.getNodeName() == "author") { entry.setAuthor(childNode.getTextContent()); } else if (childNode.getNodeName() == "title") { String textContent = childNode.getTextContent(); String textContent2 = textContent.replaceAll("<!\\[CDATA\\[|\\]\\]>", ""); String title = textContent2.replaceAll("(<b>|</b>|<font [A-Za-z0-9=]*>|</font>)", ""); entry.setTitle(title); } else if (childNode.getNodeName() == "link") { entry.setLink(childNode.getTextContent()); } else if (childNode.getNodeName() == "guid") { entry.setGuid(childNode.getTextContent()); } else if (childNode.getNodeName() == "pubDate") { feed.setIsHaveNotDate(isMatchedDatePattern(childNode.getTextContent())); entry.setCreatedAt(RssDateParser.parse(childNode.getTextContent())); entry.setIsHaveDateField(true); } else if (childNode.getNodeName() == "dc:date") { feed.setIsHaveNotDate(isMatchedDatePattern(childNode.getTextContent())); entry.setCreatedAt(RssDateParser.parse(childNode.getTextContent())); entry.setIsHaveDateField(true); } else if (childNode.getNodeName() == "description") { String textContent = childNode.getTextContent(); if (stripTag) { textContent = textContent.replaceAll("<(/)?([a-zA-Z]*)(\\s[a-zA-Z]*=[^>]*)?(\\s)*(/)?>", ""); textContent = textContent.replaceAll("<!--(.|\n|\r)*-->", ""); } entry.setContent(textContent.trim()); } else if (childNode.getNodeName() == "category") { RssCategory category = new RssCategory(); category.setName(childNode.getTextContent()); entry.getCategories().add(category); } } return entry; } private boolean isMatchedDatePattern(String content) { String datePattern1 = "\\d{4}-\\d{2}-\\d{2}"; String datePattern2 = "\\d{4}-\\d{2}-\\d{2} "; return Pattern.matches(datePattern1, content) || Pattern.matches(datePattern2, content); } private String getEntryXPath(FeedType feedType) { switch (feedType) { case ATOM: return "//entry"; default: return "//item"; } } private FeedType getFeedType(Document feed) { if (feed.getDocumentElement().getTagName() == "feed") return FeedType.ATOM; else if (feed.getDocumentElement().getTagName() == "rss") return FeedType.RSS2; else return FeedType.RSS1; } private RssChannel getChannel(Document rssXml, XPathFactory xpathFactory, FeedType feedType) throws XPathExpressionException { RssChannel channel = new RssChannel(); XPath xpath = xpathFactory.newXPath(); XPathExpression xPathExpression = xpath.compile(getChannelXPath(feedType)); NodeList titleNodeList = (NodeList) xPathExpression.evaluate(rssXml, XPathConstants.NODESET); String channelTitle = null; if (titleNodeList.getLength() != 0) channelTitle = titleNodeList.item(0).getTextContent(); channel.setTitle(channelTitle); return channel; } private String getChannelXPath(FeedType feedType) { switch (feedType) { case ATOM: return "//feed/title"; default: return "//channel/title"; } } }