package de.dfki.km.leech.parser.rss; import java.io.IOException; import java.io.InputStream; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Set; import org.apache.tika.exception.TikaException; import org.apache.tika.io.CloseShieldInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import com.sun.syndication.feed.synd.SyndEntry; import com.sun.syndication.feed.synd.SyndFeed; import com.sun.syndication.io.SyndFeedInput; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser; import de.dfki.km.leech.util.TikaUtils; /** * Feed parser. This version is derived from the Tika Feed Parser, but gives more metadata and every feed entry as a single document */ public class FeedParser2 extends AbstractParser { private static final long serialVersionUID = 1326997408920690592L; private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(MediaType.application("rss+xml"), MediaType.application("atom+xml")))); @Override public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // set the encoding? try { CrawlerContext crawlerContext = context.get(CrawlerContext.class); if(crawlerContext == null) crawlerContext = new CrawlerContext(); IncrementalCrawlingHistory crawlingHistory = crawlerContext.getIncrementalCrawlingHistory(); String strMasterDataEntityId = metadata.get(IncrementalCrawlingHistory.dataEntityId); SyndFeed feed = new SyndFeedInput().build(new InputSource(new CloseShieldInputStream(stream))); String title = stripTags(feed.getTitleEx().getValue()); String description = stripTags(feed.getDescriptionEx().getValue()); metadata.set(Metadata.TITLE, title); metadata.set(Metadata.DESCRIPTION, description); // store the other fields in the metadata XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.element("h1", title); xhtml.element("p", description); xhtml.endDocument(); String strContentType = metadata.get(Metadata.CONTENT_TYPE); if(crawlerContext.getCrawlingDepth() != 0) { for (Object e : feed.getEntries()) { SyndEntry entry = (SyndEntry) e; String strLink = entry.getLink(); if(strLink != null) { XHTMLContentHandler xhtmlSubDoc = new XHTMLContentHandler(handler, metadata); xhtmlSubDoc.startDocument(); TikaUtils.clearMetadata(metadata); // hier wollen wir mit unseren dataexistsID und contentFingerprint prüfen, ob dieser Entry schon mal indexiert wurde metadata.add(IncrementalCrawlingHistory.dataEntityId, strLink); metadata.add(IncrementalCrawlingHistory.dataEntityContentFingerprint, entry.getPublishedDate().toString()); metadata.add(IncrementalCrawlingHistory.masterDataEntityId, strMasterDataEntityId); IncrementalCrawlingParser.performHistoryStuff(crawlingHistory, metadata); if(IncrementalCrawlingParser.UNMODIFIED.equals(metadata.get(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE))) continue; metadata.add(Metadata.CONTENT_TYPE, strContentType); metadata.add(Metadata.SOURCE, strLink); metadata.add(Metadata.TITLE, stripTags(entry.getTitle())); metadata.add(Metadata.CREATOR, entry.getAuthor()); metadata.add(Metadata.MODIFIED, new SimpleDateFormat("yyyy.MM.dd HH:mm:ss:SSS").format(entry.getPublishedDate())); xhtmlSubDoc.startElement("p"); String strCleanedText = stripTags(entry.getDescription().getValue()); xhtmlSubDoc.characters(strCleanedText.toCharArray(), 0, strCleanedText.length()); xhtmlSubDoc.endElement("p"); xhtmlSubDoc.endDocument(); } } } } catch (Exception e) { throw new TikaException("RSS parse error", e); } } protected static String stripTags(String value) { if(value == null) return ""; String[] parts = value.split("<[^>]*>"); StringBuffer buf = new StringBuffer(); for (String part : parts) buf.append(part); return buf.toString().trim(); } }