package de.dfki.km.leech; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser; public class SubDataEntityContentHandler extends XHTMLContentHandler { protected String m_strBodyText; protected Metadata m_metadata; public SubDataEntityContentHandler(ContentHandler handler, Metadata metadata, String strBodyText) { super(handler, metadata); m_metadata = metadata; m_strBodyText = strBodyText; } /** * Triggers the sub data entity handling WITHOUT considering a possibly set history. This is not always necessary, and better in performance * * @throws Exception */ public void triggerSubDataEntityHandling() throws SAXException { triggerSubDataEntityHandling(null); } /** * Triggers the sub data entity handling. If the context is set null, a possibly set history will be ignored cause of performance. Considering the history is not * always necessary in every parser. * * @param context4history null: a history in the context will be ignored. Otherwise, if a history is part of the context, it will be used for recognizing cycles and * other incremental indexing stuff. Thus, if an entity is indexed yet and is unmodified, it won't be indexed/handeled again, the method will ignore this * entity and do nothing. * @throws SAXException */ public void triggerSubDataEntityHandling(ParseContext context4history) throws SAXException { try { boolean bDoIt = false; // wir beziehen die history mit ein, wenn die entsprechenden Metadaten vorhanden sind if(context4history != null && m_metadata.get(IncrementalCrawlingHistory.dataEntityId) != null && m_metadata.get(IncrementalCrawlingHistory.dataEntityContentFingerprint) != null) { CrawlerContext crawlerContext = context4history.get(CrawlerContext.class); IncrementalCrawlingHistory crawlingHistory = null; if(crawlerContext != null) crawlingHistory = crawlerContext.getIncrementalCrawlingHistory(); // the entity will be processed in the case the crawlingHistory is null boolean bProcessEntity = IncrementalCrawlingParser.performHistoryStuff(crawlingHistory, m_metadata); if(bProcessEntity) { String strDataEntityModState = m_metadata.get(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE); if(!IncrementalCrawlingParser.UNMODIFIED.equals(strDataEntityModState)) { bDoIt = true; } } } else bDoIt = true; if(bDoIt) { // wenn wir keine Metadaten für eine Dublettenerkennung haben oder wir die history ignorieren wollen, dann tragen wir den Datensatz einfach ein startDocument(); // startElement("p"); if(m_strBodyText != null) characters(m_strBodyText.toCharArray(), 0, m_strBodyText.length()); // endElement("p"); endDocument(); } } catch (Exception e) { throw new RuntimeException(e); } } }