/* * Leech - crawling capabilities for Apache Tika * * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling * * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, * either version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. * * Contact us by mail: christian.reuschling@dfki.de */ package de.dfki.km.leech.parser.incremental; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.nio.channels.FileLock; import java.util.Iterator; import java.util.UUID; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import de.dfki.km.leech.Leech; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.parser.CrawlerParser; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory.Exist; import de.dfki.km.leech.util.TikaUtils; /** * A Parser decorator which enables incremental indexing during the crawl. For this, {@link IncrementalCrawlingParser} needs two entries inside the metadata given from * the parse method:<br> * <br> * <li>{@link IncrementalCrawlingHistory}.dataEntityId: an identifier for a data entity that is independent from the content of this entity. It is only for identifying * the occurence, not to check whether it has changed (e.g. a filename) <li> * <br> * {@link IncrementalCrawlingHistory}.dataEntityContentFingerprint: some fingerprint/identifier that gives the hint whether the content of the data entity has changed, * e.g. the modifed date of a file These entries depends on the type of the datasource, which will considered by creating the InputStream for the parse method. Thus, both * metadata entries will be performed in {@link TikaUtils} during stream creation.<br> * <br> * Dependent on these entries this decorator writes a data entity modification state (new, modified, unmodified, removed) into the metadata before delegating to the * wrapped parser. In the case of a cycle during a crawl (when a data entity comes a second time during a crawl), nothing will be delegated. * * * * @author Christian Reuschling, Dipl.Ing.(BA) */ public class IncrementalCrawlingParser extends ParserDecorator { static public final String DATA_ENTITY_MODIFICATION_STATE = "dataEntitiyModificationState"; static public final String MODIFIED = "modified"; static public final String NEW = "new"; static public final String PROCESSED = "processed"; static public final String REMOVED = "removed"; static public final String ERROR = "error"; private static final long serialVersionUID = 3823147926764040243L; static public final String UNMODIFIED = "unmodified"; protected Leech m_leech = new Leech(); public IncrementalCrawlingParser(Parser parser) { super(parser); }; @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // wir machen hier das ganze inkrementelle history-Zeugs, dann parsen wir des Teil wie gewohnt, die entsprechenden crawlerParser werten u.U. // auch noch die Einträge in den Metadaten aus. Am Schluß machen wir noch die history zu. IncrementalCrawlingHistory crawlingHistory = null; boolean bIsTmpHistory = false; FileLock tmpLock = null; FileOutputStream fosTmpLock = null; int iCurrentCrawlingDepth = 0; try { CrawlerContext crawlerContext = context.get(CrawlerContext.class); if(crawlerContext == null) { crawlerContext = new CrawlerContext(); context.set(CrawlerContext.class, crawlerContext); } // die momentane crawlingdepth brauchen wir um festzustellen, wann ein kompletter crawlingVorgang abgeschlossen ist. Ein CrawlerParser // aktualisiert diese Info in der metadata String strDepth = metadata.get(CrawlerParser.CURRENT_CRAWLING_DEPTH); if(strDepth != null) iCurrentCrawlingDepth = Integer.valueOf(strDepth); // ## die crawling history crawlingHistory = crawlerContext.getIncrementalCrawlingHistory(); if(crawlingHistory == null && crawlerContext.getDetectCycles() && iCurrentCrawlingDepth == 0) { // wir erstellen eine temporäre crawlerhistory, die am Schluß des Crawls auch wieder gelöscht wird File parentDir = new File(System.getProperty("java.io.tmpdir")); File fTmpHistory = new File(parentDir.getAbsolutePath() + "/leechTmp/" + UUID.randomUUID().toString().replaceAll("\\W", "_")); fTmpHistory.mkdirs(); // wir erstellen noch ein lock file, mit dem wir steuern, ob das temporöre Verzeichnis später gelöscht werden kann oder nicht fosTmpLock = new FileOutputStream(fTmpHistory.getAbsolutePath() + "/lock"); tmpLock = fosTmpLock.getChannel().tryLock(); crawlerContext.setIncrementalCrawlingHistoryPath(fTmpHistory.getAbsolutePath()); crawlingHistory = crawlerContext.getIncrementalCrawlingHistory(); bIsTmpHistory = true; } if(iCurrentCrawlingDepth == 0 && crawlingHistory != null) crawlingHistory.crawlStarted(); // ## content and history boolean bProcessEntity = performHistoryStuff(crawlingHistory, metadata); if(bProcessEntity) { // wenn wir unmodified sind, dann wollen wir nur weitermachen, wenn es ein crawlerParser ist - evtl. sind verlinkte Inhalte ja auch // modified String strDataEntityModState = metadata.get(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE); MediaType type = m_leech.getDetector().detect(stream, metadata); Parser finalParser4Type = TikaUtils.getParser4Type((CompositeParser) getWrappedParser(), type, context); if(!IncrementalCrawlingParser.UNMODIFIED.equals(strDataEntityModState)) { getWrappedParser().parse(stream, handler, metadata, context); } else if(finalParser4Type instanceof CrawlerParser) { getWrappedParser().parse(stream, handler, metadata, context); } else { // das Teil ist unmodified, wir machen nix // Logger.getLogger(IncrementalCrawlingParser.class.getName()).info("unmodified entity, will skip it. " + metadata); InputStream dummyStream = new ByteArrayInputStream("leech sucks - hopefully :)".getBytes("UTF-8")); EmptyParser.INSTANCE.parse(dummyStream, handler, metadata, context); } } else { // das Teil war in diesem run schon mal dran - Zykel oder ein einfaches Duplikat // Logger.getLogger(IncrementalCrawlingParser.class.getName()).info("entity was processed this crawl yet, will skip it. " + metadata); InputStream dummyStream = new ByteArrayInputStream("leech sucks - hopefully :)".getBytes("UTF-8")); EmptyParser.INSTANCE.parse(dummyStream, handler, metadata, context); } // removed entites and finishing stuff if(iCurrentCrawlingDepth != 0 || crawlingHistory == null) return; Iterator<String> itRemovedDataEntitiesIDs = crawlingHistory.crawlFinished(); while (itRemovedDataEntitiesIDs.hasNext() && !crawlerContext.stopRequested() && crawlerContext.getCheckForRemovedEntities()) { // wenn der Handler gleich geblieben ist, dann muß dieses Metadata Object das selbe wie u.U. beim übergebenen handler bleiben. Das ist // bei unserem DataSinkcontentHandler der Fall ContentHandler handler4RemovedData = TikaUtils.createContentHandler4SubCrawl(crawlerContext); // wir löschen die Inhalte im Metadata-Objekt, da wir zwar die Referenz behalten wollen (falls ein Handler das auch hat), aber die // Inhalte für das subObject neu gefüllt werden sollen. TikaUtils.clearMetadata(metadata); metadata.set(DATA_ENTITY_MODIFICATION_STATE, REMOVED); String strDataEntityId2Remove = itRemovedDataEntitiesIDs.next(); metadata.set(IncrementalCrawlingHistory.dataEntityId, strDataEntityId2Remove); InputStream dummyStream = new ByteArrayInputStream("leech sucks - hopefully :)".getBytes("UTF-8")); EmptyParser.INSTANCE.parse(dummyStream, handler4RemovedData, metadata, context); } } catch (Exception e) { String strUrlOrSource = metadata.get(Metadata.SOURCE); if(strUrlOrSource == null) strUrlOrSource = metadata.get(Metadata.RESOURCE_NAME_KEY); if(strUrlOrSource == null) strUrlOrSource = metadata.get(IncrementalCrawlingHistory.dataEntityId); if(strUrlOrSource == null) strUrlOrSource = "no entity id known in metadata"; if(e instanceof TikaException) throw (TikaException) e; throw new TikaException("Error while crawling " + strUrlOrSource, e); } finally { if(crawlingHistory != null && iCurrentCrawlingDepth == 0) crawlingHistory.closeLuceneStuff(); if(tmpLock != null) tmpLock.release(); if(fosTmpLock != null) fosTmpLock.close(); if(crawlingHistory != null && iCurrentCrawlingDepth == 0 && bIsTmpHistory) { // hier werden jetzt alle tmp-Verzeichnisse gelöscht, die nicht gelockt sind. Damit entfernen wir auch evtl. Leichen File parentDir = new File(System.getProperty("java.io.tmpdir")); File leechTmpDir = new File(parentDir.getAbsolutePath() + "/leechTmp"); for (File historyDir : leechTmpDir.listFiles()) { if(!historyDir.isDirectory()) continue; FileOutputStream fosLock = new FileOutputStream(historyDir.getAbsolutePath() + "/lock"); FileLock lock = fosLock.getChannel().tryLock(); if(lock != null) { // unlocked - we can delete the directory File fTmpHistory = new File(historyDir.getAbsolutePath()); for (File fSubFile : fTmpHistory.listFiles()) fSubFile.delete(); fTmpHistory.delete(); lock.release(); } if(fosLock != null) fosLock.close(); } } } } /** * Performs the entries into the incremental crawling history and put the data entity modification state into the metadata object. In the case this data entity was * processed during this crawl yet (when we have a cycle or double entry), the method will return false which means that it don't have to be processed again. * * @param crawlingHistory the crawling history. Can be null, in this case the data entity will be flagged as NEW in any case * @param metadata the metadata of the data entity. The method will put the data entity modification state into * * @return true: process the data entity (it was not processed formerly in this crawl), false otherwise (it was processed during this call, we have a circle) * * @throws Exception */ public static boolean performHistoryStuff(IncrementalCrawlingHistory crawlingHistory, Metadata metadata) throws Exception { if(crawlingHistory == null) { metadata.set(DATA_ENTITY_MODIFICATION_STATE, NEW); return true; } else { // wir wollen inkrementelles indexieren - war das Teil schon mal da? String strDataEntityId = metadata.get(IncrementalCrawlingHistory.dataEntityId); String strMasterDataEntityId = metadata.get(IncrementalCrawlingHistory.masterDataEntityId); Exist exist = crawlingHistory.exists(strDataEntityId); // wenn wir es in diesem Crawl schon mal prozessiert haben, dann machen wir gar nix - und verfolgen auch keine Links mehr // weiter. Dann haben wir einen Zykel. if(exist.equals(Exist.YES_PROCESSED)) { metadata.set(DATA_ENTITY_MODIFICATION_STATE, PROCESSED); return false; } String strDataEntityContentFingerprint = metadata.get(IncrementalCrawlingHistory.dataEntityContentFingerprint); if(exist.equals(Exist.NOT)) { metadata.set(DATA_ENTITY_MODIFICATION_STATE, NEW); crawlingHistory.addDataEntity(strDataEntityId, strDataEntityContentFingerprint, strMasterDataEntityId); return true; } // es war schon mal da - hat es sich verändert? boolean bExistsWithContent = crawlingHistory.existsWithContent(strDataEntityId, strDataEntityContentFingerprint); if(bExistsWithContent) { // nicht verändert - wir merken uns, daß es bei diesem crawl immer noch dabei war metadata.set(DATA_ENTITY_MODIFICATION_STATE, UNMODIFIED); crawlingHistory.updateDataEntityLastCrawledTime(strDataEntityId); return true; } // verändert metadata.set(DATA_ENTITY_MODIFICATION_STATE, MODIFIED); crawlingHistory.updateDataEntity(strDataEntityId, strDataEntityContentFingerprint, strMasterDataEntityId); return true; } } }