/*
* Leech - crawling capabilities for Apache Tika
*
* Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
* either version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Contact us by mail: christian.reuschling@dfki.de
*/
package de.dfki.km.leech.parser;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.LinkedList;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import de.dfki.inquisition.collections.MultiValueHashMap;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser;
import de.dfki.km.leech.sax.DataSinkContentHandler;
import de.dfki.km.leech.util.ExceptionUtils;
import de.dfki.km.leech.util.TikaUtils;
/**
* This is the upper class for all crawling parsers. If you want to write a crawling parser, implement this class. CrawlerParser will first invoke
* {@link #processCurrentDataEntity(InputStream, Metadata, ContentHandler, ParseContext)} to process the input stream and pushing it to a ContentHandler, simply the
* standard Tika parsing way. Next, it will call {@link #getSubDataEntitiesInformation(InputStream, ContentHandler, Metadata, ParseContext)} to determine all succeeding
* sub data entities from this data entity. CrawlerParser then iterates over these entries and give them to
* {@link #processSubDataEntity(MultiValueHashMap, Metadata, ContentHandler, ParseContext)} in order to further process the sub data entities individually. This is the
* recursive call, which starts the whole parsing process again with a new entity. <br>
* <br>
* The crawling process can be configured with specific context classes, have a look into the 'config' package, especially {@link CrawlerContext}.
*
* @author Christian Reuschling, Dipl.Ing.(BA)
*/
public abstract class CrawlerParser implements Parser
{
private static final long serialVersionUID = -6707880965147815349L;
static public final String CURRENT_CRAWLING_DEPTH = "currentCrawlingDepth";
static public final String SOURCEID = "sourceId";
/**
* Gets information about all data entities that should be (sub)crawled by this crawler instance. This e.g. could be all files and directories inside the current
* directory. You can return arbritrary information about a data entity - it will be offered as-is at the invocation of
* {@link #processSubDataEntity(MultiValueHashMap, Metadata, ContentHandler, ParseContext)} in order to deal with it. <br>
* <br>
* To consider constraints given from the user for Url/datasource string filtering, use the potential CrawlerContext Object inside the ParseContext and use the
* URLFilter. Same is for the stop request, which is also offered by the CrawlerContext. Leech deals automatically with stop requests and data entity filtering, but
* you can enhance the performance when you filter subentities early in this class. This is because otherwise there will be a stream initialization or established
* connection before filtering. <br>
* <br>
* While creating the information Map for a (sub) data entity, it is recommended to put at least one key entry with CrawlerParser.SOURCEID for use in potential error
* messages, to identify a problematic data entity. In the case you do so, you can simply throw all Exceptions inside your implementation of processSubDataEntity, the
* super class will deal with it.<br>
* <br>
*
* @param stream the stream-parameter from the Parser.parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) invocation
* @param handler the handler-parameter from the Parser.parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) invocation
* @param metadata a copy of the metadata-parameter from the Parser.parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
* invocation
* @param context the context-parameter from the Parser.parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) invocation
*
* @return an iterator with all information about a data entity that should be crawled, that is enough to deal with it inside the other method implementations
*
* @throws Exception
*/
abstract protected Iterator<MultiValueHashMap<String, Object>> getSubDataEntitiesInformation(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws Exception;
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException
{
CrawlerContext crawlerContext = context.get(CrawlerContext.class);
if(crawlerContext == null) crawlerContext = new CrawlerContext();
String strSourceURL = metadata.get(Metadata.SOURCE);
int iCurrentCrawlingDepth = 0;
TikaInputStream tmpStream = null;
try
{
String strDepth = metadata.get(CURRENT_CRAWLING_DEPTH);
if(strDepth != null) iCurrentCrawlingDepth = Integer.valueOf(strDepth);
// ## die momentan zu crawlende entity (der potentielle container mit pot. eigenem Inhalt)
String strDataEntityModState = metadata.get(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE);
if(!IncrementalCrawlingParser.UNMODIFIED.equals(strDataEntityModState))
{
// hier ist es vermutlich besser, auf das tmp-file-Angebot vom TikaStream einzugehen - die Platte wird vermutlich schneller sein als
// eine durchschnittliche Internetverbindung.Auch schreibend. Sollte kein File hinter dem stream stecken (z.B. bei einer
// http-connection) wird Tika automatisch ein temporäres File erzeugen.
tmpStream = TikaInputStream.get((TikaInputStream.get(stream).getPath()));
ContentHandler handler2use4recursiveCall = TikaUtils.createContentHandler4SubCrawl(crawlerContext);
processCurrentDataEntity(tmpStream, metadata, handler2use4recursiveCall, context);
}
// ## die SubEntities - machen wir nur, wenn wir nicht schon die maximale crawlingdepth erreicht haben
Iterator<MultiValueHashMap<String, Object>> subDataEntitiesInformation;
if(iCurrentCrawlingDepth + 1 > crawlerContext.getCrawlingDepth())
subDataEntitiesInformation = new LinkedList<MultiValueHashMap<String, Object>>().iterator();
else
{
// wir kopieren das Metadata-Teil hier, damit wir in der Schleife das Original-Objekt verwenden können (der iterator wird evtl. erst
// während des Schleifendurchlaufs in einem anderen Thread beschickt, und da sollte das Metadata-Objekt noch gültig sein. Wir
// verändern allerdings dessen Inhalte in der Schleife
subDataEntitiesInformation = getSubDataEntitiesInformation(stream, handler, TikaUtils.copyMetadata(metadata), context);
}
int iEntityIndex = 0;
while (subDataEntitiesInformation.hasNext() && !crawlerContext.stopRequested())
{
MultiValueHashMap<String, Object> subDataEntityInfo = subDataEntitiesInformation.next();
// bei jeder Entität schauen wir, ob wir einen neuen Handler erzeugen müssen
ContentHandler handler2use4recursiveCall = TikaUtils.createContentHandler4SubCrawl(crawlerContext);
try
{
// wir löschen die Inhalte im Metadata-Objekt, da wir zwar die Referenz behalten wollen (falls ein Handler das auch hat), aber die
// Inhalte für die subEntity neu gefüllt werden sollen.
TikaUtils.clearMetadata(metadata);
// wir tragen dann noch die aktuelle depth ein, damit wir gegebenenfalls abbrechen können
metadata.set(CURRENT_CRAWLING_DEPTH, String.valueOf(iCurrentCrawlingDepth + 1));
processSubDataEntity(subDataEntityInfo, metadata, handler2use4recursiveCall, context);
}
catch (Throwable e)
{
Object sourceId = subDataEntityInfo.getFirst(SOURCEID);
ExceptionUtils.handleException(e, sourceId == null ? "noSourceId" : sourceId.toString(), metadata, crawlerContext, context, iCurrentCrawlingDepth,
handler2use4recursiveCall);
}
iEntityIndex++;
if(iEntityIndex % 10000 == 0)
{
// twice is full gc
System.gc();
System.gc();
}
}
if(iCurrentCrawlingDepth != 0) return;
// am Schluß auch noch die Metadata abräumen, falls man die an einem anderen Leech-Aufruf wiederverwenden will
TikaUtils.clearMetadata(metadata);
}
catch (Exception e)
{
if(e instanceof TikaException) throw (TikaException) e;
throw new TikaException("Error while crawling '" + strSourceURL + "'", e);
}
finally
{
if(tmpStream != null) tmpStream.close();
// hier wollen wir auch noch brav unterbrechen, wenn ein stop requested wurde
Boolean bStopRequested = crawlerContext.stopRequested();
synchronized (bStopRequested)
{
if(bStopRequested && iCurrentCrawlingDepth == 0) bStopRequested.notifyAll();
}
}
}
/**
* Processes the current data entity that should be parsed. This method extracts the content by e.g. delegating the stream to a specific Parser in order to push the
* content to the ContentHandler, whereby the {@link #getSubDataEntitiesInformation(InputStream, ContentHandler, Metadata, ParseContext)} method extracts all the
* links to other data entites out of this content, for further processing them individually. <br>
* <br>
* For example, the {@link HtmlCrawlerParser} simply delegates the parameters to a Tika HtmlParser Object when its unmodified (this info is inside the metadata
* possibly generated by {@link IncrementalCrawlingParser}).
*
* @param stream a 'cloned' stream from the stream given from the parse method
* @param metadata the metadata given from the parse method
* @param handler the origin content handler instance from the parse method, OR an instance created newly at every data entity as configured inside CrawlerContext
* @param context the ParseContext Object given from the parse method
*
* @throws Exception
*/
abstract protected void processCurrentDataEntity(InputStream stream, Metadata metadata, ContentHandler handler, ParseContext context) throws Exception;
/**
* Processes a sub data entity from this parsed 'container' data entity - this can be the recursive call, in the case you have other complex data types behind your
* sub data entities, which needs further parsing again. In this case, normally you invoke some kind of Leech.parse(...) method here, e.g.<br>
* <br>
* <code>
* Parser parser = m_leech.getParser();<br>
* parser.parse(stream, handler2use4recursiveCall, metadata, context);<br>
* <br>
* </code> In the other case, you have all the information yet, ready for the final handler. In this case, you can send it directly, without further processing: <br>
* <br>
* <code>
* SubDataEntityContentHandler subHandler = new SubDataEntityContentHandler(handler, metadata, strBody);<br>
* if(ignoreHistory)<br>
* subHandler.triggerSubDataEntityHandling();<br>
* else<br>
* subHandler.triggerSubDataEntityHandling(context);<br>
* </code> <br>
* The stream and possible additional metadata entries you get(or create) out of information inside the subDataEntityInformation. Make sure that you reuse the
* metadata Object for the case that the handler has also an internal metadata member that must be the same object (as inside {@link DataSinkContentHandler})
*
* @param subDataEntityInformation one entry out of the formerly returned iterator from
* {@link #getSubDataEntitiesInformation(InputStream, ContentHandler, Metadata, ParseContext)}
* @param metadata2use4recursiveCall the metadata object that should be used for handling / recursive calls
* @param handler2use4recursiveCall the origin content handler instance from the root crawl invocation, OR an instance created newly at every data entity as
* configured inside CrawlerContext
* @param context the origin ParseContext instance given from the parse method
*
* @throws Exception
*/
abstract protected void processSubDataEntity(MultiValueHashMap<String, Object> subDataEntityInformation, Metadata metadata2use4recursiveCall,
ContentHandler handler2use4recursiveCall, ParseContext context) throws Exception;
}