/* * Leech - crawling capabilities for Apache Tika * * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling * * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free * Software Foundation, either version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. * * Contact us by mail: christian.reuschling@dfki.de */ package de.dfki.km.leech.util; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.parser.CrawlerParser; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser; import de.dfki.km.leech.sax.DataSinkContentHandler; /** * A simple ultility class for dealing with Exceptions * * @author Christian Reuschling, Dipl.Ing.(BA) */ public class ExceptionUtils { /** * Creates the String out of the stacktrace and the cause of a Throwable. This is usefull for e.g. logging out Exception Messages * * @param t the according Throwable * * @return the message String together with the stacktrace and the cause */ static public String createStackTraceString(Throwable t) { // pwir geben den Stacktrace und den Stacktrace des Grundes aus, falls einer existiert ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); PrintStream stackTrace = new PrintStream(byteArrayOutputStream); t.printStackTrace(stackTrace); stackTrace.flush(); try { byteArrayOutputStream.flush(); } catch (IOException e) { throw new RuntimeException(e); } StringBuilder strbWholeMessage = new StringBuilder(); strbWholeMessage.append(byteArrayOutputStream.toString()); // den rekursiven Aufruf brauchen wir nicht - den macht die printStackTrace-methode schon selber :) // if(t.getCause() != null) // strbWholeMessage.append("Cause of:\n").append(ExceptionUtils.createStackTraceString(t.getCause())); return strbWholeMessage.toString(); } /** * Gets a throwable, create a string error message out of it (including the stacktrace), tries to create a {@link IncrementalCrawlingHistory} * .dataEntityId for identifying the entity, puts this all together into the metadata object, flag it as an error entity (with * metadata.set(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE, IncrementalCrawlingParser.ERROR);), and finally delegates it to an * EmptyParser with the given content handler and a dummy stream.<br> * This is for processing errors/error entities that occur during a crawl also by implementing the {@link ContentHandler} interface - or even * better the {@link DataSinkContentHandler} interface, which is recommended. * * @param e thhe exception occured during the crawl * @param strSourceId some referencing ID - in the case it is null, the method will get Metadata.SOURCE from the metadata or, in the case this is * also null, Metadata.RESOURCE_NAME_KEY * @param metadata the metadata object for the data entity. Will be enhanced with the error message and given to the EmptyParser invocation * @param crawlerContext the original crawler configuration object * @param context the original ParseContext * @param iCurrentCrawlingDepth the current crawling depth. In the case the crawling process should be interrupted in the case of an exception, * this method will also throw an exception. In the case the crawling depth is 0 in this case, the method will additionally show a log * message * @param handler2use4recursiveCall the handler that is used for the crawling process * * @throws TikaException will be thrown in the case the crawling should be interrupted in the case of an exception (as configured inside the * CrawlerContext * @throws SAXException will be thrown in the case the EmptyParser.parse invocation will throw it */ static public void handleException(Throwable e, String strSourceId, Metadata metadata, CrawlerContext crawlerContext, ParseContext context, int iCurrentCrawlingDepth, ContentHandler handler2use4recursiveCall) throws TikaException, SAXException { try { if(crawlerContext == null) crawlerContext = new CrawlerContext(); String strUrlOrSource4SubEntity = strSourceId; if(strUrlOrSource4SubEntity == null) strUrlOrSource4SubEntity = metadata.get(Metadata.SOURCE); if(strUrlOrSource4SubEntity == null) strUrlOrSource4SubEntity = metadata.get(Metadata.RESOURCE_NAME_KEY); if(strUrlOrSource4SubEntity == null) strUrlOrSource4SubEntity = "no data entity id known - in the case of a sub-entity, set it inside the metadata at your implementation of getSubDataEntitiesInformation(..) " + "under the key CrawlerParser.SOURCEID. Otherwise you maybe try to process an unsupported/broken URL, or it is totally strange."; metadata.set(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE, IncrementalCrawlingParser.ERROR); metadata.set(IncrementalCrawlingHistory.dataEntityId, strUrlOrSource4SubEntity); metadata.set("errorMessage", e.getMessage()); metadata.set("errorStacktrace", ExceptionUtils.createStackTraceString(e)); InputStream dummyStream = new ByteArrayInputStream("leech sucks - hopefully :)".getBytes("UTF-8")); // ///////////////////// if(crawlerContext.getInterruptIfException()) { if(iCurrentCrawlingDepth == 0) Logger.getLogger(CrawlerParser.class.getName()).log(Level.SEVERE, "Error while processing " + strUrlOrSource4SubEntity, e); // wir geben nun einfach den Error weiter - in der Metadata EmptyParser.INSTANCE.parse(dummyStream, handler2use4recursiveCall, metadata, context); throw new TikaException("Error while processing " + strUrlOrSource4SubEntity, e); } // wir geben nun einfach den Error weiter - in der Metadata EmptyParser.INSTANCE.parse(dummyStream, handler2use4recursiveCall, metadata, context); } catch (UnsupportedEncodingException e1) { Logger.getLogger(ExceptionUtils.class.getName()).log(Level.SEVERE, "Error", e); } } }