/*
Leech - crawling capabilities for Apache Tika
Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Contact us by mail: christian.reuschling@dfki.de
*/
package de.dfki.km.leech.config;
import java.io.IOException;
import java.util.LinkedList;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.Parser;
import de.dfki.km.leech.detect.LeechDefaultDetector;
import de.dfki.km.leech.parser.DirectoryCrawlerParser;
import de.dfki.km.leech.parser.HtmlCrawlerParser;
import de.dfki.km.leech.parser.ImapCrawlerParser;
/**
* This is the default configuration for Leech. It sets the {@link LeechDefaultDetector} to detect some extra types as directories and adds additional
* parsers as {@link DirectoryCrawlerParser}.
*
* @author Christian Reuschling, Dipl.Ing.(BA)
*/
public class LeechConfig extends TikaConfig
{
static protected LeechConfig m_defaultLeechConfigSingleton;
public static TikaConfig getDefaultLeechConfig()
{
try
{
if(m_defaultLeechConfigSingleton == null) m_defaultLeechConfigSingleton = new LeechConfig();
return m_defaultLeechConfigSingleton;
}
catch (IOException e)
{
throw new RuntimeException("Unable to read default leech configuration", e);
}
catch (TikaException e)
{
throw new RuntimeException("Unable to access default leech configuration", e);
}
}
protected Detector m_detector;
protected CompositeParser m_parser;
private LinkedList<Logger> m_llPdfBoxLogger = new LinkedList<>();
public LeechConfig() throws TikaException, IOException
{
super();
init();
}
@Override
public Detector getDetector()
{
return m_detector;
}
@Override
public MediaTypeRegistry getMediaTypeRegistry()
{
return super.getMediaTypeRegistry();
}
@Override
public MimeTypes getMimeRepository()
{
return super.getMimeRepository();
}
@Override
public Parser getParser()
{
return m_parser;
}
protected void init()
{
LinkedList<Parser> llParsers = new LinkedList<Parser>();
// der default-Parser aus der TikaConfig
llParsers.add(super.getParser());
// die Leech-datasource-crawler-parser - die letzten werden priorisiert, somit können wir hier z.b. den Original-html-parser überschreiben
llParsers.add(new DirectoryCrawlerParser());
llParsers.add(new HtmlCrawlerParser());
llParsers.add(new ImapCrawlerParser());
m_parser = new CompositeParser(this.getMediaTypeRegistry(), llParsers);
m_detector = new LeechDefaultDetector(m_parser);
// die kommen in ein field, da die Einstellung wohl nur so lange gültig ist, wie es noch eine gültige Referenz zu diesen Objekten gibt
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.util.PDFStreamEngine"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.encoding.Encoding"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.pdfparser.BaseParser"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.pdmodel.font.PDSimpleFont"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.pdfparser.XrefTrailerResolver"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.filter.FlateFilter"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.pdfparser.PDFParser"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.util.operator.SetTextFont"));
m_llPdfBoxLogger.add(Logger.getLogger("org.apache.pdfbox.*"));
for(Logger logger : m_llPdfBoxLogger)
logger.setLevel(Level.OFF);
}
}