/* * Leech - crawling capabilities for Apache Tika * * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling * * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free * Software Foundation, either version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. * * Contact us by mail: christian.reuschling@dfki.de */ package de.dfki.km.leech.util; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.xml.sax.ContentHandler; import de.dfki.inquisition.text.StringUtils; import de.dfki.km.leech.config.CrawlerContext; import de.dfki.km.leech.parser.DirectoryCrawlerParser; public class TikaUtils { /** * Clears all data from this metadta object * * @param metadata2clear */ public static void clearMetadata(Metadata metadata2clear) { for (String strKey : metadata2clear.names()) metadata2clear.remove(strKey); } /** * Creates a new metadta object and copies all data from the given object inside * * @param metadata2copy * * @return the copy */ public static Metadata copyMetadata(Metadata metadata2copy) { Metadata metadataCopy = new Metadata(); for (String strKey : metadata2copy.names()) for (String strVal : metadata2copy.getValues(strKey)) metadataCopy.add(strKey, strVal); return metadataCopy; } /** * Copies all data from one metadata object to another * * @param metadata2copyFrom source * @param metadata2copyTo target */ public static void copyMetadataFromTo(Metadata metadata2copyFrom, Metadata metadata2copyTo) { for (String strKey : metadata2copyFrom.names()) for (String strVal : metadata2copyFrom.getValues(strKey)) metadata2copyTo.add(strKey, strVal); } /** * Returns the content handler object given inside crawlerContext or creates a new handler according to the configured class (also configured * inside crawlerContext) * * @param crawlerContext the crawlerContext configuration. The method will throw an exception in the case it is null * * @return the content handler object given inside crawlerContext or creates a new handler according to the configured class (also configured * inside crawlerContext) */ static public ContentHandler createContentHandler4SubCrawl(CrawlerContext crawlerContext) { if(crawlerContext == null) throw new IllegalStateException("no crawlerContext was set"); ContentHandler handler2use4recursiveCall = crawlerContext.getContentHandler(); if(!StringUtils.nullOrWhitespace(crawlerContext.getContentHandlerClassName())) try { handler2use4recursiveCall = (ContentHandler) Class.forName(crawlerContext.getContentHandlerClassName()).newInstance(); } catch (Exception e) { Logger.getLogger(DirectoryCrawlerParser.class.getName()).log(Level.SEVERE, "Error during the instantiation of the configured content handler " + crawlerContext.getContentHandlerClassName(), e); } if(handler2use4recursiveCall == null) throw new IllegalStateException("No ContentHandler was set. Have a look into the class CrawlerContext"); return handler2use4recursiveCall; } /** * Determines the configured Parser inside a CompositeParser for a given type. * * @param compoParser the CompositeParser that offers several parser implementaions for several types * @param type the type we want to have the according parser for * @param context the parseContext object. Can be null for context-insensitive processing * * @return the parser for the given type */ public static Parser getParser4Type(CompositeParser compoParser, MediaType type, ParseContext context) { Map<MediaType, Parser> map; if(context != null) map = compoParser.getParsers(context); else map = compoParser.getParsers(); // We always work on the normalised, canonical form if(type != null) type = compoParser.getMediaTypeRegistry().normalize(type); while (type != null) { // Try finding a parser for the type Parser parser = map.get(type); if(parser != null) { if(parser instanceof CompositeParser) return getParser4Type((CompositeParser) parser, type, context); return parser; } // Failing that, try for the parent of the type type = compoParser.getMediaTypeRegistry().getSupertype(type); } return compoParser.getFallback(); } }