/* * Leech - crawling capabilities for Apache Tika * * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling * * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, * either version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. * * Contact us by mail: christian.reuschling@dfki.de */ package de.dfki.km.leech.detect; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import org.apache.tika.config.ServiceLoader; import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.Detector; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.Parser; import de.dfki.km.leech.util.TikaUtils; /** * A detector implementation that detects everything from the tika DefaultDetector, plus some extra datasource detectors (e.g. for directories) * * @author Christian Reuschling, Dipl.Ing.(BA) */ public class LeechDefaultDetector extends CompositeDetector { private static final long serialVersionUID = -4879286813440313595L; protected static List<Detector> getDefaultDetectors(MimeTypes types, ServiceLoader loader) { List<Detector> detectors = new ArrayList<Detector>(); detectors.add(new DirectoryDatasourceDetector()); detectors.add(new ImapDatasourceDetector()); detectors.addAll(loader.loadServiceProviders(Detector.class)); detectors.add(types); return detectors; } private CompositeParser m_usedCompoParserFromCrawlConfig = null; protected final MediaTypeRegistry registry; public LeechDefaultDetector() { this(MimeTypes.getDefaultMimeTypes()); } public LeechDefaultDetector(ClassLoader loader) { this(MimeTypes.getDefaultMimeTypes(), loader); } /** * Initializes the detector with the Parser instance that is used for crawling. In the case this parser is known, the detector checks whether a known media type will * result into an EmptyParser. In this case, it will try to detect the a possibly better media type by checking the stream. * * @param compoParser the Parser instance that is used for crawling */ public LeechDefaultDetector(CompositeParser compoParser) { this(MimeTypes.getDefaultMimeTypes()); this.m_usedCompoParserFromCrawlConfig = compoParser; } public LeechDefaultDetector(MimeTypes types) { this(types, new ServiceLoader()); } public LeechDefaultDetector(MimeTypes types, ClassLoader loader) { this(types, new ServiceLoader(loader)); } private LeechDefaultDetector(MimeTypes types, ServiceLoader loader) { super(types.getMediaTypeRegistry(), getDefaultDetectors(types, loader)); registry = types.getMediaTypeRegistry(); } @Override public MediaType detect(InputStream input, Metadata metadata) throws IOException { // wenn in den Metadaten schon eins drin steht, dann werten wir den stream hier nicht nochmal extra aus. // wenn wir den media type schon wissen, aber lediglich der EmptyParser damit assoziiert wäre, dann kucken wir trotzdem noch mal nach // (magic bytes und so) String strType = metadata.get(Metadata.CONTENT_TYPE); if(strType != null) { // wenn es schon bekannt ist, checken wir ab, was für ein Parser hierfür ausgewählt werden würde. MediaType mediaType = MediaType.parse(strType); // wir müssen noch abchecken, ob es nicht noch einen spezialisierteren Parser gibt if(registry != null) { MediaType detectedType2check = super.detect(input, metadata); if((mediaType == null && detectedType2check != null) || registry.isSpecializationOf(detectedType2check.getBaseType(), mediaType.getBaseType())) { metadata.remove(Metadata.CONTENT_TYPE); metadata.set(Metadata.CONTENT_TYPE, detectedType2check.toString()); mediaType = detectedType2check; } } if(m_usedCompoParserFromCrawlConfig == null) return mediaType; Parser parser4Type = TikaUtils.getParser4Type(m_usedCompoParserFromCrawlConfig, mediaType, null); if(!(parser4Type instanceof EmptyParser)) return mediaType; } return super.detect(input, metadata); } }