/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.config; import static org.apache.tika.TikaTest.assertContains; import static org.apache.tika.TikaTest.assertNotContained; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.util.List; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.executable.ExecutableParser; import org.apache.tika.parser.xml.XMLParser; import org.junit.Test; /** * Junit test class for {@link TikaConfig}, which cover things * that {@link TikaConfigTest} can't do due to a need for the * full set of parsers */ public class TikaParserConfigTest extends AbstractTikaConfigTest { @Test public void testMimeExcludeInclude() throws Exception { TikaConfig config = getConfig("TIKA-1558-blacklist.xml"); assertNotNull(config.getParser()); assertNotNull(config.getDetector()); Parser parser = config.getParser(); MediaType PDF = MediaType.application("pdf"); MediaType JPEG = MediaType.image("jpeg"); // Has two parsers assertEquals(CompositeParser.class, parser.getClass()); CompositeParser cParser = (CompositeParser)parser; assertEquals(2, cParser.getAllComponentParsers().size()); // Both are decorated assertTrue(cParser.getAllComponentParsers().get(0) instanceof ParserDecorator); assertTrue(cParser.getAllComponentParsers().get(1) instanceof ParserDecorator); ParserDecorator p0 = (ParserDecorator)cParser.getAllComponentParsers().get(0); ParserDecorator p1 = (ParserDecorator)cParser.getAllComponentParsers().get(1); // DefaultParser will be wrapped with excludes assertEquals(DefaultParser.class, p0.getWrappedParser().getClass()); assertNotContained(PDF, p0.getSupportedTypes(context)); assertContains(PDF, p0.getWrappedParser().getSupportedTypes(context)); assertNotContained(JPEG, p0.getSupportedTypes(context)); assertContains(JPEG, p0.getWrappedParser().getSupportedTypes(context)); // Will have an empty parser for PDF assertEquals(EmptyParser.class, p1.getWrappedParser().getClass()); assertEquals(1, p1.getSupportedTypes(context).size()); assertContains(PDF, p1.getSupportedTypes(context)); assertNotContained(PDF, p1.getWrappedParser().getSupportedTypes(context)); } @Test public void testParserExcludeFromDefault() throws Exception { TikaConfig config = getConfig("TIKA-1558-blacklist.xml"); assertNotNull(config.getParser()); assertNotNull(config.getDetector()); CompositeParser parser = (CompositeParser)config.getParser(); MediaType PE_EXE = MediaType.application("x-msdownload"); MediaType ELF = MediaType.application("x-elf"); // Get the DefaultParser from the config ParserDecorator confWrappedParser = (ParserDecorator)parser.getParsers().get(MediaType.APPLICATION_XML); assertNotNull(confWrappedParser); DefaultParser confParser = (DefaultParser)confWrappedParser.getWrappedParser(); // Get a fresh "default" DefaultParser DefaultParser normParser = new DefaultParser(config.getMediaTypeRegistry()); // The default one will offer the Executable Parser assertContains(PE_EXE, normParser.getSupportedTypes(context)); assertContains(ELF, normParser.getSupportedTypes(context)); boolean hasExec = false; for (Parser p : normParser.getParsers().values()) { if (p instanceof ExecutableParser) { hasExec = true; break; } } assertTrue(hasExec); // The one from the config won't assertNotContained(PE_EXE, confParser.getSupportedTypes(context)); assertNotContained(ELF, confParser.getSupportedTypes(context)); for (Parser p : confParser.getParsers().values()) { if (p instanceof ExecutableParser) fail("Shouldn't have the Executable Parser from config"); } } /** * TIKA-1558 It should be possible to exclude Parsers from being picked up by * DefaultParser. */ @Test public void defaultParserBlacklist() throws Exception { TikaConfig config = new TikaConfig(); assertNotNull(config.getParser()); assertNotNull(config.getDetector()); CompositeParser cp = (CompositeParser) config.getParser(); List<Parser> parsers = cp.getAllComponentParsers(); boolean hasXML = false; for (Parser p : parsers) { if (p instanceof XMLParser) { hasXML = true; break; } } assertTrue("Default config should include an XMLParser.", hasXML); // This custom TikaConfig should exclude XMLParser and all of its subclasses. config = getConfig("TIKA-1558-blacklistsub.xml"); cp = (CompositeParser) config.getParser(); parsers = cp.getAllComponentParsers(); for (Parser p : parsers) { if (p instanceof XMLParser) fail("Custom config should not include an XMLParser (" + p.getClass() + ")."); } } }