/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashSet; import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.XMPDM; import org.apache.tika.mime.MediaType; import org.apache.tika.sax.BodyContentHandler; import org.gagravarr.tika.FlacParser; import org.gagravarr.tika.OpusParser; import org.gagravarr.tika.VorbisParser; import org.junit.Test; import org.xml.sax.ContentHandler; public class AutoDetectParserTest { private TikaConfig tika = TikaConfig.getDefaultConfig(); // Easy to read constants for the MIME types: private static final String RAW = "application/octet-stream"; private static final String EXCEL = "application/vnd.ms-excel"; private static final String HTML = "text/html; charset=ISO-8859-1"; private static final String PDF = "application/pdf"; private static final String POWERPOINT = "application/vnd.ms-powerpoint"; private static final String KEYNOTE = "application/vnd.apple.keynote"; private static final String PAGES = "application/vnd.apple.pages"; private static final String NUMBERS = "application/vnd.apple.numbers"; private static final String CHM = "application/vnd.ms-htmlhelp"; private static final String RTF = "application/rtf"; private static final String PLAINTEXT = "text/plain; charset=ISO-8859-1"; private static final String UTF8TEXT = "text/plain; charset=UTF-8"; private static final String WORD = "application/msword"; private static final String XML = "application/xml"; private static final String RSS = "application/rss+xml"; private static final String BMP = "image/bmp"; private static final String GIF = "image/gif"; private static final String JPEG = "image/jpeg"; private static final String PNG = "image/png"; private static final String OGG_VORBIS = "audio/vorbis"; private static final String OGG_OPUS = "audio/opus"; private static final String OGG_FLAC = "audio/x-oggflac"; private static final String FLAC_NATIVE= "audio/x-flac"; private static final String OPENOFFICE = "application/vnd.oasis.opendocument.text"; /** * This is where a single test is done. * @param tp the parameters encapsulated in a TestParams instance * @throws IOException */ private void assertAutoDetect(TestParams tp) throws Exception { try (InputStream input = AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName)) { if (input == null) { fail("Could not open stream from specified resource: " + tp.resourceRealName); } Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName); metadata.set(Metadata.CONTENT_TYPE, tp.statedType); ContentHandler handler = new BodyContentHandler(); new AutoDetectParser(tika).parse(input, handler, metadata); assertEquals("Bad content type: " + tp, tp.realType, metadata.get(Metadata.CONTENT_TYPE)); if (tp.expectedContentFragment != null) { assertTrue("Expected content not found: " + tp, handler.toString().contains(tp.expectedContentFragment)); } } } /** * Convenience method -- its sole purpose of existence is to make the * call to it more readable than it would be if a TestParams instance * would need to be instantiated there. * * @param resourceRealName real name of resource * @param resourceStatedName stated name -- will a bad name fool us? * @param realType - the real MIME type * @param statedType - stated MIME type - will a wrong one fool us? * @param expectedContentFragment - something expected in the text * @throws Exception */ private void assertAutoDetect(String resourceRealName, String resourceStatedName, String realType, String statedType, String expectedContentFragment) throws Exception { assertAutoDetect(new TestParams(resourceRealName, resourceStatedName, realType, statedType, expectedContentFragment)); } private void assertAutoDetect( String resource, String type, String content) throws Exception { resource = "/test-documents/" + resource; // TODO !!!! The disabled tests below should work! // The correct MIME type should be determined regardless of the // stated type (ContentType hint) and the stated URL name. // Try different combinations of correct and incorrect arguments: final String wrongMimeType = RAW; assertAutoDetect(resource, resource, type, type, content); assertAutoDetect(resource, resource, type, null, content); assertAutoDetect(resource, resource, type, wrongMimeType, content); assertAutoDetect(resource, null, type, type, content); assertAutoDetect(resource, null, type, null, content); assertAutoDetect(resource, null, type, wrongMimeType, content); final String badResource = "a.xyz"; assertAutoDetect(resource, badResource, type, type, content); assertAutoDetect(resource, badResource, type, null, content); assertAutoDetect(resource, badResource, type, wrongMimeType, content); } @Test public void testKeynote() throws Exception { assertAutoDetect("testKeynote.key", KEYNOTE, "A sample presentation"); } @Test public void testPages() throws Exception { assertAutoDetect("testPages.pages", PAGES, "Sample pages document"); } @Test public void testNumbers() throws Exception { assertAutoDetect("testNumbers.numbers", NUMBERS, "Checking Account: 300545668"); } @Test public void testChm() throws Exception { assertAutoDetect("testChm.chm", CHM, "If you do not specify a window type or a window name, the main window is used."); } @Test public void testEpub() throws Exception { assertAutoDetect( "testEPUB.epub", "application/epub+zip", "The previous headings were subchapters"); } @Test public void testExcel() throws Exception { assertAutoDetect("testEXCEL.xls", EXCEL, "Sample Excel Worksheet"); } @Test public void testHTML() throws Exception { assertAutoDetect("testHTML.html", HTML, "Test Indexation Html"); } @Test public void testOpenOffice() throws Exception { assertAutoDetect("testOpenOffice2.odt", OPENOFFICE, "This is a sample Open Office document"); } @Test public void testPDF() throws Exception { assertAutoDetect("testPDF.pdf", PDF, "Content Analysis Toolkit"); } @Test public void testPowerpoint() throws Exception { assertAutoDetect("testPPT.ppt", POWERPOINT, "Sample Powerpoint Slide"); } @Test public void testRdfXml() throws Exception { assertAutoDetect("testRDF.rdf", "application/rdf+xml", ""); } @Test public void testRTF() throws Exception { assertAutoDetect("testRTF.rtf", RTF, "indexation Word"); } @Test public void testText() throws Exception { assertAutoDetect("testTXT.txt", PLAINTEXT, "indexation de Txt"); } @Test public void testTextNonASCIIUTF8() throws Exception { assertAutoDetect("testTXTNonASCIIUTF8.txt", UTF8TEXT, "The quick brown fox jumps over the lazy dog"); } @Test public void testWord() throws Exception { assertAutoDetect("testWORD.doc", WORD, "Sample Word Document"); } @Test public void testXML() throws Exception { assertAutoDetect("testXML.xml", XML, "Lius"); } @Test public void testRss() throws Exception { assertAutoDetect("/test-documents/rsstest.rss", "feed", RSS, "application/rss+xml", "Sample RSS File for Junit test"); } @Test public void testImages() throws Exception { assertAutoDetect("testBMP.bmp", BMP, null); assertAutoDetect("testGIF.gif", GIF, null); assertAutoDetect("testJPEG.jpg", JPEG, null); assertAutoDetect("testPNG.png", PNG, null); } /** * Make sure that zip bomb attacks are prevented. * * @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a> */ @Test public void testZipBombPrevention() throws Exception { try (InputStream tgz = AutoDetectParserTest.class.getResourceAsStream( "/test-documents/TIKA-216.tgz")) { Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(-1); new AutoDetectParser(tika).parse(tgz, handler, metadata); fail("Zip bomb was not detected"); } catch (TikaException e) { // expected } } /** * Make sure XML parse errors don't trigger ZIP bomb detection. * * @see <a href="https://issues.apache.org/jira/browse/TIKA-1322">TIKA-1322</a> */ @Test public void testNoBombDetectedForInvalidXml() throws Exception { // create zip with ten empty / invalid XML files, 1.xml .. 10.xml ByteArrayOutputStream baos = new ByteArrayOutputStream(); ZipOutputStream zos = new ZipOutputStream(baos); for (int i = 1; i <= 10; i++) { zos.putNextEntry(new ZipEntry(i + ".xml")); zos.closeEntry(); } zos.finish(); zos.close(); new AutoDetectParser(tika).parse(new ByteArrayInputStream(baos.toByteArray()), new BodyContentHandler(-1), new Metadata()); } /** * Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc) * have been correctly included, and are available */ @SuppressWarnings("deprecation") @Test public void testOggFlacAudio() throws Exception { // The three test files should all have similar test data String[] testFiles = new String[] { "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga", "testOPUS.opus" }; MediaType[] mediaTypes = new MediaType[] { MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE), MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS) }; // Check we can load the parsers, and they claim to do the right things VorbisParser vParser = new VorbisParser(); assertNotNull("Parser not found for " + mediaTypes[0], vParser.getSupportedTypes(new ParseContext())); FlacParser fParser = new FlacParser(); assertNotNull("Parser not found for " + mediaTypes[1], fParser.getSupportedTypes(new ParseContext())); assertNotNull("Parser not found for " + mediaTypes[2], fParser.getSupportedTypes(new ParseContext())); OpusParser oParser = new OpusParser(); assertNotNull("Parser not found for " + mediaTypes[3], oParser.getSupportedTypes(new ParseContext())); // Check we found the parser CompositeParser parser = (CompositeParser)tika.getParser(); for (MediaType mt : mediaTypes) { assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt) ); } // Have each file parsed, and check for (int i=0; i<testFiles.length; i++) { String file = testFiles[i]; try (InputStream input = AutoDetectParserTest.class.getResourceAsStream( "/test-documents/" + file)) { if (input == null) { fail("Could not find test file " + file); } Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); new AutoDetectParser(tika).parse(input, handler, metadata); assertEquals("Incorrect content type for " + file, mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE)); // Check some of the common metadata // Old style metadata assertEquals("Test Artist", metadata.get(Metadata.AUTHOR)); assertEquals("Test Title", metadata.get(Metadata.TITLE)); // New style metadata assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR)); assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE)); // Check some of the XMPDM metadata if (!file.endsWith(".opus")) { assertEquals("Test Album", metadata.get(XMPDM.ALBUM)); } assertEquals("Test Artist", metadata.get(XMPDM.ARTIST)); assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE)); assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE)); // Check some of the text String content = handler.toString(); assertTrue(content.contains("Test Title")); assertTrue(content.contains("Test Artist")); } } } /** * Test case for TIKA-514. Provide constructor for AutoDetectParser that has explicit * list of supported parsers. * @see <a href="https://issues.apache.org/jira/browse/TIKA-514">TIKA-514</a> */ @Test public void testSpecificParserList() throws Exception { AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new MyParser()); InputStream is = new ByteArrayInputStream("test".getBytes(UTF_8)); Metadata metadata = new Metadata(); parser.parse(is, new BodyContentHandler(), metadata, new ParseContext()); assertEquals("value", metadata.get("MyParser")); } private static final MediaType MY_MEDIA_TYPE = new MediaType("application", "x-myparser"); /** * A test detector which always returns the type supported * by the test parser */ @SuppressWarnings("serial") private static class MyDetector implements Detector { public MediaType detect(InputStream input, Metadata metadata) throws IOException { return MY_MEDIA_TYPE; } } @SuppressWarnings("serial") private static class MyParser extends AbstractParser { public Set<MediaType> getSupportedTypes(ParseContext context) { Set<MediaType> supportedTypes = new HashSet<MediaType>(); supportedTypes.add(MY_MEDIA_TYPE); return supportedTypes; } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) { metadata.add("MyParser", "value"); } } /** * Minimal class to encapsulate all parameters -- the main reason for * its existence is to aid in debugging via its toString() method. * * Getters and setters intentionally not provided. */ private static class TestParams { public String resourceRealName; public String resourceStatedName; public String realType; public String statedType; public String expectedContentFragment; private TestParams(String resourceRealName, String resourceStatedName, String realType, String statedType, String expectedContentFragment) { this.resourceRealName = resourceRealName; this.resourceStatedName = resourceStatedName; this.realType = realType; this.statedType = statedType; this.expectedContentFragment = expectedContentFragment; } /** * Produces a string like the following: * * <pre> * Test parameters: * resourceRealName = /test-documents/testEXCEL.xls * resourceStatedName = null * realType = application/vnd.ms-excel * statedType = null * expectedContentFragment = Sample Excel Worksheet * </pre> */ public String toString() { return "Test parameters:\n" + " resourceRealName = " + resourceRealName + "\n" + " resourceStatedName = " + resourceStatedName + "\n" + " realType = " + realType + "\n" + " statedType = " + statedType + "\n" + " expectedContentFragment = " + expectedContentFragment + "\n"; } } }