/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.detect; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStream; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.tika.config.TikaConfig; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.iwork.iwana.IWork13PackageParser; import org.junit.Test; /** * Junit test class for {@link ContainerAwareDetector} */ public class TestContainerAwareDetector { private final TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); private final MimeTypes mimeTypes = tikaConfig.getMimeRepository(); private final Detector detector = new DefaultDetector(mimeTypes); private void assertTypeByData(String file, String type) throws Exception { assertTypeByNameAndData(file, null, type); } private void assertTypeByNameAndData(String file, String type) throws Exception { assertTypeByNameAndData(file, file, type); } private void assertType(String file, String byData, String byNameAndData) throws Exception { assertTypeByData(file, byData); assertTypeByNameAndData(file, byNameAndData); } private void assertTypeByNameAndData(String dataFile, String name, String type) throws Exception { assertTypeByNameAndData(dataFile, name, type, null); } private void assertTypeByNameAndData(String dataFile, String name, String typeFromDetector, String typeFromMagic) throws Exception { try (TikaInputStream stream = TikaInputStream.get( TestContainerAwareDetector.class.getResource("/test-documents/" + dataFile))) { Metadata m = new Metadata(); if (name != null) m.add(Metadata.RESOURCE_NAME_KEY, name); // Mime Magic version is likely to be less precise if (typeFromMagic != null) { assertEquals( MediaType.parse(typeFromMagic), mimeTypes.detect(stream, m)); } // All being well, the detector should get it perfect assertEquals( MediaType.parse(typeFromDetector), detector.detect(stream, m)); } } @Test public void testDetectOLE2() throws Exception { // Microsoft office types known by POI assertTypeByData("testEXCEL.xls", "application/vnd.ms-excel"); assertTypeByData("testWORD.doc", "application/msword"); assertTypeByData("testPPT.ppt", "application/vnd.ms-powerpoint"); assertTypeByData("test-outlook.msg", "application/vnd.ms-outlook"); assertTypeByData("test-outlook2003.msg", "application/vnd.ms-outlook"); assertTypeByData("testVISIO.vsd", "application/vnd.visio"); assertTypeByData("testPUBLISHER.pub", "application/x-mspublisher"); assertTypeByData("testWORKS.wps", "application/vnd.ms-works"); assertTypeByData("testWORKS2000.wps", "application/vnd.ms-works"); // older Works Word Processor files can't be recognized // they were created with Works Word Processor 7.0 (hence the text inside) // and exported to the older formats with the "Save As" feature assertTypeByData("testWORKSWordProcessor3.0.wps","application/vnd.ms-works"); assertTypeByData("testWORKSWordProcessor4.0.wps","application/vnd.ms-works"); assertTypeByData("testWORKSSpreadsheet7.0.xlr", "application/x-tika-msworks-spreadsheet"); assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project"); assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project"); // Excel95 can be detected by not parsed assertTypeByData("testEXCEL_95.xls", "application/vnd.ms-excel"); // Try some ones that POI doesn't handle, that are still OLE2 based assertTypeByData("testCOREL.shw", "application/x-corelpresentations"); assertTypeByData("testQUATTRO.qpw", "application/x-quattro-pro; version=9"); assertTypeByData("testQUATTRO.wb3", "application/x-quattro-pro; version=7-8"); assertTypeByData("testHWP_5.0.hwp", "application/x-hwp-v5"); // With the filename and data assertTypeByNameAndData("testEXCEL.xls", "application/vnd.ms-excel"); assertTypeByNameAndData("testWORD.doc", "application/msword"); assertTypeByNameAndData("testPPT.ppt", "application/vnd.ms-powerpoint"); // With the wrong filename supplied, data will trump filename assertTypeByNameAndData("testEXCEL.xls", "notWord.doc", "application/vnd.ms-excel"); assertTypeByNameAndData("testWORD.doc", "notExcel.xls", "application/msword"); assertTypeByNameAndData("testPPT.ppt", "notWord.doc", "application/vnd.ms-powerpoint"); // With a filename of a totally different type, data will trump filename assertTypeByNameAndData("testEXCEL.xls", "notPDF.pdf", "application/vnd.ms-excel"); assertTypeByNameAndData("testEXCEL.xls", "notPNG.png", "application/vnd.ms-excel"); } /** * There is no way to distinguish "proper" StarOffice files from templates. * All templates have the same extension but their actual type depends on * the magic. Our current MimeTypes class doesn't allow us to use the same * glob pattern in more than one mimetype. * * @throws Exception */ @Test public void testDetectStarOfficeFiles() throws Exception { assertType("testStarOffice-5.2-calc.sdc", "application/vnd.stardivision.calc", "application/vnd.stardivision.calc"); assertType("testVORCalcTemplate.vor", "application/vnd.stardivision.calc", "application/vnd.stardivision.calc"); assertType("testStarOffice-5.2-draw.sda", "application/vnd.stardivision.draw", "application/vnd.stardivision.draw"); assertType("testVORDrawTemplate.vor", "application/vnd.stardivision.draw", "application/vnd.stardivision.draw"); assertType("testStarOffice-5.2-impress.sdd", "application/vnd.stardivision.impress", "application/vnd.stardivision.impress"); assertType("testVORImpressTemplate.vor", "application/vnd.stardivision.impress", "application/vnd.stardivision.impress"); assertType("testStarOffice-5.2-writer.sdw", "application/vnd.stardivision.writer", "application/vnd.stardivision.writer"); assertType("testVORWriterTemplate.vor", "application/vnd.stardivision.writer", "application/vnd.stardivision.writer"); } @Test public void testOpenContainer() throws Exception { try (TikaInputStream stream = TikaInputStream.get( TestContainerAwareDetector.class.getResource("/test-documents/testPPT.ppt"))) { assertNull(stream.getOpenContainer()); assertEquals( MediaType.parse("application/vnd.ms-powerpoint"), detector.detect(stream, new Metadata())); assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem); } } /** * EPub uses a similar mimetype entry to OpenDocument for storing * the mimetype within the parent zip file */ @Test public void testDetectEPub() throws Exception { assertTypeByData("testEPUB.epub", "application/epub+zip"); assertTypeByData("testiBooks.ibooks", "application/x-ibooks+zip"); } @Test public void testDetectLotusNotesEml() throws Exception { // Lotus .eml files aren't guaranteed to have any of the magic // matches as the first line, but should have X-Notes-Item and Message-ID assertTypeByData("testLotusEml.eml", "message/rfc822"); } @Test public void testDetectODF() throws Exception { assertTypeByData("testODFwithOOo3.odt", "application/vnd.oasis.opendocument.text"); assertTypeByData("testOpenOffice2.odf", "application/vnd.oasis.opendocument.formula"); } @Test public void testDetectOOXML() throws Exception { assertTypeByData("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); assertTypeByData("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); assertTypeByData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); // Check some of the less common OOXML types assertTypeByData("testPPT.pptm", "application/vnd.ms-powerpoint.presentation.macroenabled.12"); assertTypeByData("testPPT.ppsx", "application/vnd.openxmlformats-officedocument.presentationml.slideshow"); assertTypeByData("testPPT.ppsm", "application/vnd.ms-powerpoint.slideshow.macroEnabled.12"); assertTypeByData("testDOTM.dotm", "application/vnd.ms-word.template.macroEnabled.12"); assertTypeByData("testEXCEL.strict.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); assertTypeByData("testPPT.xps", "application/vnd.ms-xpsdocument"); assertTypeByData("testVISIO.vsdm", "application/vnd.ms-visio.drawing.macroenabled.12"); assertTypeByData("testVISIO.vsdx", "application/vnd.ms-visio.drawing"); assertTypeByData("testVISIO.vssm", "application/vnd.ms-visio.stencil.macroenabled.12"); assertTypeByData("testVISIO.vssx", "application/vnd.ms-visio.stencil"); assertTypeByData("testVISIO.vstm", "application/vnd.ms-visio.template.macroenabled.12"); assertTypeByData("testVISIO.vstx", "application/vnd.ms-visio.template"); // .xlsb is an OOXML file containing the binary parts, and not // an OLE2 file as you might initially expect! assertTypeByData("testEXCEL.xlsb", "application/vnd.ms-excel.sheet.binary.macroEnabled.12"); // With the filename and data assertTypeByNameAndData("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); assertTypeByNameAndData("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); assertTypeByNameAndData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); // With the wrong filename supplied, data will trump filename assertTypeByNameAndData("testEXCEL.xlsx", "notWord.docx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); assertTypeByNameAndData("testWORD.docx", "notExcel.xlsx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); assertTypeByNameAndData("testPPT.pptx", "notWord.docx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); // With an incorrect filename of a different container type, data trumps filename assertTypeByNameAndData("testEXCEL.xlsx", "notOldExcel.xls", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); } /** * Password Protected OLE2 files are fairly straightforward to detect, as they * have the same structure as regular OLE2 files. (Core streams may be encrypted * however) */ @Test public void testDetectProtectedOLE2() throws Exception { assertTypeByData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel"); assertTypeByData("testWORD_protected_passtika.doc", "application/msword"); assertTypeByData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint"); assertTypeByNameAndData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel"); assertTypeByNameAndData("testWORD_protected_passtika.doc", "application/msword"); assertTypeByNameAndData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint"); } /** * Password Protected OOXML files are much more tricky beasts to work with. * They have a very different structure to regular OOXML files, and instead * of being ZIP based they are actually an OLE2 file which contains the * OOXML structure within an encrypted stream. * This makes detecting them much harder... */ @Test public void testDetectProtectedOOXML() throws Exception { // Encrypted Microsoft Office OOXML files have OLE magic but // special streams, so we can tell they're Protected OOXML assertTypeByData("testEXCEL_protected_passtika.xlsx", "application/x-tika-ooxml-protected"); assertTypeByData("testWORD_protected_passtika.docx", "application/x-tika-ooxml-protected"); assertTypeByData("testPPT_protected_passtika.pptx", "application/x-tika-ooxml-protected"); // At the moment, we can't use the name to specialise // See discussions on TIKA-790 for details assertTypeByNameAndData("testEXCEL_protected_passtika.xlsx", "application/x-tika-ooxml-protected"); assertTypeByNameAndData("testWORD_protected_passtika.docx", "application/x-tika-ooxml-protected"); assertTypeByNameAndData("testPPT_protected_passtika.pptx", "application/x-tika-ooxml-protected"); } /** * Check that temporary files created by Tika are removed after * closing TikaInputStream. */ @Test public void testRemovalTempfiles() throws Exception { assertRemovalTempfiles("testWORD.docx"); assertRemovalTempfiles("test-documents.zip"); } private int countTemporaryFiles() { return new File(System.getProperty("java.io.tmpdir")).listFiles( new FilenameFilter() { public boolean accept(File dir, String name) { return name.startsWith("apache-tika-"); } }).length; } private void assertRemovalTempfiles(String fileName) throws Exception { int numberOfTempFiles = countTemporaryFiles(); try (TikaInputStream stream = TikaInputStream.get( TestContainerAwareDetector.class.getResource("/test-documents/" + fileName))) { detector.detect(stream, new Metadata()); } assertEquals(numberOfTempFiles, countTemporaryFiles()); } @Test public void testDetectIWork() throws Exception { assertTypeByData("testKeynote.key", "application/vnd.apple.keynote"); assertTypeByData("testNumbers.numbers", "application/vnd.apple.numbers"); assertTypeByData("testPages.pages", "application/vnd.apple.pages"); } @Test public void testDetectIWork2013() throws Exception { assertTypeByData("testKeynote2013.key", IWork13PackageParser.IWork13DocumentType.KEYNOTE13.getType().toString()); assertTypeByData("testNumbers2013.numbers", IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString()); assertTypeByData("testPages2013.pages", IWork13PackageParser.IWork13DocumentType.UNKNOWN13.getType().toString()); } @Test public void testDetectKMZ() throws Exception { assertTypeByData("testKMZ.kmz", "application/vnd.google-earth.kmz"); } @Test public void testDetectIPA() throws Exception { assertTypeByNameAndData("testIPA.ipa", "application/x-itunes-ipa"); assertTypeByData("testIPA.ipa", "application/x-itunes-ipa"); } @Test public void testASiC() throws Exception { assertTypeByData("testASiCE.asice", "application/vnd.etsi.asic-e+zip"); assertTypeByData("testASiCS.asics", "application/vnd.etsi.asic-s+zip"); assertTypeByNameAndData("testASiCE.asice", "application/vnd.etsi.asic-e+zip"); assertTypeByNameAndData("testASiCS.asics", "application/vnd.etsi.asic-s+zip"); } @Test public void testDetectZip() throws Exception { assertTypeByData("test-documents.zip", "application/zip"); assertTypeByData("test-zip-of-zip.zip", "application/zip"); // JAR based formats assertTypeByData("testJAR.jar", "application/java-archive"); assertTypeByData("testWAR.war", "application/x-tika-java-web-archive"); assertTypeByData("testEAR.ear", "application/x-tika-java-enterprise-archive"); assertTypeByData("testAPK.apk", "application/vnd.android.package-archive"); // JAR with HTML files in it assertTypeByNameAndData("testJAR_with_HTML.jar", "testJAR_with_HTML.jar", "application/java-archive", "application/java-archive"); } @Test public void testTarWithNoMagic() throws Exception { assertTypeByData("testTAR_no_magic.tar", "application/x-tar"); } @Test public void testLZMAOOM() throws Exception { assertTypeByData("testLZMA_oom", "application/x-lzma"); } @Test public void testCompressOOM() throws Exception { assertTypeByData("testZ_oom.Z", "application/x-compress"); } private TikaInputStream getTruncatedFile(String name, int n) throws IOException { try (InputStream input = TestContainerAwareDetector.class.getResourceAsStream( "/test-documents/" + name)) { byte[] bytes = new byte[n]; int m = 0; while (m < bytes.length) { int i = input.read(bytes, m, bytes.length - m); if (i != -1) { m += i; } else { throw new IOException("Unexpected end of stream"); } } return TikaInputStream.get(bytes); } } @Test public void testTruncatedFiles() throws Exception { // First up a truncated OOXML (zip) file // With only the data supplied, the best we can do is the container Metadata m = new Metadata(); try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) { assertEquals( MediaType.application("x-tika-ooxml"), detector.detect(xlsx, m)); } // With truncated data + filename, we can use the filename to specialise m = new Metadata(); m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx"); try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) { assertEquals( MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"), detector.detect(xlsx, m)); } // Now a truncated OLE2 file m = new Metadata(); try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) { assertEquals( MediaType.application("x-tika-msoffice"), detector.detect(xls, m)); } // Finally a truncated OLE2 file, with a filename available m = new Metadata(); m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xls"); try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) { assertEquals( MediaType.application("vnd.ms-excel"), detector.detect(xls, m)); } } }