/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.mime; import static java.nio.charset.StandardCharsets.US_ASCII; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.ByteArrayInputStream; import java.lang.reflect.Field; import java.util.ArrayList; import java.util.List; import java.util.Set; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.junit.Before; import org.junit.Test; /** * These tests try to ensure that the MimeTypesReader * has correctly processed the mime-types.xml file. * To do this, it tests that various aspects of the * mime-types.xml file have ended up correctly as * globs, matches, magics etc. * * If you make updates to mime-types.xml, then the * checks in this test may no longer hold true. * As such, if tests here start failing after your * changes, please review the test details, and * update it to match the new state of the file! */ public class MimeTypesReaderTest { private MimeTypes mimeTypes; private List<Magic> magics; @SuppressWarnings("unchecked") @Before public void setUp() throws NoSuchFieldException, SecurityException, IllegalArgumentException, IllegalAccessException{ this.mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository(); Field magicsField = mimeTypes.getClass().getDeclaredField("magics"); magicsField.setAccessible(true); magics = (List<Magic>)magicsField.get(mimeTypes); } @Test public void testHtmlMatches() throws Exception { int minMatches = 10; // Check on the type MimeType html = mimeTypes.forName("text/html"); assertTrue(html.hasMagic()); assertTrue( "There should be at least "+minMatches+" HTML matches, found " + html.getMagics().size(), html.getMagics().size() >= minMatches ); // Check on the overall magics List<Magic> htmlMagics = new ArrayList<Magic>(); for(Magic magic : magics) { if(magic.getType().toString().equals("text/html")) { htmlMagics.add(magic); } } assertTrue( "There should be at least "+minMatches+" HTML matches, found " + htmlMagics.size(), htmlMagics.size() >= minMatches ); } @Test public void testExcelMatches() throws Exception { int minMatches = 4; // Check on the type MimeType excel = mimeTypes.forName("application/vnd.ms-excel"); assertTrue(excel.hasMagic()); assertTrue( "There should be at least "+minMatches+" Excel matches, found " + excel.getMagics().size(), excel.getMagics().size() >= minMatches ); // Check on the overall magics List<Magic> excelMagics = new ArrayList<Magic>(); for(Magic magic : magics) { if(magic.getType().toString().equals("application/vnd.ms-excel")) { excelMagics.add(magic); } } assertTrue( "There should be at least "+minMatches+" Excel matches, found " + excelMagics.size(), excelMagics.size() >= minMatches ); } /** * @since TIKA-515 */ @Test public void testReadComment() { try { assertNotNull(this.mimeTypes.forName("application/msword") .getDescription()); } catch (Exception e) { fail(e.getMessage()); } } /** * @since TIKA-1012 */ @Test public void testReadExtendedMetadata() throws Exception { MimeType mime = this.mimeTypes.forName("image/bmp"); assertEquals("BMP", mime.getAcronym()); assertEquals("com.microsoft.bmp", mime.getUniformTypeIdentifier()); assertEquals("http://en.wikipedia.org/wiki/BMP_file_format", mime.getLinks().get(0).toString()); mime = this.mimeTypes.forName("application/xml"); assertEquals("XML", mime.getAcronym()); assertEquals("public.xml", mime.getUniformTypeIdentifier()); assertEquals("http://en.wikipedia.org/wiki/Xml", mime.getLinks().get(0).toString()); } @Test public void testReadParameterHierarchy() throws Exception { MimeType mimeBTree4 = this.mimeTypes.forName("application/x-berkeley-db;format=btree;version=4"); MediaType mtBTree4 = mimeBTree4.getType(); // Canonicalised with spaces assertEquals("application/x-berkeley-db; format=btree; version=4", mimeBTree4.toString()); assertEquals("application/x-berkeley-db; format=btree; version=4", mtBTree4.toString()); // Parent has one parameter MediaType mtBTree = this.mimeTypes.getMediaTypeRegistry().getSupertype(mtBTree4); assertEquals("application/x-berkeley-db; format=btree", mtBTree.toString()); // Parent has several children, for versions 2 through 4 Set<MediaType> mtBTreeChildren = this.mimeTypes.getMediaTypeRegistry().getChildTypes(mtBTree); assertTrue(mtBTreeChildren.toString(), mtBTreeChildren.size() >= 3); assertTrue(mtBTreeChildren.toString(), mtBTreeChildren.contains(mtBTree4)); // Parent of that has none MediaType mtBD = this.mimeTypes.getMediaTypeRegistry().getSupertype(mtBTree); assertEquals("application/x-berkeley-db", mtBD.toString()); // If we use one with parameters not known in the media registry, // getting the parent will return the non-parameter version MediaType mtAlt = MediaType.application("x-berkeley-db; format=unknown; version=42"); MediaType mtAltP = this.mimeTypes.getMediaTypeRegistry().getSupertype(mtAlt); assertEquals("application/x-berkeley-db", mtAltP.toString()); } /** * TIKA-746 Ensures that the custom mimetype maps were also * loaded and used */ @Test public void testCustomMimeTypes() { // Check that it knows about our three special ones String helloWorld = "hello/world"; String helloWorldFile = "hello/world-file"; String helloXWorld = "hello/x-world-hello"; try { assertNotNull(this.mimeTypes.forName(helloWorld)); assertNotNull(this.mimeTypes.forName(helloWorldFile)); assertNotNull(this.mimeTypes.forName(helloXWorld)); } catch (Exception e) { fail(e.getMessage()); } // Check that the details come through as expected try { MimeType hw = this.mimeTypes.forName(helloWorld); MimeType hwf = this.mimeTypes.forName(helloWorldFile); MimeType hxw = this.mimeTypes.forName(helloXWorld); // The parent has no comments, globs, magic etc assertEquals("", hw.getDescription()); assertEquals("", hw.getExtension()); assertEquals(0, hw.getExtensions().size()); assertEquals(0, hw.getMagics().size()); // The file one does assertEquals("A \"Hello World\" file", hwf.getDescription()); assertEquals(".hello.world", hwf.getExtension()); assertEquals(1, hwf.getMagics().size()); // The alternate one has most assertEquals("", hxw.getDescription()); assertEquals(".x-hello-world", hxw.getExtension()); assertEquals(1, hxw.getMagics().size()); // Check that we can correct detect with the file one: // By name Metadata m = new Metadata(); m.add(Metadata.RESOURCE_NAME_KEY, "test.hello.world"); assertEquals(hwf.toString(), this.mimeTypes.detect(null, m).toString()); m = new Metadata(); m.add(Metadata.RESOURCE_NAME_KEY, "test.x-hello-world"); assertEquals(hxw.toString(), this.mimeTypes.detect(null, m).toString()); // By contents - picks the x one as that sorts later m = new Metadata(); ByteArrayInputStream s = new ByteArrayInputStream( "Hello, World!".getBytes(US_ASCII)); assertEquals(hxw.toString(), this.mimeTypes.detect(s, m).toString()); } catch (Exception e) { fail(e.getMessage()); } } @Test public void testGetExtensionForPowerPoint() throws Exception { MimeType mt = this.mimeTypes.forName("application/vnd.ms-powerpoint"); String ext = mt.getExtension(); assertEquals(".ppt",ext); assertEquals(".ppt",mt.getExtensions().get(0)); } @Test public void testGetRegisteredMimesWithParameters() throws Exception { //TIKA-1692 // Media Type always keeps details / parameters String name = "application/xml; charset=UTF-8"; MediaType mt = MediaType.parse(name); assertEquals(name, mt.toString()); // Mime type loses details not in the file MimeType mimeType = this.mimeTypes.getRegisteredMimeType(name); assertEquals("application/xml", mimeType.toString()); assertEquals(".xml", mimeType.getExtension()); // But on well-known parameters stays name = "application/dita+xml; format=map"; mt = MediaType.parse(name); assertEquals(name, mt.toString()); mimeType = this.mimeTypes.getRegisteredMimeType(name); assertEquals(name, mimeType.toString()); assertEquals(".ditamap", mimeType.getExtension()); } }