/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.mime;
// Junit imports
import static java.nio.charset.StandardCharsets.UTF_16BE;
import static java.nio.charset.StandardCharsets.UTF_16LE;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNotSame;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.Test;
/**
*
* Test Suite for the {@link MimeTypes} repository.
*
*/
public class TestMimeTypes {
private Tika tika;
private MimeTypes repo;
private URL u;
private static final File f = new File("/a/b/c/x.pdf");
@Before
public void setUp() throws Exception{
TikaConfig config = TikaConfig.getDefaultConfig();
repo = config.getMimeRepository();
tika = new Tika(config);
u = new URL("http://mydomain.com/x.pdf?x=y");
}
@Test
public void testCaseSensitivity() {
String type = tika.detect("test.PDF");
assertNotNull(type);
assertEquals(type, tika.detect("test.pdf"));
assertEquals(type, tika.detect("test.PdF"));
assertEquals(type, tika.detect("test.pdF"));
}
@Test
public void testNetCDF() throws Exception {
assertTypeByData("application/x-netcdf", "sresa1b_ncar_ccsm3_0_run1_200001.nc");
}
@Test
public void testLoadMimeTypes() throws MimeTypeException {
assertNotNull(repo.forName("application/octet-stream"));
assertNotNull(repo.forName("text/x-tex"));
}
/**
* Tests MIME type determination based solely on the URL's extension.
*/
@Test
public void testGuessMimeTypes() throws Exception {
assertTypeByName("application/pdf", "x.pdf");
assertEquals("application/pdf", tika.detect(u.toExternalForm()));
assertEquals("application/pdf", tika.detect(f.getPath()));
assertTypeByName("text/plain", "x.txt");
assertTypeByName("text/html", "x.htm");
assertTypeByName("text/html", "x.html");
assertTypeByName("application/xhtml+xml", "x.xhtml");
assertTypeByName("application/xml", "x.xml");
assertTypeByName("application/zip", "x.zip");
assertTypeByName("application/vnd.oasis.opendocument.text", "x.odt");
assertTypeByName("application/octet-stream", "x.unknown");
// Test for the MS Office media types and file extensions listed in
// http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx
assertTypeByName("application/msword", "x.doc");
assertTypeByName("application/msword", "x.dot");
assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "x.docx");
assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.template", "x.dotx");
assertTypeByName("application/vnd.ms-word.document.macroenabled.12", "x.docm");
assertTypeByName("application/vnd.ms-word.template.macroenabled.12", "x.dotm");
assertTypeByName("application/vnd.ms-excel", "x.xls");
assertTypeByName("application/vnd.ms-excel", "x.xlt");
assertTypeByName("application/vnd.ms-excel", "x.xla");
assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "x.xlsx");
assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.template", "x.xltx");
assertTypeByName("application/vnd.ms-excel.sheet.macroenabled.12", "x.xlsm");
assertTypeByName("application/vnd.ms-excel.template.macroenabled.12", "x.xltm");
assertTypeByName("application/vnd.ms-excel.addin.macroenabled.12", "x.xlam");
assertTypeByName("application/vnd.ms-excel.sheet.binary.macroenabled.12", "x.xlsb");
assertTypeByName("application/vnd.ms-powerpoint", "x.ppt");
assertTypeByName("application/vnd.ms-powerpoint", "x.pot");
assertTypeByName("application/vnd.ms-powerpoint", "x.pps");
assertTypeByName("application/vnd.ms-powerpoint", "x.ppa");
assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.presentation", "x.pptx");
assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.template", "x.potx");
assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.slideshow", "x.ppsx");
assertTypeByName("application/vnd.ms-powerpoint.addin.macroenabled.12", "x.ppam");
assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", "x.pptm");
assertTypeByName("application/vnd.ms-powerpoint.template.macroenabled.12", "x.potm");
assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "x.ppsm");
}
/**
* Note - detecting container formats by mime magic is very very
* iffy, as we can't be sure where things will end up.
* People really ought to use the container aware detection...
*/
@Test
public void testOLE2Detection() throws Exception {
// These have the properties block near the start, so our mime
// magic will spot them
assertTypeByData("application/vnd.ms-excel", "testEXCEL.xls");
// This one quite legitimately doesn't have its properties block
// as one of the first couple of entries
// As such, our mime magic can't figure it out...
assertTypeByData("application/x-tika-msoffice", "testWORD.doc");
assertTypeByData("application/x-tika-msoffice", "testPPT.ppt");
// By name + data:
// Those we got right to start with are fine
assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL.xls");
// And the name lets us specialise the generic OOXML
// ones to their actual type
assertTypeByNameAndData("application/vnd.ms-powerpoint", "testPPT.ppt");
assertTypeByNameAndData("application/msword", "testWORD.doc");
}
/**
* Files generated by Works 7.0 Spreadsheet application use the OLE2
* structure and resemble Excel files (they contain a "Workbook"). They are
* not Excel though. They are distinguished from Excel files with an
* additional top-level entry in below the root of the POI filesystem.
*
* @throws Exception
*/
@Test
public void testWorksSpreadsheetDetection() throws Exception {
assertTypeDetection("testWORKSSpreadsheet7.0.xlr",
// with name-only, everything should be all right
"application/x-tika-msworks-spreadsheet",
// this is possible due to MimeTypes guessing the type
// based on the WksSSWorkBook near the beginning of the
// file
"application/x-tika-msworks-spreadsheet",
// this is right, the magic-based detection works, there is
// no need for the name-based detection to refine it
"application/x-tika-msworks-spreadsheet");
}
@Test
public void testHFATypes() throws Exception {
assertTypeByData("application/x-erdas-hfa","testHFA.hfa");
}
@Test
public void testStarOfficeDetection() throws Exception {
assertTypeDetection("testVORCalcTemplate.vor",
"application/x-staroffice-template",
"application/vnd.stardivision.calc",
"application/vnd.stardivision.calc");
assertTypeDetection("testVORDrawTemplate.vor",
"application/x-staroffice-template",
"application/vnd.stardivision.draw",
"application/vnd.stardivision.draw");
assertTypeDetection("testVORImpressTemplate.vor",
"application/x-staroffice-template",
"application/vnd.stardivision.impress",
"application/vnd.stardivision.impress");
assertTypeDetection("testVORWriterTemplate.vor",
"application/x-staroffice-template",
"application/vnd.stardivision.writer",
"application/vnd.stardivision.writer");
assertTypeDetection("testStarOffice-5.2-calc.sdc",
"application/vnd.stardivision.calc",
"application/vnd.stardivision.calc",
"application/vnd.stardivision.calc");
assertTypeDetection("testStarOffice-5.2-draw.sda",
"application/vnd.stardivision.draw",
"application/vnd.stardivision.draw",
"application/vnd.stardivision.draw");
assertTypeDetection("testStarOffice-5.2-impress.sdd",
"application/vnd.stardivision.impress",
"application/vnd.stardivision.impress",
"application/vnd.stardivision.impress");
assertTypeDetection("testStarOffice-5.2-writer.sdw",
"application/vnd.stardivision.writer",
"application/vnd.stardivision.writer",
"application/vnd.stardivision.writer");
}
/**
* Files generated by Works Word Processor versions 3.0 and 4.0 use the
* OLE2 structure. They don't resemble Word though.
*
* @throws Exception
*/
@Test
public void testOldWorksWordProcessorDetection() throws Exception {
assertTypeDetection(
"testWORKSWordProcessor3.0.wps",
// .wps is just like any other works extension
"application/vnd.ms-works",
// this is due to MatOST substring
"application/vnd.ms-works",
// magic-based detection works, no need to refine it
"application/vnd.ms-works");
// files in version 4.0 are no different from those in version 3.0
assertTypeDetection(
"testWORKSWordProcessor4.0.wps",
"application/vnd.ms-works",
"application/vnd.ms-works",
"application/vnd.ms-works");
}
/**
* Files from Excel 2 through 4 are based on the BIFF record
* structure, but without a wrapping OLE2 structure.
* Excel 5 and Excel 95+ work on OLE2
*/
@Test
public void testOldExcel() throws Exception {
// With just a name, we'll think everything's a new Excel file
assertTypeByName("application/vnd.ms-excel","testEXCEL_4.xls");
assertTypeByName("application/vnd.ms-excel","testEXCEL_5.xls");
assertTypeByName("application/vnd.ms-excel","testEXCEL_95.xls");
// With data, we can work out if it's old or new style
assertTypeByData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
assertTypeByData("application/x-tika-msoffice","testEXCEL_5.xls");
assertTypeByData("application/x-tika-msoffice","testEXCEL_95.xls");
assertTypeByNameAndData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_5.xls");
assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_95.xls");
}
/**
* Note - detecting container formats by mime magic is very very
* iffy, as we can't be sure where things will end up.
* People really ought to use the container aware detection...
*/
@Test
public void testOoxmlDetection() throws Exception {
// These two do luckily have [Content_Types].xml near the start,
// so our mime magic will spot them
assertTypeByData("application/x-tika-ooxml", "testEXCEL.xlsx");
assertTypeByData("application/x-tika-ooxml", "testPPT.pptx");
// This one quite legitimately doesn't have its [Content_Types].xml
// file as one of the first couple of entries
// As such, our mime magic can't figure it out...
assertTypeByData("application/zip", "testWORD.docx");
// POI-generated files have the rels first not Content Types
assertTypeByData("application/x-tika-ooxml", "testEXCEL_poi.xlsx");
// If we give the filename as well as the data, we can
// specialise the ooxml generic one to the correct type
assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "testEXCEL.xlsx");
assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.presentationml.presentation", "testPPT.pptx");
assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "testWORD.docx");
assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "testEXCEL_poi.xlsx");
// Test a few of the less usual ones
assertTypeByNameAndData("application/vnd.ms-excel.sheet.binary.macroenabled.12","testEXCEL.xlsb");
assertTypeByNameAndData("application/vnd.ms-powerpoint.presentation.macroenabled.12", "testPPT.pptm");
assertTypeByNameAndData("application/vnd.ms-powerpoint.template.macroenabled.12", "testPPT.potm");
assertTypeByNameAndData("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "testPPT.ppsm");
}
/**
* Note - container based formats, needs container detection
* to be properly correct
*/
@Test
public void testVisioDetection() throws Exception {
// By Name, should get it right
assertTypeByName("application/vnd.visio", "testVISIO.vsd");
assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
assertTypeByName("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
assertTypeByName("application/vnd.ms-visio.stencil", "testVISIO.vssx");
assertTypeByName("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
assertTypeByName("application/vnd.ms-visio.template", "testVISIO.vstx");
// By Name and Data, should get it right
assertTypeByNameAndData("application/vnd.visio", "testVISIO.vsd");
assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
assertTypeByNameAndData("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
assertTypeByNameAndData("application/vnd.ms-visio.stencil", "testVISIO.vssx");
assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
assertTypeByNameAndData("application/vnd.ms-visio.template", "testVISIO.vstx");
// By Data only, will get the container parent
assertTypeByData("application/x-tika-msoffice", "testVISIO.vsd");
assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdm");
assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdx");
assertTypeByData("application/x-tika-ooxml", "testVISIO.vssm");
assertTypeByData("application/x-tika-ooxml", "testVISIO.vssx");
assertTypeByData("application/x-tika-ooxml", "testVISIO.vstm");
assertTypeByData("application/x-tika-ooxml", "testVISIO.vstx");
}
/**
* Note - detecting container formats by mime magic is very very
* iffy, as we can't be sure where things will end up.
* People really ought to use the container aware detection...
*/
@Test
public void testIWorkDetection() throws Exception {
// By name is easy
assertTypeByName("application/vnd.apple.keynote", "testKeynote.key");
assertTypeByName("application/vnd.apple.numbers", "testNumbers.numbers");
assertTypeByName("application/vnd.apple.pages", "testPages.pages");
// We can't do it by data, as we'd need to unpack
// the zip file to check the XML
assertTypeByData("application/zip", "testKeynote.key");
assertTypeByNameAndData("application/vnd.apple.keynote", "testKeynote.key");
assertTypeByNameAndData("application/vnd.apple.numbers", "testNumbers.numbers");
assertTypeByNameAndData("application/vnd.apple.pages", "testPages.pages");
}
@Test
public void testArchiveDetection() throws Exception {
assertTypeByName("application/x-archive", "test.ar");
assertTypeByName("application/zip", "test.zip");
assertTypeByName("application/x-tar", "test.tar");
assertTypeByName("application/gzip", "test.tgz"); // See GZIP, not tar contents of it
assertTypeByName("application/x-cpio", "test.cpio");
assertTypeByName("application/vnd.ms-cab-compressed", "test.cab");
// TODO Add an example .deb and .udeb, then check these
// Check the mime magic patterns for them work too
assertTypeByData("application/x-archive", "testARofText.ar");
assertTypeByData("application/x-archive", "testARofSND.ar");
assertTypeByData("application/zip", "test-documents.zip");
assertTypeByData("application/x-gtar", "test-documents.tar"); // GNU TAR
assertTypeByData("application/gzip", "test-documents.tgz"); // See GZIP, not tar contents of it
assertTypeByData("application/x-cpio", "test-documents.cpio");
assertTypeByData("application/vnd.ms-cab-compressed", "test-documents.cab");
// For spanned zip files, the .zip file doesn't have the header, it's the other parts
assertTypeByData("application/octet-stream", "test-documents-spanned.zip");
assertTypeByData("application/zip", "test-documents-spanned.z01");
}
@Test
public void testFeedsDetection() throws Exception {
assertType("application/rss+xml", "rsstest.rss");
assertType("application/atom+xml", "testATOM.atom");
assertTypeByData("application/rss+xml", "rsstest.rss");
assertTypeByName("application/rss+xml", "rsstest.rss");
assertTypeByData("application/atom+xml", "testATOM.atom");
assertTypeByName("application/atom+xml", "testATOM.atom");
}
@Test
public void testFitsDetection() throws Exception {
// FITS image created using imagemagick convert of testJPEG.jpg
assertType("application/fits", "testFITS.fits");
assertTypeByData("application/fits", "testFITS.fits");
assertTypeByName("application/fits", "testFITS.fits");
// Shorter Header pattern (16 rather than 20 spaces)
assertTypeByData("application/fits", "testFITS_ShorterHeader.fits");
}
@Test
public void testJpegDetection() throws Exception {
assertType("image/jpeg", "testJPEG.jpg");
assertTypeByData("image/jpeg", "testJPEG.jpg");
assertTypeByName("image/jpeg", "x.jpg");
assertTypeByName("image/jpeg", "x.JPG");
assertTypeByName("image/jpeg", "x.jpeg");
assertTypeByName("image/jpeg", "x.JPEG");
assertTypeByName("image/jpeg", "x.jpe");
assertTypeByName("image/jpeg", "x.jif");
assertTypeByName("image/jpeg", "x.jfif");
assertTypeByName("image/jpeg", "x.jfi");
assertType("image/jp2", "testJPEG.jp2");
assertTypeByData("image/jp2", "testJPEG.jp2");
assertTypeByName("image/jp2", "x.jp2");
}
@Test
public void testBpgDetection() throws Exception {
assertType("image/x-bpg", "testBPG.bpg");
assertTypeByData("image/x-bpg", "testBPG.bpg");
assertTypeByData("image/x-bpg", "testBPG_commented.bpg");
assertTypeByName("image/x-bpg", "x.bpg");
}
@Test
public void testIcnsDetection() throws Exception {
assertType("image/icns", "testICNS.icns");
assertTypeByData("image/icns", "testICNS_basic.icns");
assertTypeByData("image/icns", "testICNS.icns");
assertTypeByName("image/icns", "testICNS.icns");
}
@Test
public void testTiffDetection() throws Exception {
assertType("image/tiff", "testTIFF.tif");
assertTypeByData("image/tiff", "testTIFF.tif");
assertTypeByName("image/tiff", "x.tiff");
assertTypeByName("image/tiff", "x.tif");
assertTypeByName("image/tiff", "x.TIF");
}
@Test
public void testGifDetection() throws Exception {
assertType("image/gif", "testGIF.gif");
assertTypeByData("image/gif", "testGIF.gif");
assertTypeByName("image/gif", "x.gif");
assertTypeByName("image/gif", "x.GIF");
}
@Test
public void testPngDetection() throws Exception {
assertType("image/png", "testPNG.png");
assertTypeByData("image/png", "testPNG.png");
assertTypeByName("image/png", "x.png");
assertTypeByName("image/png", "x.PNG");
}
@Test
public void testWEBPDetection() throws Exception {
assertType("image/webp", "testWEBP.webp");
assertTypeByData("image/webp", "testWEBP.webp");
assertTypeByName("image/webp", "x.webp");
assertTypeByName("image/webp", "x.WEBP");
}
@Test
public void testBmpDetection() throws Exception {
assertType("image/bmp", "testBMP.bmp");
assertTypeByData("image/bmp", "testBMP.bmp");
assertTypeByName("image/bmp", "x.bmp");
assertTypeByName("image/bmp", "x.BMP");
assertTypeByName("image/bmp", "x.dib");
assertTypeByName("image/bmp", "x.DIB");
//false positive check -- contains part of BMP signature
assertType("text/plain", "testBMPfp.txt");
}
@Test
public void testPnmDetection() throws Exception {
assertType("image/x-portable-bitmap", "testPBM.pbm");
assertType("image/x-portable-graymap", "testPGM.pgm");
assertType("image/x-portable-pixmap", "testPPM.ppm");
assertTypeByData("image/x-portable-bitmap", "testPBM.pbm");
assertTypeByData("image/x-portable-graymap", "testPGM.pgm");
assertTypeByData("image/x-portable-pixmap", "testPPM.ppm");
assertTypeByName("image/x-portable-anymap", "x.pnm");
assertTypeByName("image/x-portable-anymap", "x.PNM");
assertTypeByName("image/x-portable-bitmap", "x.pbm");
assertTypeByName("image/x-portable-bitmap", "x.PBM");
assertTypeByName("image/x-portable-graymap", "x.pgm");
assertTypeByName("image/x-portable-graymap", "x.PGM");
assertTypeByName("image/x-portable-pixmap", "x.ppm");
assertTypeByName("image/x-portable-pixmap", "x.PPM");
}
@Test
public void testPictDetection() throws Exception {
assertType("image/x-pict", "testPICT.pct");
assertTypeByData("image/x-pict", "testPICT.pct");
assertTypeByName("image/x-pict", "x.pic");
assertTypeByName("image/x-pict", "x.PCT");
}
@Test
public void testCgmDetection() throws Exception {
// TODO: Need a test image file
assertTypeByName("image/cgm", "x.cgm");
assertTypeByName("image/cgm", "x.CGM");
}
@Test
public void testRdfXmlDetection() throws Exception {
assertTypeByName("application/rdf+xml", "x.rdf");
assertTypeByName("application/rdf+xml", "x.owl");
}
@Test
public void testSvgDetection() throws Exception {
assertType("image/svg+xml", "testSVG.svg");
assertTypeByData("image/svg+xml", "testSVG.svg");
assertTypeByName("image/svg+xml", "x.svg");
assertTypeByName("image/svg+xml", "x.SVG");
// Should *.svgz be svg or gzip
assertType("application/gzip", "testSVG.svgz");
assertTypeByData("application/gzip", "testSVG.svgz");
assertTypeByName("image/svg+xml", "x.svgz");
assertTypeByName("image/svg+xml", "x.SVGZ");
}
@Test
public void testPdfDetection() throws Exception {
// PDF extension by name is enough
assertTypeByName("application/pdf", "x.pdf");
assertTypeByName("application/pdf", "x.PDF");
// For normal PDFs, can get by name or data or both
assertType("application/pdf", "testPDF.pdf");
assertTypeByData("application/pdf", "testPDF.pdf");
// PDF with a BoM works both ways too
assertType("application/pdf", "testPDF_bom.pdf");
assertTypeByData("application/pdf", "testPDF_bom.pdf");
}
@Test
public void testSwfDetection() throws Exception {
assertTypeByName("application/x-shockwave-flash", "x.swf");
assertTypeByName("application/x-shockwave-flash", "x.SWF");
assertTypeByName("application/x-shockwave-flash", "test1.swf");
assertTypeByName("application/x-shockwave-flash", "test2.swf");
assertTypeByName("application/x-shockwave-flash", "test3.swf");
}
@Test
public void testAutoCADDetection() throws Exception {
assertTypeByName("image/vnd.dwg", "x.dwg");
assertTypeByData("image/vnd.dwg", "testDWG2004.dwg");
assertTypeByData("image/vnd.dwg", "testDWG2007.dwg");
assertTypeByData("image/vnd.dwg", "testDWG2010.dwg");
// From name, gets the common parent type
assertTypeByName("model/vnd.dwf", "x.dwf");
// With the data, can work out it's the v6 zip-based flavour
assertTypeByData("model/vnd.dwf; version=6", "testDWF2010.dwf");
// From name, gets the common parent type
assertTypeByName("image/vnd.dxf", "x.dxf");
// With the data, can work out it's the ASCII flavour
assertTypeByData("image/vnd.dxf; format=ascii", "testDXF_ascii.dxf");
// TODO Get a sample Binary DXF file and test
}
@Test
public void testprtDetection() throws Exception {
assertTypeByName("application/x-prt", "x.prt");
assertTypeByData("application/x-prt", "testCADKEY.prt");
}
/**
* Formats which are based on plain text
*/
@Test
public void testTextBasedFormatsDetection() throws Exception {
assertTypeByName("text/plain", "testTXT.txt");
assertType( "text/plain", "testTXT.txt");
assertTypeByName("text/css", "testCSS.css");
assertType( "text/css", "testCSS.css");
assertTypeByName("text/csv", "testCSV.csv");
assertType( "text/csv", "testCSV.csv");
assertTypeByName("text/html", "testHTML.html");
assertType( "text/html", "testHTML.html");
assertTypeByName("application/javascript", "testJS.js");
assertType( "application/javascript", "testJS.js");
}
@Test
public void testJavaDetection() throws Exception {
// TODO Classloader doesn't seem to find the .class file in test-documents
//assertTypeDetection("AutoDetectParser.class", "application/java-vm");
// OSX Native Extension
assertTypeDetection("testJNILIB.jnilib", "application/x-java-jnilib");
}
@Test
public void testXmlAndHtmlDetection() throws Exception {
assertTypeByData("application/xml", "<?xml version=\"1.0\" encoding=\"UTF-8\"?><records><record/></records>"
.getBytes(UTF_8));
assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
.getBytes(UTF_16LE));
assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
.getBytes(UTF_16BE));
assertTypeByData("application/xml", "<!-- XML without processing instructions --><records><record/></records>"
.getBytes(UTF_8));
assertTypeByData("text/html", "<html><body>HTML</body></html>"
.getBytes(UTF_8));
assertTypeByData("text/html", "<!-- HTML comment --><html><body>HTML</body></html>"
.getBytes(UTF_8));
}
@Test
public void testWmfDetection() throws Exception {
assertTypeByName("image/wmf", "x.wmf");
assertTypeByData("image/wmf", "testWMF.wmf");
assertTypeByName("image/wmf", "x.WMF");
assertTypeByName("image/emf", "x.emf");
assertTypeByData("image/emf", "testEMF.emf");
assertTypeByName("image/emf", "x.EMF");
// TODO: Need a test wmz file
assertTypeByName("application/x-ms-wmz", "x.wmz");
assertTypeByName("application/x-ms-wmz", "x.WMZ");
// TODO: Need a test emz file
assertTypeByName("application/gzip", "x.emz");
assertTypeByName("application/gzip", "x.EMZ");
}
@Test
public void testPsDetection() throws Exception {
// TODO: Need a test postscript file
assertTypeByName("application/postscript", "x.ps");
assertTypeByName("application/postscript", "x.PS");
assertTypeByName("application/postscript", "x.eps");
assertTypeByName("application/postscript", "x.epsf");
assertTypeByName("application/postscript", "x.epsi");
}
@Test
public void testMicrosoftMultiMediaDetection() throws Exception {
assertTypeByName("video/x-ms-asf", "x.asf");
assertTypeByName("video/x-ms-wmv", "x.wmv");
assertTypeByName("audio/x-ms-wma", "x.wma");
assertTypeByData("video/x-ms-asf", "testASF.asf");
assertTypeByData("video/x-ms-wmv", "testWMV.wmv");
assertTypeByData("audio/x-ms-wma", "testWMA.wma");
}
/**
* All 3 DITA types are in theory handled by the same mimetype,
* but we specialise them
*/
@Test
public void testDITADetection() throws Exception {
assertTypeByName("application/dita+xml; format=topic", "test.dita");
assertTypeByName("application/dita+xml; format=map", "test.ditamap");
assertTypeByName("application/dita+xml; format=val", "test.ditaval");
assertTypeByData("application/dita+xml; format=task", "testDITA.dita");
assertTypeByData("application/dita+xml; format=concept", "testDITA2.dita");
assertTypeByData("application/dita+xml; format=map", "testDITA.ditamap");
assertTypeByNameAndData("application/dita+xml; format=task", "testDITA.dita");
assertTypeByNameAndData("application/dita+xml; format=concept", "testDITA2.dita");
assertTypeByNameAndData("application/dita+xml; format=map", "testDITA.ditamap");
// These are all children of the official type
assertEquals("application/dita+xml",
repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.ditamap")).toString());
assertEquals("application/dita+xml",
repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.dita")).toString());
// Concept inherits from topic
assertEquals("application/dita+xml; format=topic",
repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA2.dita")).toString());
}
/**
* @since TIKA-194
*/
@Test
public void testJavaRegex() throws Exception{
MimeType testType = new MimeType(MediaType.parse("foo/bar"));
this.repo.add(testType);
assertNotNull(repo.forName("foo/bar"));
String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}";
this.repo.addPattern(testType, pattern, true);
String testFileName = "rtg_sst_grb_0.5.12345678";
assertEquals("foo/bar", tika.detect(testFileName));
MimeType testType2 = new MimeType(MediaType.parse("foo/bar2"));
this.repo.add(testType2);
assertNotNull(repo.forName("foo/bar2"));
this.repo.addPattern(testType2, pattern, false);
assertNotSame("foo/bar2", tika.detect(testFileName));
}
@Test
public void testRawDetection() throws Exception {
assertTypeByName("image/x-raw-adobe", "x.dng");
assertTypeByName("image/x-raw-adobe", "x.DNG");
assertTypeByName("image/x-raw-hasselblad", "x.3fr");
assertTypeByName("image/x-raw-fuji", "x.raf");
assertTypeByName("image/x-raw-canon", "x.crw");
assertTypeByName("image/x-raw-canon", "x.cr2");
assertTypeByName("image/x-raw-kodak", "x.k25");
assertTypeByName("image/x-raw-kodak", "x.kdc");
assertTypeByName("image/x-raw-kodak", "x.dcs");
assertTypeByName("image/x-raw-kodak", "x.drf");
assertTypeByName("image/x-raw-minolta", "x.mrw");
assertTypeByName("image/x-raw-nikon", "x.nef");
assertTypeByName("image/x-raw-nikon", "x.nrw");
assertTypeByName("image/x-raw-olympus", "x.orf");
assertTypeByName("image/x-raw-pentax", "x.ptx");
assertTypeByName("image/x-raw-pentax", "x.pef");
assertTypeByName("image/x-raw-sony", "x.arw");
assertTypeByName("image/x-raw-sony", "x.srf");
assertTypeByName("image/x-raw-sony", "x.sr2");
assertTypeByName("image/x-raw-sigma", "x.x3f");
assertTypeByName("image/x-raw-epson", "x.erf");
assertTypeByName("image/x-raw-mamiya", "x.mef");
assertTypeByName("image/x-raw-leaf", "x.mos");
assertTypeByName("image/x-raw-panasonic", "x.raw");
assertTypeByName("image/x-raw-panasonic", "x.rw2");
assertTypeByName("image/x-raw-phaseone", "x.iiq");
assertTypeByName("image/x-raw-red", "x.r3d");
assertTypeByName("image/x-raw-imacon", "x.fff");
assertTypeByName("image/x-raw-logitech", "x.pxn");
assertTypeByName("image/x-raw-casio", "x.bay");
assertTypeByName("image/x-raw-rawzor", "x.rwz");
}
/**
* Tests that we correctly detect the font types
*/
@Test
public void testFontDetection() throws Exception {
assertTypeByName("application/x-font-adobe-metric", "x.afm");
assertTypeByData("application/x-font-adobe-metric", "testAFM.afm");
assertTypeByName("application/x-font-printer-metric", "x.pfm");
// TODO Get a sample .pfm file
assertTypeByData(
"application/x-font-printer-metric",
new byte[] {0x00, 0x01, 256-0xb1, 0x0a, 0x00, 0x00, 0x43, 0x6f,
0x70, 0x79, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20}
);
assertTypeByName("application/x-font-type1", "x.pfa");
// TODO Get a sample .pfa file
assertTypeByData(
"application/x-font-type1",
new byte[] {0x25, 0x21, 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f,
0x62, 0x65, 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31,
0x2e, 0x30, 0x20, 0x20, 0x2d, 0x2a, 0x2d, 0x20}
);
assertTypeByName("application/x-font-type1", "x.pfb");
// TODO Get a sample .pfm file
assertTypeByData(
"application/x-font-type1",
new byte[] {-0x80, 0x01, 0x09, 0x05, 0x00, 0x00, 0x25, 0x21,
0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f, 0x62, 0x65,
0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31, 0x2e, 0x30 }
);
}
/**
* Tests MimeTypes.getMimeType(URL), which examines both the byte header
* and, if necessary, the URL's extension.
*/
@Test
public void testMimeDeterminationForTestDocuments() throws Exception {
assertType("text/html", "testHTML.html");
assertType("application/zip", "test-documents.zip");
assertType("text/html", "testHTML_utf8.html");
assertType(
"application/vnd.oasis.opendocument.text",
"testOpenOffice2.odt");
assertType("application/pdf", "testPDF.pdf");
assertType("application/rtf", "testRTF.rtf");
assertType("text/plain", "testTXT.txt");
assertType("application/xml", "testXML.xml");
assertType("audio/basic", "testAU.au");
assertType("audio/x-aiff", "testAIFF.aif");
assertType("audio/x-wav", "testWAV.wav");
assertType("audio/midi", "testMID.mid");
assertType("application/x-msaccess", "testACCESS.mdb");
assertType("application/x-font-ttf", "testTrueType3.ttf");
}
@Test
public void test7ZipDetection() throws Exception {
assertTypeByName("application/x-7z-compressed","test-documents.7z");
assertTypeByData("application/x-7z-compressed","test-documents.7z");
assertTypeByNameAndData("application/x-7z-compressed", "test-documents.7z");
}
@Test
public void testWebArchiveDetection() throws Exception {
assertTypeByName("application/x-webarchive","x.webarchive");
assertTypeByData("application/x-bplist","testWEBARCHIVE.webarchive");
assertTypeByNameAndData("application/x-webarchive", "testWEBARCHIVE.webarchive");
}
/**
* KML, and KMZ (zipped KML)
*/
@Test
public void testKMLZDetection() throws Exception {
assertTypeByName("application/vnd.google-earth.kml+xml","testKML.kml");
assertTypeByData("application/vnd.google-earth.kml+xml","testKML.kml");
assertTypeByNameAndData("application/vnd.google-earth.kml+xml", "testKML.kml");
assertTypeByName("application/vnd.google-earth.kmz","testKMZ.kmz");
assertTypeByNameAndData("application/vnd.google-earth.kmz", "testKMZ.kmz");
// By data only, mimetype magic only gets us to a .zip
// We need to use the Zip Aware detector to get the full type
assertTypeByData("application/zip","testKMZ.kmz");
}
@Test
public void testCreativeSuite() throws IOException {
assertTypeDetection("testINDD.indd", "application/x-adobe-indesign");
assertTypeDetection("testPSD.psd", "image/vnd.adobe.photoshop");
}
@Test
public void testAMR() throws IOException {
// AMR matches on name, data or both
assertTypeDetection("testAMR.amr", "audio/amr");
// AMR-WB subtype shares extension, so needs data to identify
assertTypeDetection("testAMR-WB.amr", "audio/amr", "audio/amr-wb", "audio/amr-wb");
// Ditto for the AMR-WB+ subtype, which we don't have a sample file of yet
//assertTypeDetection("testAMR-WB+.amr", "audio/amr", "audio/amr-wb+", "audio/amr-wb+");
}
@Test
public void testEmail() throws IOException {
// EMLX
assertTypeDetection("testEMLX.emlx", "message/x-emlx");
// Groupwise
assertTypeDetection("testGroupWiseEml.eml", "message/rfc822");
// Lotus
assertTypeDetection("testLotusEml.eml", "message/rfc822");
// MBOX
assertTypeDetection("headers.mbox", "application/mbox");
// Thunderbird - doesn't currently work by name
assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml");
}
@Test
public void testAxCrypt() throws Exception {
// test-TXT.txt encrypted with a key of "tika"
assertTypeDetection("testTXT-tika.axx", "application/x-axcrypt");
}
@Test
public void testWindowsEXE() throws Exception {
assertTypeByName("application/x-msdownload", "x.dll");
assertTypeByName("application/x-ms-installer", "x.msi");
assertTypeByName("application/x-dosexec", "x.exe");
assertTypeByData("application/x-msdownload; format=pe", "testTinyPE.exe");
assertTypeByNameAndData("application/x-msdownload; format=pe", "testTinyPE.exe");
// A jar file with part of a PE header, but not a full one
// should still be detected as a zip or jar (without/with name)
assertTypeByData("application/zip", "testJAR_with_PEHDR.jar");
assertTypeByNameAndData("application/java-archive", "testJAR_with_PEHDR.jar");
}
@Test
public void testMatroskaDetection() throws Exception {
assertType("video/x-matroska", "testMKV.mkv");
// TODO: Need custom detector data detection, see TIKA-1180
assertTypeByData("application/x-matroska", "testMKV.mkv");
assertTypeByNameAndData("video/x-matroska", "testMKV.mkv");
assertTypeByName("video/x-matroska", "x.mkv");
assertTypeByName("video/x-matroska", "x.MKV");
assertTypeByName("audio/x-matroska", "x.mka");
assertTypeByName("audio/x-matroska", "x.MKA");
}
@Test
public void testWebMDetection() throws Exception {
assertType("video/webm", "testWEBM.webm");
// TODO: Need custom detector data detection, see TIKA-1180
assertTypeByData("application/x-matroska", "testWEBM.webm");
assertTypeByNameAndData("video/webm", "testWEBM.webm");
assertTypeByName("video/webm", "x.webm");
assertTypeByName("video/webm", "x.WEBM");
}
/** Test getMimeType(byte[]) */
@Test
public void testGetMimeType_byteArray() throws IOException {
// Plain text detection
assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
assertText(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
assertText(new byte[] { 'a', 'b', 'c' });
assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B });
assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
}
@Test
public void testBerkeleyDB() throws IOException {
assertTypeByData(
"application/x-berkeley-db; format=btree; version=2",
"testBDB_btree_2.db");
assertTypeByData(
"application/x-berkeley-db; format=btree; version=3",
"testBDB_btree_3.db");
assertTypeByData(
"application/x-berkeley-db; format=btree; version=4",
"testBDB_btree_4.db");
// V4 and V5 share the same btree format
assertTypeByData(
"application/x-berkeley-db; format=btree; version=4",
"testBDB_btree_5.db");
assertTypeByData(
"application/x-berkeley-db; format=hash; version=2",
"testBDB_hash_2.db");
assertTypeByData(
"application/x-berkeley-db; format=hash; version=3",
"testBDB_hash_3.db");
assertTypeByData(
"application/x-berkeley-db; format=hash; version=4",
"testBDB_hash_4.db");
assertTypeByData(
"application/x-berkeley-db; format=hash; version=5",
"testBDB_hash_5.db");
}
/**
* CBOR typically contains HTML
*/
@Test
public void testCBOR() throws IOException {
assertTypeByNameAndData("application/cbor", "NUTCH-1997.cbor");
assertTypeByData("application/cbor", "NUTCH-1997.cbor");
}
@Test
public void testZLIB() throws IOException {
// ZLIB encoded versions of testTXT.txt
assertTypeByData("application/zlib", "testTXT.zlib");
assertTypeByData("application/zlib", "testTXT.zlib0");
assertTypeByData("application/zlib", "testTXT.zlib5");
assertTypeByData("application/zlib", "testTXT.zlib9");
}
@Test
public void testTextFormats() throws Exception {
assertType("application/x-bibtex-text-file", "testBIBTEX.bib");
assertTypeByData("application/x-bibtex-text-file", "testBIBTEX.bib");
}
@Test
public void testCodeFormats() throws Exception {
assertType("text/x-csrc", "testC.c");
assertType("text/x-chdr", "testH.h");
assertTypeByData("text/x-csrc", "testC.c");
assertTypeByData("text/x-chdr", "testH.h");
assertTypeByName("text/x-java-source", "testJAVA.java");
assertType("text/x-java-properties", "testJAVAPROPS.properties");
assertType("text/x-matlab", "testMATLAB.m");
assertType("text/x-matlab", "testMATLAB_wtsgaus.m");
assertType("text/x-matlab", "testMATLAB_barcast.m");
assertTypeByData("text/x-matlab", "testMATLAB.m");
assertTypeByData("text/x-matlab", "testMATLAB_wtsgaus.m");
assertTypeByData("text/x-matlab", "testMATLAB_barcast.m");
// By name, or by name+data, gets it as JS
assertTypeByName("application/javascript", "testJS.js");
assertTypeByName("application/javascript", "testJS_HTML.js");
assertType("application/javascript", "testJS.js");
assertType("application/javascript", "testJS_HTML.js");
// With data only, because we have no JS file magic, can't be
// detected. One will come through as plain text, the other
// as HTML due to <html> in it. TODO Add JS magic. See TIKA-1141
//assertTypeByData("application/javascript", "testJS.js");
//assertTypeByData("application/javascript", "testJS_HTML.js");
}
@Test
public void testWebVTT() throws Exception {
// With the most common text header
assertType("text/vtt", "testWebVTT.vtt");
assertTypeByData("text/vtt", "testWebVTT.vtt");
// With no text header, just plain WebVTT one
assertType("text/vtt", "testWebVTT_simple.vtt");
assertTypeByData("text/vtt", "testWebVTT_simple.vtt");
// With a custom text header
assertType("text/vtt", "testWebVTT_header.vtt");
assertTypeByData("text/vtt", "testWebVTT_header.vtt");
}
@Test
public void testMIF() throws Exception {
assertType("application/vnd.mif", "testMIF.mif");
assertTypeByData("application/vnd.mif", "testMIF.mif");
}
@Test
public void testPKCSSignatures() throws Exception {
// PKCS7 Signed XML files
assertType("application/pkcs7-signature", "testPKCS17Sig.xml.p7m");
assertType("application/pkcs7-signature", "testPKCS17Sig-v2.xml.p7m");
assertType("application/pkcs7-signature", "testPKCS17Sig-v3.xml.p7m");
assertType("application/pkcs7-signature", "testPKCS17Sig-v4.xml.p7m");
assertTypeByData("application/pkcs7-signature", "testPKCS17Sig.xml.p7m");
assertTypeByData("application/pkcs7-signature", "testPKCS17Sig-v2.xml.p7m");
assertTypeByData("application/pkcs7-signature", "testPKCS17Sig-v3.xml.p7m");
assertTypeByData("application/pkcs7-signature", "testPKCS17Sig-v4.xml.p7m");
}
@Test
public void testVandICalendars() throws Exception {
assertType("text/calendar", "testICalendar.ics");
assertType("text/x-vcalendar", "testVCalendar.vcs");
assertTypeByData("text/calendar", "testICalendar.ics");
assertTypeByData("text/x-vcalendar", "testVCalendar.vcs");
}
@Test
public void testASX() throws Exception {
assertType("application/x-ms-asx", "testWindowsMediaMeta.asx");
assertTypeByData("application/x-ms-asx", "testWindowsMediaMeta.asx");
}
@Test
public void testMSOwner() throws Exception {
assertType("application/x-ms-owner", "testMSOwnerFile");
}
@Test
public void testDJVU() throws Exception {
assertType("image/vnd.djvu", "testDJVU.djvu");
assertTypeByData("image/vnd.djvu", "testDJVU.djvu");
}
@Test
public void testEndNoteImport() throws Exception {
assertType("application/x-endnote-refer", "testEndNoteImportFile.enw");
assertTypeByData("application/x-endnote-refer", "testEndNoteImportFile.enw");
}
@Test
public void testStataDTA() throws Exception {
// Filename only gives base type
assertTypeByName("application/x-stata-dta", "testStataDTA.dta");
// With data too, can get specific version
assertTypeByData("application/x-stata-dta; version=13", "testStataDTA.dta");
// Name + data gets specific version as well
assertType("application/x-stata-dta; version=13", "testStataDTA.dta");
}
@Test
public void testOneNote() throws Exception {
// With name or data we can get the full details
assertTypeByName("application/onenote; format=one", "testOneNote.one");
assertTypeByData("application/onenote; format=one", "testOneNote.one");
// TODO Get sample .onetoc2 and .onepkg files
}
@Test
public void testMSWriteFile() throws Exception {
//This file is govdocs1's 746255.doc
assertTypeByName("application/x-mswrite", "testMSWriteFile.wri");
assertTypeByData("application/x-mswrite", "testMSWriteFile.wri");
}
@Test
public void testSASProgramming() throws Exception {
// Data files we have magic for
assertTypeByName("application/x-sas-data-v6", "testSAS.sd2");
assertTypeByData("application/x-sas-data-v6", "testSAS.sd2");
assertTypeByName("application/x-sas-data", "testSAS.sas7bdat");
assertTypeByData("application/x-sas-data", "testSAS.sas7bdat");
assertTypeByName("application/x-sas-xport", "testSAS.xpt");
assertTypeByData("application/x-sas-xport", "testSAS.xpt");
// Programs we don't, so must have mime type to detect
assertTypeByName("application/x-sas", "testSAS.sas");
assertTypeByData("text/plain", "testSAS.sas");
}
private void assertText(byte[] prefix) throws IOException {
assertMagic("text/plain", prefix);
}
private void assertNotText(byte[] prefix) throws IOException {
assertMagic("application/octet-stream", prefix);
}
private void assertMagic(String expected, byte[] prefix) throws IOException {
MediaType type =
repo.detect(new ByteArrayInputStream(prefix), new Metadata());
assertNotNull(type);
assertEquals(expected, type.toString());
}
private void assertType(String expected, String filename) throws Exception {
try (InputStream stream = TestMimeTypes.class.getResourceAsStream(
"/test-documents/" + filename)) {
assertNotNull("Test file not found: " + filename, stream);
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
assertEquals(expected, repo.detect(stream, metadata).toString());
}
}
private void assertTypeByName(String expected, String filename)
throws IOException {
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
assertEquals(expected, repo.detect(null, metadata).toString());
}
private void assertTypeByData(String expected, String filename)
throws IOException {
try (InputStream stream = TestMimeTypes.class.getResourceAsStream(
"/test-documents/" + filename)) {
assertNotNull("Test file not found: " + filename, stream);
Metadata metadata = new Metadata();
assertEquals(expected, repo.detect(stream, metadata).toString());
}
}
private void assertTypeByData(String expected, byte[] data)
throws IOException {
try (InputStream stream = new ByteArrayInputStream(data)) {
Metadata metadata = new Metadata();
assertEquals(expected, repo.detect(stream, metadata).toString());
}
}
private void assertTypeDetection(String filename, String type)
throws IOException {
assertTypeDetection(filename, type, type, type);
}
private void assertTypeDetection(String filename, String byName, String byData,
String byNameAndData) throws IOException {
assertTypeByName(byName, filename);
assertTypeByData(byData, filename);
assertTypeByNameAndData(byNameAndData, filename);
}
private void assertTypeByNameAndData(String expected, String filename)
throws IOException {
assertEquals(expected, getTypeByNameAndData(filename).toString());
}
private MediaType getTypeByNameAndData(String filename) throws IOException {
try (InputStream stream = TestMimeTypes.class.getResourceAsStream(
"/test-documents/" + filename)) {
assertNotNull("Test document not found: " + filename, stream);
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
return repo.detect(stream, metadata);
}
}
}