/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.mime;
import static java.nio.charset.StandardCharsets.UTF_16BE;
import static java.nio.charset.StandardCharsets.UTF_16LE;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.Test;
public class MimeDetectionTest {
private MimeTypes mimeTypes;
private MediaTypeRegistry registry;
/** @inheritDoc */
@Before
public void setUp() {
this.mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository();
this.registry = mimeTypes.getMediaTypeRegistry();
}
@Test
public void testDetection() throws Exception {
testFile("image/svg+xml", "circles.svg");
testFile("image/svg+xml", "circles-with-prefix.svg");
testFile("image/png", "datamatrix.png");
testFile("text/html", "test.html");
testFile("application/xml", "test-iso-8859-1.xml");
testFile("application/xml", "test-utf8.xml");
testFile("application/xml", "test-utf8-bom.xml");
testFile("application/xml", "test-utf16le.xml");
testFile("application/xml", "test-utf16be.xml");
testFile("application/xml", "test-long-comment.xml");
testFile("application/xslt+xml", "stylesheet.xsl");
testUrl(
"application/rdf+xml",
"http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
"test-difficult-rdf1.xml");
testUrl(
"application/rdf+xml",
"http://www.w3.org/2002/07/owl#",
"test-difficult-rdf2.xml");
// add evil test from TIKA-327
testFile("text/html", "test-tika-327.html");
// add another evil html test from TIKA-357
testFile("text/html", "testlargerbuffer.html");
// test fragment of HTML with <div> (TIKA-1102)
testFile("text/html", "htmlfragment");
// test binary CGM detection (TIKA-1170)
testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
// test HTML detection of malformed file, previously identified as image/cgm (TIKA-1170)
testFile("text/html", "test-malformed-header.html.bin");
//test GCMD Directory Interchange Format (.dif) TIKA-1561
testFile("application/dif+xml", "brwNIMS_2014.dif");
}
@Test
public void testByteOrderMark() throws Exception {
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)),
new Metadata()));
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)),
new Metadata()));
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)),
new Metadata()));
}
@Test
public void testSuperTypes() {
assertTrue(registry.isSpecializationOf(
MediaType.parse("text/something; charset=UTF-8"),
MediaType.parse("text/something")));
assertTrue(registry.isSpecializationOf(
MediaType.parse("text/something; charset=UTF-8"),
MediaType.TEXT_PLAIN));
assertTrue(registry.isSpecializationOf(
MediaType.parse("text/something; charset=UTF-8"),
MediaType.OCTET_STREAM));
assertTrue(registry.isSpecializationOf(
MediaType.parse("text/something"),
MediaType.TEXT_PLAIN));
assertTrue(registry.isSpecializationOf(
MediaType.parse("application/something+xml"),
MediaType.APPLICATION_XML));
assertTrue(registry.isSpecializationOf(
MediaType.parse("application/something+zip"),
MediaType.APPLICATION_ZIP));
assertTrue(registry.isSpecializationOf(
MediaType.APPLICATION_XML,
MediaType.TEXT_PLAIN));
assertTrue(registry.isSpecializationOf(
MediaType.parse("application/vnd.apple.iwork"),
MediaType.APPLICATION_ZIP));
}
@SuppressWarnings("unused")
private void testUrlOnly(String expected, String url) throws IOException{
InputStream in = new URL(url).openStream();
testStream(expected, url, in);
}
private void testUrl(String expected, String url, String file) throws IOException{
InputStream in = getClass().getResourceAsStream(file);
testStream(expected, url, in);
}
private void testFile(String expected, String filename) throws IOException {
InputStream in = getClass().getResourceAsStream(filename);
testStream(expected, filename, in);
}
private void testStream(String expected, String urlOrFileName, InputStream in) throws IOException{
assertNotNull("Test stream: ["+urlOrFileName+"] is null!", in);
if (!in.markSupported()) {
in = new java.io.BufferedInputStream(in);
}
try {
Metadata metadata = new Metadata();
String mime = this.mimeTypes.detect(in, metadata).toString();
assertEquals(urlOrFileName + " is not properly detected: detected.", expected, mime);
//Add resource name and test again
metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
mime = this.mimeTypes.detect(in, metadata).toString();
assertEquals(urlOrFileName + " is not properly detected after adding resource name.", expected, mime);
} finally {
in.close();
}
}
private void assertNotNull(String string, InputStream in) {
// TODO Auto-generated method stub
}
/**
* Test for type detection of empty documents.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
*/
@Test
public void testEmptyDocument() throws IOException {
assertEquals(MediaType.OCTET_STREAM, mimeTypes.detect(
new ByteArrayInputStream(new byte[0]), new Metadata()));
Metadata namehint = new Metadata();
namehint.set(Metadata.RESOURCE_NAME_KEY, "test.txt");
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
new ByteArrayInputStream(new byte[0]), namehint));
Metadata typehint = new Metadata();
typehint.set(Metadata.CONTENT_TYPE, "text/plain");
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
new ByteArrayInputStream(new byte[0]), typehint));
}
/**
* Test for things like javascript files whose content is enclosed in XML
* comment delimiters, but that aren't actually XML.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
*/
@Test
public void testNotXML() throws IOException {
assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect(
new ByteArrayInputStream("<!-- test -->".getBytes(UTF_8)),
new Metadata()));
}
/**
* Tests that when we repeatedly test the detection of a document
* that can be detected with Mime Magic, that we consistently
* detect it correctly. See TIKA-391 for more details.
*/
@Test
public void testMimeMagicStability() throws IOException {
for(int i=0; i<100; i++) {
testFile("application/vnd.ms-excel", "test.xls");
}
}
/**
* Tests that when two magic matches both apply, and both
* have the same priority, we use the name to pick the
* right one based on the glob, or the first one we
* come across if not. See TIKA-1292 for more details.
*/
@Test
public void testMimeMagicClashSamePriority() throws IOException {
byte[] helloWorld = "Hello, World!".getBytes(UTF_8);
MediaType helloType = MediaType.parse("hello/world-file");
MediaType helloXType = MediaType.parse("hello/x-world-hello");
Metadata metadata;
// With a filename, picks the right one
metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "test.hello.world");
assertEquals(helloType, mimeTypes.detect(new ByteArrayInputStream(helloWorld), metadata));
metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "test.x-hello-world");
assertEquals(helloXType, mimeTypes.detect(new ByteArrayInputStream(helloWorld), metadata));
// Without, goes for the one that sorts last
metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "testingTESTINGtesting");
assertEquals(helloXType, mimeTypes.detect(new ByteArrayInputStream(helloWorld), metadata));
}
}