/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.detect;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.junit.Test;
import static java.nio.charset.StandardCharsets.US_ASCII;
import static java.nio.charset.StandardCharsets.UTF_16BE;
import static java.nio.charset.StandardCharsets.UTF_16LE;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
/**
* Test cases for the {@link MagicDetector} class.
*/
public class MagicDetectorTest {
@Test
public void testDetectNull() throws Exception {
MediaType html = new MediaType("text", "html");
Detector detector = new MagicDetector(html, "<html".getBytes(US_ASCII));
assertEquals(
MediaType.OCTET_STREAM,
detector.detect(null, new Metadata()));
}
@Test
public void testDetectSimple() throws Exception {
MediaType html = new MediaType("text", "html");
Detector detector = new MagicDetector(html, "<html".getBytes(US_ASCII));
assertDetect(detector, html, "<html");
assertDetect(detector, html, "<html><head/><body/></html>");
assertDetect(detector, MediaType.OCTET_STREAM, "<HTML");
assertDetect(detector, MediaType.OCTET_STREAM, "<?xml?><html");
assertDetect(detector, MediaType.OCTET_STREAM, " <html");
assertDetect(detector, MediaType.OCTET_STREAM, "");
}
@Test
public void testDetectOffsetRange() throws Exception {
MediaType html = new MediaType("text", "html");
Detector detector = new MagicDetector(
html, "<html".getBytes(US_ASCII), null, 0, 64);
assertDetect(detector, html, "<html");
assertDetect(detector, html, "<html><head/><body/></html>");
assertDetect(detector, html, "<?xml?><html/>");
assertDetect(detector, html, "\n <html");
assertDetect(detector, html, "\u0000<html");
assertDetect(detector, MediaType.OCTET_STREAM, "<htm");
assertDetect(detector, MediaType.OCTET_STREAM, " html");
assertDetect(detector, MediaType.OCTET_STREAM, "<HTML");
assertDetect(detector, html,
"0........1.........2.........3.........4.........5.........6"
+ "1234<html");
assertDetect(detector, MediaType.OCTET_STREAM,
"0........1.........2.........3.........4.........5.........6"
+ "12345<html");
assertDetect(detector, MediaType.OCTET_STREAM, "");
}
@Test
public void testDetectMask() throws Exception {
MediaType html = new MediaType("text", "html");
byte up = (byte) 0xdf;
Detector detector = new MagicDetector(
html,
new byte[] { '<', 'H', 'T', 'M', 'L' },
new byte[] { (byte) 0xff, up, up, up, up },
0, 64);
assertDetect(detector, html, "<html");
assertDetect(detector, html, "<HTML><head/><body/></html>");
assertDetect(detector, html, "<?xml?><HtMl/>");
assertDetect(detector, html, "\n <html");
assertDetect(detector, html, "\u0000<HTML");
assertDetect(detector, MediaType.OCTET_STREAM, "<htm");
assertDetect(detector, MediaType.OCTET_STREAM, " html");
assertDetect(detector, html,
"0 1 2 3 4 5 6"
+ "1234<html");
assertDetect(detector, MediaType.OCTET_STREAM,
"0 1 2 3 4 5 6"
+ "12345<html");
assertDetect(detector, MediaType.OCTET_STREAM, "");
}
@Test
public void testDetectRegExPDF() throws Exception {
MediaType pdf = new MediaType("application", "pdf");
Detector detector = new MagicDetector(
pdf, "(?s)\\A.{0,144}%PDF-".getBytes(US_ASCII), null, true, 0, 0);
assertDetect(detector, pdf, "%PDF-1.0");
assertDetect(
detector, pdf,
"0 10 20 30 40 50 6"
+ "0 70 80 90 100 110 1"
+ "20 130 140"
+ "34%PDF-1.0");
assertDetect(
detector, MediaType.OCTET_STREAM,
"0 10 20 30 40 50 6"
+ "0 70 80 90 100 110 1"
+ "20 130 140"
+ "345%PDF-1.0");
assertDetect(detector, MediaType.OCTET_STREAM, "");
}
@Test
public void testDetectRegExGreedy() throws Exception {
String pattern =
"(?s)\\x3chtml xmlns=\"http://www\\.w3\\.org/1999/xhtml"
+ "\".*\\x3ctitle\\x3e.*\\x3c/title\\x3e";
MediaType xhtml = new MediaType("application", "xhtml+xml");
Detector detector = new MagicDetector(xhtml,
pattern.getBytes(US_ASCII), null,
true, 0, 8192);
assertDetect(detector, xhtml,
"<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+ "<head><title>XHTML test document</title></head>");
}
@Test
public void testDetectRegExOptions() throws Exception {
String pattern =
"(?s)\\A.{0,1024}\\x3c\\!(?:DOCTYPE|doctype) (?:HTML|html) "
+ "(?:PUBLIC|public) \"-//.{1,16}//(?:DTD|dtd) .{0,64}"
+ "(?:HTML|html) 4\\.01";
String data =
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\""
+ "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>"
+ "<HEAD><TITLE>HTML document</TITLE></HEAD>"
+ "<BODY><P>Hello world!</BODY></HTML>";
String data1 =
"<!DOCTYPE html PUBLIC \"-//W3C//dtd html 4.01//EN\""
+ "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>"
+ "<HEAD><TITLE>HTML document</TITLE></HEAD>"
+ "<BODY><P>Hello world!</BODY></HTML>";
String data2 =
"<!DoCtYpE hTmL pUbLiC \"-//W3C//dTd HtMl 4.01//EN\""
+ "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>"
+ "<HEAD><TITLE>HTML document</TITLE></HEAD>"
+ "<BODY><P>Hello world!</BODY></HTML>";
MediaType html = new MediaType("text", "html");
Detector detector = new MagicDetector(
html, pattern.getBytes(US_ASCII), null, true, 0, 0);
assertDetect(detector, html, data);
assertDetect(detector, html, data1);
assertDetect(detector, MediaType.OCTET_STREAM, data2);
}
@Test
public void testDetectStreamReadProblems() throws Exception {
byte[] data = "abcdefghijklmnopqrstuvwxyz0123456789".getBytes(US_ASCII);
MediaType testMT = new MediaType("application", "test");
Detector detector = new MagicDetector(testMT, data, null, false, 0, 0);
// Deliberately prevent InputStream.read(...) from reading the entire
// buffer in one go
InputStream stream = new RestrictiveInputStream(data);
assertEquals(testMT, detector.detect(stream, new Metadata()));
}
@Test
public void testDetectString() throws Exception {
String data = "abcdEFGhijklmnoPQRstuvwxyz0123456789";
MediaType testMT = new MediaType("application", "test");
Detector detector;
// Check regular String matching
detector = MagicDetector.parse(testMT, "string", "0:20", "abcd", null);
assertDetect(detector, testMT, data.getBytes(US_ASCII));
detector = MagicDetector.parse(testMT, "string", "0:20", "cdEFGh", null);
assertDetect(detector, testMT, data.getBytes(US_ASCII));
// Check Little Endian and Big Endian utf-16 strings
detector = MagicDetector.parse(testMT, "unicodeLE", "0:20", "cdEFGh", null);
assertDetect(detector, testMT, data.getBytes(UTF_16LE));
detector = MagicDetector.parse(testMT, "unicodeBE", "0:20", "cdEFGh", null);
assertDetect(detector, testMT, data.getBytes(UTF_16BE));
// Check case ignoring String matching
detector = MagicDetector.parse(testMT, "stringignorecase", "0:20", "BcDeFgHiJKlm", null);
assertDetect(detector, testMT, data.getBytes(US_ASCII));
}
private void assertDetect(Detector detector, MediaType type, String data) {
byte[] bytes = data.getBytes(US_ASCII);
assertDetect(detector, type, bytes);
}
private void assertDetect(Detector detector, MediaType type, byte[] bytes) {
try {
InputStream stream = new ByteArrayInputStream(bytes);
assertEquals(type, detector.detect(stream, new Metadata()));
// Test that the stream has been reset
for (int i = 0; i < bytes.length; i++) {
assertEquals(bytes[i], (byte) stream.read());
}
assertEquals(-1, stream.read());
} catch (IOException e) {
fail("Unexpected exception from MagicDetector");
}
}
/**
* InputStream class that does not read in all available bytes in
* one go.
*/
private class RestrictiveInputStream extends ByteArrayInputStream {
public RestrictiveInputStream(byte[] buf) {
super(buf);
}
/**
* Prevent reading the entire len of bytes if requesting more
* than 10 bytes.
*/
public int read(byte[] b, int off, int len) {
if (len > 10) {
return super.read(b, off, len-10);
} else {
return super.read(b, off, len);
}
}
}
}