package org.archive.wayback.replay.mimetype;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.archive.wayback.core.Resource;
import org.archive.wayback.replay.GzipDecodingResource;
import org.archive.wayback.replay.charset.CharsetDetector;
import org.archive.wayback.replay.charset.StandardCharsetDetector;
/**
* Simple {@link MimeTypeDetector} implementation.
* It's ad-hoc and not customizable, just tested against many samples.
*/
public class SimpleMimeTypeDetector implements MimeTypeDetector {
private static final Logger logger = Logger.getLogger(SimpleMimeTypeDetector.class.getName());
/**
* default value for {@code sniffLength}.
*/
public static final int DEFAULT_SNIFF_LENGTH = 1536;
/**
* minimum size of sniffing byte buffer to allocate (to
* prevent {@code ArrayIndexOutOfBoundsException}.)
*/
protected static final int MINIMUM_SNIFF_BUFFER_SIZE = 10;
private int sniffLength = DEFAULT_SNIFF_LENGTH;
private CharsetDetector charsetDetector = new StandardCharsetDetector();
/**
* number of bytes to read from resource.
* @param sniffLength
*/
public void setSniffLength(int sniffLength) {
this.sniffLength = sniffLength;
}
public int getSniffLength() {
return sniffLength;
}
/**
* {@link CharsetDetector} to use for detecting character encoding.
* <p>{@link StandardCharsetDetector} is used by default.</p>
* @param charsetDetector
*/
public void setCharsetDetector(CharsetDetector charsetDetector) {
if (charsetDetector != null)
this.charsetDetector = charsetDetector;
else
this.charsetDetector = new StandardCharsetDetector();
}
private static final String BINARY_FILE = "application/octet-stream";
/**
* Return {@code true} if {@code bytes} looks like a beginning of a
* binary file.
* <p>Looks for well-known binary format MAGICs.</p>
* @param bytes array of bytes.
* @return detected mimetype, or {@code null} if no binary
* format is detected.
*/
private String detectBinaryTypes(byte[] bytes) {
// NB: there are a lot of specific mimetypes we can detect by file
// magic, but we don't want to make too much effort here.
// "application/octet-stream" is fine if file looks very much like
// binary but don't know what exactly. Returned value is not really
// used beyond being non-null. We're only concerned with file formats
// frequently sent out with imprecise Content-Type.
switch (bytes[0]) {
case (byte)0xFF:
if (bytes[1] == (byte)0xFE) {
// UTF-16LE BOM
return null;
} else if ((bytes[1] & 0xFE) == (byte)0xFA) {
// audio/mp3
return "audio/mp3";
} else if (bytes[1] == (byte)0xD8) {
// image/jpeg - commonly <FF><D8><FF><E0> or <FF><D8><FF><E1>
return "image/jpeg";
}
// WordPerfect (<FF>WPC) falls in this category.
// (WordPerfect also has <D8>WPC
return BINARY_FILE;
case (byte)0xFE:
if (bytes[1] == (byte)0xFF) {
// UTF-16BE BOM
return null;
}
break;
case (byte)0xF7:
if (bytes[1] == 0x02 && bytes[2] == 0x01) {
// unconfirmed.
return "application/x-dvi";
}
break;
case (byte)0xEF:
if (bytes[1] == (byte)0xBB && bytes[2] == (byte)0xBF) {
// UTF-8 BOM
return null;
}
break;
case (byte)0xD0:
if (bytes[1] == (byte)0xCF && bytes[2] == 0x11 && bytes[2] == (byte)0xE1) {
// MS Word.Document.8
return BINARY_FILE;
}
break;
case (byte)0xCA:
if (bytes[1] == (byte)0xFE && bytes[2] == (byte)0xBA && bytes[3] == (byte)0xBE) {
// application/java (class files)
return "application/java";
}
break;
case (byte)0x89:
if (bytes[1] == 'P' && bytes[2] == 'N' && bytes[3] == 'G') {
return "image/png";
}
break;
case 0x00:
if (bytes[1] == 0x00) {
// Windows icon resource falls in this category.
return BINARY_FILE;
} else if (bytes[1] == 0x01) {
// TTF?
return BINARY_FILE;
}
break;
case 0x01:
if (bytes[1] == (byte)0xB3 || bytes[1] == (byte)0xBA) {
return "video/mpeg";
} else if (bytes[1] == 0x00) {
// very likely is a binary file
return BINARY_FILE;
}
break;
case 0x1F:
if (bytes[1] == (byte)0x8B) {
return "application/x-gzip";
} else if (bytes[1] == (byte)0x9D) {
// followed by 0x90
return "application/x-compress";
}
break;
case '%':
if (bytes[1] == 'P' && bytes[2] == 'D' && bytes[3] == 'F' && bytes[4] == '-') {
return "application/pdf";
} else if (bytes[1] == '!' && bytes[2] == 'P' && bytes[3] == 'S' && bytes[4] == '-') {
// application/postscript, Type 1 font.
return "application/postscript";
}
break;
case 'B':
if (bytes[1] == 'Z' && bytes[2] == 'h') {
return "application/x-bzip2";
}
break;
case 'F':
if (bytes[1] == 'W' && bytes[2] == 'S') {
// followed by <04> or <05>. Also 'CWS<07>?
return "application/x-shockwave-flash";
} else if (bytes[1] == 'L' && bytes[2] == 'V' && bytes[3] == 0x01) {
return "video/x-flv";
}
break;
case 'G':
if (bytes[1] == 'I' && bytes[2] == 'F' && bytes[3] == '8') {
// GIF87a or GIF89a
return "image/gif";
}
break;
case 'M':
if (bytes[1] == 'Z') {
// MS-DOS Executable (MZ <00-FF><00-01>)
if (bytes[3] == 0x00 || bytes[3] == 0x01) {
return "application/x-dosexec";
}
} else if (bytes[1] == 'S' && bytes[2] == 'C' && bytes[3] == 'F') {
// MS cab file
return "application/vnd.ms-cab-compressed";
} else if (bytes[1] == 'T' && bytes[2] == 'h' && bytes[3] == 'd') {
// MIDI
return "audio/midi"; // or "application/x-midi"
}
break;
case 'P':
if (bytes[1] == 'K' && bytes[2] == 0x03 && bytes[3] == 0x04) {
return "application/zip";
} else if (bytes[1] == 'E' && bytes[2] == 0x00 && bytes[3] == 0x00 && bytes[4] == 'M' && bytes[5] == 'S') {
// Windows PE
return BINARY_FILE;
}
break;
case 'm':
if (bytes[1] == 'o' && bytes[2] == 'o' && bytes[3] == 'v') {
return "video/quicktime";
} else if (bytes[1] == 'd' && bytes[2] == 'a' && bytes[3] == 't') {
return "video/quicktime";
}
break;
case '{':
if (bytes[1] == '\\' && bytes[2] == 'r' && bytes[3] == 't' && bytes[4] == 'f' && bytes[5] == '1') {
return "application/rtf";
}
break;
}
if (bytes[2] == '-' && bytes[3] == 'l' && bytes[4] == 'h' && bytes[5] == '5' && bytes[6] == '-') {
// LZH archive
return BINARY_FILE;
}
// Other formats we may want to add
// "RIFF" <?><?><?><?> "WAVEfmt " - "audio/wav"
// <DB><A5><2D><00> - commonly with suffix .doc. files command doesn't know this format.
// <C5><D0><D3><C6><1E><00><00><00> - some EPS has a binary header starting with this, before "%!PS-"
return null;
}
/**
* Read first {@code sniffLength} bytes of {@code resource}'s payload,
* decoding {@code Content-Encoding} if any. Reset {@code resource}'s
* read position back to zero.
* @param resource Resource to load bytes from
* @return bytes, zero-padded if payload is shorter.
* @throws IOException
*/
protected byte[] peekContent(Resource resource) throws IOException {
byte[] bbuffer = new byte[Math.max(sniffLength, MINIMUM_SNIFF_BUFFER_SIZE)];
String encoding = resource.getHeader("content-encoding");
if ("gzip".equalsIgnoreCase(encoding) || "x-gzip".equalsIgnoreCase(encoding)) {
// use larger readlimit, because gzip-ed data can be larger than the original
// at low compression level.
resource.mark(sniffLength + 100);
@SuppressWarnings("resource")
Resource z = new GzipDecodingResource(resource);
z.read(bbuffer, 0, sniffLength);
resource.reset();
} else {
resource.mark(sniffLength);
resource.read(bbuffer, 0, sniffLength);
resource.reset();
}
return bbuffer;
}
@Override
public String sniff(Resource resource) {
// This sniffer only works with HTTP response record.
// TODO: check record type.
byte[] bbuffer;
try {
bbuffer = peekContent(resource);
} catch (IOException ex) {
// Caveat: IOException from reset() (i.e. mark got invalidated) will have major
// consequences. Should we re-throw some runtime exception?
logger.warning("error reading " + sniffLength + " from resource: " + ex.getMessage());
return null;
}
// Spare decoding and regexp-matching for clearly-binary files.
// Most mimetype detector libraries are overkill since we don't
// need to know the details (ex. bitrate of MP3, PDF version).
// So we use our own detector.
String ctype = detectBinaryTypes(bbuffer);
if (ctype != null) {
// Very like be a binary file.
return ctype;
}
// Try decoding as text and look for signature patterns.
// It doesn't need to be too complex, since want to differentiate
// only a handful of text types: HTML, JAVASCRIPT, JSON and CSS.
String encoding;
try {
encoding = charsetDetector.getCharset(resource, null);
} catch (IOException ex1) {
// IO error at this stage means we won't be able to sniff
// content type either.
// TODO: log
return null;
}
//System.err.println("detected encoding: " + encoding);
String text;
try {
text = new String(bbuffer, encoding);
} catch (UnsupportedEncodingException ex) {
// likely to happen, already checked by CharsetDetector.
return null;
}
// strip off BOM - all variants are decoded into \ufeff.
if (text.length() > 0 && text.charAt(0) == '\ufeff') {
text = text.substring(1);
}
ctype = detectHTML(text);
if (ctype != null) return ctype;
ctype = detectJavaScript(text);
if (ctype != null) return ctype;
ctype = detectCSS(text);
if (ctype != null) return ctype;
return null;
}
private static final Pattern RE_XML_PROLOGUE = Pattern
.compile("\\s*<\\?xml\\s+version=\"[.\\d]+\"\\s+.*\\?>");
private static final Pattern RE_HTML_ELEMENTS = Pattern
.compile("(?i)\\s*<(HTML|HEAD|STYLE|SCRIPT|META|BODY)(\\s|>)");
private static final Pattern RE_DOCTYPE_HTML = Pattern
.compile("(?i)\\s*<!DOCTYPE\\s+HTML");
private static final Pattern RE_SGML_COMMENT = Pattern
.compile("(?s)\\s*<!--.*?-->");
private static final Pattern RE_END_TAG = Pattern
.compile("(?i)</[a-z][a-z0-9]*>");
protected String detectHTML(String text) {
int pos = 0;
{
Matcher m = RE_XML_PROLOGUE.matcher(text);
if (m.lookingAt()) {
pos = m.end();
}
}
{
Matcher m = RE_SGML_COMMENT.matcher(text);
m.region(pos, text.length());
while (m.lookingAt()) {
m.region(pos = m.end(), text.length());
}
}
{
Matcher m = RE_DOCTYPE_HTML.matcher(text);
m.region(pos, text.length());
if (m.lookingAt())
return "text/html";
}
{
Matcher m = RE_HTML_ELEMENTS.matcher(text);
m.region(pos, text.length());
if (m.lookingAt())
return "text/html";
}
{
Matcher m = RE_END_TAG.matcher(text);
m.region(pos, text.length());
if (m.find())
return "text/html";
}
return null;
}
private static final Pattern RE_JS_VAR = Pattern
.compile("(?m)^var\\s+[_a-zA-Z$][_a-zA-Z$0-9]+");
private static final Pattern RE_JS_FUNCTION = Pattern
.compile("(?s)function(?:\\s+[a-zA-Z0-9_$]+\\s*)?\\(");
private static final Pattern RE_JSON_HEAD = Pattern
.compile("\\s*\\{\\s*\"");
protected String detectJavaScript(String text) {
{
Matcher m = RE_JS_VAR.matcher(text);
if (m.find())
return "text/javascript";
}
{
Matcher m = RE_JS_FUNCTION.matcher(text);
if (m.find())
return "text/javascript";
}
{
Matcher m = RE_JSON_HEAD.matcher(text);
if (m.lookingAt()) {
// TODO: if resource has content-type "text/javascript", just
// use it.
return "application/json";
}
}
return null;
}
private static final Pattern RE_CSS_COMMENT = Pattern.compile("\\s*/\\*.*?\\*/");
private static final Pattern RE_CSS_AT_RULE = Pattern.compile("\\s*@(import|media|document|charset|font-face|keyframes|namespace|supports)\\s+");
private static final String RE_CSS_SIMPLE_SELECTOR = "(?:(?:[-a-z0-9]+|\\*)(?:[.#:][-_a-z0-9]+|\\[.+?\\])*|(?:[.#:][-_a-z0-9]+|\\[.+?\\])+)";
private static final Pattern RE_CSS_RULESET_START = Pattern
.compile("(?i)\\s*" + RE_CSS_SIMPLE_SELECTOR + "(?:[\\s,+>]+" + RE_CSS_SIMPLE_SELECTOR + ")*\\s*\\{");
private static final Pattern RE_CSS_DECLARATION = Pattern
.compile("(?i)\\s*[-a-z]+\\s*:\\s*[^;}]+[;}]");
protected String detectCSS(String text) {
// CSS (they are rarely returned with mimetype "unk" or "text/html")
int pos = 0;
Matcher cm = RE_CSS_COMMENT.matcher(text);
{
cm.region(pos, text.length());
while (cm.lookingAt()) {
cm.region(pos = cm.end(), text.length());
}
}
{
Matcher m = RE_CSS_AT_RULE.matcher(text);
m.region(pos, text.length());
if (m.lookingAt()) {
return "text/css";
}
}
{
Matcher m = RE_CSS_RULESET_START.matcher(text);
m.region(pos, text.length());
if (m.lookingAt()) {
cm.region(pos = m.end(), text.length());
while (cm.lookingAt()) {
cm.region(pos = cm.end(), text.length());
}
Matcher sm = RE_CSS_DECLARATION.matcher(text);
sm.region(pos, text.length());
if (sm.lookingAt()) {
return "text/css";
}
}
}
return null;
}
}