/******************************************************************************* * Copyright (c) 2011 Subgraph. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Subgraph - initial API and implementation ******************************************************************************/ package com.subgraph.vega.internal.analysis; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.http.Header; import com.subgraph.vega.api.analysis.MimeType; import com.subgraph.vega.api.http.requests.IHttpResponse; public class MimeDetector { private static final List<String> genericAsciiPrefixes = new ArrayList<String>(); private static final Map<String, MimeType> nameMap = new HashMap<String, MimeType>(); static { genericAsciiPrefixes.addAll(Arrays.asList("text/x-", "text/vnd.", "application/x-httpd-")); for(MimeType mt: MimeType.values()) nameMap.put(mt.getCanonicalName(), mt); addExtraNames(MimeType.MIME_ASC_GENERIC, "text/csv"); addExtraNames(MimeType.MIME_ASC_JAVASCRIPT, "application/x-javascript", "application/json", "text/javascript"); addExtraNames(MimeType.MIME_ASC_RTF, "application/rtf"); addExtraNames(MimeType.MIME_XML_GENERIC, "application/xml"); addExtraNames(MimeType.MIME_IMG_BMP, "image/bmp", "image/x-icon"); addExtraNames(MimeType.MIME_AV_WAV, "audio/wav"); addExtraNames(MimeType.MIME_AV_RA, "audio/x-pn-realaudio", "audio/x-realaudio"); addExtraNames(MimeType.MIME_AV_MPEG, "video/mp4"); addExtraNames(MimeType.MIME_AV_FLV, "video/x-flv"); addExtraNames(MimeType.MIME_AV_WMEDIA, "audio/x-ms-wma", "video/x-ms-asf"); addExtraNames(MimeType.MIME_BIN_ZIP, "application/x-zip-compressed"); addExtraNames(MimeType.MIME_BIN_GZIP, "application/x-gunzip", "application/x-tar-gz"); addExtraNames(MimeType.MIME_BIN_GENERIC, "application/octet-stream"); } static void addExtraNames(MimeType mime, String ... names) { for(int i = 0; i < names.length; i++) nameMap.put(names[i], mime); } private final CSSDetector cssDetector = new CSSDetector(); private final JavascriptDetector jsDetector = new JavascriptDetector(); MimeType getDeclaredMimeType(IHttpResponse response) { if(response.getRawResponse().containsHeader("Content-Type")) return headerToMimeType(response.getRawResponse().getFirstHeader("Content-Type")); return MimeType.MIME_NONE; } private MimeType headerToMimeType(Header hdr) { if(hdr == null || hdr.getValue() == null) return MimeType.MIME_NONE; final String ctype = hdr.getValue(); if(nameMap.containsKey(ctype)) return nameMap.get(ctype); for(String prefix : genericAsciiPrefixes) { if(ctype.startsWith(prefix)) return MimeType.MIME_ASC_GENERIC; } return MimeType.MIME_NONE; } MimeType getSniffedMimeType(IHttpResponse response) { final String body = response.getBodyAsString(); if(body == null) return MimeType.MIME_NONE; final String buffer = (body.length() > 1024) ? (body.substring(0, 1024)) : (body); if(cssDetector.isBodyCSS(response)) return MimeType.MIME_ASC_CSS; else if(jsDetector.isBodyJavascript(response)) return MimeType.MIME_ASC_JAVASCRIPT; else if(response.isMostlyAscii()) return getSniffedMimeTypeForAscii(buffer); else return getSniffedMimeTypeForBinary(buffer); } MimeType getSniffedMimeTypeForAscii(String buffer) { if(buffer.startsWith("%!PS")) return MimeType.MIME_ASC_POSTSCRIPT; else if(buffer.startsWith("{\\rtf")) return MimeType.MIME_ASC_RTF; else if(buffer.startsWith("%PDF")) return MimeType.MIME_EXT_PDF; else if(buffer.contains("<OpenSearch")) return MimeType.MIME_XML_OPENSEARCH; else if(buffer.contains("<channel>") || buffer.contains("<description>") || buffer.contains("<item>") || buffer.contains("<rdf:RDF")) return MimeType.MIME_XML_RSS; else if(buffer.contains("<feed") || buffer.contains("<updated>")) return MimeType.MIME_XML_ATOM; final String lower = buffer.toLowerCase(); if(lower.contains("<wml") || lower.contains("<!doctype wml ")) return MimeType.MIME_XML_WML; else if(lower.contains("<cross-domain-policy>")) return MimeType.MIME_XML_CROSSDOMAIN; else if(buffer.contains("<?xml") || buffer.contains("<!DOCTYPE")) { if(lower.contains("<!doctype html") || buffer.contains("http://www.w3.org/1999/xhtml")) return MimeType.MIME_XML_XHTML; else return MimeType.MIME_XML_GENERIC; } final List<String> htmlStrings = Arrays.asList("<html", "<meta", "<head", "<title", "<body", "</body", "<!doctype", "<--", "<style", "<script", "<font", "<span", "<div", "<img", "<form", "<br", "<td", "<h1", "<li", "<p>", "href="); for(String s: htmlStrings) { if(lower.contains(s)) return MimeType.MIME_ASC_HTML; } if(buffer.contains("<![CDATA[") || buffer.contains("</") || buffer.contains("/>")) return MimeType.MIME_XML_GENERIC; return MimeType.MIME_ASC_GENERIC; } MimeType getSniffedMimeTypeForBinary(String buffer) { final char c0 = charAt(buffer, 0); final char c1 = charAt(buffer, 1); final char c2 = charAt(buffer, 2); final char c3 = charAt(buffer, 3); if(c0 == 0xFF && c1 == 0xD8 && c2 == 0xFF) return MimeType.MIME_IMG_JPEG; else if(buffer.startsWith("GIF8")) return MimeType.MIME_IMG_GIF; else if(c0 == 0x89 && buffer.startsWith("PNG", 1)) return MimeType.MIME_IMG_PNG; else if(buffer.startsWith("BM")) return MimeType.MIME_IMG_BMP; else if(buffer.startsWith("II") && c2 == 42) return MimeType.MIME_IMG_TIFF; else if(buffer.startsWith("RIFF")) { if(charAt(buffer, 8) == 'A') { if(charAt(buffer, 9) == 'C') return MimeType.MIME_IMG_ANI; else return MimeType.MIME_AV_AVI; } else return MimeType.MIME_AV_WAV; } else if(c0 == 0 && c1 == 0 && c2 != 0 && c3 == 0) return MimeType.MIME_IMG_BMP; else if(c0 == 0x30 && c1 == 0x26 && c2 == 0xB2) return MimeType.MIME_AV_WMEDIA; else if(c0 == 0xFF && c1 == 0xFB) return MimeType.MIME_AV_MP3; else if(c0 == 0x00 && c1 == 0x00 && c2 == 0x01 && (c3 >> 4) == 0x0B) return MimeType.MIME_AV_MPEG; else if(buffer.length() >= 4 && buffer.substring(0, 4).equalsIgnoreCase("OggS")) return MimeType.MIME_AV_OGG; else if(c0 == 0x28 && buffer.startsWith("RMF",1)) return MimeType.MIME_AV_RA; else if(c0 == 0x2E && buffer.startsWith("RMF",1)) return MimeType.MIME_AV_RV; else if(buffer.startsWith("free", 4) || buffer.startsWith("mdat", 4) || buffer.startsWith("wide", 4) || buffer.startsWith("pnot", 4) || buffer.startsWith("skip", 4) || buffer.startsWith("moov", 4)) return MimeType.MIME_AV_QT; else if(buffer.startsWith("FLV")) return MimeType.MIME_AV_FLV; else if(buffer.startsWith("FCWS") || buffer.startsWith("CWS")) return MimeType.MIME_EXT_FLASH; else if(buffer.startsWith("%PDF")) return MimeType.MIME_EXT_PDF; else if(buffer.startsWith("PK") && c2 < 6 && c3 < 7) { if(buffer.contains("META-INF/")) return MimeType.MIME_EXT_JAR; else return MimeType.MIME_BIN_ZIP; } else if(c0 == 0xCA && c1 == 0xFE && c2 == 0xBA && c3 == 0xBE) return MimeType.MIME_EXT_CLASS; else if(buffer.length() > 512 && c0 == 0xD0 && c1 == 0xCF && c2 == 0x11 && c3 == 0xE0) { switch(buffer.charAt(512)) { case 0xEC: return MimeType.MIME_EXT_WORD; case 0xFD: case 0x09: return MimeType.MIME_EXT_EXCEL; case 0x00: case 0x0F: case 0xA0: return MimeType.MIME_EXT_PPNT; } } else if(c0 == 0x1F && c1 == 0x8B && c2 == 0x08) return MimeType.MIME_BIN_GZIP; else if(buffer.startsWith("MSCF") && charAt(buffer, 4) == 0x00) return MimeType.MIME_BIN_CAB; return MimeType.MIME_BIN_GENERIC; } char charAt(String buffer, int idx) { if(idx >= buffer.length()) return 0xFFFF; return buffer.charAt(idx); } }