package org.archive.wayback.replay.mimetype; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import junit.framework.TestCase; import org.archive.io.warc.TestWARCReader; import org.archive.io.warc.TestWARCRecordInfo; import org.archive.io.warc.WARCRecord; import org.archive.io.warc.WARCRecordInfo; import org.archive.wayback.core.Resource; import org.archive.wayback.resourcestore.jwat.JWATResource; import org.archive.wayback.resourcestore.resourcefile.WarcResource; /** * Test for {@link SimpleMimeTypeDetector} */ public class SimpleMimeTypeDetectorTest extends TestCase { SimpleMimeTypeDetector cut; /* (non-Javadoc) * @see junit.framework.TestCase#setUp() */ protected void setUp() throws Exception { super.setUp(); cut = new SimpleMimeTypeDetector(); } public static Resource createTestResource(String ctype, byte[] payloadBytes, boolean compressed) throws IOException { WARCRecordInfo recinfo = compressed ? TestWARCRecordInfo .createCompressedHttpResponse(ctype, payloadBytes) : TestWARCRecordInfo.createHttpResponse(ctype, payloadBytes); TestWARCReader ar = new TestWARCReader(recinfo); WARCRecord rec = ar.get(0); WarcResource resource = new WarcResource(rec, ar); resource.parseHeaders(); return resource; } protected byte[] getTestContent(String filename) throws IOException { ByteArrayOutputStream bao = new ByteArrayOutputStream(); InputStream is = getClass().getResourceAsStream(filename); assertNotNull("test resource " + filename + " is missing", is); byte[] buf = new byte[8192]; int n; while ((n = is.read(buf)) > 0) { bao.write(buf, 0, n); } return bao.toByteArray(); } protected String detectMimeType(String filename, String indexContentType, String recordContentType) throws IOException { //WaybackRequest wbRequest = new WaybackRequest(); //CaptureSearchResult result = new CaptureSearchResult(); //result.setMimeType(indexContentType); Resource resource = createTestResource(recordContentType, getTestContent(filename), false); String mimetype = cut.sniff(resource); return mimetype; } protected String detectMimeType(String filename, String indexContentType) throws IOException { String recordContentType = "unk".equals(indexContentType) ? null : indexContentType; return detectMimeType(filename, indexContentType, recordContentType); } // Content-Type sniffing tests public void testContentSniffing_HTML() throws Exception { // CDX writer writes "unk" in content-type field if Content-Type // header is missing. assertEquals("text/html", detectMimeType("html/1.html", "unk")); assertEquals("text/html", detectMimeType("html/2.html", "unk")); assertEquals("text/html", detectMimeType("html/3.html", "unk")); assertEquals("text/html", detectMimeType("html/4.html", "text/html")); assertEquals("text/html", detectMimeType("html/5.html", "text/html")); assertEquals("text/html", detectMimeType("html/6.html", "text/html")); } /** * Detect pattern explosion caused by overly flexible regular * expression. * @throws Exception */ public void testContentSniffing_runawayRegexp() throws Exception { // 7.html has a sequence of TABs and LFs. One example found in production. ExecutorService exec = Executors.newSingleThreadExecutor(); Future<String> future = exec.submit(new Callable<String>() { @Override public String call() throws Exception { return detectMimeType("html/7.html", "text/html"); } }); try { String result = future.get(10, TimeUnit.SECONDS); assertEquals(null, result); } catch (TimeoutException ex) { fail("sniff did not finish within 10 seconds"); future.cancel(true); } } public void testContentSniffing_JavaScript() throws Exception { assertEquals("text/javascript", detectMimeType("js/1.js", "text/html")); } public void testContentSniffing_JavaScript_compressed() throws Exception { Resource resource = createTestResource("text/html", getTestContent("js/1.js"), true); String mimetype = cut.sniff(resource); assertEquals("text/javascript", mimetype); // resource's payload stream must be positioned at the beginning, // which is confirmed by testing if the first two bytes are GZIP MAGIC. byte[] bytes = new byte[2]; resource.read(bytes); assertTrue("resource is properly reset to position 0", bytes[0] == (byte)0x1f && bytes[1] == (byte)0x8b); } public void testContentSniffing_JSON() throws Exception { assertEquals("application/json", detectMimeType("json/1.json", "application/javascript")); } public void testContentSniffing_CSS() throws Exception { assertEquals("text/css", detectMimeType("css/1.css", "unk")); assertEquals("text/css", detectMimeType("css/2.css", "unk")); assertEquals("text/css", detectMimeType("css/3.css", "unk")); assertEquals("text/css", detectMimeType("css/4.css", "unk")); } public void testContentSniffing_Binary() throws Exception { assertEquals("application/pdf", detectMimeType("bin/1.pdf", "unk")); assertEquals("image/png", detectMimeType("bin/2.png", "text/html")); } /** * Main entry point for testing against real-world samples. * <p> * Needs following JARs to run: * <ul> * <li>wayback-core</li> * <li>junit 3</li> * <li>commons-httpclient 3.1</li> * <li>juniversalchardet</li> * <li>jwat-{arc,warc,gzip,common}</li> * </ul> */ public static void main(String[] args) { if (args.length < 1) { System.err.println("Arguments: INPUT-FILE"); System.err.println(" INPUT-FILE is a file containing just one gzip-ed W/ARC record,"); System.err.println(" or a directory containing such files (must have .gz suffix)"); System.exit(1); } File input = new File(args[0]); SimpleMimeTypeDetector detector = new SimpleMimeTypeDetector(); if (input.isDirectory()) { File[] inputFiles = input.listFiles(new FileFilter() { @Override public boolean accept(File pathname) { return pathname.isFile() && pathname.getName().endsWith(".gz"); } }); for (File f : inputFiles) { detectFile(detector, f); } } else { detectFile(detector, input); } } public static void detectFile(MimeTypeDetector detector, File file) { try { InputStream is = new FileInputStream(file); Resource resource = JWATResource.getResource(is, 0); String contentType = resource.getHeader("content-type"); if (contentType == null) contentType = "-"; else { int p = contentType.indexOf(';'); if (p >= 0) { contentType = contentType.substring(0, p).trim(); } } String mimeType = detector.sniff(resource); if (mimeType == null) mimeType = "-"; System.out.println(file.getPath() + "\t" + contentType + "\t" + mimeType); } catch (Exception ex) { System.out.println(file.getPath() + "\t" + "-" + "\tERROR " + ex.getMessage()); } } }