/** * */ package org.archive.wayback.resourcestore.resourcefile; import java.text.SimpleDateFormat; import java.util.Locale; import java.util.Map; import junit.framework.TestCase; import org.archive.format.warc.WARCConstants.WARCRecordType; import org.archive.io.warc.TestWARCReader; import org.archive.io.warc.TestWARCRecordInfo; import org.archive.io.warc.WARCRecord; import org.archive.io.warc.WARCRecordInfo; import org.archive.wayback.core.Resource; import org.archive.wayback.replay.GzipDecodingResource; import org.archive.wayback.replay.TextReplayRenderer; import org.archive.wayback.replay.charset.CharsetDetector; import org.archive.wayback.replay.charset.StandardCharsetDetector; import org.archive.wayback.resourcestore.jwat.JWATResourceTest; /** * TODO: add more tests. it has only tests relevant to recent * changes. * * @contributor kenji * */ public class WarcResourceTest extends TestCase { /* (non-Javadoc) * @see junit.framework.TestCase#setUp() */ protected void setUp() throws Exception { super.setUp(); } /** * create a test {@link Resource} from {@link WARCRecordInfo} {@code recinfo}. * <p>Override this method to run tests on different implementations of * Resource.</p> * @param recinfo * @return Resource * @see JWATResourceTest */ protected Resource createResource(WARCRecordInfo recinfo) throws Exception { TestWARCReader ar = new TestWARCReader(recinfo); WARCRecord rec = ar.get(0); WarcResource res = new WarcResource(rec, ar); return res; } /** * plain HTTP response (without any transfer/content-encoding) * @throws Exception */ public void testPlainHttpRecord() throws Exception { String payload = "hogehogehogehogehoge"; WARCRecordInfo recinfo = TestWARCRecordInfo.createHttpResponse(payload); Resource res = createResource(recinfo); res.parseHeaders(); assertEquals("statusCode", 200, res.getStatusCode()); assertEquals("content-type", "text/plain", res.getHeader("Content-Type")); byte[] buf = new byte[payload.getBytes().length + 1]; int n = res.read(buf); assertEquals("content length", buf.length - 1, n); res.close(); } /** * uncompressed, but chunked-encoded HTTP response * @throws Exception */ public void testPlainChunkedHttpRecord() throws Exception { String payload = "hogehogehogehogehoge"; WARCRecordInfo recinfo = new TestWARCRecordInfo( TestWARCRecordInfo.buildHttpResponseBlock("200 OK", "text/plain", payload.getBytes("UTF-8"), true)); Resource res = createResource(recinfo); res.parseHeaders(); assertEquals("statusCode", 200, res.getStatusCode()); assertEquals("content-type", "text/plain", res.getHeader("Content-Type")); byte[] buf = new byte[payload.getBytes().length + 1]; int n = res.read(buf); assertEquals("content length", buf.length - 1, n); res.close(); } /** * gzip-compressed HTTP response. * @throws Exception */ public void testCompressedHttpRecord() throws Exception { String payload = "hogehogehogehogehoge"; String ctype = "text/plain"; WARCRecordInfo recinfo = new TestWARCRecordInfo( TestWARCRecordInfo.buildCompressedHttpResponseBlock(ctype, payload.getBytes())); Resource res = createResource(recinfo); res.parseHeaders(); assertEquals("statusCode", 200, res.getStatusCode()); assertEquals("content-type", ctype, res.getHeader("Content-Type")); Resource zres = TextReplayRenderer.decodeResource(res); assertTrue("wrapped with GzipDecodingResource", (zres instanceof GzipDecodingResource)); byte[] buf = new byte[payload.getBytes().length + 1]; int n = zres.read(buf); assertEquals("content length", buf.length - 1, n); res.close(); } /** * gzip-compressed, chunked-encoded HTTP response. * @throws Exception */ public void testCompressedChunkedHttpRecord() throws Exception { String payload = "hogehogehogehogehoge"; String ctype = "text/plain"; WARCRecordInfo recinfo = new TestWARCRecordInfo( TestWARCRecordInfo.buildCompressedHttpResponseBlock(ctype, payload.getBytes(), true)); Resource res = createResource(recinfo); res.parseHeaders(); assertEquals("statusCode", 200, res.getStatusCode()); assertEquals("content-type", ctype, res.getHeader("Content-Type")); Resource zres = TextReplayRenderer.decodeResource(res); assertTrue("wrapped with GzipDecodingResource", (zres instanceof GzipDecodingResource)); byte[] buf = new byte[payload.getBytes().length + 1]; int n = zres.read(buf); assertEquals("content length", buf.length - 1, n); res.close(); } // TODO: add more tests on various Transfer-Encoding and Content-Encoding. // TODO: add more tests on corner cases. /** * metadata record with render-able content like site screenshot image. * * HTTP status is assumed to be 200, and Content-Type of WARC header * becomes Content-Type of replay response. * @throws Exception */ public void testMetadataRecord() throws Exception { // 1-dot transparent GIF found everywhere if you wonder :-) final byte[] block = new byte[] { 71, 73, 70, 56, 57, 97, 1, 0, 1, 0, -128, 0, 0, -64, -64, -64, 0, 0, 0, 33, -7, 4, 1, 0, 0, 0, 0, 44, 0, 0, 0, 0, 1, 0, 1, 0, 0, 2, 2, 68, 1, 0, 59, 13, 10, 13, 10 }; final String ct = "image/gif"; WARCRecordInfo recinfo = new TestWARCRecordInfo(block); recinfo.setType(WARCRecordType.metadata); recinfo.setMimetype(ct); Resource res = createResource(recinfo); // must not fail res.parseHeaders(); // should return assumed 200 assertEquals("statusCode", 200, res.getStatusCode()); // content-type is what's specified in WARC header. assertEquals("content-type", ct, res.getHeader("Content-Type")); // must have Date header, in HTTP Date format. String date = res.getHeader("Date"); assertNotNull("has date header", date); new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z", Locale.ENGLISH).parse(date); // block as content byte[] buf = new byte[block.length + 1]; int n = res.read(buf); assertEquals("content length", block.length, n); for (int i = 0; i < block.length; i++) { assertEquals("byte " + i, block[i], buf[i]); } res.close(); } final String REVISIT_WARC_PROFILE = "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest"; /** * new, current revisit record, which has just HTTP response line and * headers part of the capture. * <p>Expectations: * TextReplayRender receives revisit WarcResource as {@code httpHeaderResource}, * and calls following methods on it:</p> * <ul> * <li>{@link WarcResource#getStatusCode()}</li> * <li>{@link WarcResource#getHttpHeaders()} (ok to return null)</li> * </ul> * @throws Exception */ public void testRevisitRecord() throws Exception { final String ct = "text/html"; WARCRecordInfo recinfo = TestWARCRecordInfo.createRevisitHttpResponse(ct, 1345); Resource res = createResource(recinfo); res.parseHeaders(); // these are from this record. assertEquals("statusCode", 200, res.getStatusCode()); assertEquals("content-type", ct, res.getHeader("Content-Type")); StandardCharsetDetector csd = new StandardCharsetDetector(); // assuming WaybackRequest (3rd parameter) is not used in getCharset() csd.getCharset(res, res, null); res.close(); } /** * old revisit record, which has zero-length block (no HTTP response * line, no HTTP headers). * * in this case, {@link WarcResource#getStatusCode()} should not fail, but * either return special value or throw an appropriate exception signifying * there's no HTTP status line recorded in this resource, and thus ReplayRenderer * should fallback on using payloadResource for the info instead. * {@link WarcResource#getHttpHeaders()} must not return null, but should * return empty Map object, so that {@link CharsetDetector} can return null * without failing. * * for the better, this fallback may be encapsulated in * virtual Resource combining httpHeaderResource and payloadResource. * * related issue: https://webarchive.jira.com/browse/ACC-126 * @throws Exception * @see TextReplayRenderer * @see StandardCharsetDetector#getCharset(org.archive.wayback.core.Resource, org.archive.wayback.core.Resource, org.archive.wayback.core.WaybackRequest) */ public void testOldRevisitRecord() throws Exception { final String ct = "text/html"; WARCRecordInfo recinfo = TestWARCRecordInfo.createRevisitHttpResponse(ct, 1345, false); Resource res = createResource(recinfo); res.parseHeaders(); // should either return special value or throw appropriate exception (TBD) int scode = res.getStatusCode(); assertEquals("status code", 0, scode); Map<String, String> headers = res.getHttpHeaders(); //assertNotNull("headers", headers); assertNull("headers", headers); res.close(); } public void testUrlAgnosticRevisitRecord() throws Exception { final String ctype = "text/html"; WARCRecordInfo recinfo = TestWARCRecordInfo .createUrlAgnosticRevisitHttpResponse(ctype, 1345); Resource res = createResource(recinfo); res.parseHeaders(); // these are from this record. assertEquals("statusCode", 200, res.getStatusCode()); assertEquals("content-type", ctype, res.getHeader("Content-Type")); assertEquals("http://example.com/", res.getRefersToTargetURI()); assertEquals("20140101101010", res.getRefersToDate()); StandardCharsetDetector csd = new StandardCharsetDetector(); // assuming WaybackRequest (3rd parameter) is not used in getCharset() csd.getCharset(res, res, null); res.close(); } /** * resource record, typically used for archiving ftp fetches. * @throws Exception */ public void testResourceRecord() throws Exception { final String ct = "text/plain"; final byte[] block = "blahblahblah\n".getBytes(); WARCRecordInfo recinfo = new TestWARCRecordInfo(block); recinfo.setType(WARCRecordType.resource); recinfo.setUrl("ftp://ftp.example.com/afile.txt"); recinfo.setMimetype(ct); Resource res = createResource(recinfo); res.parseHeaders(); int scode = res.getStatusCode(); assertEquals("statusCode", 200, scode); Map<String, String> headers = res.getHttpHeaders(); assertNotNull("headers", headers); assertEquals("content-type", ct, res.getHeader("Content-Type")); // must have Date header, in HTTP Date format. String date = res.getHeader("Date"); assertNotNull("has date header", date); new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z", Locale.ENGLISH).parse(date); res.close(); } // TODO: there can be revisit for ftp fetches, right? }