package org.archive.io.warc;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.ParseException;
import java.util.Date;
import java.util.zip.GZIPOutputStream;
import org.archive.format.ArchiveFileConstants;
import org.archive.format.warc.WARCConstants;
import org.archive.util.DateUtils;
/**
* WARCRecordInfo with default values and convenience factory methods.
*
* TODO: use existing well-tested HTTP library for generating HTTP content-block.
*
* @see TestWARCReader
* @contributor kenji
*
*/
public class TestWARCRecordInfo extends WARCRecordInfo implements WARCConstants, ArchiveFileConstants {
public final static String REVISIT_WARC_PROFILE =
"http://netpreserve.org/warc/1.0/revisit/identical-payload-digest";
public TestWARCRecordInfo(byte[] content) {
this.type = WARCRecordType.response;
this.url = "http://test.example.com/";
this.mimetype = "application/http; msgtype=response";
try {
this.recordId = new URI("uri:recordidentifier");
} catch (URISyntaxException ex) {
throw new RuntimeException("unexpected error", ex);
}
this.contentStream = new ByteArrayInputStream(content);
this.contentLength = content.length;
// NB: create14DigitDate must be in ISOZ format (name "14DigitDate" is confusing)
this.create14DigitDate = DateUtils.getLog14Date();
}
/**
* translates DT14 (YYYYmmddHHMMSS) to ISOZ format used in WARC-Date header.
* @param dt14
* @return ISOZ (YYYY-mm-ddTHH:MM:SSZ)
* @exception IOException dt14 is in bad format (wrapping ParseException to simply error handling)
*/
public static String dt14ToISOZ(String dt14) throws IOException {
try {
Date date = DateUtils.parse14DigitDate(dt14);
return DateUtils.getLog14Date(date);
} catch (ParseException ex) {
throw new IOException("invalid DT14 " + dt14, ex);
}
}
/**
* utility method for updating create14DigitDate from DT14.
* @param dt14 DT14 (YYYYmmddHHMMSS)
* @throws IOException dt14 is in bad format.
*/
public void setCreate14DigitDateFromDT14(String dt14) throws IOException {
create14DigitDate = dt14ToISOZ(dt14);
}
// factory methods
/**
* return TestWARCRecordInfo for HTTP Response with entity {@code payload}.
* Content-Type is {@code text/plain}, and {@code payload} is encoded in UTF-8.
* @param payload
* @return
* @throws IOException
*/
public static TestWARCRecordInfo createHttpResponse(String payload)
throws IOException {
return new TestWARCRecordInfo(buildHttpResponseBlock("text/plain", payload.getBytes("UTF-8")));
}
/**
* return TestWARCRecordInfo for HTTP Response with entity {@code payload}.
* @param ctype Content-Type value
* @param payloadBytes payload bytes
* @return WARCRecordInfo with default values set to key properties.
* @throws IOException
*/
public static TestWARCRecordInfo createHttpResponse(String ctype, byte[] payloadBytes)
throws IOException {
return new TestWARCRecordInfo(buildHttpResponseBlock(ctype, payloadBytes));
}
/**
* return TestWARCRecordInfo for HTTP Response with response status line {@code status},
* entity {@code payload} of content-type {@code ctype}.
* @param status status line, such as {@code "200 OK"}
* @param ctype content-type
* @param payloadBytes payload bytes
* @return TestWARCRecordInfo
* @throws IOException
*/
public static TestWARCRecordInfo createHttpResponse(String status,
String ctype, byte[] payloadBytes) throws IOException {
return new TestWARCRecordInfo(buildHttpResponseBlock(status, ctype, payloadBytes));
}
public static TestWARCRecordInfo createCompressedHttpResponse(String ctype,
byte[] payloadBytes) throws IOException {
return new TestWARCRecordInfo(buildCompressedHttpResponseBlock(ctype, payloadBytes));
}
public static TestWARCRecordInfo createRevisitHttpResponse(String ctype, int len, boolean withHeader)
throws IOException {
return createRevisitHttpResponse(ctype, len, withHeader, false);
}
public static TestWARCRecordInfo createRevisitHttpResponse(String ctype, int len, boolean withHeader, boolean gzipContent)
throws IOException {
TestWARCRecordInfo recinfo = new TestWARCRecordInfo(buildRevisitHttpResponseBlock(ctype, len, withHeader, gzipContent));
recinfo.setType(WARCRecordType.revisit);
recinfo.addExtraHeader("WARC-Truncated", "length");
recinfo.addExtraHeader("WARC-Profile", REVISIT_WARC_PROFILE);
return recinfo;
}
public static TestWARCRecordInfo createRevisitHttpResponse(String ctype, int len)
throws IOException {
return createRevisitHttpResponse(ctype, len, true);
}
/**
* creates TestWARCRecordInfo with URL-Agnostic Revisit WARC record content.
* <ul>
* <li>{@code WARC-Refers-To-Target-URI} = {@code http://example.com/}</li>
* <li>{@code WARC-Refers-To-Date} = {@code 2014-01-01T10:10:10Z}</li>
* </ul>
* @param ctype Content-Type
* @param len Content-Length (arbitrary)
* @return TestWARCRecordInfo
* @throws IOException
*/
public static TestWARCRecordInfo createUrlAgnosticRevisitHttpResponse(
String ctype, int len) throws IOException {
TestWARCRecordInfo recinfo = new TestWARCRecordInfo(
TestWARCRecordInfo.buildRevisitHttpResponseBlock(ctype, len, true,
false));
recinfo.setType(WARCRecordType.revisit);
recinfo.addExtraHeader("WARC-Truncated", "length");
recinfo.addExtraHeader("WARC-Profile", TestWARCRecordInfo.REVISIT_WARC_PROFILE);
recinfo.addExtraHeader("WARC-Refers-To-Target-URI", "http://example.com/");
recinfo.addExtraHeader("WARC-Refers-To-Date", "2014-01-01T10:10:10Z");
return recinfo;
}
public static byte[] buildHttpResponseBlock(String payload) throws IOException {
return buildHttpResponseBlock("text/plain", payload.getBytes());
}
/**
* short cut for generating "200 OK" HTTP response content-block.
* @param ctype HTTP Content-Type, such as {@code "text/plain"}, {@code "image/gif"}
* @param payloadBytes payload bytes
* @return content-block bytes with HTTP status line, HTTP headers and payload.
* @throws IOException
*/
public static byte[] buildHttpResponseBlock(String ctype, byte[] payloadBytes) throws IOException {
return buildHttpResponseBlock("200 OK", ctype, payloadBytes);
}
private static void writeChunked(OutputStream out, byte[] data) throws IOException {
int s = 0;
while (s < data.length) {
int n = data.length - s;
if (n > 0x1000) n = 0x1000;
out.write(String.format("%x" + CRLF, n).getBytes("UTF-8"));
out.write(data, s, n);
out.write(CRLF.getBytes("UTF-8"));
s += n;
}
out.write(("0" + CRLF + CRLF).getBytes("UTF-8"));
}
/**
* return content-block bytes for HTTP response.
* @param status HTTP status code and status text separated by a space. ex. {@code "200 OK"}.
* @param ctype HTTP Content-Type
* @param payloadBytes payload bytes
* @param chunked if true, use chunked transfer-encoding
* @return content-block bytes with HTTP status line, HTTP headers and payload.
* @throws IOException
*/
public static byte[] buildHttpResponseBlock(String status, String ctype, byte[] payloadBytes, boolean chunked)
throws IOException {
ByteArrayOutputStream blockbuf = new ByteArrayOutputStream();
Writer bw = new OutputStreamWriter(blockbuf);
bw.write("HTTP/1.0 " + status + CRLF);
if (chunked) {
bw.write("Transfer-Encoding: chunked" + CRLF);
} else {
bw.write("Content-Length: " + payloadBytes.length + CRLF);
}
if (ctype != null) {
bw.write("Content-Type: " + ctype + CRLF);
}
bw.write(CRLF);
bw.flush();
if (chunked) {
writeChunked(blockbuf, payloadBytes);
} else {
blockbuf.write(payloadBytes);
}
bw.close();
return blockbuf.toByteArray();
}
public static byte[] buildHttpResponseBlock(String status, String ctype, byte[] payloadBytes) throws IOException {
return buildHttpResponseBlock(status, ctype, payloadBytes, false);
}
public static byte[] buildHttpRedirectResponseBlock(String location) throws IOException {
return buildHttpRedirectResponseBlock("302 Moved Temporarily", location);
}
public static byte[] buildHttpRedirectResponseBlock(String statusline,
String location) throws IOException {
assert statusline.startsWith("3");
ByteArrayOutputStream blockbuf = new ByteArrayOutputStream();
Writer bw = new OutputStreamWriter(blockbuf);
bw.write("HTTP/1.0 " + statusline + CRLF);
bw.write("Content-Length: " + 0 + CRLF);
bw.write("Content-Type: text/html" + CRLF);
bw.write("Location: " + location + CRLF);
bw.write(CRLF);
bw.close();
return blockbuf.toByteArray();
}
public static byte[] buildCompressedHttpResponseBlock(String ctype,
byte[] payloadBytes, boolean chunked) throws IOException {
ByteArrayOutputStream gzippedPayloadBytes = new ByteArrayOutputStream();
GZIPOutputStream zout = new GZIPOutputStream(gzippedPayloadBytes);
zout.write(payloadBytes);
zout.close();
payloadBytes = gzippedPayloadBytes.toByteArray();
ByteArrayOutputStream blockbuf = new ByteArrayOutputStream();
Writer bw = new OutputStreamWriter(blockbuf);
bw.write("HTTP/1.0 200 OK" + CRLF);
if (chunked) {
bw.write("Transfer-Encoding: chunked" + CRLF);
} else {
bw.write("Content-Length: " + payloadBytes.length + CRLF);
}
bw.write("Content-Type: " + ctype + CRLF);
bw.write("Content-Encoding: gzip" + CRLF);
bw.write(CRLF);
bw.flush();
if (chunked) {
writeChunked(blockbuf, payloadBytes);
} else {
blockbuf.write(payloadBytes);
}
bw.close();
return blockbuf.toByteArray();
}
public static byte[] buildCompressedHttpResponseBlock(String ctype, byte[] payloadBytes) throws IOException {
return buildCompressedHttpResponseBlock(ctype, payloadBytes, false);
}
/**
* generates WARC content for new revisit record.
* @param ctype value for Content-Type
* @param len value for Content-Length
* @param withHeader include HTTP status line and headers.
* passing false generates old-style revisit content block.
* @param gzipContent if true, block will have "Content-Encoding: gzip" header.
* (this shall match the compress-ness of previous capture).
* @return record content as byte array
* @throws IOException
*/
public static byte[] buildRevisitHttpResponseBlock(String ctype, int len,
boolean withHeader, boolean gzipContent) throws IOException {
ByteArrayOutputStream blockbuf = new ByteArrayOutputStream();
Writer bw = new OutputStreamWriter(blockbuf);
if (withHeader) {
bw.write("HTTP/1.0 200 OK" + CRLF);
if (len >= 0) {
bw.write("Content-Length: " + len + CRLF);
}
bw.write("Content-Type: " + ctype + CRLF);
if (gzipContent)
bw.write("Content-Encoding: gzip" + CRLF);
bw.write(CRLF);
bw.flush();
bw.close();
}
return blockbuf.toByteArray();
}
// POPULAR PAYLOAD SAMPLES
// ubiquitous 1-pixel transparent GIF, if you wonder.
public static final byte[] PAYLOAD_GIF = new byte[] {
71, 73, 70, 56, 57, 97, 1, 0, 1, 0, -128, 0, 0, -64, -64, -64,
0, 0, 0, 33, -7, 4, 1, 0, 0, 0, 0, 44, 0, 0, 0, 0,
1, 0, 1, 0, 0, 2, 2, 68, 1, 0, 59, 13, 10, 13, 10
};
}