package org.archive.wayback.resourcestore.jwat;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.util.Date;
import java.util.Hashtable;
import java.util.Map;
import org.archive.util.ArchiveUtils;
import org.apache.commons.lang.time.DateUtils;
import org.archive.format.warc.WARCConstants;
import org.archive.wayback.core.Resource;
import org.archive.wayback.exception.ResourceNotAvailableException;
import org.archive.wayback.replay.HttpHeaderOperation;
import org.jwat.arc.ArcReader;
import org.jwat.arc.ArcReaderFactory;
import org.jwat.arc.ArcRecordBase;
import org.jwat.common.ByteCountingPushBackInputStream;
import org.jwat.common.HeaderLine;
import org.jwat.common.HttpHeader;
import org.jwat.common.Payload;
import org.jwat.common.UriProfile;
import org.jwat.gzip.GzipEntry;
import org.jwat.gzip.GzipReader;
import org.jwat.warc.WarcReader;
import org.jwat.warc.WarcReaderFactory;
import org.jwat.warc.WarcRecord;
/**
* JWATResource -- created by Nick Clarke for interfacing with JWAT ARC/WARC Readers
* Originally forked from https://bitbucket.org/nclarkekb/jwat-wayback-resourcestore
*
* @see JWATFlexResourceStore
*/
public class JWATResource extends Resource implements WARCConstants {
protected ByteCountingPushBackInputStream pbin;
protected GzipReader gzipReader;
protected GzipEntry gzipEntry;
protected ArcReader arcReader;
protected ArcRecordBase arcRecord;
protected WarcReader warcReader;
protected WarcRecord warcRecord;
protected InputStream payloadStream;
protected Map<String, String> headers = null;
protected long length = 0;
protected int status = 0;
private static WARCRecordType getWARCRecordType(WarcRecord rec)
throws ResourceNotAvailableException {
HeaderLine rectypeHeader = rec.getHeader(HEADER_KEY_TYPE);
if (rectypeHeader == null) {
throw new ResourceNotAvailableException("WARC-Type header is missing");
}
try {
return WARCRecordType.valueOf(rectypeHeader.value);
} catch (IllegalArgumentException ex) {
throw new ResourceNotAvailableException(
"unrecognized WARC-Type \"" + rectypeHeader.value + "\"");
}
}
public static Resource getResource(InputStream rin, long offset)
throws IOException, ResourceNotAvailableException {
JWATResource r = new JWATResource();
r.pbin = new ByteCountingPushBackInputStream(rin, 32);
ByteCountingPushBackInputStream in = null;
if (GzipReader.isGzipped(r.pbin)) {
r.gzipReader = new GzipReader(r.pbin);
if ((r.gzipEntry = r.gzipReader.getNextEntry()) != null) {
in = new ByteCountingPushBackInputStream(
new BufferedInputStream(r.gzipEntry.getInputStream(), 8192),
32);
} else {
throw new ResourceNotAvailableException("GZip entry is invalid");
}
} else {
in = r.pbin;
}
Payload payload = null;
HttpHeader httpHeader = null;
// essential metadata for non-HTTP response records.
String contentType = null;
String httpDate = null;
if (ArcReaderFactory.isArcRecord(in)) {
r.arcReader = ArcReaderFactory.getReaderUncompressed();
r.arcReader.setUriProfile(UriProfile.RFC3986_ABS_16BIT_LAX);
r.arcReader.setBlockDigestEnabled(false);
r.arcReader.setPayloadDigestEnabled(false);
r.arcRecord = r.arcReader.getNextRecordFrom(in, offset);
if (r.arcRecord != null) {
payload = r.arcRecord.getPayload();
if (payload != null) {
httpHeader = r.arcRecord.getHttpHeader();
}
if (httpHeader != null) {
r.payloadStream = httpHeader.getPayloadInputStream();
r.length = httpHeader.payloadLength;
r.status = httpHeader.statusCode;
} else if (payload != null) {
r.payloadStream = payload.getInputStreamComplete();
r.length = payload.getTotalLength();
r.status = 200;
} else {
r.payloadStream = new ByteArrayInputStream(new byte[0]);
r.length = 0;
r.status = 200;
}
}
} else if (WarcReaderFactory.isWarcRecord(in)) {
r.warcReader = WarcReaderFactory.getReaderUncompressed();
r.warcReader
.setWarcTargetUriProfile(UriProfile.RFC3986_ABS_16BIT_LAX);
r.warcReader.setBlockDigestEnabled(false);
r.warcReader.setPayloadDigestEnabled(false);
r.warcRecord = r.warcReader.getNextRecordFrom(in, offset);
if (r.warcRecord != null) {
WARCRecordType rectype = getWARCRecordType(r.warcRecord);
if (rectype == WARCRecordType.response || rectype == WARCRecordType.revisit) {
payload = r.warcRecord.getPayload();
if (payload != null) {
httpHeader = r.warcRecord.getHttpHeader();
}
if (httpHeader != null) {
r.payloadStream = httpHeader.getPayloadInputStream();
r.length = httpHeader.payloadLength;
r.status = httpHeader.statusCode;
} else if (payload != null) {
r.payloadStream = payload.getInputStreamComplete();
r.length = payload.getTotalLength();
r.status = 200;
} else {
r.payloadStream = new ByteArrayInputStream(new byte[0]);
r.length = 0;
if (rectype == WARCRecordType.revisit)
r.status = 0; // look in the original
else
r.status = 200;
}
} else if (rectype == WARCRecordType.metadata || rectype == WARCRecordType.resource) {
// record body is the payload, assume 200 status.
payload = r.warcRecord.getPayload();
r.payloadStream = payload.getInputStreamComplete();
r.length = payload.getTotalLength();
r.status = 200;
HeaderLine ctHeader = r.warcRecord.getHeader("content-type");
if (ctHeader != null) {
contentType = ctHeader.value;
}
HeaderLine dateHeader = r.warcRecord.getHeader(HEADER_KEY_DATE);
if (dateHeader != null) {
try {
// translate ISOZ date in WARC-Date header to standard HTTP date.
Date d = DateUtils.parseDate(dateHeader.value, new String[] { "yyyy-MM-dd'T'HH:mm:ss'Z'" });
httpDate = org.archive.util.DateUtils.getRFC1123Date(d);
} catch (ParseException ex) {
//ignore.
}
}
}
}
} else {
throw new ResourceNotAvailableException("Unknown archive record");
}
if (r.payloadStream == null) {
r.close();
r = null;
} else {
r.setInputStream(r.payloadStream);
if (httpHeader != null) {
r.headers = new Hashtable<String, String>();
for (HeaderLine headerLine : httpHeader.getHeaderList()) {
String name = headerLine.name.toLowerCase();
if (name.equals("transfer-encoding")) {
if (HttpHeaderOperation.HTTP_CHUNKED_ENCODING_HEADER
.equals(headerLine.value.toUpperCase())) {
r.setChunkedEncoding();
}
}
r.headers.put(name, headerLine.value);
}
} else {
// metadata, resource or old-style revisit
if (contentType != null || httpDate != null) {
r.headers = new Hashtable<String, String>();
if (contentType != null)
r.headers.put("Content-Type", contentType);
if (httpDate != null)
r.headers.put("Date", httpDate);
}
}
}
return r;
}
@Override
public Map<String, String> getHttpHeaders() {
return headers;
}
@Override
public long getRecordLength() {
return length;
}
@Override
public int getStatusCode() {
return status;
}
@Override
public String getRefersToTargetURI() {
if (warcRecord != null) {
HeaderLine h = warcRecord.getHeader("WARC-Refers-To-Target-URI");
if (h != null)
return h.value;
}
return null;
}
@Override
public String getRefersToDate() {
if (warcRecord != null) {
HeaderLine h = warcRecord.getHeader("WARC-Refers-To-Date");
if (h != null) {
Date date = ArchiveUtils.parse14DigitISODate(h.value, null);
if (date != null) {
return ArchiveUtils.get14DigitDate(date);
}
}
}
return null;
}
@Override
public void close() throws IOException {
if (warcRecord != null) {
warcRecord.close();
}
if (warcReader != null) {
warcReader.close();
}
if (arcRecord != null) {
arcRecord.close();
}
if (arcReader != null) {
arcReader.close();
}
if (gzipEntry != null) {
gzipEntry.close();
}
if (gzipReader != null) {
gzipReader.close();
}
if (pbin != null) {
pbin.close();
}
}
}