/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.resourcestore.resourcefile;
import java.io.IOException;
import java.text.ParseException;
import java.util.Date;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Map;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.StatusLine;
import org.apache.commons.httpclient.util.EncodingUtil;
import org.archive.format.arc.ARCConstants;
import org.archive.format.warc.WARCConstants.WARCRecordType;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.RecoverableIOException;
import org.archive.io.warc.WARCRecord;
import org.archive.util.ArchiveUtils;
import org.archive.util.DateUtils;
import org.archive.util.LaxHttpParser;
import org.archive.wayback.core.Resource;
import org.archive.wayback.replay.HttpHeaderOperation;
public class WarcResource extends Resource {
private WARCRecord rec = null;
private ArchiveReader reader = null;
private Map<String, String> headers = null;
private long length = 0;
private int status = 0;
private boolean parsedHeaders = false;
public WarcResource(WARCRecord rec, ArchiveReader reader) {
this.rec = rec;
this.reader = reader;
}
/**
* @param bytes Array of bytes to examine for an EOL.
* @return Count of end-of-line characters or zero if none.
*/
private int getEolCharsCount(byte [] bytes) {
int count = 0;
if (bytes != null && bytes.length >=1 &&
bytes[bytes.length - 1] == '\n') {
count++;
if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
count++;
}
}
return count;
}
public void parseHeaders() throws IOException {
if(parsedHeaders) {
return;
}
// If warc or arc record is 0 length, don't do any more parsing!
// Hopefully caller code will check this before proceeding as well
if (getRecordLength() <= 0) {
parsedHeaders = true;
return;
}
// WARCRecord should have getRecordType() method returning WARCRecordType.
String rectypeStr = (String)rec.getHeader().getHeaderValue("WARC-Type");
WARCRecordType rectype;
try {
rectype = WARCRecordType.valueOf(rectypeStr);
} catch (IllegalArgumentException ex) {
throw new RecoverableIOException("unrecognized WARC-Type \"" + rectypeStr + "\"");
}
if (rectype == WARCRecordType.response || rectype == WARCRecordType.revisit) {
byte [] statusBytes = LaxHttpParser.readRawLine(rec);
int eolCharCount = getEolCharsCount(statusBytes);
if (eolCharCount <= 0) {
throw new RecoverableIOException("Failed to read http status where one " +
" was expected: " + new String(statusBytes));
}
String statusLineStr = EncodingUtil.getString(statusBytes, 0,
statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
if ((statusLineStr == null) ||
!StatusLine.startsWithHTTP(statusLineStr)) {
throw new RecoverableIOException("Failed parse of http status line.");
}
StatusLine statusLine = new StatusLine(statusLineStr);
this.status = statusLine.getStatusCode();
Header[] tmpHeaders = LaxHttpParser.parseHeaders(rec,
ARCConstants.DEFAULT_ENCODING);
headers = new Hashtable<String,String>();
this.setInputStream(rec);
for(Header header: tmpHeaders) {
headers.put(header.getName(), header.getValue());
if(header.getName().toUpperCase().contains(
HttpHeaderOperation.HTTP_TRANSFER_ENC_HEADER)) {
if(header.getValue().toUpperCase().contains(
HttpHeaderOperation.HTTP_CHUNKED_ENCODING_HEADER)) {
setChunkedEncoding();
}
}
}
} else if (rectype == WARCRecordType.metadata || rectype == WARCRecordType.resource) {
status = 200;
headers = new HashMap<String, String>();
String ct = (String)rec.getHeader().getHeaderValue("Content-Type");
if (ct != null) {
headers.put("Content-Type", ct);
}
// necessary?
String date = rec.getHeader().getDate();
if (date != null) {
try {
Date d = org.apache.commons.lang.time.DateUtils.parseDate(date, new String[] { "yyyy-MM-dd'T'HH:mm:ss'Z'"});
String httpDate = DateUtils.getRFC1123Date(d);
headers.put("Date", httpDate);
} catch (ParseException ex) {
//
}
}
setInputStream(rec);
}
parsedHeaders = true;
}
@Override
public Map<String, String> getHttpHeaders() {
return headers;
}
public ArchiveRecordHeader getWarcHeaders() {
return rec.getHeader();
}
@Override
public long getRecordLength() {
if ((length == 0) && (rec.getHeader() != null)) {
length = rec.getHeader().getContentLength();
}
return length;
}
@Override
public int getStatusCode() {
return status;
}
@Override
public void close() throws IOException {
rec.close();
reader.close();
}
public String getRefersToTargetURI() {
return (String)getWarcHeaders().getHeaderFields().get(
"WARC-Refers-To-Target-URI");
}
public String getRefersToDate() {
String dateString = (String)getWarcHeaders().getHeaderFields().get(
"WARC-Refers-To-Date");
if (dateString != null) {
Date date = ArchiveUtils.parse14DigitISODate(dateString, null);
if (date != null) {
return ArchiveUtils.get14DigitDate(date);
}
}
return null;
}
public static final String PROFILE_REVISIT_SERVER_NOT_MODIFIED =
"http://netpreserve.org/warc/1.0/revisit/server-not-modified";
/**
* whether this Resource is {@code server-not-modified} revisit.
* (this method used to be {@code AccessPoint#isWarcRevisitNotModified(Resource)}.
* Not made a part of {@code Resource} interface because it was unused.)
* @return {@code true} if it is
*/
public boolean isRevisitNotModified() {
Map<String, Object> warcHeaders = getWarcHeaders().getHeaderFields();
String warcProfile = (String)warcHeaders.get("WARC-Profile");
return PROFILE_REVISIT_SERVER_NOT_MODIFIED.equals(warcProfile);
}
}