/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.core;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.httpclient.ChunkedInputStream;
/**
* Abstraction on top of a document stored in a WaybackCollection.
*
* TODO: This implementation needs some pretty drastic refactoring.. May have to wait
* for 2.0. This should be a byte-oriented record, and allow wrapping the
* interior byte-stream in on the more full featured HTTP libraries
* (jetty/apache-http-client/w3c-http-reference).
*
* For now, it is a system-wide assumption that all resources are HTTP based.
*
* TODO: Some code downcasts Resource to its sub-classes to gain access to
* methods only available in specific implementation. Consider adding more methods
* to make downcast unnecessary. More sub-classes are expected, for encapsulating
* revisit-original pair of Resources as single Resource, for example.
*
* @see org.archive.wayback.ResourceStore#retrieveResource(CaptureSearchResult)
*
* @author Brad Tofel
*/
public abstract class Resource extends InputStream {
private InputStream is;
public abstract void close() throws IOException;
/**
* Assumes an HTTP resource - return the HTTP response code
* @return the HTTP response code from the HTTP message
*/
public abstract int getStatusCode();
/**
* @return the size in bytes of the record payload, including HTTP header
*/
public abstract long getRecordLength();
/**
* Assumes an HTTP response - return the HTTP headers, not including the
* HTTP Message header
* @return key-value Map of HTTP headers
*/
public abstract Map<String,String> getHttpHeaders();
// URL-Agnostic Revisit Support
/**
* return {@code WARC-Refer-To-Target-URI} WARC record header value or
* equivalent.
* Default implementation returns {@code null}.
* @return header value (URI)
*/
public String getRefersToTargetURI() {
return null;
}
/**
* return {@code WARC-Refers-To-Date} WARC record header value or
* equivalent.
* Default implementation returns {@code null}
* @return 14-digit timestamp string ({@code yyyyMMddHHmmss})
*/
public String getRefersToDate() {
return null;
}
public void parseHeaders() throws IOException {
//Implemented in warc/arc reader
}
public String getHeader(String headerName) {
Map<String, String> httpHeaders = getHttpHeaders();
if (httpHeaders == null) {
return null;
}
Iterator<String> keys = httpHeaders.keySet().iterator();
String headerUp = headerName.toUpperCase();
while (keys.hasNext()) {
String key = keys.next();
if (key.toUpperCase().equals(headerUp)) {
return httpHeaders.get(key);
}
}
return null;
}
private void validate() throws IOException {
if(is == null) {
throw new IOException("No InputStream");
}
}
protected void setInputStream(InputStream is) {
if(is.markSupported()) {
this.is = is;
} else {
this.is = new BufferedInputStream(is);
}
}
/**
* indicate that there is a {@code Transfer-Encoding: chunked} header, so the input
* data should be dechunked as it is read. This method actually peeks
* ahead to verify that there is a hex-encoded chunk length before
* assuming the data is chunked.
* @throws IOException for usual reasons
*/
public void setChunkedEncoding() throws IOException {
validate();
// peek ahead and make sure we have a line with hex numbers:
int max = 50;
is.mark(max + 2);
int cur = 0;
int hexFound = 0;
boolean isChunked = false;
while (cur < max) {
int nextC = is.read();
// allow CRLF and plain ole LF:
if ((nextC == 13) || (nextC == 10)) {
// must have read at least 1 hex char:
if (hexFound > 0) {
if (nextC == 10) {
isChunked = true;
break;
}
nextC = is.read();
if (nextC == 10) {
isChunked = true;
break;
}
}
// keep looking to allow some blank lines.
} else {
// better be a hex character:
if (isHex(nextC)) {
hexFound++;
} else if (nextC != ' ') {
// allow whitespace before or after chunk...
// not a hex digit: not a chunked stream.
break;
}
}
cur++;
}
is.reset();
if (isChunked) {
setInputStream(new ChunkedInputStream(is));
}
}
private boolean isHex(int c) {
if ((c >= '0') && (c <= '9')) {
return true;
}
if ((c >= 'a') && (c <= 'f')) {
return true;
}
if ((c >= 'A') && (c <= 'F')) {
return true;
}
return false;
}
public int available() throws IOException {
validate();
return is.available();
}
public void mark(int readlimit) {
if (is != null) {
is.mark(readlimit);
}
}
public boolean markSupported() {
if (is == null) {
return false;
}
return is.markSupported();
}
public int read() throws IOException {
validate();
return is.read();
}
public int read(byte[] b, int off, int len) throws IOException {
validate();
return is.read(b, off, len);
}
public int read(byte[] b) throws IOException {
validate();
return is.read(b);
}
public void reset() throws IOException {
validate();
is.reset();
}
public long skip(long n) throws IOException {
validate();
return is.skip(n);
}
}