/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.resourcestore.indexer;
import java.io.File;
import java.io.IOException;
import java.util.logging.Logger;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.StatusLine;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.EncodingUtil;
import org.archive.format.warc.WARCConstants;
import org.archive.format.warc.WARCConstants.WARCRecordType;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.RecoverableIOException;
import org.archive.io.arc.ARCConstants;
import org.archive.io.warc.WARCRecord;
import org.archive.util.LaxHttpParser;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.resourceindex.filters.WARCRevisitAnnotationFilter;
import org.archive.wayback.util.Adapter;
import org.archive.wayback.util.url.IdentityUrlCanonicalizer;
/**
* Adapts certain WARCRecords into SearchResults. DNS and response records are
* mostly straightforward, but SearchResult objects generated from revisit
* records contain lots of "placeholder" fields, which are expected to be
* understood by later processes traversing a stream of SearchResult objects.
*
* @author brad
* @version $Date$, $Revision$
* @see WARCRevisitAnnotationFilter
*/
public class WARCRecordToSearchResultAdapter
implements Adapter<WARCRecord,CaptureSearchResult>{
private static final Logger LOGGER =
Logger.getLogger(WARCRecordToSearchResultAdapter.class.getName());
private static final String VERSION = "0.1.0";
private static final String WARC_FILEDESC_VERSION =
"warc/warcinfo" + VERSION;
private final static String DEFAULT_VALUE = "-";
private UrlCanonicalizer canonicalizer = null;
private HTTPRecordAnnotater annotater = null;
private boolean processAll = false;
public WARCRecordToSearchResultAdapter() {
canonicalizer = new IdentityUrlCanonicalizer();
annotater = new HTTPRecordAnnotater();
}
/*
* This just calls adaptInner, returning null if an Exception is thrown:
*/
public CaptureSearchResult adapt(WARCRecord rec) {
try {
return adaptInner(rec);
} catch (IOException e) {
e.printStackTrace();
return null;
} catch (OutOfMemoryError e) {
e.printStackTrace();
return null;
}
}
private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException {
ArchiveRecordHeader header = rec.getHeader();
String typeStr = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
WARCRecordType type;
try {
type = WARCRecordType.valueOf(typeStr);
} catch (IllegalArgumentException e) {
LOGGER.warning("Skipping unrecognized record type : " + typeStr);
return null;
}
CaptureSearchResult result = genericResult(rec);
switch (type) {
case response:
String mime = annotater.transformHTTPMime(header.getMimetype());
if(mime != null && mime.equals("text/dns")) {
// close to complete reading, then the digest is legit
// TODO: DO we want to use the WARC header digest for this?
rec.close();
result.setDigest(transformWARCDigest(rec.getDigestStr()));
result.setMimeType(mime);
} else {
result = adaptWARCHTTPResponse(result,rec);
}
break;
case revisit:
// also set the mime type:
result.setMimeType("warc/revisit");
break;
case request:
if(processAll) {
// also set the mime type:
result.setMimeType("warc/request");
} else {
result = null;
}
break;
case metadata:
if(processAll) {
// also set the mime type:
result.setMimeType("warc/metadata");
} else {
result = null;
}
break;
case warcinfo:
result.setMimeType(WARC_FILEDESC_VERSION);
break;
default:
LOGGER.info("Skipping record type : " + type);
break;
}
return result;
}
// ALL HELPER METHODS BELOW:
/*
* Extract all common WARC fields into a CaptureSearchResult. This is the
* same for all WARC record types:
*
* file, offset, timestamp, digest, urlKey, originalUrl
*/
private CaptureSearchResult genericResult(WARCRecord rec) {
CaptureSearchResult result = new CaptureSearchResult();
result.setMimeType(DEFAULT_VALUE);
result.setHttpCode(DEFAULT_VALUE);
result.setRedirectUrl(DEFAULT_VALUE);
ArchiveRecordHeader header = rec.getHeader();
String file = transformWARCFilename(header.getReaderIdentifier());
long offset = header.getOffset();
result.setCaptureTimestamp(transformWARCDate(header.getDate()));
result.setFile(file);
result.setOffset(offset);
result.setDigest(transformWARCDigest(header.getHeaderValue(
WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
String origUrl = header.getUrl();
if(origUrl == null) {
String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
if(type.equals(WARCConstants.WARCRecordType.warcinfo)) {
String filename = header.getHeaderValue(
WARCConstants.HEADER_KEY_FILENAME).toString();
result.setOriginalUrl("filedesc:"+filename);
result.setUrlKey("filedesc:"+filename);
} else {
result.setOriginalUrl(DEFAULT_VALUE);
result.setUrlKey(DEFAULT_VALUE);
}
} else {
result.setOriginalUrl(origUrl);
try {
String urlKey = canonicalizer.urlStringToKey(origUrl);
result.setUrlKey(urlKey);
} catch (URIException e) {
String shortUrl =
(origUrl.length() < 100)
? origUrl
:origUrl.substring(0,100);
LOGGER.warning("FAILED canonicalize(" + shortUrl + "):" +
file + " " + offset);
result.setUrlKey(origUrl);
}
}
return result;
}
/**
* borrowed(copied) from org.archive.io.arc.ARCRecord...
*
* @param bytes Array of bytes to examine for an EOL.
* @return Count of end-of-line characters or zero if none.
*/
private int getEolCharsCount(byte [] bytes) {
int count = 0;
if (bytes != null && bytes.length >=1 &&
bytes[bytes.length - 1] == '\n') {
count++;
if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
count++;
}
}
return count;
}
private String transformWARCFilename(String readerIdentifier) {
String warcName = readerIdentifier;
int index = warcName.lastIndexOf(File.separator);
if (index > 0 && (index + 1) < warcName.length()) {
warcName = warcName.substring(index + 1);
}
return warcName;
}
private String transformWARCDigest(final Object o) {
if(o == null) {
return DEFAULT_VALUE;
}
String orig = o.toString();
if(orig.startsWith("sha1:")) {
return orig.substring(5);
}
return orig;
// return (o == null) ? DEFAULT_VALUE : o.toString();
}
/*
* Transform input date to 14-digit timestamp:
* 2007-08-29T18:00:26Z => 20070829180026
*/
private static String transformWARCDate(final String input) {
StringBuilder output = new StringBuilder(14);
output.append(input.substring(0,4));
output.append(input.substring(5,7));
output.append(input.substring(8,10));
output.append(input.substring(11,13));
output.append(input.substring(14,16));
output.append(input.substring(17,19));
return output.toString();
}
/*
* Currently the WARCReader doesn't parse HTTP headers. This method parses
* them then calls the common ARC/WARC shared record parsing code, which
* addresses HTTP headers, and possibly even parses HTML content to look
* for Robot Meta tags.
*/
private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result,
WARCRecord rec) throws IOException {
ArchiveRecordHeader header = rec.getHeader();
// need to parse the documents HTTP message and headers here: WARCReader
// does not implement this... yet..
byte [] statusBytes = LaxHttpParser.readRawLine(rec);
int eolCharCount = getEolCharsCount(statusBytes);
if (eolCharCount <= 0) {
throw new RecoverableIOException("Failed to read http status where one " +
" was expected: " +
((statusBytes == null) ? "(null)" : new String(statusBytes)));
}
String statusLine = EncodingUtil.getString(statusBytes, 0,
statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
if ((statusLine == null) ||
!StatusLine.startsWithHTTP(statusLine)) {
throw new RecoverableIOException("Failed parse of http status line.");
}
StatusLine status = new StatusLine(statusLine);
result.setHttpCode(String.valueOf(status.getStatusCode()));
Header[] headers = LaxHttpParser.parseHeaders(rec,
ARCConstants.DEFAULT_ENCODING);
annotater.annotateHTTPContent(result,rec,headers,header.getMimetype());
return result;
}
public UrlCanonicalizer getCanonicalizer() {
return canonicalizer;
}
public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
this.canonicalizer = canonicalizer;
}
public boolean isProcessAll() {
return processAll;
}
public void setProcessAll(boolean processAll) {
this.processAll = processAll;
}
/**
* @return the annotater
*/
public HTTPRecordAnnotater getAnnotater() {
return annotater;
}
/**
* @param annotater the annotater to set
*/
public void setAnnotater(HTTPRecordAnnotater annotater) {
this.annotater = annotater;
}
}