/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.core;
import java.util.Date;
import org.archive.wayback.ResourceIndex;
import org.archive.wayback.util.url.UrlOperations;
/**
*
*
* @author brad
*/
public class CaptureSearchResult extends SearchResult implements Capture {
protected long cachedOffset = -1;
protected long cachedCompressedLength = -1;
protected long cachedDate = -1;
// Keep track of the z matched result so that we can walk
// back if the current result is a self-redirect/otherwise unavailable
private CaptureSearchResult prevResult = null;
private CaptureSearchResult nextResult = null;
public static final String CAPTURE_ORIGINAL_URL = "url";
public static final String CAPTURE_ORIGINAL_HOST = "host";
/**
* Result: canonicalized(lookup key) form of URL of captured document
*/
public static final String CAPTURE_URL_KEY = "urlkey";
/**
* Result: 14-digit timestamp when document was captured
*/
public static final String CAPTURE_CAPTURE_TIMESTAMP = "capturedate";
/**
* Result: basename of ARC/WARC file containing this document.
*/
public static final String CAPTURE_FILE = "file";
/**
* Result: compressed byte offset within ARC/WARC file where this document's
* gzip envelope begins.
*/
public static final String CAPTURE_OFFSET = "compressedoffset";
/**
* Result: compressed byte offset within ARC/WARC file where this document's
* gzip envelope Ends.
*/
public static final String CAPTURE_COMPRESSED_LENGTH = "compressedendoffset";
/**
* Result: best-guess at mime-type of this document.
*/
public static final String CAPTURE_MIME_TYPE = "mimetype";
/**
* Result: 3-digit integer HTTP response code. may be '0' in some fringe
* conditions, old ARCs, bug in crawler, etc.
*/
public static final String CAPTURE_HTTP_CODE = "httpresponsecode";
/**
* Result: some form of document fingerprint. This should represent the HTTP
* payload only for HTTP captured resources. It may represent an MD5, a
* SHA1, and may be a fragment of the full representation of the digest.
*/
public static final String CAPTURE_DIGEST = "digest";
/**
* Result: URL that this document redirected to, or '-' if it does not
* redirect
*/
public static final String CAPTURE_REDIRECT_URL = "redirecturl";
/**
* Result: String flags which indicate robot instructions found in an HTML
* page. Currently one or more of:
* <ul>
* <li>"A" - noarchive</li>
* <li>"F" - nofollow</li>
* <li>"I" - noindex</li>
* </ul>
* @see "http://noarchive.net/"
*/
public static final String CAPTURE_ROBOT_FLAGS = "robotflags";
public static final String CAPTURE_ROBOT_NOARCHIVE = "A";
public static final String CAPTURE_ROBOT_NOFOLLOW = "F";
public static final String CAPTURE_ROBOT_NOINDEX = "I";
public static final String CAPTURE_ROBOT_IGNORE = "G";
/**
* non-standard robot-flag indicating the capture is <i>soft-blocked</i>
* (not available for direct replay, but available as the original for
* a revisits.)
*/
public static final char CAPTURE_ROBOT_BLOCKED = 'X';
/**
* Result: flag within a SearchResult that indicates this is the closest to
* a particular requested date.
*/
public static final String CAPTURE_CLOSEST_INDICATOR = "closest";
public static final String CAPTURE_CLOSEST_VALUE = "true";
/**
* Result: this key being present indicates that this particular capture was
* not actually stored, and that other values within this SearchResult are
* actually values from a different record which *should* be identical to
* this capture, had it been stored.
*/
public static final String CAPTURE_DUPLICATE_ANNOTATION = "duplicate";
/**
* Result: this key is present when the CAPTURE_DUPLICATE_ANNOTATION is also
* present, with the value indicating the last date that was actually stored
* for this duplicate.
*/
public static final String CAPTURE_DUPLICATE_STORED_TS = "duplicate-ts";
/**
* flag indicates that this document was downloaded and verified as
* identical to a previous capture by digest.
*/
public static final String CAPTURE_DUPLICATE_DIGEST = "digest";
/**
* For identical content digest revisit records, the file where the payload
* can be found, if known.
*/
public static final String CAPTURE_DUPLICATE_PAYLOAD_FILE = "payload-" +
CAPTURE_FILE;
/**
* For identical content digest revisit records, the offset in
* CAPTURE_DUPLICATE_PAYLOAD_FILE where the payload record can be found, if
* known.
*/
public static final String CAPTURE_DUPLICATE_PAYLOAD_OFFSET = "payload-" +
CAPTURE_OFFSET;
/**
* For identical content digest revisit records, the compressed length in
* CAPTURE_DUPLICATE_PAYLOAD_LENGTH where the payload record can be found,
* if known.
*/
public static final String CAPTURE_DUPLICATE_PAYLOAD_COMPRESSED_LENGTH = "payload-" +
CAPTURE_COMPRESSED_LENGTH;
/**
* flag indicates that this document was NOT downloaded, but that the origin
* server indicated that the document had not changed, based on If-Modified
* HTTP request headers.
*/
public static final String CAPTURE_DUPLICATE_HTTP = "http";
public static final String CAPTURE_ORACLE_POLICY = "oracle-policy";
public CaptureSearchResult() {
}
protected CaptureSearchResult(boolean autocreateMap) {
super(autocreateMap);
}
/* (non-Javadoc)
* @see org.archive.wayback.core.Capture#getOriginalUrl()
*/
@Override
public String getOriginalUrl() {
String url = get(CAPTURE_ORIGINAL_URL);
if (url == null) {
// convert from ORIG_HOST to ORIG_URL here:
url = getUrlKey();
String host = get(CAPTURE_ORIGINAL_HOST);
if (url != null && host != null) {
StringBuilder sb = new StringBuilder(url.length());
sb.append(UrlOperations.DEFAULT_SCHEME);
sb.append(host);
sb.append(UrlOperations.getURLPath(url));
url = sb.toString();
// cache it for next time...?
setOriginalUrl(url);
}
}
return url;
}
/**
* @param originalUrl as close to the original URL by which this Resource
* was captured as is possible
*/
public void setOriginalUrl(String originalUrl) {
put(CAPTURE_ORIGINAL_URL, originalUrl);
}
public String getOriginalHost() {
String host = get(CAPTURE_ORIGINAL_HOST);
if (host == null) {
host = UrlOperations.urlToHost(getOriginalUrl());
}
return host;
}
public void setOriginalHost(String originalHost) {
put(CAPTURE_ORIGINAL_HOST, originalHost);
}
public String getUrlKey() {
return get(CAPTURE_URL_KEY);
}
public void setUrlKey(String urlKey) {
put(CAPTURE_URL_KEY, urlKey);
}
public Date getCaptureDate() {
if (cachedDate == -1) {
cachedDate = tsToDate(getCaptureTimestamp()).getTime();
}
return new Date(cachedDate);
}
public void setCaptureDate(Date date) {
cachedDate = date.getTime();
setCaptureTimestamp(dateToTS(date));
}
/* (non-Javadoc)
* @see org.archive.wayback.core.Capture#getCaptureTimestamp()
*/
@Override
public String getCaptureTimestamp() {
return get(CAPTURE_CAPTURE_TIMESTAMP);
}
public void setCaptureTimestamp(String timestamp) {
put(CAPTURE_CAPTURE_TIMESTAMP, timestamp);
}
public String getFile() {
return get(CAPTURE_FILE);
}
public void setFile(String file) {
put(CAPTURE_FILE, file);
}
public long getOffset() {
if (cachedOffset == -1) {
cachedOffset = Long.parseLong(get(CAPTURE_OFFSET));
}
return cachedOffset;
}
public void setOffset(long offset) {
cachedOffset = offset;
put(CAPTURE_OFFSET, String.valueOf(offset));
}
public long getCompressedLength() {
if (cachedCompressedLength == -1) {
String tmp = get(CAPTURE_COMPRESSED_LENGTH);
cachedCompressedLength = tmp == null ? -1 : Long.parseLong(tmp);
}
return cachedCompressedLength;
}
public void setCompressedLength(long offset) {
cachedCompressedLength = offset;
put(CAPTURE_COMPRESSED_LENGTH, String.valueOf(offset));
}
public String getMimeType() {
return get(CAPTURE_MIME_TYPE);
}
public void setMimeType(String mimeType) {
put(CAPTURE_MIME_TYPE, mimeType);
}
public String getHttpCode() {
return get(CAPTURE_HTTP_CODE);
}
public void setHttpCode(String httpCode) {
put(CAPTURE_HTTP_CODE, httpCode);
}
public String getDigest() {
return get(CAPTURE_DIGEST);
}
public void setDigest(String digest) {
put(CAPTURE_DIGEST, digest);
}
public String getRedirectUrl() {
return get(CAPTURE_REDIRECT_URL);
}
public void setRedirectUrl(String url) {
put(CAPTURE_REDIRECT_URL, url);
}
public boolean isClosest() {
return getBoolean(CAPTURE_CLOSEST_INDICATOR);
}
public void setClosest(boolean value) {
putBoolean(CAPTURE_CLOSEST_INDICATOR, value);
}
/*
* Identical content digest revisits have a duplicateDigestStoredDate if the
* payload is found by WARCRevisitAnnotationFilter in an earlier capture of
* the same url. If isDuplicateDigest() and
* getDuplicateDigestStoredTimestamp()==null then it must be a url-agnostic
* HER-2022 revisit.
*/
public void flagDuplicateDigest() {
put(CAPTURE_DUPLICATE_ANNOTATION, CAPTURE_DUPLICATE_DIGEST);
}
/**
* Mark this capture as a revisit of previous capture {@code payload}, identified by content digest.
* <p>Record location information is copied from {@code payload} so that the content can be
* loaded from the record later.</p>
* <p>{@link ResourceIndex} implementations should call this method before returning
* {@code CaptureSearchResult}s to {@code AccessPoint}.</p>
* @param payload capture being revisited
* @see #getDuplicateDigestStoredTimestamp()
* @see #getDuplicateDigestStoredDate()
* @see #getDuplicatePayloadFile()
* @see #getDuplicatePayloadOffset()
* @see #getDuplicatePayloadCompressedLength()
*/
public void flagDuplicateDigest(CaptureSearchResult payload) {
flagDuplicateDigest();
put(CAPTURE_DUPLICATE_STORED_TS, payload.getCaptureTimestamp());
put(CAPTURE_DUPLICATE_PAYLOAD_FILE, payload.getFile());
put(CAPTURE_DUPLICATE_PAYLOAD_OFFSET,
String.valueOf(payload.getOffset()));
if (payload.getCompressedLength() > 0) {
put(CAPTURE_DUPLICATE_PAYLOAD_COMPRESSED_LENGTH,
String.valueOf(payload.getCompressedLength()));
}
}
// For use in FastCaptureSearchResult, which stores the payload
// CaptureSearchResult directly
public CaptureSearchResult getDuplicatePayload() {
return null;
}
public String getDuplicatePayloadFile() {
return get(CAPTURE_DUPLICATE_PAYLOAD_FILE);
}
public Long getDuplicatePayloadOffset() {
if (get(CAPTURE_DUPLICATE_PAYLOAD_OFFSET) != null) {
return Long.valueOf(get(CAPTURE_DUPLICATE_PAYLOAD_OFFSET));
} else {
return null;
}
}
public long getDuplicatePayloadCompressedLength() {
if (get(CAPTURE_DUPLICATE_PAYLOAD_COMPRESSED_LENGTH) != null) {
return Long
.valueOf(get(CAPTURE_DUPLICATE_PAYLOAD_COMPRESSED_LENGTH));
} else {
return -1;
}
}
/** @deprecated */
public void flagDuplicateDigest(Date storedDate) {
flagDuplicateDigest();
put(CAPTURE_DUPLICATE_STORED_TS, dateToTS(storedDate));
}
/** @deprecated */
public void flagDuplicateDigest(String storedTS) {
flagDuplicateDigest();
put(CAPTURE_DUPLICATE_STORED_TS, storedTS);
}
/**
* whether this capture is a re-fetch of previously archived capture
* (<i>revisit</i>), detected by content's digest, and replay of
* that previous capture is not blocked.
* <p>1.8.1 2014-10-02 behavior change. This method now returns
* {@code false} even for revisits, if the original capture
* is blocked. Use #isRevisitDigest() for old behavior.</p>
* @return {@code true} if revisit
*/
public boolean isDuplicateDigest() {
if (!isRevisitDigest()) return false;
CaptureSearchResult orig = getDuplicatePayload();
if (orig != null && orig.isRobotFlagSet(CaptureSearchResult.CAPTURE_ROBOT_BLOCKED))
return false;
return true;
}
/**
* whether this capture is a re-fetch of previously archived capture
* (<i>revisit</i>), detected by content's digest.
* <p>This method is meant for use by replay processing. For use in
* user interface / web API code, consider {@link #isDuplicateDigest()}
* is more appropriate.</p>
* @return {@code true} if revisit
*/
public boolean isRevisitDigest() {
String dupeType = get(CAPTURE_DUPLICATE_ANNOTATION);
return (dupeType != null && dupeType.equals(CAPTURE_DUPLICATE_DIGEST));
}
public Date getDuplicateDigestStoredDate() {
if (isRevisitDigest() && get(CAPTURE_DUPLICATE_STORED_TS) != null) {
return tsToDate(get(CAPTURE_DUPLICATE_STORED_TS));
}
return null;
}
/**
* same with {@link #getDuplicateDigestStoredDate()}, but
* returns raw timestamp value.
* @return string representing timestamp.
*/
public String getDuplicateDigestStoredTimestamp() {
if (isRevisitDigest()) {
return get(CAPTURE_DUPLICATE_STORED_TS);
}
return null;
}
public void flagDuplicateHTTP(Date storedDate) {
put(CAPTURE_DUPLICATE_ANNOTATION, CAPTURE_DUPLICATE_HTTP);
put(CAPTURE_DUPLICATE_STORED_TS, dateToTS(storedDate));
}
public void flagDuplicateHTTP(String storedTS) {
put(CAPTURE_DUPLICATE_ANNOTATION, CAPTURE_DUPLICATE_HTTP);
put(CAPTURE_DUPLICATE_STORED_TS, storedTS);
}
/**
* whether this capture is an archive of {@code 304 Not Modified} response
* from the server.
* @return
*/
public boolean isDuplicateHTTP() {
String dupeType = get(CAPTURE_DUPLICATE_ANNOTATION);
return (dupeType != null && dupeType.equals(CAPTURE_DUPLICATE_HTTP));
}
public Date getDuplicateHTTPStoredDate() {
if (isDuplicateHTTP()) {
return tsToDate(get(CAPTURE_DUPLICATE_STORED_TS));
}
return null;
}
public String getDuplicateHTTPStoredTimestamp() {
if (isDuplicateHTTP()) {
return get(CAPTURE_DUPLICATE_STORED_TS);
}
return null;
}
/**
* return <i>robot flags</i> field value.
* @return
*/
public String getRobotFlags() {
return get(CAPTURE_ROBOT_FLAGS);
}
/**
* Set <i>robot flags</i> field value as a whole.
* For adding a flag, use {@link #setRobotFlag(char)} or
* {@link #setRobotFlag(String)}.
* @param robotFlags new field value
*/
public void setRobotFlags(String robotFlags) {
put(CAPTURE_ROBOT_FLAGS, robotFlags);
}
/**
* Add a flag to {@code robotflags} field.
* If {@code flag} is already set, this is a no-op.
* @param flag a flag to add (don't put multiple flags).
*/
public void setRobotFlag(String flag) {
String flags = getRobotFlags();
if (flags == null) {
flags = "";
}
if (!flags.contains(flag)) {
flags = flags + flag;
}
setRobotFlags(flags);
}
/**
* Add a flag to {@code robotflags} field.
* If {@code flag} is already set, this is a no-op.
* @param flag a flag to add
*/
public void setRobotFlag(char flag) {
String flags = getRobotFlags();
if (flags == null) {
setRobotFlags(Character.toString(flag));
} else {
if (flags.indexOf(flag) < 0) {
setRobotFlags(flags + flag);
}
}
}
/**
* test if {@code robotflags} field has flag {@code flag} set.
* <p>
* Caveat: if {@code flag} has more than once character,
* {@code robotflags} must have {@code flag} as its substring
* for this method to return {@code true} (not really useful).
* </p>
* @param flag flag to test
* @return {@code true} if {@code flag} is set.
*/
public boolean isRobotFlagSet(String flag) {
String flags = getRobotFlags();
if (flags == null) {
return false;
}
return flags.contains(flag);
}
/**
* test if {@code robotflags} field has flag {@code flag} set.
* @param flag one flag to test
* @return {@code true} if {@code flag} is set.
*/
public boolean isRobotFlagSet(char flag) {
String flags = getRobotFlags();
return flags != null && flags.indexOf(flag) >= 0;
}
public boolean isRobotNoArchive() {
return isRobotFlagSet(CAPTURE_ROBOT_NOARCHIVE);
}
public boolean isRobotNoIndex() {
return isRobotFlagSet(CAPTURE_ROBOT_NOINDEX);
}
public boolean isRobotNoFollow() {
return isRobotFlagSet(CAPTURE_ROBOT_NOFOLLOW);
}
public boolean isRobotIgnore() {
return isRobotFlagSet(CAPTURE_ROBOT_IGNORE);
}
public void setRobotNoArchive() {
setRobotFlag(CAPTURE_ROBOT_NOARCHIVE);
}
public void setRobotNoIndex() {
setRobotFlag(CAPTURE_ROBOT_NOINDEX);
}
public void setRobotNoFollow() {
setRobotFlag(CAPTURE_ROBOT_NOFOLLOW);
}
public void setRobotIgnore() {
setRobotFlag(CAPTURE_ROBOT_IGNORE);
}
public String getOraclePolicy() {
return get(CAPTURE_ORACLE_POLICY);
}
public void setOraclePolicy(String policy) {
put(CAPTURE_ORACLE_POLICY, policy);
}
public void setPrevResult(CaptureSearchResult result) {
prevResult = result;
}
public CaptureSearchResult getPrevResult() {
return prevResult;
}
public void setNextResult(CaptureSearchResult result) {
nextResult = result;
}
public CaptureSearchResult getNextResult() {
return nextResult;
}
public void removeFromList() {
if (nextResult != null) {
nextResult.setPrevResult(prevResult);
}
if (prevResult != null) {
prevResult.setNextResult(nextResult);
}
prevResult = null;
nextResult = null;
}
public String toString() {
return getCaptureDate().toString() + " " + getOriginalUrl();
}
/**
* {@code true} if HTTP response code is either {@code 4xx} or {@code 5xx}.
* @return
*/
public boolean isHttpError() {
if (isRevisitDigest() && (getDuplicatePayload() != null)) {
return getDuplicatePayload().isHttpError();
}
String httpCode = getHttpCode();
return (httpCode.startsWith("4") || httpCode.startsWith("5"));
}
/**
* {@code true} if HTTP response code is {@code 3xx}.
* @return
*/
public boolean isHttpRedirect() {
if (isRevisitDigest() && (getDuplicatePayload() != null)) {
return getDuplicatePayload().isHttpRedirect();
}
String httpCode = getHttpCode();
return (httpCode.startsWith("3"));
}
/**
* {@code true} if HTTP response code is {@code 2xx}.
* @return
*/
public boolean isHttpSuccess() {
if (isRevisitDigest() && (getDuplicatePayload() != null)) {
return getDuplicatePayload().isHttpSuccess();
}
String httpCode = getHttpCode();
return (httpCode.startsWith("2"));
}
}