CaptureSearchResult.java example

Explorer
wayback-machine-master
/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.wayback.core;

import java.util.Date;

import org.archive.wayback.util.url.UrlOperations;

/**
 *
 *
 * @author brad
 * @version $Date$, $Revision$
 */
public class CaptureSearchResult extends SearchResult {
	
	private long cachedOffset = -1;
	private long cachedCompressedLength = -1;
	private long cachedDate = -1;
	
	public static final String CAPTURE_ORIGINAL_URL = "url";
	public static final String CAPTURE_ORIGINAL_HOST = "host";

	/**
	 * Result: canonicalized(lookup key) form of URL of captured document 
	 */
	public static final String CAPTURE_URL_KEY = "urlkey";
	
	/**
	 * Result: 14-digit timestamp when document was captured 
	 */
	public static final String CAPTURE_CAPTURE_TIMESTAMP = "capturedate";

	/**
	 * Result: basename of ARC/WARC file containing this document.
	 */
	public static final String CAPTURE_FILE = "file";

	/**
	 * Result: compressed byte offset within ARC/WARC file where this document's
	 * gzip envelope begins. 
	 */
	public static final String CAPTURE_OFFSET = "compressedoffset";

	/**
	 * Result: compressed byte offset within ARC/WARC file where this document's
	 * gzip envelope Ends.
	 */
	public static final String CAPTURE_COMPRESSED_LENGTH = "compressedendoffset";
	
	/**
	 * Result: best-guess at mime-type of this document.
	 */
	public static final String CAPTURE_MIME_TYPE = "mimetype";

	/**
	 * Result: 3-digit integer HTTP response code. may be '0' in some
	 * fringe conditions, old ARCs, bug in crawler, etc.
	 */
	public static final String CAPTURE_HTTP_CODE = "httpresponsecode";

	/**
	 * Result: some form of document fingerprint. This should represent the 
	 * HTTP payload only for HTTP captured resources. It may represent an MD5, a
	 * SHA1, and may be a fragment of the full representation of the digest.
	 */
	public static final String CAPTURE_DIGEST= "digest";

	/**
	 * Result: URL that this document redirected to, or '-' if it does
	 * not redirect
	 */
	public static final String CAPTURE_REDIRECT_URL = "redirecturl";

	/**
	 * Result: String flags which indicate robot instructions found in an HTML
	 * page. Currently one or more of:
	 * <li>"A" - noarchive</li>
	 * <li>"F" - nofollow</li>
	 * <li>"I" - noindex</li>
	 * @see "http://noarchive.net/"
	 */
	public static final String CAPTURE_ROBOT_FLAGS = "robotflags";
	
	public static final String CAPTURE_ROBOT_NOARCHIVE = "A";
	public static final String CAPTURE_ROBOT_NOFOLLOW = "F";
	public static final String CAPTURE_ROBOT_NOINDEX = "I";
	
	/**
	 * Result: flag within a SearchResult that indicates this is the closest to
	 * a particular requested date.
	 */
	public static final String CAPTURE_CLOSEST_INDICATOR = "closest";
	public static final String CAPTURE_CLOSEST_VALUE = "true";

	/**
	 * Result: this key being present indicates that this particular capture
	 * was not actually stored, and that other values within this SearchResult
	 * are actually values from a different record which *should* be identical
	 * to this capture, had it been stored.
	 */
	public static final String CAPTURE_DUPLICATE_ANNOTATION = "duplicate";

	/**
	 * Result: this key is present when the CAPTURE_DUPLICATE_ANNOTATION is also
	 * present, with the value indicating the last date that was actually
	 * stored for this duplicate.
	 */
	public static final String CAPTURE_DUPLICATE_STORED_TS = "duplicate-ts";

	/**
	 * flag indicates that this document was downloaded and verified as 
	 * identical to a previous capture by digest.
	 */
	public static final String CAPTURE_DUPLICATE_DIGEST = "digest";

	/**
	 * flag indicates that this document was NOT downloaded, but that the
	 * origin server indicated that the document had not changed, based on
	 * If-Modified HTTP request headers.
	 */
	public static final String CAPTURE_DUPLICATE_HTTP = "http";
	/**
	 * @return the original URL which resulted in the capture. If it is not 
	 * available, the urlKey and original Host will be used to reconstruct 
	 * something possibly closer to the original URL than the urlKey
	 */
	public String getOriginalUrl() {
		String url = get(CAPTURE_ORIGINAL_URL);
		if(url == null) {
			// convert from ORIG_HOST to ORIG_URL here:
			url = getUrlKey();
			String host = get(CAPTURE_ORIGINAL_HOST);
			if(url != null && host != null) {
				StringBuilder sb = new StringBuilder(url.length());
				sb.append(UrlOperations.DEFAULT_SCHEME);
				sb.append(host);
				sb.append(UrlOperations.getURLPath(url));
				url = sb.toString();
				// cache it for next time...?
				setOriginalUrl(url);
			}
		}
		return url;
	}
	/**
	 * @param originalUrl as close to the original URL by which this Resource
	 * 			was captured as is possible
	 */
	public void setOriginalUrl(String originalUrl) {
		put(CAPTURE_ORIGINAL_URL,originalUrl);
	}
	public String getOriginalHost() {
		String host = get(CAPTURE_ORIGINAL_HOST);
		if(host == null) {
			host = UrlOperations.urlToHost(getOriginalUrl());
		}
		return host;
	}
	public void setOriginalHost(String originalHost) {
		put(CAPTURE_ORIGINAL_HOST,originalHost);
	}
	public String getUrlKey() {
		return get(CAPTURE_URL_KEY);
	}
	public void setUrlKey(String urlKey) {
		put(CAPTURE_URL_KEY,urlKey);
	}
	public Date getCaptureDate() {
		if(cachedDate == -1) {
			cachedDate = tsToDate(getCaptureTimestamp()).getTime();
		}
		return new Date(cachedDate);
	}
	public void setCaptureDate(Date date) {
		cachedDate = date.getTime();
		put(CAPTURE_CAPTURE_TIMESTAMP, dateToTS(date));
	}
	public String getCaptureTimestamp() {
		return get(CAPTURE_CAPTURE_TIMESTAMP);
	}
	public void setCaptureTimestamp(String timestamp) {
		put(CAPTURE_CAPTURE_TIMESTAMP,timestamp);
	}
	public String getFile() {
		return get(CAPTURE_FILE);
	}
	public void setFile(String file) {
		put(CAPTURE_FILE, file);
	}
	public long getOffset() {
		if(cachedOffset == -1) {
			cachedOffset = Long.parseLong(get(CAPTURE_OFFSET));
		}
		return cachedOffset;
	}
	public void setOffset(long offset) {
		cachedOffset = offset;
		put(CAPTURE_OFFSET,String.valueOf(offset));
	}
	public long getCompressedLength() {
		if(cachedCompressedLength == -1) {
			String tmp = get(CAPTURE_COMPRESSED_LENGTH);
			cachedCompressedLength = tmp == null ? -1 : Long.parseLong(tmp);
		}
		return cachedCompressedLength;
	}
	public void setCompressedLength(long offset) {
		cachedCompressedLength = offset;
		put(CAPTURE_COMPRESSED_LENGTH,String.valueOf(offset));
	}
	public String getMimeType() {
		return get(CAPTURE_MIME_TYPE);
	}
	public void setMimeType(String mimeType) {
		put(CAPTURE_MIME_TYPE,mimeType);
	}
	public String getHttpCode() {
		return get(CAPTURE_HTTP_CODE);
	}
	public void setHttpCode(String httpCode) {
		put(CAPTURE_HTTP_CODE,httpCode);
	}
	public String getDigest() {
		return get(CAPTURE_DIGEST);
	}
	public void setDigest(String digest) {
		put(CAPTURE_DIGEST,digest);
	}
	public String getRedirectUrl() {
		return get(CAPTURE_REDIRECT_URL);
	}
	public void setRedirectUrl(String url) {
		put(CAPTURE_REDIRECT_URL,url);
	}
	public boolean isClosest() {
		return getBoolean(CAPTURE_CLOSEST_INDICATOR);
	}
	public void setClosest(boolean value) {
		putBoolean(CAPTURE_CLOSEST_INDICATOR,value);
	}

	public void flagDuplicateDigest(Date storedDate) {
		put(CAPTURE_DUPLICATE_ANNOTATION,CAPTURE_DUPLICATE_DIGEST);
		put(CAPTURE_DUPLICATE_STORED_TS,dateToTS(storedDate));
	}
	public void flagDuplicateDigest(String storedTS) {
		put(CAPTURE_DUPLICATE_ANNOTATION,CAPTURE_DUPLICATE_DIGEST);
		put(CAPTURE_DUPLICATE_STORED_TS,storedTS);
	}
	public boolean isDuplicateDigest() {
		String dupeType = get(CAPTURE_DUPLICATE_ANNOTATION);
		return (dupeType != null && dupeType.equals(CAPTURE_DUPLICATE_DIGEST));
	}
	public Date getDuplicateDigestStoredDate() {
		if(isDuplicateDigest()) {
			return tsToDate(get(CAPTURE_DUPLICATE_STORED_TS));
		}
		return null;
	}
	public String getDuplicateDigestStoredTimestamp() {
		if(isDuplicateDigest()) {
			return get(CAPTURE_DUPLICATE_STORED_TS);
		}
		return null;
	}

	public void flagDuplicateHTTP(Date storedDate) {
		put(CAPTURE_DUPLICATE_ANNOTATION,CAPTURE_DUPLICATE_HTTP);
		put(CAPTURE_DUPLICATE_STORED_TS,dateToTS(storedDate));
	}
	public void flagDuplicateHTTP(String storedTS) {
		put(CAPTURE_DUPLICATE_ANNOTATION,CAPTURE_DUPLICATE_HTTP);
		put(CAPTURE_DUPLICATE_STORED_TS,storedTS);
	}
	public boolean isDuplicateHTTP() {
		String dupeType = get(CAPTURE_DUPLICATE_ANNOTATION);
		return (dupeType != null && dupeType.equals(CAPTURE_DUPLICATE_HTTP));
	}
	public Date getDuplicateHTTPStoredDate() {
		if(isDuplicateHTTP()) {
			return tsToDate(get(CAPTURE_DUPLICATE_STORED_TS));
		}
		return null;
	}
	public String getDuplicateHTTPStoredTimestamp() {
		if(isDuplicateHTTP()) {
			return get(CAPTURE_DUPLICATE_STORED_TS);
		}
		return null;
	}
	public String getRobotFlags() {
		return get(CAPTURE_ROBOT_FLAGS);
	}
	public void setRobotFlags(String robotFlags) {
		put(CAPTURE_ROBOT_FLAGS,robotFlags);
	}
	public void setRobotFlag(String flag) {
		String flags = get(CAPTURE_ROBOT_FLAGS);
		if(flags == null) {
			flags = "";
		}
		if(!flags.contains(flag)) {
			flags = flags + flag;
		}
		put(CAPTURE_ROBOT_FLAGS,flags);
	}
	public boolean isRobotFlagSet(String flag) {
		String flags = get(CAPTURE_ROBOT_FLAGS);
		if(flags == null) {
			return false;
		}
		return flags.contains(flag); 
	}

	public boolean isRobotNoArchive() {
		return isRobotFlagSet(CAPTURE_ROBOT_NOARCHIVE);
	}
	public boolean isRobotNoIndex() {
		return isRobotFlagSet(CAPTURE_ROBOT_NOINDEX);
	}
	public boolean isRobotNoFollow() {
		return isRobotFlagSet(CAPTURE_ROBOT_NOFOLLOW);
	}
	public void setRobotNoArchive() {
		setRobotFlag(CAPTURE_ROBOT_NOARCHIVE);
	}
	public void setRobotNoIndex() {
		setRobotFlag(CAPTURE_ROBOT_NOARCHIVE);
	}
	public void setRobotNoFollow() {
		setRobotFlag(CAPTURE_ROBOT_NOARCHIVE);
	}
}