/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.core; import java.util.Date; import org.archive.wayback.util.url.UrlOperations; /** * * * @author brad * @version $Date$, $Revision$ */ public class CaptureSearchResult extends SearchResult { private long cachedOffset = -1; private long cachedCompressedLength = -1; private long cachedDate = -1; public static final String CAPTURE_ORIGINAL_URL = "url"; public static final String CAPTURE_ORIGINAL_HOST = "host"; /** * Result: canonicalized(lookup key) form of URL of captured document */ public static final String CAPTURE_URL_KEY = "urlkey"; /** * Result: 14-digit timestamp when document was captured */ public static final String CAPTURE_CAPTURE_TIMESTAMP = "capturedate"; /** * Result: basename of ARC/WARC file containing this document. */ public static final String CAPTURE_FILE = "file"; /** * Result: compressed byte offset within ARC/WARC file where this document's * gzip envelope begins. */ public static final String CAPTURE_OFFSET = "compressedoffset"; /** * Result: compressed byte offset within ARC/WARC file where this document's * gzip envelope Ends. */ public static final String CAPTURE_COMPRESSED_LENGTH = "compressedendoffset"; /** * Result: best-guess at mime-type of this document. */ public static final String CAPTURE_MIME_TYPE = "mimetype"; /** * Result: 3-digit integer HTTP response code. may be '0' in some * fringe conditions, old ARCs, bug in crawler, etc. */ public static final String CAPTURE_HTTP_CODE = "httpresponsecode"; /** * Result: some form of document fingerprint. This should represent the * HTTP payload only for HTTP captured resources. It may represent an MD5, a * SHA1, and may be a fragment of the full representation of the digest. */ public static final String CAPTURE_DIGEST= "digest"; /** * Result: URL that this document redirected to, or '-' if it does * not redirect */ public static final String CAPTURE_REDIRECT_URL = "redirecturl"; /** * Result: String flags which indicate robot instructions found in an HTML * page. Currently one or more of: * <li>"A" - noarchive</li> * <li>"F" - nofollow</li> * <li>"I" - noindex</li> * @see "http://noarchive.net/" */ public static final String CAPTURE_ROBOT_FLAGS = "robotflags"; public static final String CAPTURE_ROBOT_NOARCHIVE = "A"; public static final String CAPTURE_ROBOT_NOFOLLOW = "F"; public static final String CAPTURE_ROBOT_NOINDEX = "I"; /** * Result: flag within a SearchResult that indicates this is the closest to * a particular requested date. */ public static final String CAPTURE_CLOSEST_INDICATOR = "closest"; public static final String CAPTURE_CLOSEST_VALUE = "true"; /** * Result: this key being present indicates that this particular capture * was not actually stored, and that other values within this SearchResult * are actually values from a different record which *should* be identical * to this capture, had it been stored. */ public static final String CAPTURE_DUPLICATE_ANNOTATION = "duplicate"; /** * Result: this key is present when the CAPTURE_DUPLICATE_ANNOTATION is also * present, with the value indicating the last date that was actually * stored for this duplicate. */ public static final String CAPTURE_DUPLICATE_STORED_TS = "duplicate-ts"; /** * flag indicates that this document was downloaded and verified as * identical to a previous capture by digest. */ public static final String CAPTURE_DUPLICATE_DIGEST = "digest"; /** * flag indicates that this document was NOT downloaded, but that the * origin server indicated that the document had not changed, based on * If-Modified HTTP request headers. */ public static final String CAPTURE_DUPLICATE_HTTP = "http"; /** * @return the original URL which resulted in the capture. If it is not * available, the urlKey and original Host will be used to reconstruct * something possibly closer to the original URL than the urlKey */ public String getOriginalUrl() { String url = get(CAPTURE_ORIGINAL_URL); if(url == null) { // convert from ORIG_HOST to ORIG_URL here: url = getUrlKey(); String host = get(CAPTURE_ORIGINAL_HOST); if(url != null && host != null) { StringBuilder sb = new StringBuilder(url.length()); sb.append(UrlOperations.DEFAULT_SCHEME); sb.append(host); sb.append(UrlOperations.getURLPath(url)); url = sb.toString(); // cache it for next time...? setOriginalUrl(url); } } return url; } /** * @param originalUrl as close to the original URL by which this Resource * was captured as is possible */ public void setOriginalUrl(String originalUrl) { put(CAPTURE_ORIGINAL_URL,originalUrl); } public String getOriginalHost() { String host = get(CAPTURE_ORIGINAL_HOST); if(host == null) { host = UrlOperations.urlToHost(getOriginalUrl()); } return host; } public void setOriginalHost(String originalHost) { put(CAPTURE_ORIGINAL_HOST,originalHost); } public String getUrlKey() { return get(CAPTURE_URL_KEY); } public void setUrlKey(String urlKey) { put(CAPTURE_URL_KEY,urlKey); } public Date getCaptureDate() { if(cachedDate == -1) { cachedDate = tsToDate(getCaptureTimestamp()).getTime(); } return new Date(cachedDate); } public void setCaptureDate(Date date) { cachedDate = date.getTime(); put(CAPTURE_CAPTURE_TIMESTAMP, dateToTS(date)); } public String getCaptureTimestamp() { return get(CAPTURE_CAPTURE_TIMESTAMP); } public void setCaptureTimestamp(String timestamp) { put(CAPTURE_CAPTURE_TIMESTAMP,timestamp); } public String getFile() { return get(CAPTURE_FILE); } public void setFile(String file) { put(CAPTURE_FILE, file); } public long getOffset() { if(cachedOffset == -1) { cachedOffset = Long.parseLong(get(CAPTURE_OFFSET)); } return cachedOffset; } public void setOffset(long offset) { cachedOffset = offset; put(CAPTURE_OFFSET,String.valueOf(offset)); } public long getCompressedLength() { if(cachedCompressedLength == -1) { String tmp = get(CAPTURE_COMPRESSED_LENGTH); cachedCompressedLength = tmp == null ? -1 : Long.parseLong(tmp); } return cachedCompressedLength; } public void setCompressedLength(long offset) { cachedCompressedLength = offset; put(CAPTURE_COMPRESSED_LENGTH,String.valueOf(offset)); } public String getMimeType() { return get(CAPTURE_MIME_TYPE); } public void setMimeType(String mimeType) { put(CAPTURE_MIME_TYPE,mimeType); } public String getHttpCode() { return get(CAPTURE_HTTP_CODE); } public void setHttpCode(String httpCode) { put(CAPTURE_HTTP_CODE,httpCode); } public String getDigest() { return get(CAPTURE_DIGEST); } public void setDigest(String digest) { put(CAPTURE_DIGEST,digest); } public String getRedirectUrl() { return get(CAPTURE_REDIRECT_URL); } public void setRedirectUrl(String url) { put(CAPTURE_REDIRECT_URL,url); } public boolean isClosest() { return getBoolean(CAPTURE_CLOSEST_INDICATOR); } public void setClosest(boolean value) { putBoolean(CAPTURE_CLOSEST_INDICATOR,value); } public void flagDuplicateDigest(Date storedDate) { put(CAPTURE_DUPLICATE_ANNOTATION,CAPTURE_DUPLICATE_DIGEST); put(CAPTURE_DUPLICATE_STORED_TS,dateToTS(storedDate)); } public void flagDuplicateDigest(String storedTS) { put(CAPTURE_DUPLICATE_ANNOTATION,CAPTURE_DUPLICATE_DIGEST); put(CAPTURE_DUPLICATE_STORED_TS,storedTS); } public boolean isDuplicateDigest() { String dupeType = get(CAPTURE_DUPLICATE_ANNOTATION); return (dupeType != null && dupeType.equals(CAPTURE_DUPLICATE_DIGEST)); } public Date getDuplicateDigestStoredDate() { if(isDuplicateDigest()) { return tsToDate(get(CAPTURE_DUPLICATE_STORED_TS)); } return null; } public String getDuplicateDigestStoredTimestamp() { if(isDuplicateDigest()) { return get(CAPTURE_DUPLICATE_STORED_TS); } return null; } public void flagDuplicateHTTP(Date storedDate) { put(CAPTURE_DUPLICATE_ANNOTATION,CAPTURE_DUPLICATE_HTTP); put(CAPTURE_DUPLICATE_STORED_TS,dateToTS(storedDate)); } public void flagDuplicateHTTP(String storedTS) { put(CAPTURE_DUPLICATE_ANNOTATION,CAPTURE_DUPLICATE_HTTP); put(CAPTURE_DUPLICATE_STORED_TS,storedTS); } public boolean isDuplicateHTTP() { String dupeType = get(CAPTURE_DUPLICATE_ANNOTATION); return (dupeType != null && dupeType.equals(CAPTURE_DUPLICATE_HTTP)); } public Date getDuplicateHTTPStoredDate() { if(isDuplicateHTTP()) { return tsToDate(get(CAPTURE_DUPLICATE_STORED_TS)); } return null; } public String getDuplicateHTTPStoredTimestamp() { if(isDuplicateHTTP()) { return get(CAPTURE_DUPLICATE_STORED_TS); } return null; } public String getRobotFlags() { return get(CAPTURE_ROBOT_FLAGS); } public void setRobotFlags(String robotFlags) { put(CAPTURE_ROBOT_FLAGS,robotFlags); } public void setRobotFlag(String flag) { String flags = get(CAPTURE_ROBOT_FLAGS); if(flags == null) { flags = ""; } if(!flags.contains(flag)) { flags = flags + flag; } put(CAPTURE_ROBOT_FLAGS,flags); } public boolean isRobotFlagSet(String flag) { String flags = get(CAPTURE_ROBOT_FLAGS); if(flags == null) { return false; } return flags.contains(flag); } public boolean isRobotNoArchive() { return isRobotFlagSet(CAPTURE_ROBOT_NOARCHIVE); } public boolean isRobotNoIndex() { return isRobotFlagSet(CAPTURE_ROBOT_NOINDEX); } public boolean isRobotNoFollow() { return isRobotFlagSet(CAPTURE_ROBOT_NOFOLLOW); } public void setRobotNoArchive() { setRobotFlag(CAPTURE_ROBOT_NOARCHIVE); } public void setRobotNoIndex() { setRobotFlag(CAPTURE_ROBOT_NOARCHIVE); } public void setRobotNoFollow() { setRobotFlag(CAPTURE_ROBOT_NOARCHIVE); } }