/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.recrawl; /** * * @author pjack * */ public interface RecrawlAttributeConstants { /* Duplication-reduction / recrawl / history constants */ /** fetch history array */ public static final String A_FETCH_HISTORY = "fetch-history"; /** content digest */ public static final String A_CONTENT_DIGEST = "content-digest"; /** header name (and AList key) for last-modified timestamp */ public static final String A_LAST_MODIFIED_HEADER = "last-modified"; /** header name (and AList key) for ETag */ public static final String A_ETAG_HEADER = "etag"; /** key for status (when in history) */ public static final String A_STATUS = "status"; /** reference length (content length or virtual length */ public static final String A_REFERENCE_LENGTH = "reference-length"; // constants for uri-agnostic content digest based dedupe /** content digest history map */ public static final String A_CONTENT_DIGEST_HISTORY = "content-digest-history"; /** url that the content payload was written for */ public static final String A_ORIGINAL_URL = "original-url"; /** warc record id of warc record with the content payload */ public static final String A_WARC_RECORD_ID = "warc-record-id"; /** warc filename containing the content payload */ public static final String A_WARC_FILENAME = "warc-filename"; /** offset into warc file of warc record with content payload */ public static final String A_WARC_FILE_OFFSET = "warc-file-offset"; /** date content payload was written */ public static final String A_ORIGINAL_DATE = "content-written-date"; /** number of times we've seen this content digest (1 original + n duplicates) */ public static final String A_CONTENT_DIGEST_COUNT = "content-digest-count"; /** * Writer processors of all types are encouraged to put a 'writeTag' * (analogous to HTTP 'etag') in the CrawlURI state. Its contents are * opaque/private-to-the-writer, but might generally be a * WARC-name/offset/UUID/etc, and their mere presence means content is * written somewhere. A writer processor that decides not to write fresh * content at all, not even a revisit record, because it sees previous * sufficient writeTag in history, will usually copy that forward to latest * history record. {@link PersistLogProcessor}/{@link PersistStoreProcessor} * have an option {@link PersistProcessor#onlyStoreIfWriteTagPresent}, which * defaults to true. */ public static final String A_WRITE_TAG = "write-tag"; }