RecrawlDataSchemaBase.java example

Explorer
heritrix3-master
package org.archive.modules.recrawl.hbase;

import java.util.Map;
import java.util.logging.Logger;

import org.apache.hadoop.hbase.util.Bytes;
import org.archive.modules.CrawlURI;
import org.archive.modules.canonicalize.CanonicalizationRule;
import org.archive.modules.recrawl.FetchHistoryHelper;
import org.archive.modules.recrawl.FetchHistoryProcessor;
import org.archive.modules.recrawl.PersistProcessor;

/**
 * implements common utility methods for implementing {@link RecrawlDataSchema}.
 * <ul>
 * <li>configuring single column family name</li>
 * <li>formatting/parsing HTTP date text</li>
 * <li>constructing row key</li>
 * <li>preparing fetch-history array</li>
 * </ul>
 * @contributor kenji
 */
abstract public class RecrawlDataSchemaBase implements RecrawlDataSchema {
    private static final Logger logger = Logger.getLogger(RecrawlDataSchemaBase.class.getName());

    /**
     * default value for {@link #columnFamily}.
     */
    public static final byte[] DEFAULT_COLUMN_FAMILY = Bytes.toBytes("f");
    protected byte[] columnFamily = DEFAULT_COLUMN_FAMILY;

    public static final byte[] COLUMN_NOCRAWL = Bytes.toBytes("z");
    
    /**
     * default value for {@link #useCanonicalString}.
     */
    public static boolean DEFAULT_USE_CANONICAL_STRING = true;

    private boolean useCanonicalString = DEFAULT_USE_CANONICAL_STRING;
    private CanonicalizationRule keyRule = null;

    protected int historyLength = 2;

    public RecrawlDataSchemaBase() {
        super();
    }

    public void setColumnFamily(String colf) {
        columnFamily = Bytes.toBytes(colf);
    }

    public boolean isUseCanonicalString() {
        return useCanonicalString;
    }
    /**
     * if set to true, canonicalized string will be used as row key, rather than URI
     * @param useCanonicalString
     */
    public void setUseCanonicalString(boolean useCanonicalString) {
        this.useCanonicalString = useCanonicalString;
    }

    public String getColumnFamily() {
        return Bytes.toString(columnFamily);
    }


    public CanonicalizationRule getKeyRule() {
        return keyRule;
    }
    /**
     * alternative canonicalization rule for generating row key from URI.
     * TODO: currently unused.
     * @param keyRule
     */
    public void setKeyRule(CanonicalizationRule keyRule) {
        this.keyRule = keyRule;
    }

    public int getHistoryLength() {
        return historyLength;
    }

    /**
     * maximum number of crawl history entries to retain in {@link CrawlURI}.
     * when more than this number of crawl history entry is being added by
     * {@link #getFetchHistory(CrawlURI, long)}, oldest entry will be discarded.
     * {@code historyLength} should be the same number as
     * {@link FetchHistoryProcessor#setHistoryLength(int)}, or FetchHistoryProcessor will
     * reallocate the crawl history array.
     * @param historyLength
     * @see FetchHistoryProcessor#setHistoryLength(int)
     */
    public void setHistoryLength(int historyLength) {
        this.historyLength = historyLength;
    }

    /**
     * calls {@link FetchHistoryHelper#getFetchHistory(CrawlURI, long, int)} with {@link #historyLength}.
     * @param uri CrawlURI from which fetch history is obtained.
     * @return Map object for storing re-crawl data (never null).
     * @see FetchHistoryHelper#getFetchHistory(CrawlURI, long, int)
     * @see FetchHistoryProcessor
     */
    protected Map<String, Object> getFetchHistory(CrawlURI uri, long timestamp) {
        return FetchHistoryHelper.getFetchHistory(uri, timestamp, historyLength);
    }

    /**
     * return row key for {@code curi}.
     * TODO: move this to HBasePersistProcessor by redesigning {@link RecrawlDataSchema}.
     * @param curi {@link CrawlURI} for which a row is being fetched.
     * @return row key
     */
    public byte[] rowKeyForURI(CrawlURI curi) {
        if (useCanonicalString) {
            // TODO: use keyRule if specified.
            return Bytes.toBytes(PersistProcessor.persistKeyFor(curi));
        } else {
            return Bytes.toBytes(curi.toString());
        }
    }
}