package org.archive.modules.recrawl.hbase;
import java.util.Map;
import java.util.logging.Logger;
import org.apache.hadoop.hbase.util.Bytes;
import org.archive.modules.CrawlURI;
import org.archive.modules.canonicalize.CanonicalizationRule;
import org.archive.modules.recrawl.FetchHistoryHelper;
import org.archive.modules.recrawl.FetchHistoryProcessor;
import org.archive.modules.recrawl.PersistProcessor;
/**
* implements common utility methods for implementing {@link RecrawlDataSchema}.
* <ul>
* <li>configuring single column family name</li>
* <li>formatting/parsing HTTP date text</li>
* <li>constructing row key</li>
* <li>preparing fetch-history array</li>
* </ul>
* @contributor kenji
*/
abstract public class RecrawlDataSchemaBase implements RecrawlDataSchema {
private static final Logger logger = Logger.getLogger(RecrawlDataSchemaBase.class.getName());
/**
* default value for {@link #columnFamily}.
*/
public static final byte[] DEFAULT_COLUMN_FAMILY = Bytes.toBytes("f");
protected byte[] columnFamily = DEFAULT_COLUMN_FAMILY;
public static final byte[] COLUMN_NOCRAWL = Bytes.toBytes("z");
/**
* default value for {@link #useCanonicalString}.
*/
public static boolean DEFAULT_USE_CANONICAL_STRING = true;
private boolean useCanonicalString = DEFAULT_USE_CANONICAL_STRING;
private CanonicalizationRule keyRule = null;
protected int historyLength = 2;
public RecrawlDataSchemaBase() {
super();
}
public void setColumnFamily(String colf) {
columnFamily = Bytes.toBytes(colf);
}
public boolean isUseCanonicalString() {
return useCanonicalString;
}
/**
* if set to true, canonicalized string will be used as row key, rather than URI
* @param useCanonicalString
*/
public void setUseCanonicalString(boolean useCanonicalString) {
this.useCanonicalString = useCanonicalString;
}
public String getColumnFamily() {
return Bytes.toString(columnFamily);
}
public CanonicalizationRule getKeyRule() {
return keyRule;
}
/**
* alternative canonicalization rule for generating row key from URI.
* TODO: currently unused.
* @param keyRule
*/
public void setKeyRule(CanonicalizationRule keyRule) {
this.keyRule = keyRule;
}
public int getHistoryLength() {
return historyLength;
}
/**
* maximum number of crawl history entries to retain in {@link CrawlURI}.
* when more than this number of crawl history entry is being added by
* {@link #getFetchHistory(CrawlURI, long)}, oldest entry will be discarded.
* {@code historyLength} should be the same number as
* {@link FetchHistoryProcessor#setHistoryLength(int)}, or FetchHistoryProcessor will
* reallocate the crawl history array.
* @param historyLength
* @see FetchHistoryProcessor#setHistoryLength(int)
*/
public void setHistoryLength(int historyLength) {
this.historyLength = historyLength;
}
/**
* calls {@link FetchHistoryHelper#getFetchHistory(CrawlURI, long, int)} with {@link #historyLength}.
* @param uri CrawlURI from which fetch history is obtained.
* @return Map object for storing re-crawl data (never null).
* @see FetchHistoryHelper#getFetchHistory(CrawlURI, long, int)
* @see FetchHistoryProcessor
*/
protected Map<String, Object> getFetchHistory(CrawlURI uri, long timestamp) {
return FetchHistoryHelper.getFetchHistory(uri, timestamp, historyLength);
}
/**
* return row key for {@code curi}.
* TODO: move this to HBasePersistProcessor by redesigning {@link RecrawlDataSchema}.
* @param curi {@link CrawlURI} for which a row is being fetched.
* @return row key
*/
public byte[] rowKeyForURI(CrawlURI curi) {
if (useCanonicalString) {
// TODO: use keyRule if specified.
return Bytes.toBytes(PersistProcessor.persistKeyFor(curi));
} else {
return Bytes.toBytes(curi.toString());
}
}
}