package org.archive.modules.recrawl.hbase; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_CONTENT_DIGEST_COUNT; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ORIGINAL_DATE; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_ORIGINAL_URL; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_FILENAME; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_FILE_OFFSET; import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_WARC_RECORD_ID; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.RetriesExhaustedWithDetailsException; import org.apache.hadoop.hbase.regionserver.NoSuchColumnFamilyException; import org.apache.hadoop.hbase.util.Bytes; import org.archive.modules.CrawlURI; import org.archive.modules.recrawl.AbstractContentDigestHistory; import org.json.JSONException; import org.json.JSONObject; import org.springframework.context.Lifecycle; import com.google.common.collect.BiMap; import com.google.common.collect.HashBiMap; /** * HBase content digest history store. Must be a toplevel bean in * crawler-beans.cxml in order to receive {@link Lifecycle} events. * * @see AbstractContentDigestHistory * @contributor nlevitt */ public class HBaseContentDigestHistory extends AbstractContentDigestHistory implements Lifecycle { private static final Logger logger = Logger.getLogger(HBaseContentDigestHistory.class.getName()); protected static final byte[] COLUMN_FAMILY = Bytes.toBytes("f"); protected static final byte[] COLUMN = Bytes.toBytes("c"); protected static final BiMap<String,String> JSON_KEYS_MAP = HashBiMap.create(); static { JSON_KEYS_MAP.put(A_CONTENT_DIGEST_COUNT, "c"); JSON_KEYS_MAP.put(A_ORIGINAL_URL, "u"); JSON_KEYS_MAP.put(A_WARC_RECORD_ID, "i"); JSON_KEYS_MAP.put(A_WARC_FILENAME, "f"); JSON_KEYS_MAP.put(A_WARC_FILE_OFFSET, "o"); JSON_KEYS_MAP.put(A_ORIGINAL_DATE, "d"); } protected HBaseTable table; public void setTable(HBaseTable table) { this.table = table; } protected boolean addColumnFamily = false; public boolean getAddColumnFamily() { return addColumnFamily; } /** * Add the expected column family * {@link HBasePersistProcessor#COLUMN_FAMILY} to the HBase table if the * table doesn't already have it. */ public void setAddColumnFamily(boolean addColumnFamily) { this.addColumnFamily = addColumnFamily; } protected int retryIntervalMs = 10*1000; public int getRetryIntervalMs() { return retryIntervalMs; } public void setRetryIntervalMs(int retryIntervalMs) { this.retryIntervalMs = retryIntervalMs; } protected int maxTries = 1; public int getMaxTries() { return maxTries; } public void setMaxTries(int maxTries) { this.maxTries = maxTries; } protected String keySuffix = null; public String getKeySuffix() { return keySuffix; } /** * If not null, keySuffix is appended to the lookup key when loading and * storing digest history. Thus the key looks like {digest}{keySuffix}, e.g. * "sha1:22SFHXERHNFOEY6WK7YOUN4PFIPZSB4D-1193". The purpose is to support * multiple namespaces in a single hbase table, to avoid proliferation of * small tables. The reason we use a suffix instead of a prefix is to leave * open the possibility of deduplication across these different namespaces * at some point in the future. * * @param keySuffix */ public void setKeySuffix(String keySuffix) { this.keySuffix = keySuffix; } @Override protected String persistKeyFor(CrawlURI curi) { if (keySuffix != null) { return super.persistKeyFor(curi) + keySuffix; } else { return super.persistKeyFor(curi); } } protected synchronized void addColumnFamily() { try { HTableDescriptor oldDesc = table.getHtableDescriptor(); if (oldDesc.getFamily(COLUMN_FAMILY) == null) { HTableDescriptor newDesc = new HTableDescriptor(oldDesc); newDesc.addFamily(new HColumnDescriptor(COLUMN_FAMILY)); logger.info("table does not yet have expected column family, modifying descriptor to " + newDesc); HBaseAdmin hbaseAdmin = table.getHbase().admin(); hbaseAdmin.disableTable(table.getName()); hbaseAdmin.modifyTable(Bytes.toBytes(table.getName()), newDesc); hbaseAdmin.enableTable(table.getName()); } } catch (IOException e) { logger.warning("problem adding column family: " + e); } } private boolean isRunning; @Override public void start() { // add column family here to avoid disabling table while another // ToeThread is trying to use it if (getAddColumnFamily()) { addColumnFamily(); } this.isRunning = true; } @Override public void stop() { this.isRunning = false; } @Override public boolean isRunning() { return isRunning; } @Override public void load(CrawlURI curi) { // make this call in all cases so that the value is initialized and // WARCWriterProcessor knows it should put the info in there HashMap<String, Object> contentDigestHistory = curi.getContentDigestHistory(); byte[] key = Bytes.toBytes(persistKeyFor(curi)); Result hbaseResult = tryHbaseGet(curi, new Get(key)); if (hbaseResult != null) { Map<String, Object> loadedHistory = parseHbaseResult(curi, hbaseResult); if (loadedHistory != null) { if (logger.isLoggable(Level.FINER)) { logger.finer("loaded history by digest " + persistKeyFor(curi) + " for uri " + curi + " - " + loadedHistory); } contentDigestHistory.putAll(loadedHistory); } } } protected Result tryHbaseGet(CrawlURI curi, Get hbaseGet) { try { return table.get(hbaseGet); } catch (IOException e) { logger.warning("problem retrieving persist data from hbase, proceeding without, for digest " + persistKeyFor(curi) + " uri " + curi + " - " + e); return null; } } protected Map<String, Object> parseHbaseResult(CrawlURI curi, Result hbaseResult) { HashMap<String, Object> loadedHistory = null; // no data for uri is indicated by empty Result if (!hbaseResult.isEmpty()) { byte[] jsonBytes = hbaseResult.getValue(COLUMN_FAMILY, COLUMN); if (jsonBytes != null) { JSONObject json = null; try { json = new JSONObject(Bytes.toString(jsonBytes)); loadedHistory = new HashMap<String,Object>(); @SuppressWarnings("unchecked") Iterator<String> keyIter = json.keys(); while (keyIter.hasNext()) { String jsonKey = keyIter.next(); Object jsonValue = json.get(jsonKey); String historyMapKey = JSON_KEYS_MAP.inverse().get(jsonKey); if (historyMapKey == null) { logger.warning("unknown key \"" + jsonKey + "\" found in hbase json for digest " + persistKeyFor(curi)); historyMapKey = jsonKey; } loadedHistory.put(historyMapKey, jsonValue); } } catch (JSONException e) { logger.warning("problem parsing json for digest " + persistKeyFor(curi) + " uri " + curi + " - " + e); } } else { // shouldn't happen? result.isEmpty() is normal case logger.fine("[jsonBytes==null] no persist data for digest " + persistKeyFor(curi) + " uri " + curi); } } else { logger.finest("[result.isEmpty()] no persist data for digest " + persistKeyFor(curi) + " uri " + curi); } return loadedHistory; } @Override public void store(CrawlURI curi) { if (!curi.hasContentDigestHistory() || curi.getContentDigestHistory().isEmpty()) { return; } if (logger.isLoggable(Level.FINER)) { logger.finer("storing history by digest " + persistKeyFor(curi) + " for uri " + curi + " - " + curi.getContentDigestHistory()); } Put hbasePut = createHbasePut(curi); tryHbasePut(curi, hbasePut); } protected void tryHbasePut(CrawlURI curi, Put p) { int tryCount = 0; do { tryCount++; try { table.put(p); return; } catch (RetriesExhaustedWithDetailsException e) { if (e.getCause(0) instanceof NoSuchColumnFamilyException && getAddColumnFamily()) { addColumnFamily(); tryCount--; } else { logger.warning("put failed " + "(try " + tryCount + " of " + getMaxTries() + ")" + " for " + curi + " - " + e); } } catch (IOException e) { logger.warning("put failed " + "(try " + tryCount + " of " + getMaxTries() + ")" + " for " + curi + " - " + e); } catch (NullPointerException e) { // HTable.put() throws NullPointerException while connection is lost. logger.warning("put failed " + "(try " + tryCount + " of " + getMaxTries() + ")" + " for " + curi + " - " + e); } if (tryCount > 0 && tryCount < getMaxTries() && isRunning()) { try { Thread.sleep(getRetryIntervalMs()); } catch (InterruptedException ex) { logger.warning("thread interrupted. aborting retry for " + curi); return; } } } while (tryCount < getMaxTries() && isRunning()); if (isRunning()) { logger.warning("giving up after " + tryCount + " tries on put for " + curi); } } protected Put createHbasePut(CrawlURI curi) { byte[] key = Bytes.toBytes(persistKeyFor(curi)); Put hbasePut = new Put(key); try { JSONObject json = new JSONObject(); for (Entry<String, Object> entry: curi.getContentDigestHistory().entrySet()) { String jsonKey = JSON_KEYS_MAP.get(entry.getKey()); if (jsonKey == null) { logger.warning("unknown key \"" + entry.getKey() + "\" found in content digest history map for " + curi); jsonKey = entry.getKey(); } json.put(jsonKey, entry.getValue()); } hbasePut.add(COLUMN_FAMILY, COLUMN, Bytes.toBytes(json.toString())); } catch (JSONException e) { // should not happen - all values are either primitive or String. logger.log(Level.SEVERE, "problem creating json object for digest " + persistKeyFor(curi) + " uri " + curi, e); } return hbasePut; } }