/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.recrawl.hbase; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.httpclient.HttpMethod; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.util.Bytes; import org.archive.modules.CrawlURI; import org.archive.modules.fetcher.FetchStatusCodes; import org.archive.modules.recrawl.FetchHistoryHelper; import org.archive.modules.recrawl.RecrawlAttributeConstants; import org.json.JSONException; import org.json.JSONObject; /** * {@linkplain SingleColumnJsonRecrawlDataSchema} stores all re-crawl data properties in a single column, * in JSON format. As HBase stores each column paired with the row key, it takes a lot of space to store * each re-crawl data property in its own column. * <ul> * <li>{@code r}: re-crawl data in JSON format</li> * <li>{@code z}: do-not-crawl flag - loader discards URL if this column has non-empty value.</li> * </ul> * @contributor Kenji Nagahashi */ public class SingleColumnJsonRecrawlDataSchema extends RecrawlDataSchemaBase implements RecrawlDataSchema { static final Logger logger = Logger.getLogger(SingleColumnJsonRecrawlDataSchema.class.getName()); public static byte[] DEFAULT_COLUMN = Bytes.toBytes("r"); // JSON property names for re-crawl data properties public static final String PROPERTY_STATUS = "s"; public static final String PROPERTY_CONTENT_DIGEST = "d"; public static final String PROPERTY_ETAG = "e"; public static final String PROPERTY_LAST_MODIFIED = "m"; // SHA1 scheme is assumed. public static final String CONTENT_DIGEST_SCHEME = "sha1:"; // single column for storing JSON of re-crawl data protected byte[] column = DEFAULT_COLUMN; public void setColumn(String column) { this.column = Bytes.toBytes(column); } public String getColumn() { return Bytes.toString(column); } /* (non-Javadoc) * @see org.archive.modules.hq.recrawl.RecrawlDataSchema#createPut(org.archive.modules.CrawlURI) */ public Put createPut(CrawlURI uri) { byte[] key = rowKeyForURI(uri); Put p = new Put(key); JSONObject jo = new JSONObject(); try { // TODO should we post warning message when scheme != "sha1"? String digest = uri.getContentDigestString(); if (digest != null) { jo.put(PROPERTY_CONTENT_DIGEST, digest); } jo.put(PROPERTY_STATUS, uri.getFetchStatus()); if (uri.isHttpTransaction()) { String etag = uri.getHttpResponseHeader(RecrawlAttributeConstants.A_ETAG_HEADER); if (etag != null) { // Etag is usually quoted if (etag.length() >= 2 && etag.charAt(0) == '"' && etag.charAt(etag.length() - 1) == '"') etag = etag.substring(1, etag.length() - 1); jo.put(PROPERTY_ETAG, etag); } String lastmod = uri.getHttpResponseHeader(RecrawlAttributeConstants.A_LAST_MODIFIED_HEADER); if (lastmod != null) { long lastmod_sec = FetchHistoryHelper.parseHttpDate(lastmod); if (lastmod_sec == 0) { try { lastmod_sec = uri.getFetchCompletedTime(); } catch (NullPointerException ex) { logger.warning("CrawlURI.getFetchCompletedTime():" + ex + " for " + uri.shortReportLine()); } } } else { try { long completed = uri.getFetchCompletedTime(); if (completed != 0) jo.put(PROPERTY_LAST_MODIFIED, completed); } catch (NullPointerException ex) { logger.warning("CrawlURI.getFetchCompletedTime():" + ex + " for " + uri.shortReportLine()); } } } } catch (JSONException ex) { // should not happen - all values are either primitive or String. logger.log(Level.SEVERE, "JSON translation failed", ex); } p.add(columnFamily, column, Bytes.toBytes(jo.toString())); return p; } /* (non-Javadoc) * @see org.archive.modules.hq.recrawl.RecrawlDataSchema#load(org.apache.hadoop.hbase.client.Result) */ public void load(Result result, CrawlURI curi) { // check for "do-not-crawl" flag - any non-empty data tells not to crawl this // URL. byte[] nocrawl = result.getValue(columnFamily, COLUMN_NOCRAWL); if (nocrawl != null && nocrawl.length > 0) { // fetch status set to S_DEEMED_CHAFF, because this do-not-crawl flag // is primarily intended for preventing crawler from stepping on traps. curi.setFetchStatus(FetchStatusCodes.S_DEEMED_CHAFF); curi.getAnnotations().add("nocrawl"); return; } KeyValue rkv = result.getColumnLatest(columnFamily, column); long timestamp = rkv.getTimestamp(); Map<String, Object> history = FetchHistoryHelper.getFetchHistory(curi, timestamp, historyLength); if (history == null) { // crawl history array is fully occupied by crawl history entries // newer than timestamp. return; } byte[] jsonBytes = rkv.getValue(); if (jsonBytes != null) { JSONObject jo = null; try { jo = new JSONObject(Bytes.toString(jsonBytes)); } catch (JSONException ex) { logger.warning(String.format("JSON parsing failed for key %1s: %2s", result.getRow(), ex.getMessage())); } if (jo != null) { int status = jo.optInt(PROPERTY_STATUS, -1); if (status >= 0) { history.put(RecrawlAttributeConstants.A_STATUS, status); } String digest = jo.optString(PROPERTY_CONTENT_DIGEST); if (digest != null) { history.put(RecrawlAttributeConstants.A_CONTENT_DIGEST, CONTENT_DIGEST_SCHEME + digest); } String etag = jo.optString(PROPERTY_ETAG); if (etag != null) { history.put(RecrawlAttributeConstants.A_ETAG_HEADER, etag); } long lastmod = jo.optLong(PROPERTY_LAST_MODIFIED); if (lastmod > 0) { history.put(RecrawlAttributeConstants.A_LAST_MODIFIED_HEADER, FetchHistoryHelper.formatHttpDate(lastmod)); } } } } }