/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.recrawl.hbase; import java.util.Map; import java.util.logging.Logger; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.util.Bytes; import org.archive.modules.CrawlURI; import org.archive.modules.fetcher.FetchStatusCodes; import org.archive.modules.recrawl.FetchHistoryHelper; import org.archive.modules.recrawl.RecrawlAttributeConstants; /** * RecrawlDataSchema that stores each recrawl data properties in a separate column in single column * family, whose name may be configured with {@link #setColumnFamily(String)} (default "f"). * <ul> * <li>{@code s}: fetch status (as integer text)</li> * <li>{@code d}: content digest (with {@code sha1:} prefix, Base32 text)</li> * <li>{@code e}: ETag (enclosing quotes stripped)</li> * <li>{@code m}: last-modified date-time (as integer timestamp, binary format)</li> * <li>{@code z}: do-not-crawl flag - loader discards URL if this column has non-empty value.</li> * </ul> * * @contributor kenji */ public class MultiColumnRecrawlDataSchema extends RecrawlDataSchemaBase implements RecrawlDataSchema, RecrawlAttributeConstants { static final Logger logger = Logger.getLogger(MultiColumnRecrawlDataSchema.class.getName()); public static final byte[] COLUMN_STATUS = Bytes.toBytes("s"); public static final byte[] COLUMN_CONTENT_DIGEST = Bytes.toBytes("d"); public static final byte[] COLUMN_ETAG = Bytes.toBytes("e"); public static final byte[] COLUMN_LAST_MODIFIED = Bytes.toBytes("m"); /* (non-Javadoc) * @see org.archive.modules.hq.recrawl.RecrawlDataSchema#createPut() */ public Put createPut(CrawlURI uri) { byte[] uriBytes = rowKeyForURI(uri); byte[] key = uriBytes; Put p = new Put(key); String digest = uri.getContentDigestSchemeString(); if (digest != null) { p.add(columnFamily, COLUMN_CONTENT_DIGEST, Bytes.toBytes(digest)); } p.add(columnFamily, COLUMN_STATUS, Bytes.toBytes(Integer.toString(uri.getFetchStatus()))); if (uri.isHttpTransaction()) { String etag = uri.getHttpResponseHeader(RecrawlAttributeConstants.A_ETAG_HEADER); if (etag != null) { // Etqg is usually quoted if (etag.length() >= 2 && etag.charAt(0) == '"' && etag.charAt(etag.length() - 1) == '"') etag = etag.substring(1, etag.length() - 1); p.add(columnFamily, COLUMN_ETAG, Bytes.toBytes(etag)); } String lastmod = uri.getHttpResponseHeader(RecrawlAttributeConstants.A_LAST_MODIFIED_HEADER); if (lastmod != null) { long lastmod_sec = FetchHistoryHelper.parseHttpDate(lastmod); if (lastmod_sec == 0) { try { lastmod_sec = uri.getFetchCompletedTime(); } catch (NullPointerException ex) { logger.warning("CrawlURI.getFetchCompletedTime():" + ex + " for " + uri.shortReportLine()); } } if (lastmod_sec != 0) p.add(columnFamily, COLUMN_LAST_MODIFIED, Bytes.toBytes(lastmod_sec)); } else { try { long completed = uri.getFetchCompletedTime(); if (completed != 0) p.add(columnFamily, COLUMN_LAST_MODIFIED, Bytes.toBytes(completed)); } catch (NullPointerException ex) { logger.warning("CrawlURI.getFetchCompletedTime():" + ex + " for " + uri.shortReportLine()); } } } return p; } /* (non-Javadoc) * @see org.archive.modules.hq.recrawl.RecrawlDataSchema#load(java.util.Map, org.apache.hadoop.hbase.client.Result) */ public void load(Result result, CrawlURI curi) { // check for "do-not-crawl" flag - any non-empty data tells not to crawl this // URL. byte[] nocrawl = result.getValue(columnFamily, COLUMN_NOCRAWL); if (nocrawl != null && nocrawl.length > 0) { // fetch status set to S_DEEMED_CHAFF, because this do-not-crawl flag // is primarily intended for preventing crawler from stepping on traps. curi.setFetchStatus(FetchStatusCodes.S_DEEMED_CHAFF); curi.getAnnotations().add("nocrawl"); return; } // all column should have identical timestamp. KeyValue rkv = result.getColumnLatest(columnFamily, COLUMN_STATUS); long timestamp = rkv.getTimestamp(); Map<String, Object> history = FetchHistoryHelper.getFetchHistory(curi, timestamp, historyLength); // FetchHTTP ignores history with status <= 0 byte[] status = result.getValue(columnFamily, COLUMN_STATUS); if (status != null) { // Note that status is stored as integer text. It's typically three-chars // that is less than 4-byte integer bits. history.put(RecrawlAttributeConstants.A_STATUS, Integer.parseInt(Bytes.toString(status))); byte[] etag = result.getValue(columnFamily, COLUMN_ETAG); if (etag != null) { history.put(RecrawlAttributeConstants.A_ETAG_HEADER, Bytes.toString(etag)); } byte[] lastmod = result.getValue(columnFamily, COLUMN_LAST_MODIFIED); if (lastmod != null) { long lastmod_sec = Bytes.toLong(lastmod); history.put(RecrawlAttributeConstants.A_LAST_MODIFIED_HEADER, FetchHistoryHelper.formatHttpDate(lastmod_sec)); } byte[] digest = result.getValue(columnFamily, COLUMN_CONTENT_DIGEST); if (digest != null) { history.put(RecrawlAttributeConstants.A_CONTENT_DIGEST, Bytes.toString(digest)); } } } }