/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.recrawl.hbase;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Result;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.archive.modules.recrawl.FetchHistoryProcessor;
/**
* A {@link Processor} for retrieving recrawl info from HBase table.
* See {@link HBasePersistProcessor} for table schema.
* As with other fetch history processors, this needs to be combined with {@link FetchHistoryProcessor}
* (set up after FetchHTTP, before WarcWriter) to work.
* @see HBasePersistStoreProcessor
* @contributor kenji
*/
public class HBasePersistLoadProcessor extends HBasePersistProcessor {
private static final Logger logger =
Logger.getLogger(HBasePersistLoadProcessor.class.getName());
@Override
protected ProcessResult innerProcessResult(CrawlURI uri) throws InterruptedException {
byte[] key = rowKeyForURI(uri);
Get g = new Get(key);
try {
Result r = table.get(g);
// no data for uri is indicated by empty Result
if (r.isEmpty()) {
if (logger.isLoggable(Level.FINE)) {
logger.fine(uri + ": <no crawlinfo>");
}
return ProcessResult.PROCEED;
}
schema.load(r, uri);
if (uri.getFetchStatus() < 0) {
return ProcessResult.FINISH;
}
} catch (IOException e) {
logger.warning("problem retrieving persist data from hbase, proceeding without, for " + uri + " - " + e);
} catch (Exception ex) {
// get() throws RuntimeException upon ZooKeeper connection failures.
// no crawl history load failure should make fetch of URL fail.
logger.log(Level.WARNING, "Get failed for " + uri + ": ", ex);
}
return ProcessResult.PROCEED;
}
/**
* unused.
*/
@Override
protected void innerProcess(CrawlURI uri) throws InterruptedException {
}
@Override
protected boolean shouldProcess(CrawlURI uri) {
// TODO: we want deduplicate robots.txt, too.
//if (uri.isPrerequisite()) return false;
String scheme = uri.getUURI().getScheme();
if (!(scheme.equals("http") || scheme.equals("https") || scheme.equals("ftp"))) {
return false;
}
return true;
}
}