package org.apache.nutchbase.fetcher;
import java.io.IOException;
import java.net.InetAddress;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.io.BatchUpdate;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.RowResult;
import org.apache.hadoop.hbase.mapred.TableInputFormat;
import org.apache.hadoop.hbase.mapred.TableOutputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunnable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.protocol.RobotRules;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.URLUtil;
import org.apache.nutchbase.crawl.CrawlDatumHbase;
import org.apache.nutchbase.crawl.GeneratorHbase;
import org.apache.nutchbase.protocol.ProtocolFactoryHbase;
import org.apache.nutchbase.protocol.ProtocolHbase;
import org.apache.nutchbase.util.hbase.RowPart;
import org.apache.nutchbase.util.hbase.TableColumns;
import org.apache.nutchbase.util.hbase.TableUtil;
public class FetcherHbase
extends Configured
implements MapRunnable<ImmutableBytesWritable, RowResult,
ImmutableBytesWritable, RowPart>,
Reducer<ImmutableBytesWritable, RowPart,
ImmutableBytesWritable, BatchUpdate>,
Tool {
public static final Log LOG = LogFactory.getLog(FetcherHbase.class);
public static final String REDIRECT_DISCOVERED = "__tmp_rdr_disc__";
public static final String TMP_PARSE_MARK = "__tmp_parse_mark__";
private static final Set<String> COLUMNS = new HashSet<String>();
static {
COLUMNS.add(TableColumns.FETCH_TIME_STR);
COLUMNS.add(TableColumns.REPR_URL_STR);
COLUMNS.add(TableColumns.METADATA_STR + GeneratorHbase.TMP_FETCH_MARK);
}
private OutputCollector<ImmutableBytesWritable, RowPart> output;
private Reporter reporter;
private final AtomicInteger activeThreads = new AtomicInteger(0);
private final AtomicInteger spinWaiting = new AtomicInteger(0);
private final long start = System.currentTimeMillis(); // start time of fetcher run
private final AtomicLong lastRequestStart = new AtomicLong(start);
private final AtomicLong bytes = new AtomicLong(0); // total bytes fetched
private final AtomicInteger pages = new AtomicInteger(0); // total pages fetched
private final AtomicInteger errors = new AtomicInteger(0); // total pages errored
private FetchItemQueues fetchQueues;
private QueueFeeder feeder;
/**
* This class described the item to be fetched.
*/
private static class FetchItem {
ImmutableBytesWritable key;
RowPart row;
String queueID;
String url;
URL u;
public FetchItem(ImmutableBytesWritable key, RowPart row,
String url, URL u, String queueID) {
this.key = key;
this.row = row;
this.url = url;
this.u = u;
this.queueID = queueID;
}
/** Create an item. Queue id will be created based on <code>byIP</code>
* argument, either as a protocol + hostname pair, or protocol + IP
* address pair.
*/
public static FetchItem create(ImmutableBytesWritable key, RowPart row,
String url, boolean byIP) {
String queueID;
URL u = null;
try {
u = new URL(url);
} catch (final Exception e) {
LOG.warn("Cannot parse url: " + url, e);
return null;
}
final String proto = u.getProtocol().toLowerCase();
String host;
if (byIP) {
try {
final InetAddress addr = InetAddress.getByName(u.getHost());
host = addr.getHostAddress();
} catch (final UnknownHostException e) {
// unable to resolve it, so don't fall back to host name
LOG.warn("Unable to resolve: " + u.getHost() + ", skipping.");
return null;
}
} else {
host = u.getHost();
if (host == null) {
LOG.warn("Unknown host for url: " + url + ", skipping.");
return null;
}
host = host.toLowerCase();
}
queueID = proto + "://" + host;
return new FetchItem(key, row, url, u, queueID);
}
}
/**
* This class handles FetchItems which come from the same host ID (be it
* a proto/hostname or proto/IP pair). It also keeps track of requests in
* progress and elapsed time between requests.
*/
private static class FetchItemQueue {
List<FetchItem> queue = Collections.synchronizedList(new LinkedList<FetchItem>());
Set<FetchItem> inProgress = Collections.synchronizedSet(new HashSet<FetchItem>());
AtomicLong nextFetchTime = new AtomicLong();
long crawlDelay;
long minCrawlDelay;
int maxThreads;
Configuration conf;
public FetchItemQueue(Configuration conf, int maxThreads, long crawlDelay, long minCrawlDelay) {
this.conf = conf;
this.maxThreads = maxThreads;
this.crawlDelay = crawlDelay;
this.minCrawlDelay = minCrawlDelay;
// ready to start
setEndTime(System.currentTimeMillis() - crawlDelay);
}
public int getQueueSize() {
return queue.size();
}
public int getInProgressSize() {
return inProgress.size();
}
public void finishFetchItem(FetchItem it, boolean asap) {
if (it != null) {
inProgress.remove(it);
setEndTime(System.currentTimeMillis(), asap);
}
}
public void addFetchItem(FetchItem it) {
if (it == null) return;
queue.add(it);
}
public void addInProgressFetchItem(FetchItem it) {
if (it == null) return;
inProgress.add(it);
}
public FetchItem getFetchItem() {
if (inProgress.size() >= maxThreads) return null;
final long now = System.currentTimeMillis();
if (nextFetchTime.get() > now) return null;
FetchItem it = null;
if (queue.size() == 0) return null;
try {
it = queue.remove(0);
inProgress.add(it);
} catch (final Exception e) {
LOG.error("Cannot remove FetchItem from queue or cannot add it to inProgress queue", e);
}
return it;
}
public synchronized void dump() {
LOG.info(" maxThreads = " + maxThreads);
LOG.info(" inProgress = " + inProgress.size());
LOG.info(" crawlDelay = " + crawlDelay);
LOG.info(" minCrawlDelay = " + minCrawlDelay);
LOG.info(" nextFetchTime = " + nextFetchTime.get());
LOG.info(" now = " + System.currentTimeMillis());
for (int i = 0; i < queue.size(); i++) {
final FetchItem it = queue.get(i);
LOG.info(" " + i + ". " + it.url);
}
}
private void setEndTime(long endTime) {
setEndTime(endTime, false);
}
private void setEndTime(long endTime, boolean asap) {
if (!asap)
nextFetchTime.set(endTime + (maxThreads > 1 ? minCrawlDelay : crawlDelay));
else
nextFetchTime.set(endTime);
}
}
/**
* Convenience class - a collection of queues that keeps track of the total
* number of items, and provides items eligible for fetching from any queue.
*/
private static class FetchItemQueues {
public static final String DEFAULT_ID = "default";
Map<String, FetchItemQueue> queues = new HashMap<String, FetchItemQueue>();
AtomicInteger totalSize = new AtomicInteger(0);
int maxThreads;
boolean byIP;
long crawlDelay;
long minCrawlDelay;
Configuration conf;
public FetchItemQueues(Configuration conf) {
this.conf = conf;
this.maxThreads = conf.getInt("fetcher.threads.per.host", 1);
// backward-compatible default setting
this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", false);
this.crawlDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000);
this.minCrawlDelay = (long) (conf.getFloat("fetcher.server.min.delay", 0.0f) * 1000);
}
public int getTotalSize() {
return totalSize.get();
}
public int getQueueCount() {
return queues.size();
}
public void addFetchItem(ImmutableBytesWritable key, RowPart row,
String url) {
final FetchItem it = FetchItem.create(key, row, url, byIP);
if (it != null) addFetchItem(it);
}
public void addFetchItem(FetchItem it) {
final FetchItemQueue fiq = getFetchItemQueue(it.queueID);
fiq.addFetchItem(it);
totalSize.incrementAndGet();
}
public void finishFetchItem(FetchItem it) {
finishFetchItem(it, false);
}
public void finishFetchItem(FetchItem it, boolean asap) {
final FetchItemQueue fiq = queues.get(it.queueID);
if (fiq == null) {
LOG.warn("Attempting to finish item from unknown queue: " + it);
return;
}
fiq.finishFetchItem(it, asap);
}
public synchronized FetchItemQueue getFetchItemQueue(String id) {
FetchItemQueue fiq = queues.get(id);
if (fiq == null) {
// initialize queue
fiq = new FetchItemQueue(conf, maxThreads, crawlDelay, minCrawlDelay);
queues.put(id, fiq);
}
return fiq;
}
public synchronized FetchItem getFetchItem() {
final Iterator<Map.Entry<String, FetchItemQueue>> it =
queues.entrySet().iterator();
while (it.hasNext()) {
final FetchItemQueue fiq = it.next().getValue();
// reap empty queues
if (fiq.getQueueSize() == 0 && fiq.getInProgressSize() == 0) {
it.remove();
continue;
}
final FetchItem fit = fiq.getFetchItem();
if (fit != null) {
totalSize.decrementAndGet();
return fit;
}
}
return null;
}
public synchronized void dump() {
for (final String id : queues.keySet()) {
final FetchItemQueue fiq = queues.get(id);
if (fiq.getQueueSize() == 0) continue;
LOG.info("* queue: " + id);
fiq.dump();
}
}
}
/**
* This class feeds the queues with input items, and re-fills them as
* items are consumed by FetcherThread-s.
*/
private static class QueueFeeder extends Thread {
private final RecordReader<ImmutableBytesWritable, RowResult> reader;
private final FetchItemQueues queues;
private final int size;
public QueueFeeder(RecordReader<ImmutableBytesWritable, RowResult> reader,
FetchItemQueues queues, int size) {
this.reader = reader;
this.queues = queues;
this.size = size;
this.setDaemon(true);
this.setName("QueueFeeder");
}
@Override
public void run() {
boolean hasMore = true;
int cnt = 0;
while (hasMore) {
int feed = size - queues.getTotalSize();
if (feed <= 0) {
// queues are full - spin-wait until they have some free space
try {
LOG.info("-feeder : spin-waiting while queues are full (size=" + size + ", queues.getTotalSize()=" + queues.getTotalSize() );
Thread.sleep(1000);
} catch (final Exception e) {};
continue;
} else {
LOG.info("-feeding " + feed + " input urls ...");
while (feed > 0 && hasMore) {
try {
final ImmutableBytesWritable key = new ImmutableBytesWritable();
final RowResult rowResult = new RowResult();
hasMore = reader.next(key, rowResult);
if (hasMore) {
final RowPart row = new RowPart(rowResult);
if (!row.hasMeta(GeneratorHbase.TMP_FETCH_MARK)) {
// not marked by generate for fetching
continue;
}
final String url = TableUtil.unreverseUrl(Bytes.toString(key.get()));
queues.addFetchItem(key, row, url);
cnt++;
feed--;
}
} catch (final IOException e) {
LOG.fatal("QueueFeeder error reading input, record " + cnt, e);
return;
}
}
}
}
LOG.info("QueueFeeder finished: total " + cnt + " records.");
}
}
/**
* This class picks items from queues and fetches the pages.
*/
private class FetcherThread extends Thread {
private final URLFilters urlFilters;
private final URLNormalizers normalizers;
private final ProtocolFactoryHbase protocolFactory;
private final long maxCrawlDelay;
private final boolean byIP;
private final int maxRedirect;
private String reprUrl;
private boolean redirecting;
private int redirectCount;
public FetcherThread(Configuration conf, int num) {
this.setDaemon(true); // don't hang JVM on exit
this.setName("FetcherThread" + num); // use an informative name
this.urlFilters = new URLFilters(conf);
this.protocolFactory = new ProtocolFactoryHbase(conf);
this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
// backward-compatible default setting
this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
this.maxRedirect = conf.getInt("http.redirect.max", 3);
}
@Override
public void run() {
activeThreads.incrementAndGet(); // count threads
FetchItem fit = null;
try {
while (true) {
fit = fetchQueues.getFetchItem();
// LOG.info("getFetchItem : " + fit);
if (fit == null) {
if (feeder.isAlive() || fetchQueues.getTotalSize() > 0) {
LOG.debug(getName() + " fetchQueues.getFetchItem() was null, spin-waiting ...");
// spin-wait.
spinWaiting.incrementAndGet();
try {
Thread.sleep(500);
} catch (final Exception e) {}
spinWaiting.decrementAndGet();
continue;
} else {
// all done, finish this thread
return;
}
}
lastRequestStart.set(System.currentTimeMillis());
if (!fit.row.hasColumn(TableColumns.REPR_URL)) {
reprUrl = fit.url.toString();
} else {
reprUrl = fit.row.getReprUrl();
}
try {
LOG.info("fetching " + fit.url);
// fetch the page
redirecting = false;
redirectCount = 0;
do {
if (LOG.isDebugEnabled()) {
LOG.debug("redirectCount=" + redirectCount);
}
redirecting = false;
final ProtocolHbase protocol = this.protocolFactory.getProtocol(fit.url);
final RobotRules rules = protocol.getRobotRules(fit.url, fit.row);
if (!rules.isAllowed(fit.u)) {
// unblock
fetchQueues.finishFetchItem(fit, true);
if (LOG.isDebugEnabled()) {
LOG.info("Denied by robots.txt: " + fit.url);
}
output(fit, null, ProtocolStatus.STATUS_ROBOTS_DENIED,
CrawlDatumHbase.STATUS_GONE);
continue;
}
if (rules.getCrawlDelay() > 0) {
if (rules.getCrawlDelay() > maxCrawlDelay) {
// unblock
fetchQueues.finishFetchItem(fit, true);
LOG.info("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
output(fit, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatumHbase.STATUS_GONE);
continue;
} else {
final FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
fiq.crawlDelay = rules.getCrawlDelay();
}
}
final ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.row);
final ProtocolStatus status = output.getStatus();
final Content content = output.getContent();
// unblock queue
fetchQueues.finishFetchItem(fit);
switch(status.getCode()) {
case ProtocolStatus.WOULDBLOCK:
// retry ?
fetchQueues.addFetchItem(fit);
break;
case ProtocolStatus.SUCCESS: // got a page
output(fit, content, status, CrawlDatumHbase.STATUS_FETCHED);
updateStatus(content.getContent().length);
break;
case ProtocolStatus.MOVED: // redirect
case ProtocolStatus.TEMP_MOVED:
byte code;
boolean temp;
if (status.getCode() == ProtocolStatus.MOVED) {
code = CrawlDatumHbase.STATUS_REDIR_PERM;
temp = false;
} else {
code = CrawlDatumHbase.STATUS_REDIR_TEMP;
temp = true;
}
output(fit, content, status, code);
final String newUrl = status.getMessage();
handleRedirect(fit.url, newUrl, temp, Fetcher.PROTOCOL_REDIR);
redirecting = false;
break;
case ProtocolStatus.EXCEPTION:
logError(fit.url, status.getMessage());
/* FALLTHROUGH */
case ProtocolStatus.RETRY: // retry
case ProtocolStatus.BLOCKED:
output(fit, null, status, CrawlDatumHbase.STATUS_RETRY);
break;
case ProtocolStatus.GONE: // gone
case ProtocolStatus.NOTFOUND:
case ProtocolStatus.ACCESS_DENIED:
case ProtocolStatus.ROBOTS_DENIED:
output(fit, null, status, CrawlDatumHbase.STATUS_GONE);
break;
case ProtocolStatus.NOTMODIFIED:
output(fit, null, status, CrawlDatumHbase.STATUS_NOTMODIFIED);
break;
default:
if (LOG.isWarnEnabled()) {
LOG.warn("Unknown ProtocolStatus: " + status.getCode());
}
output(fit, null, status, CrawlDatumHbase.STATUS_RETRY);
}
if (redirecting && redirectCount >= maxRedirect) {
fetchQueues.finishFetchItem(fit);
if (LOG.isInfoEnabled()) {
LOG.info(" - redirect count exceeded " + fit.url);
}
output(fit, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatumHbase.STATUS_GONE);
}
} while (redirecting && (redirectCount < maxRedirect));
} catch (final Throwable t) { // unexpected exception
// unblock
fetchQueues.finishFetchItem(fit);
t.printStackTrace();
logError(fit.url, t.toString());
output(fit, null, ProtocolStatus.STATUS_FAILED, CrawlDatumHbase.STATUS_RETRY);
}
}
} catch (final Throwable e) {
if (LOG.isFatalEnabled()) {
e.printStackTrace(LogUtil.getFatalStream(LOG));
LOG.fatal("fetcher caught:"+e.toString());
}
} finally {
if (fit != null) fetchQueues.finishFetchItem(fit);
activeThreads.decrementAndGet(); // count threads
LOG.info("-finishing thread " + getName() + ", activeThreads=" + activeThreads);
}
}
private void handleRedirect(String url, String newUrl,
boolean temp, String redirType)
throws URLFilterException, IOException {
newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
newUrl = urlFilters.filter(newUrl);
if (newUrl == null || newUrl.equals(url)) {
return;
}
reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
final String reversedUrl = TableUtil.reverseUrl(reprUrl);
final ImmutableBytesWritable newKey =
new ImmutableBytesWritable(reversedUrl.getBytes());
final RowPart newRow = new RowPart(newKey.get());
if (!reprUrl.equals(url)) {
newRow.setReprUrl(reprUrl);
}
newRow.putMeta(REDIRECT_DISCOVERED, TableUtil.YES_VAL);
output.collect(newKey, newRow);
if (LOG.isDebugEnabled()) {
LOG.debug(" - " + redirType + " redirect to " +
reprUrl + " (fetching later)");
}
}
private void logError(String url, String message) {
if (LOG.isInfoEnabled()) {
LOG.info("fetch of " + url + " failed with: " + message);
}
errors.incrementAndGet();
}
private void output(FetchItem fit, Content content,
ProtocolStatus pstatus, byte status) {
try {
fit.row.setStatus(status);
final long prevFetchTime = fit.row.getFetchTime();
fit.row.setPrevFetchTime(prevFetchTime);
fit.row.setFetchTime(System.currentTimeMillis());
if (pstatus != null) {
fit.row.setProtocolStatus(pstatus);
}
if (content != null) {
fit.row.setContent(content.getContent());
fit.row.setContentType(content.getContentType());
fit.row.setBaseUrl(content.getBaseUrl());
if (status == CrawlDatumHbase.STATUS_FETCHED)
fit.row.putMeta(TMP_PARSE_MARK, TableUtil.YES_VAL);
}
output.collect(fit.key, fit.row);
} catch (final IOException e) {
e.printStackTrace(LogUtil.getFatalStream(LOG));
LOG.fatal("fetcher caught:"+e.toString());
}
}
}
private void updateStatus(int bytesInPage) throws IOException {
pages.incrementAndGet();
bytes.addAndGet(bytesInPage);
}
private void reportStatus() throws IOException {
String status;
final long elapsed = (System.currentTimeMillis() - start)/1000;
status = activeThreads + " threads, " +
pages+" pages, "+errors+" errors, "
+ Math.round(((float)pages.get()*10)/elapsed)/10.0+" pages/s, "
+ Math.round(((((float)bytes.get())*8)/1024)/elapsed)+" kb/s, ";
reporter.setStatus(status);
}
public void run(RecordReader<ImmutableBytesWritable, RowResult> input,
OutputCollector<ImmutableBytesWritable, RowPart> output,
Reporter reporter) throws IOException {
this.output = output;
this.reporter = reporter;
this.fetchQueues = new FetchItemQueues(getConf());
final int threadCount = getConf().getInt("fetcher.threads.fetch", 10);
LOG.info("Fetcher: threads: " + threadCount);
feeder = new QueueFeeder(input, fetchQueues, threadCount * 50);
//feeder.setPriority((Thread.MAX_PRIORITY + Thread.NORM_PRIORITY) / 2);
feeder.start();
// set non-blocking & no-robots mode for HTTP protocol plugins.
getConf().setBoolean(Protocol.CHECK_BLOCKING, false);
getConf().setBoolean(Protocol.CHECK_ROBOTS, false);
for (int i = 0; i < threadCount; i++) { // spawn threads
new FetcherThread(getConf(), i).start();
}
// select a timeout that avoids a task timeout
final long timeout = getConf().getInt("mapred.task.timeout", 10*60*1000)/2;
do { // wait for threads to exit
try {
Thread.sleep(1000);
} catch (final InterruptedException e) {}
reportStatus();
LOG.info("-activeThreads=" + activeThreads + ", spinWaiting=" + spinWaiting.get()
+ ", fetchQueues= " + fetchQueues.getQueueCount() +", fetchQueues.totalSize=" + fetchQueues.getTotalSize());
if (/* !feeder.isAlive() && */ fetchQueues.getTotalSize() < 20) {
fetchQueues.dump();
}
// some requests seem to hang, despite all intentions
if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) {
if (LOG.isWarnEnabled()) {
LOG.warn("Aborting with "+activeThreads+" hung threads.");
}
return;
}
} while (activeThreads.get() > 0);
LOG.info("-activeThreads=" + activeThreads);
}
public void configure(JobConf job) {
}
public void close() throws IOException {
}
public void reduce(ImmutableBytesWritable key, Iterator<RowPart> values,
OutputCollector<ImmutableBytesWritable, BatchUpdate> output,
Reporter reporter) throws IOException {
while (values.hasNext()) {
final RowPart row = values.next();
// remove the fetch-mark
row.deleteMeta(GeneratorHbase.TMP_FETCH_MARK);
output.collect(key, row.makeBatchUpdate());
}
}
public void fetch(String table, int threads)
throws IOException {
LOG.info("FetcherHbase: starting");
LOG.info("FetcherHbase: table: " + table);
final JobConf job = new NutchJob(getConf());
job.setJobName("fetch " + table);
if (threads > 0) {
job.setInt("fetcher.threads.fetch", threads);
}
// for politeness, don't permit parallel execution of a single task
job.setSpeculativeExecution(false);
job.setInputFormat(TableInputFormat.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(RowPart.class);
job.setMapRunnerClass(FetcherHbase.class);
FileInputFormat.addInputPaths(job, table);
job.set(TableInputFormat.COLUMN_LIST, getColumnsList(job));
job.setOutputFormat(TableOutputFormat.class);
job.setReducerClass(FetcherHbase.class);
job.set(TableOutputFormat.OUTPUT_TABLE, table);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(BatchUpdate.class);
JobClient.runJob(job);
LOG.info("FetcherHbase: done");
}
private static final String getColumnsList(JobConf job) {
final Set<String> columnSet = new HashSet<String>(COLUMNS);
final ProtocolFactoryHbase protocolFactory = new ProtocolFactoryHbase(job);
columnSet.addAll(protocolFactory.getColumnSet());
return TableUtil.getColumns(columnSet);
}
public int run(String[] args) throws Exception {
final String usage = "Usage: FetcherHbase <webtable> [-threads n]";
if (args.length < 1) {
System.err.println(usage);
System.exit(-1);
}
final String table = args[0];
int threads = -1;
if (args.length == 3 && args[1].equals("-threads")) {
// found -threads option
threads = Integer.parseInt(args[2]);
}
fetch(table, threads); // run the Fetcher
return 0;
}
public static void main(String[] args) throws Exception {
final int res = ToolRunner.run(NutchConfiguration.create(), new FetcherHbase(), args);
System.exit(res);
}
}