package org.apache.nutchbase.crawl;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.io.BatchUpdate;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.RowResult;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.FetchSchedule;
import org.apache.nutch.crawl.Inlink;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureComparator;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutchbase.fetcher.FetcherHbase;
import org.apache.nutchbase.parse.ParseTable;
import org.apache.nutchbase.util.hbase.ImmutableRowPart;
import org.apache.nutchbase.util.hbase.RowPart;
import org.apache.nutchbase.util.hbase.TableColumns;
import org.apache.nutchbase.util.hbase.TableMapReduce;
import org.apache.nutchbase.util.hbase.TableUtil;
public class UpdateTable
extends TableMapReduce<ImmutableBytesWritable, NutchWritable>
implements Tool {
public static final Log LOG = LogFactory.getLog(UpdateTable.class);
private static final Set<String> COLUMNS = new HashSet<String>();
private static final String ALL = "db.update.all";
static {
COLUMNS.add(TableColumns.OUTLINKS_STR);
COLUMNS.add(TableColumns.INLINKS_STR);
COLUMNS.add(TableColumns.STATUS_STR);
COLUMNS.add(TableColumns.METADATA_STR + ParseTable.TMP_UPDATE_MARK);
COLUMNS.add(TableColumns.METADATA_STR + FetcherHbase.REDIRECT_DISCOVERED);
COLUMNS.add(TableColumns.RETRIES_STR);
COLUMNS.add(TableColumns.FETCH_TIME_STR);
COLUMNS.add(TableColumns.MODIFIED_TIME_STR);
COLUMNS.add(TableColumns.FETCH_INTERVAL_STR);
COLUMNS.add(TableColumns.PREV_FETCH_TIME_STR);
COLUMNS.add(TableColumns.PREV_SIGNATURE_STR);
}
private int retryMax;
private boolean additionsAllowed;
private int maxInterval;
private float scoreInjected;
private FetchScheduleHbase schedule;
private List<Inlink> inlinks = new ArrayList<Inlink>();
private boolean updateAll;
@Override
public void configure(JobConf job) {
retryMax = job.getInt("db.fetch.retry.max", 3);
additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
scoreInjected = job.getFloat("db.score.injected", 1.0f);
maxInterval = job.getInt("db.fetch.interval.max", 0 );
updateAll = job.getBoolean("db.update.all", false);
schedule = FetchScheduleFactoryHbase.getFetchSchedule(job);
}
@Override
public void map(ImmutableBytesWritable key, RowResult rowResult,
OutputCollector<ImmutableBytesWritable, NutchWritable> output,
Reporter reporter)
throws IOException {
ImmutableRowPart row = new ImmutableRowPart(rowResult);
if (!updateAll && !row.hasMeta(ParseTable.TMP_UPDATE_MARK)) {
return;
}
output.collect(key, new NutchWritable(rowResult));
Collection<Outlink> outlinks = row.getOutlinks();
if (outlinks.isEmpty()) {
return;
}
String url = TableUtil.unreverseUrl(Bytes.toString(key.get()));
for (Outlink outlink : outlinks) {
try {
String reversedOut = TableUtil.reverseUrl(outlink.getToUrl());
ImmutableBytesWritable outKey =
new ImmutableBytesWritable(reversedOut.getBytes());
output.collect(outKey, new NutchWritable(new Inlink(url, outlink.getAnchor())));
} catch (Exception e) {
// Catching anything isn't usually good - but we should report it
// and shouldn't crash the process for a bad URL.
LOG.info("Exception thrown by url: " + outlink.getToUrl().toString(), e);
}
}
}
@Override
public void reduce(ImmutableBytesWritable key,
Iterator<NutchWritable> values,
OutputCollector<ImmutableBytesWritable, BatchUpdate> output,
Reporter reporter) throws IOException {
RowResult rowResult = null;
inlinks.clear();
while (values.hasNext()) {
Writable val = values.next().get();
if (val instanceof RowResult) {
rowResult = (RowResult) val;
} else {
Inlink anchor = (Inlink) val;
inlinks.add(anchor);
}
}
String url;
try {
url = TableUtil.unreverseUrl(Bytes.toString(key.get()));
} catch (Exception e) {
// this can happen because a newly discovered malformed link
// may slip by url filters
// TODO: Find a better solution
return;
}
RowPart row;
if (rowResult == null) { // new row
if (!additionsAllowed) {
return;
}
row = new RowPart(key.get());
schedule.initializeSchedule(url, row);
row.setStatus(CrawlDatumHbase.STATUS_UNFETCHED);
row.setScore(scoreInjected);
} else {
row = new RowPart(rowResult);
if (row.hasMeta(FetcherHbase.REDIRECT_DISCOVERED) && !row.hasColumn(TableColumns.STATUS)) {
// this row is marked during fetch as the destination of a redirect
// but does not contain anything else, so we initialize it.
schedule.initializeSchedule(url, row);
row.setStatus(CrawlDatumHbase.STATUS_UNFETCHED);
row.setScore(scoreInjected);
} else if (row.hasMeta(ParseTable.TMP_UPDATE_MARK)) { // marked for update
byte status = row.getStatus();
switch (status) {
case CrawlDatumHbase.STATUS_FETCHED: // succesful fetch
case CrawlDatumHbase.STATUS_REDIR_TEMP: // successful fetch, redirected
case CrawlDatumHbase.STATUS_REDIR_PERM:
case CrawlDatumHbase.STATUS_NOTMODIFIED: // successful fetch, notmodified
int modified = FetchSchedule.STATUS_UNKNOWN;
if (status == CrawlDatumHbase.STATUS_NOTMODIFIED) {
modified = FetchSchedule.STATUS_NOTMODIFIED;
}
byte[] prevSig = row.getPrevSignature();
byte[] signature = row.getSignature();
if (prevSig != null && signature != null) {
if (SignatureComparator._compare(prevSig, signature) != 0) {
modified = FetchSchedule.STATUS_MODIFIED;
} else {
modified = FetchSchedule.STATUS_NOTMODIFIED;
}
}
long fetchTime = row.getFetchTime();
long prevFetchTime = row.getPrevFetchTime();
long modifiedTime = row.getModifiedTime();
schedule.setFetchSchedule(url, row, prevFetchTime, 0L,
fetchTime, modifiedTime, modified);
if (maxInterval < row.getFetchInterval())
schedule.forceRefetch(url, row, false);
break;
case CrawlDatumHbase.STATUS_RETRY:
schedule.setPageRetrySchedule(url, row, 0L, 0L, row.getFetchTime());
if (row.getRetriesSinceFetch() < retryMax) {
row.setStatus(CrawlDatumHbase.STATUS_UNFETCHED);
} else {
row.setStatus(CrawlDatumHbase.STATUS_GONE);
}
break;
case CrawlDatumHbase.STATUS_GONE:
schedule.setPageGoneSchedule(url, row, 0L, 0L, row.getFetchTime());
break;
}
}
}
row.deleteAllInlinks();
for (Inlink inlink : inlinks) {
row.addInlink(inlink);
}
// clear markers
row.deleteMeta(FetcherHbase.REDIRECT_DISCOVERED);
row.deleteMeta(GeneratorHbase.TMP_FETCH_MARK);
row.deleteMeta(FetcherHbase.TMP_PARSE_MARK);
row.deleteMeta(ParseTable.TMP_UPDATE_MARK);
output.collect(key, row.makeBatchUpdate());
}
private void updateTable(String table, boolean updateAll) throws IOException {
LOG.info("UpdateTable: starting");
LOG.info("UpdateTable: table: " + table);
if (updateAll && LOG.isWarnEnabled())
LOG.warn("Running update with reset enabled - whole table will be modified.");
JobConf job = new NutchJob(getConf());
job.setBoolean(ALL, updateAll);
job.setJobName("update-table " + table);
TableMapReduce.initJob(table, TableUtil.getColumns(COLUMNS),
UpdateTable.class, ImmutableBytesWritable.class,
NutchWritable.class, job);
JobClient.runJob(job);
LOG.info("UpdateTable: done");
}
public int run(String[] args) throws Exception {
String usage = "Usage: UpdateTable <webtable> [-all]";
if (args.length < 1) {
System.err.println(usage);
System.exit(-1);
}
boolean updateAll = false;
for (int i = 1; i < args.length; i++) {
if ("-all".equals(args[i]))
updateAll = true;
}
updateTable(args[0], updateAll);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new UpdateTable(), args);
System.exit(res);
}
}