package org.apache.nutchbase.crawl; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.io.BatchUpdate; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.io.RowResult; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.lib.NullOutputFormat; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.util.LogUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutchbase.util.hbase.ImmutableRowPart; import org.apache.nutchbase.util.hbase.RowPart; import org.apache.nutchbase.util.hbase.TableColumns; import org.apache.nutchbase.util.hbase.TableMapReduce; import org.apache.nutchbase.util.hbase.TableUtil; public class InjectorHbase extends TableMapReduce<ImmutableBytesWritable, BooleanWritable> implements Tool { public static final Log LOG = LogFactory.getLog(InjectorHbase.class); private static final String INJECT_KEY_STR = "__tmp_inject_key__"; private static final String META_INJECT_KEY_STR = TableColumns.METADATA_STR + INJECT_KEY_STR; private static final byte[] META_INJECT_KEY = Bytes.toBytes(META_INJECT_KEY_STR); private static final Set<String> COLUMNS = new HashSet<String>(); static { COLUMNS.add(META_INJECT_KEY_STR); COLUMNS.add(TableColumns.STATUS_STR); } private int interval; private float scoreInjected; private long curTime; private ImmutableRowPart row = new ImmutableRowPart(); public static class UrlMapperHbase implements Mapper<LongWritable, Text, Text, Text> { private URLNormalizers urlNormalizers; private URLFilters filters; private HTable table; private HBaseConfiguration hbaseConf; public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { if (table == null) { throw new IOException("Can not connect to hbase table"); } String url = value.toString(); String reversedUrl; try { url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); url = filters.filter(url); if (url == null) { return; } reversedUrl = TableUtil.reverseUrl(url); } catch (Exception e) { LOG.warn("Skipping " + url + ":" + e); return; } BatchUpdate bu = new BatchUpdate(reversedUrl); bu.put(META_INJECT_KEY, TableUtil.YES_VAL); table.commit(bu); } public void configure(JobConf job) { urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT); filters = new URLFilters(job); hbaseConf = new HBaseConfiguration(); try { table = new HTable(hbaseConf, job.get("input.table") ); } catch (IOException e) { e.printStackTrace(LogUtil.getFatalStream(LOG)); } } public void close() throws IOException { } } @Override public void map(ImmutableBytesWritable key, RowResult rowResult, OutputCollector<ImmutableBytesWritable, BooleanWritable> output, Reporter reporter) throws IOException { row = new ImmutableRowPart(rowResult); if (!row.hasMeta(INJECT_KEY_STR)) { return; } output.collect(key, new BooleanWritable( row.hasColumn(TableColumns.STATUS))); } public void configure(JobConf job) { interval = job.getInt("db.fetch.interval.default", 2592000); scoreInjected = job.getFloat("db.score.injected", 1.0f); curTime = job.getLong("injector.current.time", System.currentTimeMillis()); } @Override public void reduce(ImmutableBytesWritable key, Iterator<BooleanWritable> values, OutputCollector<ImmutableBytesWritable, BatchUpdate> output, Reporter reporter) throws IOException { boolean isOld = values.next().get(); RowPart row = new RowPart(key.get()); row.deleteMeta(INJECT_KEY_STR); if (!isOld) { row.setStatus(CrawlDatumHbase.STATUS_UNFETCHED); row.setFetchTime(curTime); row.setFetchInterval(interval); row.setScore(scoreInjected); row.setPagerank(0.0f); row.setVotes(0.0f); row.setRetriesSinceFetch(0); } output.collect(key, row.makeBatchUpdate()); } public void inject(String table, Path urlDir) throws IOException { LOG.info("InjectorHbase: starting"); LOG.info("InjectorHbase: urlDir: " + urlDir); JobConf job = new NutchJob(getConf()); job.setJobName("inject-hbase-p1 " + urlDir); FileInputFormat.addInputPath(job, urlDir); job.setMapperClass(UrlMapperHbase.class); job.setOutputFormat(NullOutputFormat.class); job.setLong("injector.current.time", System.currentTimeMillis()); job.set("input.table", table); JobClient.runJob(job); job = new NutchJob(getConf()); job.setJobName("inject-hbase-p2 " + urlDir); TableMapReduce.initJob(table, TableUtil.getColumns(COLUMNS), InjectorHbase.class, ImmutableBytesWritable.class, BooleanWritable.class, job); JobClient.runJob(job); LOG.info("InjectorHbase: done"); } public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: InjectorHbase <webtable> <url_dir>"); return -1; } try { inject(args[0], new Path(args[1])); return 0; } catch (Exception e) { LOG.fatal("InjectorHbase: " + StringUtils.stringifyException(e)); return -1; } } public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new InjectorHbase(), args); System.exit(res); } }