package org.apache.nutchbase.crawl; import java.io.DataOutputStream; import java.io.IOException; import java.net.URL; import java.util.Date; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Random; import java.util.Set; import java.util.TreeMap; // Commons Logging imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.io.BatchUpdate; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.io.RowResult; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.Closeable; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapFileOutputFormat; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.lib.HashPartitioner; import org.apache.hadoop.mapred.lib.IdentityMapper; import org.apache.hadoop.mapred.lib.IdentityReducer; import org.apache.hadoop.util.Progressable; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.StringUtil; import org.apache.nutchbase.util.hbase.RowPart; import org.apache.nutchbase.util.hbase.TableColumns; import org.apache.nutchbase.util.hbase.TableMapReduce; import org.apache.nutchbase.util.hbase.TableUtil; public class WebtableStatisticsReader extends Configured implements Tool { public static final Log LOG = LogFactory.getLog(WebtableStatisticsReader.class); private static final Set<String> COLUMNS = new HashSet<String>(); static { COLUMNS.add(TableColumns.FETCH_TIME_STR); COLUMNS.add(TableColumns.SCORE_STR); COLUMNS.add(TableColumns.STATUS_STR); } public static class CrawlDbStatMapReduce extends TableMapReduce<ImmutableBytesWritable, LongWritable> // implements Mapper<Text, CrawlDatumHbase, Text, LongWritable> { LongWritable COUNT_1 = new LongWritable(1); private boolean sort = false; public void configure(JobConf job) { sort = job.getBoolean("db.reader.stats.sort", false ); } public void close() {} public void map(Text key, CrawlDatumHbase value, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException { // output.collect(new Text("T"), COUNT_1); // output.collect(new Text("status " + value.getName()), COUNT_1); // output.collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1); // output.collect(new Text("s"), new LongWritable((long) (value.getScore() * 1000.0))); // if(sort){ // URL u = new URL(key.toString()); // String host = u.getHost(); // output.collect(new Text("status " + value.getName()() + " " + host), COUNT_1); // } } @Override public void map(ImmutableBytesWritable key, RowResult rowResult, OutputCollector<ImmutableBytesWritable, LongWritable> output, Reporter reporter) throws IOException { String reversedUrl = Bytes.toString(key.get()); String url = TableUtil.unreverseUrl(reversedUrl); RowPart row = new RowPart(rowResult); // System.out.println("url :" + url + " / " + row.getFetchTime()); output.collect(new ImmutableBytesWritable("T".getBytes()), COUNT_1); String status = "status " + CrawlDatumHbase.getName(row.getStatus()); output.collect(new ImmutableBytesWritable(status.getBytes()), COUNT_1); // output.collect(new Text("retry " + row.getRetriesSinceFetch()), COUNT_1); output.collect(new ImmutableBytesWritable("s".getBytes()), new LongWritable((long) (row.getScore() * 1000.0))); } @Override public void reduce(ImmutableBytesWritable key, Iterator<LongWritable> values, OutputCollector<ImmutableBytesWritable, BatchUpdate> output, Reporter reporter) throws IOException { System.out.println("Reducer::reduce"); } } // public static class CrawlDbStatCombiner implements Reducer<Text, LongWritable, Text, LongWritable> { // LongWritable val = new LongWritable(); // // public CrawlDbStatCombiner() { } // public void configure(JobConf job) { } // public void close() {} // public void reduce(Text key, Iterator<LongWritable> values, OutputCollector<Text, LongWritable> output, Reporter reporter) // throws IOException { // val.set(0L); // String k = ((Text)key).toString(); // if (!k.equals("s")) { // while (values.hasNext()) { // LongWritable cnt = (LongWritable)values.next(); // val.set(val.get() + cnt.get()); // } // output.collect(key, val); // } else { // long total = 0; // long min = Long.MAX_VALUE; // long max = Long.MIN_VALUE; // while (values.hasNext()) { // LongWritable cnt = (LongWritable)values.next(); // if (cnt.get() < min) min = cnt.get(); // if (cnt.get() > max) max = cnt.get(); // total += cnt.get(); // } // output.collect(new Text("scn"), new LongWritable(min)); // output.collect(new Text("scx"), new LongWritable(max)); // output.collect(new Text("sct"), new LongWritable(total)); // } // } // } public static class CrawlDbStatReducer implements Reducer<ImmutableBytesWritable, LongWritable, Text, LongWritable> { public void configure(JobConf job) {} public void close() {} public void reduce(ImmutableBytesWritable key, Iterator<LongWritable> values, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException { String k = Bytes.toString(((ImmutableBytesWritable) key).get()); // System.out.println("k :" + k); if (k.equals("T")) { // sum all values for this key long sum = 0; while (values.hasNext()) { sum += ((LongWritable) values.next()).get(); } // output sum // output.collect(new Text(k), new LongWritable(sum)); System.out.println(k + " --> " + sum); } else if (k.startsWith("status") || k.startsWith("retry")) { LongWritable cnt = new LongWritable(); while (values.hasNext()) { LongWritable val = (LongWritable)values.next(); cnt.set(cnt.get() + val.get()); } // output.collect(new Text(k), cnt); System.out.println(k + " --> " + cnt); } else if (k.equals("scx")) { LongWritable cnt = new LongWritable(Long.MIN_VALUE); while (values.hasNext()) { LongWritable val = (LongWritable)values.next(); if (cnt.get() < val.get()) cnt.set(val.get()); } // output.collect(new Text(k), cnt); System.out.println(k + " --> " + cnt); } else if (k.equals("scn")) { LongWritable cnt = new LongWritable(Long.MAX_VALUE); while (values.hasNext()) { LongWritable val = (LongWritable)values.next(); if (cnt.get() > val.get()) cnt.set(val.get()); } // output.collect(new Text(k), cnt); System.out.println(k + " --> " + cnt); } else if (k.equals("sct")) { LongWritable cnt = new LongWritable(); while (values.hasNext()) { LongWritable val = (LongWritable)values.next(); cnt.set(cnt.get() + val.get()); } // output.collect(new Text(k), cnt); System.out.println(k + " --> " + cnt); } } } public void processStatJob(String webtable, boolean sort) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("Webtable statistics start: " + webtable); } Path tmpFolder = new Path("/tmp", "stat_tmp" + System.currentTimeMillis()); JobConf job = new NutchJob(getConf()); job.setJobName("stats " + webtable); job.setBoolean("db.reader.stats.sort", sort); TableMapReduce.initJob(webtable, TableUtil.getColumns(COLUMNS), CrawlDbStatMapReduce.class, ImmutableBytesWritable.class, LongWritable.class, job); // job.setCombinerClass(CrawlDbStatCombiner.class); job.setReducerClass(CrawlDbStatReducer.class); FileOutputFormat.setOutputPath(job, tmpFolder); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient.runJob(job); // FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); // job.setInputFormat(SequenceFileInputFormat.class); // job.setMapperClass(CrawlDbStatMapper.class); // job.setCombinerClass(CrawlDbStatCombiner.class); // job.setReducerClass(CrawlDbStatReducer.class); // FileOutputFormat.setOutputPath(job, tmpFolder); // job.setOutputFormat(SequenceFileOutputFormat.class); // job.setOutputKeyClass(Text.class); // job.setOutputValueClass(LongWritable.class); // reading the result // FileSystem fileSystem = FileSystem.get(getConf()); // SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(), tmpFolder); // // Text key = new Text(); // LongWritable value = new LongWritable(); // // TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>(); // for (int i = 0; i < readers.length; i++) { // SequenceFile.Reader reader = readers[i]; // while (reader.next(key, value)) { // String k = key.toString(); // LongWritable val = stats.get(k); // if (val == null) { // val = new LongWritable(); // if (k.equals("scx")) val.set(Long.MIN_VALUE); // if (k.equals("scn")) val.set(Long.MAX_VALUE); // stats.put(k, val); // } // if (k.equals("scx")) { // if (val.get() < value.get()) val.set(value.get()); // } else if (k.equals("scn")) { // if (val.get() > value.get()) val.set(value.get()); // } else { // val.set(val.get() + value.get()); // } // } // reader.close(); // } // // if (LOG.isInfoEnabled()) { // LOG.info("Statistics for WebTable: " + webtable); // LongWritable totalCnt = stats.get("T"); // stats.remove("T"); // LOG.info("TOTAL urls:\t" + totalCnt.get()); // for (Map.Entry<String, LongWritable> entry : stats.entrySet()) { // String k = entry.getKey(); // LongWritable val = entry.getValue(); // if (k.equals("scn")) { // LOG.info("min score:\t" + (float) (val.get() / 1000.0f)); // } else if (k.equals("scx")) { // LOG.info("max score:\t" + (float) (val.get() / 1000.0f)); // } else if (k.equals("sct")) { // LOG.info("avg score:\t" + (float) ((((double)val.get()) / totalCnt.get()) / 1000.0)); // } else if (k.startsWith("status")) { // String[] st = k.split(" "); // int code = Integer.parseInt(st[1]); // if(st.length >2 ) LOG.info(" " + st[2] +" :\t" + val); // else LOG.info(st[0] +" " +code + " (" + CrawlDatumHbase.getName((byte) code) + "):\t" + val); // } else LOG.info(k + ":\t" + val); // } // } // // removing the tmp folder // fileSystem.delete(tmpFolder, true); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics: done"); } } public void readUrl(String webtable, String url) throws IOException { Text key = new Text(url); //CrawlDatum val = new CrawlDatum(); //CrawlDatum res = (CrawlDatum)MapFileOutputFormat.getEntry(readers, new HashPartitioner<Text, CrawlDatum>(), key, val); // System.out.println("URL: " + url); // if (res != null) { // System.out.println(res); // } else { // System.out.println("not found"); // } } public int run(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: CrawlDbReader <webtable> -stats | -url <url>)"); System.err.println("\t<crawldb>\tdirectory name where crawldb is located"); System.err.println("\t-stats [-sort] \tprint overall statistics to System.out"); System.err.println("\t\t[-sort]\tlist status sorted by host"); System.err.println("\t-url <url>\tprint information on <url> to System.out"); System.err.println("\t\t[<min>]\tskip records with scores below this value."); System.err.println("\t\t\tThis can significantly improve performance."); return -1; } String table = args[0]; for (int i = 1; i < args.length; i++) { if (args[i].equals("-stats")) { boolean toSort = false; if(i < args.length - 1 && "-sort".equals(args[i+1])){ toSort = true; i++; } processStatJob(table, toSort); } else if (args[i].equals("-url")) { readUrl(table, args[++i]); } else { System.err.println("\nError: wrong argument " + args[i]); } } return 0; } public static void main(String args[]) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new WebtableStatisticsReader(), args); System.exit(res); } }