/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.crawl; import java.io.DataOutputStream; import java.io.IOException; import java.net.URL; import java.util.Date; import java.util.Iterator; import java.util.Map; import java.util.Random; import java.util.TreeMap; // Commons Logging imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Closeable; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapFileOutputFormat; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.lib.HashPartitioner; import org.apache.hadoop.mapred.lib.IdentityMapper; import org.apache.hadoop.mapred.lib.IdentityReducer; import org.apache.hadoop.util.Progressable; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.StringUtil; /** * Read utility for the CrawlDB. * * @author Andrzej Bialecki * */ public class CrawlDbReader implements Closeable { public static final Log LOG = LogFactory.getLog(CrawlDbReader.class); public static final int STD_FORMAT = 0; public static final int CSV_FORMAT = 1; private MapFile.Reader[] readers = null; private void openReaders(String crawlDb, Configuration config) throws IOException { if (readers != null) return; FileSystem fs = FileSystem.get(config); readers = MapFileOutputFormat.getReaders(fs, new Path(crawlDb, CrawlDb.CURRENT_NAME), config); } private void closeReaders() { if (readers == null) return; for (int i = 0; i < readers.length; i++) { try { readers[i].close(); } catch (Exception e) { } } } public static class CrawlDatumCsvOutputFormat extends FileOutputFormat<Text,CrawlDatum> { protected static class LineRecordWriter implements RecordWriter<Text,CrawlDatum> { private DataOutputStream out; public LineRecordWriter(DataOutputStream out) { this.out = out; try { out.writeBytes("Url;Status code;Status name;Fetch Time;Modified Time;Retries since fetch;Retry interval;Score;Signature;Metadata\n"); } catch (IOException e) {} } public synchronized void write(Text key, CrawlDatum value) throws IOException { out.writeByte('"'); out.writeBytes(key.toString()); out.writeByte('"'); out.writeByte(';'); out.writeBytes(Integer.toString(value.getStatus())); out.writeByte(';'); out.writeByte('"'); out.writeBytes(CrawlDatum.getStatusName(value.getStatus())); out.writeByte('"'); out.writeByte(';'); out.writeBytes(new Date(value.getFetchTime()).toString()); out.writeByte(';'); out.writeBytes(new Date(value.getModifiedTime()).toString()); out.writeByte(';'); out.writeBytes(Integer.toString(value.getRetriesSinceFetch())); out.writeByte(';'); out.writeBytes(Float.toString(value.getFetchInterval())); out.writeByte(';'); out.writeBytes(Float.toString((value.getFetchInterval() / FetchSchedule.SECONDS_PER_DAY))); out.writeByte(';'); out.writeBytes(Float.toString(value.getScore())); out.writeByte(';'); out.writeByte('"'); out.writeBytes(value.getSignature() != null ? StringUtil.toHexString(value.getSignature()): "null"); out.writeByte('"'); out.writeByte('\n'); } public synchronized void close(Reporter reporter) throws IOException { out.close(); } } public RecordWriter<Text,CrawlDatum> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress) throws IOException { Path dir = FileOutputFormat.getOutputPath(job); DataOutputStream fileOut = fs.create(new Path(dir, name), progress); return new LineRecordWriter(fileOut); } } public static class CrawlDbStatMapper implements Mapper<Text, CrawlDatum, Text, LongWritable> { LongWritable COUNT_1 = new LongWritable(1); private boolean sort = false; public void configure(JobConf job) { sort = job.getBoolean("db.reader.stats.sort", false ); } public void close() {} public void map(Text key, CrawlDatum value, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException { output.collect(new Text("T"), COUNT_1); output.collect(new Text("status " + value.getStatus()), COUNT_1); output.collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1); output.collect(new Text("s"), new LongWritable((long) (value.getScore() * 1000.0))); if(sort){ URL u = new URL(key.toString()); String host = u.getHost(); output.collect(new Text("status " + value.getStatus() + " " + host), COUNT_1); } } } public static class CrawlDbStatCombiner implements Reducer<Text, LongWritable, Text, LongWritable> { LongWritable val = new LongWritable(); public CrawlDbStatCombiner() { } public void configure(JobConf job) { } public void close() {} public void reduce(Text key, Iterator<LongWritable> values, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException { val.set(0L); String k = ((Text)key).toString(); if (!k.equals("s")) { while (values.hasNext()) { LongWritable cnt = (LongWritable)values.next(); val.set(val.get() + cnt.get()); } output.collect(key, val); } else { long total = 0; long min = Long.MAX_VALUE; long max = Long.MIN_VALUE; while (values.hasNext()) { LongWritable cnt = (LongWritable)values.next(); if (cnt.get() < min) min = cnt.get(); if (cnt.get() > max) max = cnt.get(); total += cnt.get(); } output.collect(new Text("scn"), new LongWritable(min)); output.collect(new Text("scx"), new LongWritable(max)); output.collect(new Text("sct"), new LongWritable(total)); } } } public static class CrawlDbStatReducer implements Reducer<Text, LongWritable, Text, LongWritable> { public void configure(JobConf job) {} public void close() {} public void reduce(Text key, Iterator<LongWritable> values, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException { String k = ((Text) key).toString(); if (k.equals("T")) { // sum all values for this key long sum = 0; while (values.hasNext()) { sum += ((LongWritable) values.next()).get(); } // output sum output.collect(key, new LongWritable(sum)); } else if (k.startsWith("status") || k.startsWith("retry")) { LongWritable cnt = new LongWritable(); while (values.hasNext()) { LongWritable val = (LongWritable)values.next(); cnt.set(cnt.get() + val.get()); } output.collect(key, cnt); } else if (k.equals("scx")) { LongWritable cnt = new LongWritable(Long.MIN_VALUE); while (values.hasNext()) { LongWritable val = (LongWritable)values.next(); if (cnt.get() < val.get()) cnt.set(val.get()); } output.collect(key, cnt); } else if (k.equals("scn")) { LongWritable cnt = new LongWritable(Long.MAX_VALUE); while (values.hasNext()) { LongWritable val = (LongWritable)values.next(); if (cnt.get() > val.get()) cnt.set(val.get()); } output.collect(key, cnt); } else if (k.equals("sct")) { LongWritable cnt = new LongWritable(); while (values.hasNext()) { LongWritable val = (LongWritable)values.next(); cnt.set(cnt.get() + val.get()); } output.collect(key, cnt); } } } public static class CrawlDbTopNMapper implements Mapper<Text, CrawlDatum, FloatWritable, Text> { private static final FloatWritable fw = new FloatWritable(); private float min = 0.0f; public void configure(JobConf job) { long lmin = job.getLong("db.reader.topn.min", 0); if (lmin != 0) { min = (float)lmin / 1000000.0f; } } public void close() {} public void map(Text key, CrawlDatum value, OutputCollector<FloatWritable, Text> output, Reporter reporter) throws IOException { if (value.getScore() < min) return; // don't collect low-scoring records fw.set(-value.getScore()); // reverse sorting order output.collect(fw, key); // invert mapping: score -> url } } public static class CrawlDbTopNReducer implements Reducer<FloatWritable, Text, FloatWritable, Text> { private long topN; private long count = 0L; public void reduce(FloatWritable key, Iterator<Text> values, OutputCollector<FloatWritable, Text> output, Reporter reporter) throws IOException { while (values.hasNext() && count < topN) { key.set(-key.get()); output.collect(key, values.next()); count++; } } public void configure(JobConf job) { topN = job.getLong("db.reader.topn", 100) / job.getNumReduceTasks(); } public void close() {} } public void close() { closeReaders(); } public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics start: " + crawlDb); } Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis()); JobConf job = new NutchJob(config); job.setJobName("stats " + crawlDb); job.setBoolean("db.reader.stats.sort", sort); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbStatMapper.class); job.setCombinerClass(CrawlDbStatCombiner.class); job.setReducerClass(CrawlDbStatReducer.class); FileOutputFormat.setOutputPath(job, tmpFolder); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); JobClient.runJob(job); // reading the result FileSystem fileSystem = FileSystem.get(config); SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder); Text key = new Text(); LongWritable value = new LongWritable(); TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>(); for (int i = 0; i < readers.length; i++) { SequenceFile.Reader reader = readers[i]; while (reader.next(key, value)) { String k = key.toString(); LongWritable val = stats.get(k); if (val == null) { val = new LongWritable(); if (k.equals("scx")) val.set(Long.MIN_VALUE); if (k.equals("scn")) val.set(Long.MAX_VALUE); stats.put(k, val); } if (k.equals("scx")) { if (val.get() < value.get()) val.set(value.get()); } else if (k.equals("scn")) { if (val.get() > value.get()) val.set(value.get()); } else { val.set(val.get() + value.get()); } } reader.close(); } if (LOG.isInfoEnabled()) { LOG.info("Statistics for CrawlDb: " + crawlDb); LongWritable totalCnt = stats.get("T"); stats.remove("T"); LOG.info("TOTAL urls:\t" + totalCnt.get()); for (Map.Entry<String, LongWritable> entry : stats.entrySet()) { String k = entry.getKey(); LongWritable val = entry.getValue(); if (k.equals("scn")) { LOG.info("min score:\t" + (float) (val.get() / 1000.0f)); } else if (k.equals("scx")) { LOG.info("max score:\t" + (float) (val.get() / 1000.0f)); } else if (k.equals("sct")) { LOG.info("avg score:\t" + (float) ((((double)val.get()) / totalCnt.get()) / 1000.0)); } else if (k.startsWith("status")) { String[] st = k.split(" "); int code = Integer.parseInt(st[1]); if(st.length >2 ) LOG.info(" " + st[2] +" :\t" + val); else LOG.info(st[0] +" " +code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val); } else LOG.info(k + ":\t" + val); } } // removing the tmp folder fileSystem.delete(tmpFolder, true); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics: done"); } } public CrawlDatum get(String crawlDb, String url, Configuration config) throws IOException { Text key = new Text(url); CrawlDatum val = new CrawlDatum(); openReaders(crawlDb, config); CrawlDatum res = (CrawlDatum)MapFileOutputFormat.getEntry(readers, new HashPartitioner<Text, CrawlDatum>(), key, val); return res; } public void readUrl(String crawlDb, String url, Configuration config) throws IOException { CrawlDatum res = get(crawlDb, url, config); System.out.println("URL: " + url); if (res != null) { System.out.println(res); } else { System.out.println("not found"); } } public void processDumpJob(String crawlDb, String output, Configuration config, int format) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: starting"); LOG.info("CrawlDb db: " + crawlDb); } Path outFolder = new Path(output); JobConf job = new NutchJob(config); job.setJobName("dump " + crawlDb); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, outFolder); if(format == CSV_FORMAT) job.setOutputFormat(CrawlDatumCsvOutputFormat.class); else job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: done"); } } public void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")"); LOG.info("CrawlDb db: " + crawlDb); } Path outFolder = new Path(output); Path tempDir = new Path(config.get("mapred.temp.dir", ".") + "/readdb-topN-temp-"+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("topN prepare " + crawlDb); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbTopNMapper.class); job.setReducerClass(IdentityReducer.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputValueClass(Text.class); // XXX hmmm, no setFloat() in the API ... :( job.setLong("db.reader.topn.min", Math.round(1000000.0 * min)); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: collecting topN scores."); } job = new NutchJob(config); job.setJobName("topN collect " + crawlDb); job.setLong("db.reader.topn", topN); FileInputFormat.addInputPath(job, tempDir); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(IdentityMapper.class); job.setReducerClass(CrawlDbTopNReducer.class); FileOutputFormat.setOutputPath(job, outFolder); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); // create a single file. JobClient.runJob(job); FileSystem fs = FileSystem.get(config); fs.delete(tempDir, true); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: done"); } } public static void main(String[] args) throws IOException { CrawlDbReader dbr = new CrawlDbReader(); if (args.length < 1) { System.err.println("Usage: CrawlDbReader <crawldb> (-stats | -dump <out_dir> | -topN <nnnn> <out_dir> [<min>] | -url <url>)"); System.err.println("\t<crawldb>\tdirectory name where crawldb is located"); System.err.println("\t-stats [-sort] \tprint overall statistics to System.out"); System.err.println("\t\t[-sort]\tlist status sorted by host"); System.err.println("\t-dump <out_dir> [-format normal|csv ]\tdump the whole db to a text file in <out_dir>"); System.err.println("\t\t[-format csv]\tdump in Csv format"); System.err.println("\t\t[-format normal]\tdump in standard format (default option)"); System.err.println("\t-url <url>\tprint information on <url> to System.out"); System.err.println("\t-topN <nnnn> <out_dir> [<min>]\tdump top <nnnn> urls sorted by score to <out_dir>"); System.err.println("\t\t[<min>]\tskip records with scores below this value."); System.err.println("\t\t\tThis can significantly improve performance."); return; } String param = null; String crawlDb = args[0]; Configuration conf = NutchConfiguration.create(); for (int i = 1; i < args.length; i++) { if (args[i].equals("-stats")) { boolean toSort = false; if(i < args.length - 1 && "-sort".equals(args[i+1])){ toSort = true; i++; } dbr.processStatJob(crawlDb, conf, toSort); } else if (args[i].equals("-dump")) { param = args[++i]; String format = "normal"; if(i < args.length - 1 && "-format".equals(args[i+1])) format = args[i=i+2]; dbr.processDumpJob(crawlDb, param, conf, "csv".equals(format)? CSV_FORMAT : STD_FORMAT ); } else if (args[i].equals("-url")) { param = args[++i]; dbr.readUrl(crawlDb, param, conf); } else if (args[i].equals("-topN")) { param = args[++i]; long topN = Long.parseLong(param); param = args[++i]; float min = 0.0f; if (i < args.length - 1) { min = Float.parseFloat(args[++i]); } dbr.processTopNJob(crawlDb, topN, min, param, conf); } else { System.err.println("\nError: wrong argument " + args[i]); } } return; } }