package org.commoncrawl.service.crawler.util; import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.commoncrawl.protocol.CrawlURL; public class DumpCrawlLog { private static final Log LOG = LogFactory.getLog(DumpCrawlLog.class); public static void main(String[] args)throws IOException { Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); if (args.length != 0) { FileSystem fs = FileSystem.get(conf); Path path = new Path(args[0]); LOG.info("Opening crawl log at:" + path); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); Text url = new Text(); CrawlURL urlData = new CrawlURL(); while (reader.next(url, urlData)) { LOG.info( "URL:" + url.toString() + " Result:" + urlData.getLastAttemptResult() + " Crawl Time:" + urlData.getLastCrawlTime() + " Headers:" + urlData.getHeaders()); } reader.close(); } } }