/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.Closeable;
import java.lang.invoke.MethodHandles;
import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.TreeMap;
// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.JexlUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TimingUtil;
import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlEngine;
import org.apache.commons.lang.time.DateUtils;
/**
* Read utility for the CrawlDB.
*
* @author Andrzej Bialecki
*
*/
public class CrawlDbReader extends Configured implements Closeable, Tool {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private MapFile.Reader[] readers = null;
private void openReaders(String crawlDb, JobConf config)
throws IOException {
if (readers != null)
return;
Path crawlDbPath = new Path(crawlDb, CrawlDb.CURRENT_NAME);
FileSystem fs = crawlDbPath.getFileSystem(config);
readers = MapFileOutputFormat.getReaders(fs, crawlDbPath, config);
}
private void closeReaders() {
if (readers == null)
return;
for (int i = 0; i < readers.length; i++) {
try {
readers[i].close();
} catch (Exception e) {
}
}
}
public static class CrawlDatumCsvOutputFormat extends
FileOutputFormat<Text, CrawlDatum> {
protected static class LineRecordWriter implements
RecordWriter<Text, CrawlDatum> {
private DataOutputStream out;
public LineRecordWriter(DataOutputStream out) {
this.out = out;
try {
out.writeBytes("Url,Status code,Status name,Fetch Time,Modified Time,Retries since fetch,Retry interval seconds,Retry interval days,Score,Signature,Metadata\n");
} catch (IOException e) {
}
}
public synchronized void write(Text key, CrawlDatum value)
throws IOException {
out.writeByte('"');
out.writeBytes(key.toString());
out.writeByte('"');
out.writeByte(',');
out.writeBytes(Integer.toString(value.getStatus()));
out.writeByte(',');
out.writeByte('"');
out.writeBytes(CrawlDatum.getStatusName(value.getStatus()));
out.writeByte('"');
out.writeByte(',');
out.writeBytes(new Date(value.getFetchTime()).toString());
out.writeByte(',');
out.writeBytes(new Date(value.getModifiedTime()).toString());
out.writeByte(',');
out.writeBytes(Integer.toString(value.getRetriesSinceFetch()));
out.writeByte(',');
out.writeBytes(Float.toString(value.getFetchInterval()));
out.writeByte(',');
out.writeBytes(Float.toString((value.getFetchInterval() / FetchSchedule.SECONDS_PER_DAY)));
out.writeByte(',');
out.writeBytes(Float.toString(value.getScore()));
out.writeByte(',');
out.writeByte('"');
out.writeBytes(value.getSignature() != null ? StringUtil
.toHexString(value.getSignature()) : "null");
out.writeByte('"');
out.writeByte(',');
out.writeByte('"');
if (value.getMetaData() != null) {
for (Entry<Writable, Writable> e : value.getMetaData().entrySet()) {
out.writeBytes(e.getKey().toString());
out.writeByte(':');
out.writeBytes(e.getValue().toString());
out.writeBytes("|||");
}
}
out.writeByte('"');
out.writeByte('\n');
}
public synchronized void close(Reporter reporter) throws IOException {
out.close();
}
}
public RecordWriter<Text, CrawlDatum> getRecordWriter(FileSystem fs,
JobConf job, String name, Progressable progress) throws IOException {
Path dir = FileOutputFormat.getOutputPath(job);
DataOutputStream fileOut = fs.create(new Path(dir, name), progress);
return new LineRecordWriter(fileOut);
}
}
public static class CrawlDbStatMapper implements
Mapper<Text, CrawlDatum, Text, LongWritable> {
LongWritable COUNT_1 = new LongWritable(1);
private boolean sort = false;
public void configure(JobConf job) {
sort = job.getBoolean("db.reader.stats.sort", false);
}
public void close() {
}
public void map(Text key, CrawlDatum value,
OutputCollector<Text, LongWritable> output, Reporter reporter)
throws IOException {
output.collect(new Text("T"), COUNT_1);
output.collect(new Text("status " + value.getStatus()), COUNT_1);
output
.collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1);
output.collect(new Text("sc"), new LongWritable(
(long) (value.getScore() * 1000.0)));
// fetch time (in minutes to prevent from overflows when summing up)
output.collect(new Text("ft"),
new LongWritable(value.getFetchTime() / (1000 * 60)));
// fetch interval (in seconds)
output.collect(new Text("fi"),
new LongWritable(value.getFetchInterval()));
if (sort) {
URL u = new URL(key.toString());
String host = u.getHost();
output.collect(new Text("status " + value.getStatus() + " " + host),
COUNT_1);
}
}
}
public static class CrawlDbStatCombiner implements
Reducer<Text, LongWritable, Text, LongWritable> {
LongWritable val = new LongWritable();
public CrawlDbStatCombiner() {
}
public void configure(JobConf job) {
}
public void close() {
}
private void reduceMinMaxTotal(String keyPrefix, Iterator<LongWritable> values,
OutputCollector<Text, LongWritable> output, Reporter reporter)
throws IOException {
long total = 0;
long min = Long.MAX_VALUE;
long max = Long.MIN_VALUE;
while (values.hasNext()) {
LongWritable cnt = values.next();
if (cnt.get() < min)
min = cnt.get();
if (cnt.get() > max)
max = cnt.get();
total += cnt.get();
}
output.collect(new Text(keyPrefix+"n"), new LongWritable(min));
output.collect(new Text(keyPrefix+"x"), new LongWritable(max));
output.collect(new Text(keyPrefix+"t"), new LongWritable(total));
}
public void reduce(Text key, Iterator<LongWritable> values,
OutputCollector<Text, LongWritable> output, Reporter reporter)
throws IOException {
val.set(0L);
String k = key.toString();
if (k.equals("sc") || k.equals("ft") || k.equals("fi")) {
reduceMinMaxTotal(k, values, output, reporter);
} else {
while (values.hasNext()) {
LongWritable cnt = values.next();
val.set(val.get() + cnt.get());
}
output.collect(key, val);
}
}
}
public static class CrawlDbStatReducer implements
Reducer<Text, LongWritable, Text, LongWritable> {
public void configure(JobConf job) {
}
public void close() {
}
public void reduce(Text key, Iterator<LongWritable> values,
OutputCollector<Text, LongWritable> output, Reporter reporter)
throws IOException {
String k = key.toString();
if (k.equals("T")) {
// sum all values for this key
long sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
// output sum
output.collect(key, new LongWritable(sum));
} else if (k.startsWith("status") || k.startsWith("retry")) {
LongWritable cnt = new LongWritable();
while (values.hasNext()) {
LongWritable val = values.next();
cnt.set(cnt.get() + val.get());
}
output.collect(key, cnt);
} else if (k.equals("scx") || k.equals("ftx") || k.equals("fix")) {
LongWritable cnt = new LongWritable(Long.MIN_VALUE);
while (values.hasNext()) {
LongWritable val = values.next();
if (cnt.get() < val.get())
cnt.set(val.get());
}
output.collect(key, cnt);
} else if (k.equals("scn") || k.equals("ftn") || k.equals("fin")) {
LongWritable cnt = new LongWritable(Long.MAX_VALUE);
while (values.hasNext()) {
LongWritable val = values.next();
if (cnt.get() > val.get())
cnt.set(val.get());
}
output.collect(key, cnt);
} else if (k.equals("sct") || k.equals("ftt") || k.equals("fit")) {
LongWritable cnt = new LongWritable();
while (values.hasNext()) {
LongWritable val = values.next();
cnt.set(cnt.get() + val.get());
}
output.collect(key, cnt);
}
}
}
public static class CrawlDbTopNMapper implements
Mapper<Text, CrawlDatum, FloatWritable, Text> {
private static final FloatWritable fw = new FloatWritable();
private float min = 0.0f;
public void configure(JobConf job) {
min = job.getFloat("db.reader.topn.min", 0.0f);
}
public void close() {
}
public void map(Text key, CrawlDatum value,
OutputCollector<FloatWritable, Text> output, Reporter reporter)
throws IOException {
if (value.getScore() < min)
return; // don't collect low-scoring records
fw.set(-value.getScore()); // reverse sorting order
output.collect(fw, key); // invert mapping: score -> url
}
}
public static class CrawlDbTopNReducer implements
Reducer<FloatWritable, Text, FloatWritable, Text> {
private long topN;
private long count = 0L;
public void reduce(FloatWritable key, Iterator<Text> values,
OutputCollector<FloatWritable, Text> output, Reporter reporter)
throws IOException {
while (values.hasNext() && count < topN) {
key.set(-key.get());
output.collect(key, values.next());
count++;
}
}
public void configure(JobConf job) {
topN = job.getLong("db.reader.topn", 100) / job.getNumReduceTasks();
}
public void close() {
}
}
public void close() {
closeReaders();
}
private TreeMap<String, LongWritable> processStatJobHelper(String crawlDb, Configuration config, boolean sort) throws IOException{
Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());
JobConf job = new NutchJob(config);
job.setJobName("stats " + crawlDb);
job.setBoolean("db.reader.stats.sort", sort);
FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(CrawlDbStatMapper.class);
job.setCombinerClass(CrawlDbStatCombiner.class);
job.setReducerClass(CrawlDbStatReducer.class);
FileOutputFormat.setOutputPath(job, tmpFolder);
job.setOutputFormat(SequenceFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// https://issues.apache.org/jira/browse/NUTCH-1029
job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
JobClient.runJob(job);
// reading the result
FileSystem fileSystem = tmpFolder.getFileSystem(config);
SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config,
tmpFolder);
Text key = new Text();
LongWritable value = new LongWritable();
TreeMap<String, LongWritable> stats = new TreeMap<>();
for (int i = 0; i < readers.length; i++) {
SequenceFile.Reader reader = readers[i];
while (reader.next(key, value)) {
String k = key.toString();
LongWritable val = stats.get(k);
if (val == null) {
val = new LongWritable();
if (k.equals("scx") || k.equals("ftx") || k.equals("fix"))
val.set(Long.MIN_VALUE);
if (k.equals("scn") || k.equals("ftn") || k.equals("fin"))
val.set(Long.MAX_VALUE);
stats.put(k, val);
}
if (k.equals("scx") || k.equals("ftx") || k.equals("fix")) {
if (val.get() < value.get())
val.set(value.get());
} else if (k.equals("scn") || k.equals("ftn") || k.equals("fin")) {
if (val.get() > value.get())
val.set(value.get());
} else {
val.set(val.get() + value.get());
}
}
reader.close();
}
// removing the tmp folder
fileSystem.delete(tmpFolder, true);
return stats;
}
public void processStatJob(String crawlDb, Configuration config, boolean sort)
throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb statistics start: " + crawlDb);
}
TreeMap<String, LongWritable> stats = processStatJobHelper(crawlDb, config, sort);
if (LOG.isInfoEnabled()) {
LOG.info("Statistics for CrawlDb: " + crawlDb);
LongWritable totalCnt = stats.get("T");
stats.remove("T");
LOG.info("TOTAL urls:\t" + totalCnt.get());
for (Map.Entry<String, LongWritable> entry : stats.entrySet()) {
String k = entry.getKey();
LongWritable val = entry.getValue();
if (k.equals("scn")) {
LOG.info("min score:\t" + (val.get() / 1000.0f));
} else if (k.equals("scx")) {
LOG.info("max score:\t" + (val.get() / 1000.0f));
} else if (k.equals("sct")) {
LOG.info("avg score:\t"
+ (float) ((((double) val.get()) / totalCnt.get()) / 1000.0));
} else if (k.equals("ftn")) {
LOG.info("earliest fetch time:\t" + new Date(1000 * 60 * val.get()));
} else if (k.equals("ftx")) {
LOG.info("latest fetch time:\t" + new Date(1000 * 60 * val.get()));
} else if (k.equals("ftt")) {
LOG.info("avg of fetch times:\t"
+ new Date(1000 * 60 * (val.get() / totalCnt.get())));
} else if (k.equals("fin")) {
LOG.info("shortest fetch interval:\t{}",
TimingUtil.secondsToDaysHMS(val.get()));
} else if (k.equals("fix")) {
LOG.info("longest fetch interval:\t{}",
TimingUtil.secondsToDaysHMS(val.get()));
} else if (k.equals("fit")) {
LOG.info("avg fetch interval:\t{}",
TimingUtil.secondsToDaysHMS(val.get() / totalCnt.get()));
} else if (k.startsWith("status")) {
String[] st = k.split(" ");
int code = Integer.parseInt(st[1]);
if (st.length > 2)
LOG.info(" " + st[2] + " :\t" + val);
else
LOG.info(st[0] + " " + code + " ("
+ CrawlDatum.getStatusName((byte) code) + "):\t" + val);
} else
LOG.info(k + ":\t" + val);
}
}
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb statistics: done");
}
}
public CrawlDatum get(String crawlDb, String url, JobConf config)
throws IOException {
Text key = new Text(url);
CrawlDatum val = new CrawlDatum();
openReaders(crawlDb, config);
CrawlDatum res = (CrawlDatum) MapFileOutputFormat.getEntry(readers,
new HashPartitioner<>(), key, val);
return res;
}
public void readUrl(String crawlDb, String url, JobConf config)
throws IOException {
CrawlDatum res = get(crawlDb, url, config);
System.out.println("URL: " + url);
if (res != null) {
System.out.println(res);
} else {
System.out.println("not found");
}
}
public void processDumpJob(String crawlDb, String output,
JobConf config, String format, String regex, String status,
Integer retry, String expr) throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb dump: starting");
LOG.info("CrawlDb db: " + crawlDb);
}
Path outFolder = new Path(output);
JobConf job = new NutchJob(config);
job.setJobName("dump " + crawlDb);
FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
FileOutputFormat.setOutputPath(job, outFolder);
if (format.equals("csv")) {
job.setOutputFormat(CrawlDatumCsvOutputFormat.class);
} else if (format.equals("crawldb")) {
job.setOutputFormat(MapFileOutputFormat.class);
} else {
job.setOutputFormat(TextOutputFormat.class);
}
if (status != null)
job.set("status", status);
if (regex != null)
job.set("regex", regex);
if (retry != null)
job.setInt("retry", retry);
if (expr != null) {
job.set("expr", expr);
LOG.info("CrawlDb db: expr: " + expr);
}
job.setMapperClass(CrawlDbDumpMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
JobClient.runJob(job);
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb dump: done");
}
}
public static class CrawlDbDumpMapper implements
Mapper<Text, CrawlDatum, Text, CrawlDatum> {
Pattern pattern = null;
Matcher matcher = null;
String status = null;
Integer retry = null;
Expression expr = null;
public void configure(JobConf job) {
if (job.get("regex", null) != null) {
pattern = Pattern.compile(job.get("regex"));
}
status = job.get("status", null);
retry = job.getInt("retry", -1);
if (job.get("expr", null) != null) {
expr = JexlUtil.parseExpression(job.get("expr", null));
}
}
public void close() {
}
public void map(Text key, CrawlDatum value,
OutputCollector<Text, CrawlDatum> output, Reporter reporter)
throws IOException {
// check retry
if (retry != -1) {
if (value.getRetriesSinceFetch() < retry) {
return;
}
}
// check status
if (status != null
&& !status.equalsIgnoreCase(CrawlDatum.getStatusName(value
.getStatus())))
return;
// check regex
if (pattern != null) {
matcher = pattern.matcher(key.toString());
if (!matcher.matches()) {
return;
}
}
// check expr
if (expr != null) {
if (!value.evaluate(expr)) {
return;
}
}
output.collect(key, value);
}
}
public void processTopNJob(String crawlDb, long topN, float min,
String output, JobConf config) throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
LOG.info("CrawlDb db: " + crawlDb);
}
Path outFolder = new Path(output);
Path tempDir = new Path(config.get("mapred.temp.dir", ".")
+ "/readdb-topN-temp-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf job = new NutchJob(config);
job.setJobName("topN prepare " + crawlDb);
FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(CrawlDbTopNMapper.class);
job.setReducerClass(IdentityReducer.class);
FileOutputFormat.setOutputPath(job, tempDir);
job.setOutputFormat(SequenceFileOutputFormat.class);
job.setOutputKeyClass(FloatWritable.class);
job.setOutputValueClass(Text.class);
job.setFloat("db.reader.topn.min", min);
JobClient.runJob(job);
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb topN: collecting topN scores.");
}
job = new NutchJob(config);
job.setJobName("topN collect " + crawlDb);
job.setLong("db.reader.topn", topN);
FileInputFormat.addInputPath(job, tempDir);
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(IdentityMapper.class);
job.setReducerClass(CrawlDbTopNReducer.class);
FileOutputFormat.setOutputPath(job, outFolder);
job.setOutputFormat(TextOutputFormat.class);
job.setOutputKeyClass(FloatWritable.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(1); // create a single file.
JobClient.runJob(job);
FileSystem fs = tempDir.getFileSystem(config);
fs.delete(tempDir, true);
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb topN: done");
}
}
public int run(String[] args) throws IOException {
@SuppressWarnings("resource")
CrawlDbReader dbr = new CrawlDbReader();
if (args.length < 2) {
System.err
.println("Usage: CrawlDbReader <crawldb> (-stats | -dump <out_dir> | -topN <nnnn> <out_dir> [<min>] | -url <url>)");
System.err
.println("\t<crawldb>\tdirectory name where crawldb is located");
System.err
.println("\t-stats [-sort] \tprint overall statistics to System.out");
System.err.println("\t\t[-sort]\tlist status sorted by host");
System.err
.println("\t-dump <out_dir> [-format normal|csv|crawldb]\tdump the whole db to a text file in <out_dir>");
System.err.println("\t\t[-format csv]\tdump in Csv format");
System.err
.println("\t\t[-format normal]\tdump in standard format (default option)");
System.err.println("\t\t[-format crawldb]\tdump as CrawlDB");
System.err.println("\t\t[-regex <expr>]\tfilter records with expression");
System.err.println("\t\t[-retry <num>]\tminimum retry count");
System.err
.println("\t\t[-status <status>]\tfilter records by CrawlDatum status");
System.err.println("\t\t[-expr <expr>]\tJexl expression to evaluate for this record");
System.err
.println("\t-url <url>\tprint information on <url> to System.out");
System.err
.println("\t-topN <nnnn> <out_dir> [<min>]\tdump top <nnnn> urls sorted by score to <out_dir>");
System.err
.println("\t\t[<min>]\tskip records with scores below this value.");
System.err.println("\t\t\tThis can significantly improve performance.");
return -1;
}
String param = null;
String crawlDb = args[0];
JobConf job = new NutchJob(getConf());
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-stats")) {
boolean toSort = false;
if (i < args.length - 1 && "-sort".equals(args[i + 1])) {
toSort = true;
i++;
}
dbr.processStatJob(crawlDb, job, toSort);
} else if (args[i].equals("-dump")) {
param = args[++i];
String format = "normal";
String regex = null;
Integer retry = null;
String status = null;
String expr = null;
for (int j = i + 1; j < args.length; j++) {
if (args[j].equals("-format")) {
format = args[++j];
i = i + 2;
}
if (args[j].equals("-regex")) {
regex = args[++j];
i = i + 2;
}
if (args[j].equals("-retry")) {
retry = Integer.parseInt(args[++j]);
i = i + 2;
}
if (args[j].equals("-status")) {
status = args[++j];
i = i + 2;
}
if (args[j].equals("-expr")) {
expr = args[++j];
i=i+2;
}
}
dbr.processDumpJob(crawlDb, param, job, format, regex, status, retry, expr);
} else if (args[i].equals("-url")) {
param = args[++i];
dbr.readUrl(crawlDb, param, job);
} else if (args[i].equals("-topN")) {
param = args[++i];
long topN = Long.parseLong(param);
param = args[++i];
float min = 0.0f;
if (i < args.length - 1) {
min = Float.parseFloat(args[++i]);
}
dbr.processTopNJob(crawlDb, topN, min, param, job);
} else {
System.err.println("\nError: wrong argument " + args[i]);
return -1;
}
}
return 0;
}
public static void main(String[] args) throws Exception {
int result = ToolRunner.run(NutchConfiguration.create(),
new CrawlDbReader(), args);
System.exit(result);
}
public Object query(Map<String, String> args, Configuration conf, String type, String crawlId) throws Exception {
Map<String, Object> results = new HashMap<>();
String crawlDb = crawlId + "/crawldb";
if(type.equalsIgnoreCase("stats")){
boolean sort = false;
if(args.containsKey("sort")){
if(args.get("sort").equalsIgnoreCase("true"))
sort = true;
}
TreeMap<String , LongWritable> stats = processStatJobHelper(crawlDb, NutchConfiguration.create(), sort);
LongWritable totalCnt = stats.get("T");
stats.remove("T");
results.put("totalUrls", String.valueOf(totalCnt.get()));
Map<String, Object> statusMap = new HashMap<>();
for (Map.Entry<String, LongWritable> entry : stats.entrySet()) {
String k = entry.getKey();
LongWritable val = entry.getValue();
if (k.equals("scn")) {
results.put("minScore", String.valueOf((val.get() / 1000.0f)));
} else if (k.equals("scx")) {
results.put("maxScore", String.valueOf((val.get() / 1000.0f)));
} else if (k.equals("sct")) {
results.put("avgScore", String.valueOf((float) ((((double) val.get()) / totalCnt.get()) / 1000.0)));
} else if (k.startsWith("status")) {
String[] st = k.split(" ");
int code = Integer.parseInt(st[1]);
if (st.length > 2){
@SuppressWarnings("unchecked")
Map<String, Object> individualStatusInfo = (Map<String, Object>) statusMap.get(String.valueOf(code));
Map<String, String> hostValues;
if(individualStatusInfo.containsKey("hostValues")){
hostValues= (Map<String, String>) individualStatusInfo.get("hostValues");
}
else{
hostValues = new HashMap<>();
individualStatusInfo.put("hostValues", hostValues);
}
hostValues.put(st[2], String.valueOf(val));
}
else{
Map<String, Object> individualStatusInfo = new HashMap<>();
individualStatusInfo.put("statusValue", CrawlDatum.getStatusName((byte) code));
individualStatusInfo.put("count", String.valueOf(val));
statusMap.put(String.valueOf(code), individualStatusInfo);
}
} else
results.put(k, String.valueOf(val));
}
results.put("status", statusMap);
return results;
}
if(type.equalsIgnoreCase("dump")){
String output = args.get("out_dir");
String format = "normal";
String regex = null;
Integer retry = null;
String status = null;
String expr = null;
if (args.containsKey("format")) {
format = args.get("format");
}
if (args.containsKey("regex")) {
regex = args.get("regex");
}
if (args.containsKey("retry")) {
retry = Integer.parseInt(args.get("retry"));
}
if (args.containsKey("status")) {
status = args.get("status");
}
if (args.containsKey("expr")) {
expr = args.get("expr");
}
processDumpJob(crawlDb, output, new NutchJob(conf), format, regex, status, retry, expr);
File dumpFile = new File(output+"/part-00000");
return dumpFile;
}
if (type.equalsIgnoreCase("topN")) {
String output = args.get("out_dir");
long topN = Long.parseLong(args.get("nnn"));
float min = 0.0f;
if(args.containsKey("min")){
min = Float.parseFloat(args.get("min"));
}
processTopNJob(crawlDb, topN, min, output, new NutchJob(conf));
File dumpFile = new File(output+"/part-00000");
return dumpFile;
}
if(type.equalsIgnoreCase("url")){
String url = args.get("url");
CrawlDatum res = get(crawlDb, url, new NutchJob(conf));
results.put("status", res.getStatus());
results.put("fetchTime", new Date(res.getFetchTime()));
results.put("modifiedTime", new Date(res.getModifiedTime()));
results.put("retriesSinceFetch", res.getRetriesSinceFetch());
results.put("retryInterval", res.getFetchInterval());
results.put("score", res.getScore());
results.put("signature", StringUtil.toHexString(res.getSignature()));
Map<String, String> metadata = new HashMap<>();
if(res.getMetaData()!=null){
for (Entry<Writable, Writable> e : res.getMetaData().entrySet()) {
metadata.put(String.valueOf(e.getKey()), String.valueOf(e.getValue()));
}
}
results.put("metadata", metadata);
return results;
}
return results;
}
}