/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.scoring.webgraph;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.scoring.webgraph.Loops.LoopSet;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
/**
* The LinkDumper tool creates a database of node to inlink information that can
* be read using the nested Reader class. This allows the inlink and scoring
* state of a single url to be reviewed quickly to determine why a given url is
* ranking a certain way. This tool is to be used with the LinkRank analysis.
*/
public class LinkDumper
extends Configured
implements Tool {
public static final Log LOG = LogFactory.getLog(LinkDumper.class);
public static final String DUMP_DIR = "linkdump";
/**
* Reader class which will print out the url and all of its inlinks to system
* out. Each inlinkwill be displayed with its node information including
* score and number of in and outlinks.
*/
public static class Reader {
public static void main(String[] args)
throws Exception {
if (args == null || args.length < 2) {
System.out.println("LinkDumper$Reader usage: <webgraphdb> <url>");
return;
}
// open the readers for the linkdump directory
Configuration conf = NutchConfiguration.create();
FileSystem fs = FileSystem.get(conf);
Path webGraphDb = new Path(args[0]);
String url = args[1];
MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(
webGraphDb, DUMP_DIR), conf);
// get the link nodes for the url
Text key = new Text(url);
LinkNodes nodes = new LinkNodes();
MapFileOutputFormat.getEntry(readers,
new HashPartitioner<Text, LinkNodes>(), key, nodes);
// print out the link nodes
LinkNode[] linkNodesAr = nodes.getLinks();
System.out.println(url + ":");
for (LinkNode node : linkNodesAr) {
System.out.println(" " + node.getUrl() + " - "
+ node.getNode().toString());
}
// close the readers
FSUtils.closeReaders(readers);
}
}
/**
* Bean class which holds url to node information.
*/
public static class LinkNode
implements Writable {
private String url = null;
private Node node = null;
public LinkNode() {
}
public LinkNode(String url, Node node) {
this.url = url;
this.node = node;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public Node getNode() {
return node;
}
public void setNode(Node node) {
this.node = node;
}
public void readFields(DataInput in)
throws IOException {
url = in.readUTF();
node = new Node();
node.readFields(in);
}
public void write(DataOutput out)
throws IOException {
out.writeUTF(url);
node.write(out);
}
}
/**
* Writable class which holds an array of LinkNode objects.
*/
public static class LinkNodes
implements Writable {
private LinkNode[] links;
public LinkNodes() {
}
public LinkNodes(LinkNode[] links) {
this.links = links;
}
public LinkNode[] getLinks() {
return links;
}
public void setLinks(LinkNode[] links) {
this.links = links;
}
public void readFields(DataInput in)
throws IOException {
int numLinks = in.readInt();
if (numLinks > 0) {
links = new LinkNode[numLinks];
for (int i = 0; i < numLinks; i++) {
LinkNode node = new LinkNode();
node.readFields(in);
links[i] = node;
}
}
}
public void write(DataOutput out)
throws IOException {
if (links != null && links.length > 0) {
int numLinks = links.length;
out.writeInt(numLinks);
for (int i = 0; i < numLinks; i++) {
links[i].write(out);
}
}
}
}
/**
* Inverts outlinks from the WebGraph to inlinks and attaches node
* information.
*/
public static class Inverter
implements Mapper<Text, Writable, Text, ObjectWritable>,
Reducer<Text, ObjectWritable, Text, LinkNode> {
private JobConf conf;
public void configure(JobConf conf) {
this.conf = conf;
}
/**
* Wraps all values in ObjectWritables.
*/
public void map(Text key, Writable value,
OutputCollector<Text, ObjectWritable> output, Reporter reporter)
throws IOException {
ObjectWritable objWrite = new ObjectWritable();
objWrite.set(value);
output.collect(key, objWrite);
}
/**
* Inverts outlinks to inlinks while attaching node information to the
* outlink.
*/
public void reduce(Text key, Iterator<ObjectWritable> values,
OutputCollector<Text, LinkNode> output, Reporter reporter)
throws IOException {
String fromUrl = key.toString();
List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
Node node = null;
LoopSet loops = null;
// loop through all values aggregating outlinks, saving node and loopset
while (values.hasNext()) {
ObjectWritable write = values.next();
Object obj = write.get();
if (obj instanceof Node) {
node = (Node)obj;
}
else if (obj instanceof LinkDatum) {
outlinks.add((LinkDatum)WritableUtils.clone((LinkDatum)obj, conf));
}
else if (obj instanceof LoopSet) {
loops = (LoopSet)obj;
}
}
// only collect if there are outlinks
int numOutlinks = node.getNumOutlinks();
if (numOutlinks > 0) {
Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null;
for (int i = 0; i < outlinks.size(); i++) {
LinkDatum outlink = outlinks.get(i);
String toUrl = outlink.getUrl();
// remove any url that is in the loopset, same as LinkRank
if (loopSet != null && loopSet.contains(toUrl)) {
continue;
}
// collect the outlink as an inlink with the node
output.collect(new Text(toUrl), new LinkNode(fromUrl, node));
}
}
}
public void close() {
}
}
/**
* Merges LinkNode objects into a single array value per url. This allows
* all values to be quickly retrieved and printed via the Reader tool.
*/
public static class Merger
implements Reducer<Text, LinkNode, Text, LinkNodes> {
private JobConf conf;
private int maxInlinks = 50000;
public void configure(JobConf conf) {
this.conf = conf;
}
/**
* Aggregate all LinkNode objects for a given url.
*/
public void reduce(Text key, Iterator<LinkNode> values,
OutputCollector<Text, LinkNodes> output, Reporter reporter)
throws IOException {
List<LinkNode> nodeList = new ArrayList<LinkNode>();
int numNodes = 0;
while (values.hasNext()) {
LinkNode cur = values.next();
if (numNodes < maxInlinks) {
nodeList.add((LinkNode)WritableUtils.clone(cur, conf));
numNodes++;
}
else {
break;
}
}
LinkNode[] linkNodesAr = nodeList.toArray(new LinkNode[nodeList.size()]);
LinkNodes linkNodes = new LinkNodes(linkNodesAr);
output.collect(key, linkNodes);
}
public void close() {
}
}
/**
* Runs the inverter and merger jobs of the LinkDumper tool to create the
* url to inlink node database.
*/
public void dumpLinks(Path webGraphDb)
throws IOException {
LOG.info("NodeDumper: starting");
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
Path linkdump = new Path(webGraphDb, DUMP_DIR);
Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
Path loopSetDb = new Path(webGraphDb, Loops.LOOPS_DIR);
boolean loopsExists = fs.exists(loopSetDb);
Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
// run the inverter job
Path tempInverted = new Path(webGraphDb, "inverted-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf inverter = new NutchJob(conf);
inverter.setJobName("LinkDumper: inverter");
FileInputFormat.addInputPath(inverter, nodeDb);
if (loopsExists) {
FileInputFormat.addInputPath(inverter, loopSetDb);
}
FileInputFormat.addInputPath(inverter, outlinkDb);
inverter.setInputFormat(SequenceFileInputFormat.class);
inverter.setMapperClass(Inverter.class);
inverter.setReducerClass(Inverter.class);
inverter.setMapOutputKeyClass(Text.class);
inverter.setMapOutputValueClass(ObjectWritable.class);
inverter.setOutputKeyClass(Text.class);
inverter.setOutputValueClass(LinkNode.class);
FileOutputFormat.setOutputPath(inverter, tempInverted);
inverter.setOutputFormat(SequenceFileOutputFormat.class);
try {
LOG.info("LinkDumper: running inverter");
JobClient.runJob(inverter);
LOG.info("LinkDumper: finished inverter");
}
catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
// run the merger job
JobConf merger = new NutchJob(conf);
merger.setJobName("LinkDumper: merger");
FileInputFormat.addInputPath(merger, tempInverted);
merger.setInputFormat(SequenceFileInputFormat.class);
merger.setReducerClass(Merger.class);
merger.setMapOutputKeyClass(Text.class);
merger.setMapOutputValueClass(LinkNode.class);
merger.setOutputKeyClass(Text.class);
merger.setOutputValueClass(LinkNodes.class);
FileOutputFormat.setOutputPath(merger, linkdump);
merger.setOutputFormat(MapFileOutputFormat.class);
try {
LOG.info("LinkDumper: running merger");
JobClient.runJob(merger);
LOG.info("LinkDumper: finished merger");
}
catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
fs.delete(tempInverted, true);
}
public static void main(String[] args)
throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new LinkDumper(),
args);
System.exit(res);
}
/**
* Runs the LinkDumper tool. This simply creates the database, to read the
* values the nested Reader tool must be used.
*/
public int run(String[] args)
throws Exception {
Options options = new Options();
Option helpOpts = OptionBuilder.withArgName("help").withDescription(
"show this help message").create("help");
Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg()
.withDescription("the web graph database to use").create("webgraphdb");
options.addOption(helpOpts);
options.addOption(webGraphDbOpts);
CommandLineParser parser = new GnuParser();
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("webgraphdb")) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("LinkDumper", options);
return -1;
}
String webGraphDb = line.getOptionValue("webgraphdb");
dumpLinks(new Path(webGraphDb));
return 0;
}
catch (Exception e) {
LOG.fatal("LinkDumper: " + StringUtils.stringifyException(e));
return -2;
}
}
}