/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.scoring.webgraph; import java.io.IOException; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.MapFileOutputFormat; import org.apache.hadoop.mapred.lib.HashPartitioner; import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.NutchConfiguration; /** * Reads and prints to system out information for a single node from the NodeDb * in the WebGraph. */ public class NodeReader { private Configuration conf; private FileSystem fs; private MapFile.Reader[] nodeReaders; /** * Prints the content of the Node represented by the url to system out. * * @param webGraphDb The webgraph from which to get the node. * @param url The url of the node. * * @throws IOException If an error occurs while getting the node. */ public void dumpUrl(Path webGraphDb, String url) throws IOException { conf = NutchConfiguration.create(); fs = FileSystem.get(conf); nodeReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, WebGraph.NODE_DIR), conf); // open the readers, get the node, print out the info, and close the readers Text key = new Text(url); Node node = new Node(); MapFileOutputFormat.getEntry(nodeReaders, new HashPartitioner<Text, Node>(), key, node); System.out.println(url + ":"); System.out.println(" inlink score: " + node.getInlinkScore()); System.out.println(" outlink score: " + node.getOutlinkScore()); System.out.println(" num inlinks: " + node.getNumInlinks()); System.out.println(" num outlinks: " + node.getNumOutlinks()); FSUtils.closeReaders(nodeReaders); } /** * Runs the NodeReader tool. The command line arguments must contain a * webgraphdb path and a url. The url must match the normalized url that is * contained in the NodeDb of the WebGraph. */ public static void main(String[] args) throws Exception { Options options = new Options(); Option helpOpts = OptionBuilder.withArgName("help").withDescription( "show this help message").create("help"); Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg() .withDescription("the webgraphdb to use").create("webgraphdb"); Option urlOpts = OptionBuilder.withArgName("url").hasOptionalArg() .withDescription("the url to dump").create("url"); options.addOption(helpOpts); options.addOption(webGraphOpts); options.addOption(urlOpts); CommandLineParser parser = new GnuParser(); try { // command line must take a webgraphdb and a url CommandLine line = parser.parse(options, args); if (line.hasOption("help") || !line.hasOption("webgraphdb") || !line.hasOption("url")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("WebGraphReader", options); return; } // dump the values to system out and return String webGraphDb = line.getOptionValue("webgraphdb"); String url = line.getOptionValue("url"); NodeReader reader = new NodeReader(); reader.dumpUrl(new Path(webGraphDb), url); return; } catch (Exception e) { e.printStackTrace(); return; } } }