/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.scoring.webgraph; import java.io.IOException; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.MapFileOutputFormat; import org.apache.hadoop.mapred.lib.HashPartitioner; import org.apache.nutch.scoring.webgraph.Loops.LoopSet; import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.NutchConfiguration; /** * The LoopReader tool prints the loopset information for a single url. */ public class LoopReader { private Configuration conf; private FileSystem fs; private MapFile.Reader[] loopReaders; /** * Prints loopset for a single url. The loopset information will show any * outlink url the eventually forms a link cycle. * * @param webGraphDb The WebGraph to check for loops * @param url The url to check. * * @throws IOException If an error occurs while printing loopset information. */ public void dumpUrl(Path webGraphDb, String url) throws IOException { // open the readers conf = NutchConfiguration.create(); fs = FileSystem.get(conf); loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, Loops.LOOPS_DIR), conf); // get the loopset for a given url, if any Text key = new Text(url); LoopSet loop = new LoopSet(); MapFileOutputFormat.getEntry(loopReaders, new HashPartitioner<Text, LoopSet>(), key, loop); // print out each loop url in the set System.out.println(url + ":"); for (String loopUrl : loop.getLoopSet()) { System.out.println(" " + loopUrl); } // close the readers FSUtils.closeReaders(loopReaders); } /** * Runs the LoopReader tool. For this tool to work the loops job must have * already been run on the corresponding WebGraph. */ public static void main(String[] args) throws Exception { Options options = new Options(); Option helpOpts = OptionBuilder.withArgName("help").withDescription( "show this help message").create("help"); Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg() .withDescription("the webgraphdb to use").create("webgraphdb"); Option urlOpts = OptionBuilder.withArgName("url").hasOptionalArg() .withDescription("the url to dump").create("url"); options.addOption(helpOpts); options.addOption(webGraphOpts); options.addOption(urlOpts); CommandLineParser parser = new GnuParser(); try { CommandLine line = parser.parse(options, args); if (line.hasOption("help") || !line.hasOption("webgraphdb") || !line.hasOption("url")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("WebGraphReader", options); return; } String webGraphDb = line.getOptionValue("webgraphdb"); String url = line.getOptionValue("url"); LoopReader reader = new LoopReader(); reader.dumpUrl(new Path(webGraphDb), url); return; } catch (Exception e) { e.printStackTrace(); return; } } }