/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.indexer.field; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.Random; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.ObjectWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.scoring.webgraph.LinkDatum; import org.apache.nutch.scoring.webgraph.Node; import org.apache.nutch.scoring.webgraph.WebGraph; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; /** * Creates FieldWritable objects for inbound anchor text. These FieldWritable * objects are then included in the input to the FieldIndexer to be converted * to Lucene Field objects and indexed. * * Any empty or null anchor text is ignored. Anchors are sorted in descending * order according to the score of their parent pages. There are settings for a * maximum number of anchors to index and whether those anchors should be stored * and tokenized. With a descending order by score and a maximum anchors index * we ensure that only the best anchors are indexed assuming that a higher link * analysis score equals a better page and better inbound text. */ public class AnchorFields extends Configured implements Tool { public static final Log LOG = LogFactory.getLog(AnchorFields.class); /** * Comparator to order the links in descending order by score. */ private static class DescendinLinkDatumScoreComparator implements Comparator<LinkDatum> { public int compare(LinkDatum one, LinkDatum two) { float scoreOne = one.getScore(); float scoreTwo = two.getScore(); return (scoreOne == scoreTwo ? 0 : (scoreOne > scoreTwo ? -1 : 1)); } } /** * Runs the Extractor job. Get outlinks to be converted while ignoring empty * and null anchors. * * @param webGraphDb The WebGraphDb to pull from. * @param output The extractor output. * * @throws IOException If an error occurs while running the extractor. */ private void runExtractor(Path webGraphDb, Path output) throws IOException { JobConf extractor = new NutchJob(getConf()); extractor.setJobName("AnchorFields Extractor"); FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.OUTLINK_DIR)); FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.NODE_DIR)); FileOutputFormat.setOutputPath(extractor, output); extractor.setInputFormat(SequenceFileInputFormat.class); extractor.setMapperClass(Extractor.class); extractor.setReducerClass(Extractor.class); extractor.setMapOutputKeyClass(Text.class); extractor.setMapOutputValueClass(ObjectWritable.class); extractor.setOutputKeyClass(Text.class); extractor.setOutputValueClass(LinkDatum.class); extractor.setOutputFormat(SequenceFileOutputFormat.class); LOG.info("Starting extractor job"); try { JobClient.runJob(extractor); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished extractor job."); } /** * Runs the collector job. Aggregates extracted inlinks, sorts and converts * the highest scoring into FieldWritable objects. Only inlinks for which * basic fields exist will be collected to avoid orphan fields. * * @param basicFields The BasicFields which must be present to collect anchors * to avoid orphan fields. * @param links The outlinks path. * @param output The collector output. * * @throws IOException If an error occurs while running the collector. */ private void runCollector(Path basicFields, Path links, Path output) throws IOException { JobConf collector = new NutchJob(getConf()); collector.setJobName("AnchorFields Collector"); FileInputFormat.addInputPath(collector, links); FileInputFormat.addInputPath(collector, basicFields); FileOutputFormat.setOutputPath(collector, output); collector.setInputFormat(SequenceFileInputFormat.class); collector.setMapOutputKeyClass(Text.class); collector.setMapOutputValueClass(ObjectWritable.class); collector.setMapperClass(Collector.class); collector.setReducerClass(Collector.class); collector.setOutputKeyClass(Text.class); collector.setOutputValueClass(FieldWritable.class); collector.setOutputFormat(SequenceFileOutputFormat.class); LOG.info("Starting collector job"); try { JobClient.runJob(collector); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished collector job."); } /** * Extracts outlinks to be created as FieldWritable objects. Ignores empty * and null anchors. */ public static class Extractor extends Configured implements Mapper<Text, Writable, Text, ObjectWritable>, Reducer<Text, ObjectWritable, Text, LinkDatum> { private boolean ignoreEmptyAnchors = true; private JobConf conf; /** * Default constructor. */ public Extractor() { } /** * Configurable constructor. */ public Extractor(Configuration conf) { setConf(conf); } /** * Configures the job, sets to ignore empty anchors. */ public void configure(JobConf conf) { this.conf = conf; ignoreEmptyAnchors = conf.getBoolean("link.ignore.empty.anchors", true); } /** * Wraps values in ObjectWritable */ public void map(Text key, Writable value, OutputCollector<Text, ObjectWritable> output, Reporter reporter) throws IOException { ObjectWritable objWrite = new ObjectWritable(); objWrite.set(value); output.collect(key, objWrite); } /** * Extracts and inverts outlinks, ignores empty anchors. */ public void reduce(Text key, Iterator<ObjectWritable> values, OutputCollector<Text, LinkDatum> output, Reporter reporter) throws IOException { List<LinkDatum> outlinkList = new ArrayList<LinkDatum>(); Node node = null; // collect the outlinks while ignoring links with empty anchor text, also // assign the node while (values.hasNext()) { ObjectWritable objWrite = values.next(); Object obj = objWrite.get(); if (obj instanceof LinkDatum) { LinkDatum next = (LinkDatum)obj; String anchor = next.getAnchor(); if (anchor != null) { anchor = anchor.trim(); } if (ignoreEmptyAnchors && (anchor == null || anchor.length() == 0)) { continue; } outlinkList.add(next); } else if (obj instanceof Node) { node = (Node)obj; } } // has to have outlinks to index if (node != null && outlinkList.size() > 0) { String fromUrl = key.toString(); float outlinkScore = node.getInlinkScore(); for (LinkDatum datum : outlinkList) { String toUrl = datum.getUrl(); datum.setUrl(fromUrl); datum.setScore(outlinkScore); datum.setLinkType(LinkDatum.INLINK); output.collect(new Text(toUrl), datum); } } } public void close() { } } /** * Collects and creates FieldWritable objects from the inlinks. Inlinks are * first sorted by descending score before being collected. */ public static class Collector extends Configured implements Mapper<Text, Writable, Text, ObjectWritable>, Reducer<Text, ObjectWritable, Text, FieldWritable> { private int maxInlinks = 1000; private boolean tokenize = true; private boolean stored = false; private Comparator<LinkDatum> descLinkComp = new DescendinLinkDatumScoreComparator(); /** * Configures the jobs. Sets maximum number of inlinks and whether to * tokenize and store. */ public void configure(JobConf conf) { this.maxInlinks = conf.getInt("link.max.inlinks", 1000); this.tokenize = conf.getBoolean("indexer.anchor.tokenize", true); this.stored = conf.getBoolean("indexer.anchor.stored", false); } public void close() { } /** * Wraps values in ObjectWritable */ public void map(Text key, Writable value, OutputCollector<Text, ObjectWritable> output, Reporter reporter) throws IOException { ObjectWritable objWrite = new ObjectWritable(); objWrite.set(value); output.collect(key, objWrite); } /** * Aggregates and sorts inlinks. Then converts up to a max number to * FieldWritable objects. */ public void reduce(Text key, Iterator<ObjectWritable> values, OutputCollector<Text, FieldWritable> output, Reporter reporter) throws IOException { List<LinkDatum> anchors = new ArrayList<LinkDatum>(); FieldsWritable basicFields = null; // aggregate inlinks assign basic fields while (values.hasNext()) { ObjectWritable objWrite = values.next(); Object obj = objWrite.get(); if (obj instanceof LinkDatum) { anchors.add((LinkDatum)obj); } else if (obj instanceof FieldsWritable) { basicFields = (FieldsWritable)obj; } } // only collect anchors for those urls that have basic fields, otherwise // we get orphan entries indexed only under anchor text if (basicFields != null && anchors.size() > 0) { // sort according to score descending Collections.sort(anchors, descLinkComp); // collect to maximum number of inlinks int numToCollect = (maxInlinks > anchors.size() ? anchors.size() : maxInlinks); for (int i = 0; i < numToCollect; i++) { LinkDatum datum = anchors.get(i); FieldWritable anchorField = new FieldWritable(Fields.ANCHOR, datum.getAnchor(), FieldType.CONTENT, true, stored, tokenize); output.collect(key, anchorField); } } } } /** * Creates the FieldsWritable object from the anchors. * * @param webGraphDb The WebGraph from which to pull outlinks. * @param basicFields The BasicFields that must be present to avoid orphan * anchor fields. * @param output The AnchorFields output. * * @throws IOException If an error occurs while creating the fields. */ public void createFields(Path webGraphDb, Path basicFields, Path output) throws IOException { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Path tempLinks = new Path(output + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); runExtractor(webGraphDb, tempLinks); runCollector(basicFields, tempLinks, output); fs.delete(tempLinks, true); } public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new AnchorFields(), args); System.exit(res); } /** * Runs the AnchorFields job. */ public int run(String[] args) throws Exception { Options options = new Options(); Option helpOpts = OptionBuilder.withArgName("help").withDescription( "show this help message").create("help"); Option outputOpts = OptionBuilder.withArgName("output").hasArg().withDescription( "the output index directory").create("output"); Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg().withDescription( "the webgraphdb to use").create("webgraphdb"); Option basicFieldOpts = OptionBuilder.withArgName("basicfields").hasArgs().withDescription( "the basicfields to use").create("basicfields"); options.addOption(helpOpts); options.addOption(webGraphDbOpts); options.addOption(basicFieldOpts); options.addOption(outputOpts); CommandLineParser parser = new GnuParser(); try { CommandLine line = parser.parse(options, args); if (line.hasOption("help") || !line.hasOption("webgraphdb") || !line.hasOption("output") || !line.hasOption("basicfields")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("AnchorFields", options); return -1; } String webGraphDb = line.getOptionValue("webgraphdb"); String output = line.getOptionValue("output"); String basicFields = line.getOptionValue("basicfields"); createFields(new Path(webGraphDb), new Path(basicFields), new Path(output)); return 0; } catch (Exception e) { LOG.fatal("AnchorFields: " + StringUtils.stringifyException(e)); return -2; } } }