/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.scoring.webgraph; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapFileOutputFormat; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.ParseData; import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.URLUtil; /** * Creates three databases, one for inlinks, one for outlinks, and a node * database that holds the number of in and outlinks to a url and the current * score for the url. * * The score is set by an analysis program such as LinkRank. The WebGraph is an * update-able database. Outlinks are stored by their fetch time or by the * current system time if no fetch time is available. Only the most recent * version of outlinks for a given url is stored. As more crawls are executed * and the WebGraph updated, newer Outlinks will replace older Outlinks. This * allows the WebGraph to adapt to changes in the link structure of the web. * * The Inlink database is created from the Outlink database and is regenerated * when the WebGraph is updated. The Node database is created from both the * Inlink and Outlink databases. Because the Node database is overwritten when * the WebGraph is updated and because the Node database holds current scores * for urls it is recommended that a crawl-cyle (one or more full crawls) fully * complete before the WebGraph is updated and some type of analysis, such as * LinkRank, is run to update scores in the Node database in a stable fashion. */ public class WebGraph extends Configured implements Tool { public static final Log LOG = LogFactory.getLog(WebGraph.class); public static final String LOCK_NAME = ".locked"; public static final String INLINK_DIR = "inlinks"; public static final String OUTLINK_DIR = "outlinks"; public static final String NODE_DIR = "nodes"; /** * The OutlinkDb creates a database of all outlinks. Outlinks to internal urls * by domain and host can be ignored. The number of Outlinks out to a given * page or domain can also be limited. */ public static class OutlinkDb extends Configured implements Mapper<Text, Writable, Text, LinkDatum>, Reducer<Text, LinkDatum, Text, LinkDatum> { // ignoring internal domains, internal hosts private boolean ignoreDomain = true; private boolean ignoreHost = true; // limiting urls out to a page or to a domain private boolean limitPages = true; private boolean limitDomains = true; // url normalizers and job configuration private URLNormalizers urlNormalizers; private JobConf conf; /** * Normalizes and trims extra whitespace from the given url. * * @param url The url to normalize. * * @return The normalized url. */ private String normalizeUrl(String url) { String normalized = null; if (urlNormalizers != null) { try { // normalize and trim the url normalized = urlNormalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); normalized = normalized.trim(); } catch (Exception e) { LOG.warn("Skipping " + url + ":" + e); normalized = null; } } return normalized; } /** * Returns the fetch time from the parse data or the current system time if * the fetch time doesn't exist. * * @param data The parse data. * * @return The fetch time as a long. */ private long getFetchTime(ParseData data) { // default to current system time long fetchTime = System.currentTimeMillis(); String fetchTimeStr = data.getContentMeta().get(Nutch.FETCH_TIME_KEY); try { // get the fetch time from the parse data fetchTime = Long.parseLong(fetchTimeStr); } catch (Exception e) { fetchTime = System.currentTimeMillis(); } return fetchTime; } /** * Default constructor. */ public OutlinkDb() { } /** * Configurable constructor. */ public OutlinkDb(Configuration conf) { setConf(conf); } /** * Configures the OutlinkDb job. Sets up internal links and link limiting. */ public void configure(JobConf conf) { this.conf = conf; ignoreHost = conf.getBoolean("link.ignore.internal.host", true); ignoreDomain = conf.getBoolean("link.ignore.internal.domain", true); limitPages = conf.getBoolean("link.ignore.limit.page", true); limitDomains = conf.getBoolean("link.ignore.limit.domain", true); urlNormalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT); } /** * Passes through existing LinkDatum objects from an existing OutlinkDb and * maps out new LinkDatum objects from new crawls ParseData. */ public void map(Text key, Writable value, OutputCollector<Text, LinkDatum> output, Reporter reporter) throws IOException { // normalize url, stop processing if null String url = normalizeUrl(key.toString()); if (url == null) { return; } if (value instanceof ParseData) { // get the parse data and the outlinks from the parse data, along with // the fetch time for those links ParseData data = (ParseData)value; long fetchTime = getFetchTime(data); Outlink[] outlinkAr = data.getOutlinks(); Map<String, String> outlinkMap = new LinkedHashMap<String, String>(); // normalize urls and put into map if (outlinkAr != null && outlinkAr.length > 0) { for (int i = 0; i < outlinkAr.length; i++) { Outlink outlink = outlinkAr[i]; String toUrl = normalizeUrl(outlink.getToUrl()); // only put into map if the url doesn't already exist in the map or // if it does and the anchor for that link is null, will replace if // url is existing boolean existingUrl = outlinkMap.containsKey(toUrl); if (toUrl != null && (!existingUrl || (existingUrl && outlinkMap.get(toUrl) == null))) { outlinkMap.put(toUrl, outlink.getAnchor()); } } } // collect the outlinks under the fetch time for (String outlinkUrl : outlinkMap.keySet()) { String anchor = outlinkMap.get(outlinkUrl); LinkDatum datum = new LinkDatum(outlinkUrl, anchor, fetchTime); output.collect(key, datum); } } else if (value instanceof LinkDatum) { // collect existing outlinks from existing OutlinkDb output.collect(key, (LinkDatum)value); } } public void reduce(Text key, Iterator<LinkDatum> values, OutputCollector<Text, LinkDatum> output, Reporter reporter) throws IOException { // aggregate all outlinks, get the most recent timestamp for a fetch // which should be the timestamp for all of the most recent outlinks long mostRecent = 0L; List<LinkDatum> outlinkList = new ArrayList<LinkDatum>(); while (values.hasNext()) { // loop through, change out most recent timestamp if needed LinkDatum next = values.next(); long timestamp = next.getTimestamp(); if (mostRecent == 0L || mostRecent < timestamp) { mostRecent = timestamp; } outlinkList.add((LinkDatum)WritableUtils.clone(next, conf)); } // get the url, domain, and host for the url String url = key.toString(); String domain = URLUtil.getDomainName(url); String host = URLUtil.getHost(url); // setup checking sets for domains and pages Set<String> domains = new HashSet<String>(); Set<String> pages = new HashSet<String>(); // loop through the link datums for (LinkDatum datum : outlinkList) { // get the url, host, domain, and page for each outlink String toUrl = datum.getUrl(); String toDomain = URLUtil.getDomainName(toUrl); String toHost = URLUtil.getHost(toUrl); String toPage = URLUtil.getPage(toUrl); datum.setLinkType(LinkDatum.OUTLINK); // outlinks must be the most recent and conform to internal url and // limiting rules, if it does collect it if (datum.getTimestamp() == mostRecent && (!limitPages || (limitPages && !pages.contains(toPage))) && (!limitDomains || (limitDomains && !domains.contains(toDomain))) && (!ignoreHost || (ignoreHost && !toHost.equalsIgnoreCase(host))) && (!ignoreDomain || (ignoreDomain && !toDomain.equalsIgnoreCase(domain)))) { output.collect(key, datum); pages.add(toPage); domains.add(toDomain); } } } public void close() { } } /** * The InlinkDb creates a database of Inlinks. Inlinks are inverted from the * OutlinkDb LinkDatum objects and are regenerated each time the WebGraph is * updated. */ private static class InlinkDb extends Configured implements Mapper<Text, LinkDatum, Text, LinkDatum> { private JobConf conf; private long timestamp; /** * Default constructor. */ public InlinkDb() { } /** * Configurable constructor. */ public InlinkDb(Configuration conf) { setConf(conf); } /** * Configures job. Sets timestamp for all Inlink LinkDatum objects to the * current system time. */ public void configure(JobConf conf) { this.conf = conf; timestamp = System.currentTimeMillis(); } public void close() { } /** * Inverts the Outlink LinkDatum objects into new LinkDatum objects with a * new system timestamp, type and to and from url switched. */ public void map(Text key, LinkDatum datum, OutputCollector<Text, LinkDatum> output, Reporter reporter) throws IOException { // get the to and from url and the anchor String fromUrl = key.toString(); String toUrl = datum.getUrl(); String anchor = datum.getAnchor(); // flip the from and to url and set the new link type LinkDatum inlink = new LinkDatum(fromUrl, anchor, timestamp); inlink.setLinkType(LinkDatum.INLINK); output.collect(new Text(toUrl), inlink); } } /** * Creates the Node database which consists of the number of in and outlinks * for each url and a score slot for analysis programs such as LinkRank. */ private static class NodeDb extends Configured implements Reducer<Text, LinkDatum, Text, Node> { private JobConf conf; /** * Default constructor. */ public NodeDb() { } /** * Configurable constructor. */ public NodeDb(Configuration conf) { setConf(conf); } /** * Configures job. */ public void configure(JobConf conf) { this.conf = conf; } public void close() { } /** * Counts the number of inlinks and outlinks for each url and sets a default * score of 0.0 for each url (node) in the webgraph. */ public void reduce(Text key, Iterator<LinkDatum> values, OutputCollector<Text, Node> output, Reporter reporter) throws IOException { Node node = new Node(); int numInlinks = 0; int numOutlinks = 0; // loop through counting number of in and out links while (values.hasNext()) { LinkDatum next = values.next(); if (next.getLinkType() == LinkDatum.INLINK) { numInlinks++; } else if (next.getLinkType() == LinkDatum.OUTLINK) { numOutlinks++; } } // set the in and outlinks and a default score of 0 node.setNumInlinks(numInlinks); node.setNumOutlinks(numOutlinks); node.setInlinkScore(0.0f); output.collect(key, node); } } /** * Creates the three different WebGraph databases, Outlinks, Inlinks, and * Node. If a current WebGraph exists then it is updated, if it doesn't exist * then a new WebGraph database is created. * * @param webGraphDb The WebGraph to create or update. * @param segments The array of segments used to update the WebGraph. Newer * segments and fetch times will overwrite older segments. * * @throws IOException If an error occurs while processing the WebGraph. */ public void createWebGraph(Path webGraphDb, Path[] segments) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("WebGraphDb: starting"); LOG.info("WebGraphDb: webgraphdb: " + webGraphDb); } Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // lock an existing webgraphdb to prevent multiple simultaneous updates Path lock = new Path(webGraphDb, LOCK_NAME); boolean webGraphDbExists = fs.exists(webGraphDb); if (webGraphDbExists) { LockUtil.createLockFile(fs, lock, false); } else { // if the webgraph doesn't exist, create it fs.mkdirs(webGraphDb); } // outlink and temp outlink database paths Path outlinkDb = new Path(webGraphDb, OUTLINK_DIR); Path tempOutlinkDb = new Path(outlinkDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf outlinkJob = new NutchJob(conf); outlinkJob.setJobName("Outlinkdb: " + outlinkDb); // get the parse data for all segments if (segments != null) { for (int i = 0; i < segments.length; i++) { Path parseData = new Path(segments[i], ParseData.DIR_NAME); if (fs.exists(parseData)) { LOG.info("OutlinkDb: adding input: " + parseData); FileInputFormat.addInputPath(outlinkJob, parseData); } } } // add the existing webgraph if (webGraphDbExists) { LOG.info("OutlinkDb: adding input: " + outlinkDb); FileInputFormat.addInputPath(outlinkJob, outlinkDb); } outlinkJob.setInputFormat(SequenceFileInputFormat.class); outlinkJob.setMapperClass(OutlinkDb.class); outlinkJob.setReducerClass(OutlinkDb.class); outlinkJob.setMapOutputKeyClass(Text.class); outlinkJob.setMapOutputValueClass(LinkDatum.class); outlinkJob.setOutputKeyClass(Text.class); outlinkJob.setOutputValueClass(LinkDatum.class); FileOutputFormat.setOutputPath(outlinkJob, tempOutlinkDb); outlinkJob.setOutputFormat(MapFileOutputFormat.class); // run the outlinkdb job and replace any old outlinkdb with the new one try { LOG.info("OutlinkDb: running"); JobClient.runJob(outlinkJob); LOG.info("OutlinkDb: installing " + outlinkDb); FSUtils.replace(fs, outlinkDb, tempOutlinkDb, true); LOG.info("OutlinkDb: finished"); } catch (IOException e) { // remove lock file and and temporary directory if an error occurs LockUtil.removeLockFile(fs, lock); if (fs.exists(tempOutlinkDb)) { fs.delete(tempOutlinkDb, true); } LOG.error(StringUtils.stringifyException(e)); throw e; } // inlink and temp link database paths Path inlinkDb = new Path(webGraphDb, INLINK_DIR); Path tempInlinkDb = new Path(inlinkDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf inlinkJob = new NutchJob(conf); inlinkJob.setJobName("Inlinkdb " + inlinkDb); LOG.info("InlinkDb: adding input: " + outlinkDb); FileInputFormat.addInputPath(inlinkJob, outlinkDb); inlinkJob.setInputFormat(SequenceFileInputFormat.class); inlinkJob.setMapperClass(InlinkDb.class); inlinkJob.setMapOutputKeyClass(Text.class); inlinkJob.setMapOutputValueClass(LinkDatum.class); inlinkJob.setOutputKeyClass(Text.class); inlinkJob.setOutputValueClass(LinkDatum.class); FileOutputFormat.setOutputPath(inlinkJob, tempInlinkDb); inlinkJob.setOutputFormat(MapFileOutputFormat.class); try { // run the inlink and replace any old with new LOG.info("InlinkDb: running"); JobClient.runJob(inlinkJob); LOG.info("InlinkDb: installing " + inlinkDb); FSUtils.replace(fs, inlinkDb, tempInlinkDb, true); LOG.info("InlinkDb: finished"); } catch (IOException e) { // remove lock file and and temporary directory if an error occurs LockUtil.removeLockFile(fs, lock); if (fs.exists(tempInlinkDb)) { fs.delete(tempInlinkDb, true); } LOG.error(StringUtils.stringifyException(e)); throw e; } // node and temp node database paths Path nodeDb = new Path(webGraphDb, NODE_DIR); Path tempNodeDb = new Path(nodeDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf nodeJob = new NutchJob(conf); nodeJob.setJobName("NodeDb " + nodeDb); LOG.info("NodeDb: adding input: " + outlinkDb); LOG.info("NodeDb: adding input: " + inlinkDb); FileInputFormat.addInputPath(nodeJob, outlinkDb); FileInputFormat.addInputPath(nodeJob, inlinkDb); nodeJob.setInputFormat(SequenceFileInputFormat.class); nodeJob.setReducerClass(NodeDb.class); nodeJob.setMapOutputKeyClass(Text.class); nodeJob.setMapOutputValueClass(LinkDatum.class); nodeJob.setOutputKeyClass(Text.class); nodeJob.setOutputValueClass(Node.class); FileOutputFormat.setOutputPath(nodeJob, tempNodeDb); nodeJob.setOutputFormat(MapFileOutputFormat.class); try { // run the node job and replace old nodedb with new LOG.info("NodeDb: running"); JobClient.runJob(nodeJob); LOG.info("NodeDb: installing " + nodeDb); FSUtils.replace(fs, nodeDb, tempNodeDb, true); LOG.info("NodeDb: finished"); } catch (IOException e) { // remove lock file and and temporary directory if an error occurs LockUtil.removeLockFile(fs, lock); if (fs.exists(tempNodeDb)) { fs.delete(tempNodeDb, true); } LOG.error(StringUtils.stringifyException(e)); throw e; } // remove the lock file for the webgraph LockUtil.removeLockFile(fs, lock); } public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new WebGraph(), args); System.exit(res); } /** * Parses command link arguments and runs the WebGraph jobs. */ public int run(String[] args) throws Exception { Options options = new Options(); Option helpOpts = OptionBuilder.withArgName("help").withDescription( "show this help message").create("help"); Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg().withDescription( "the web graph database to use").create("webgraphdb"); Option segOpts = OptionBuilder.withArgName("segment").hasArgs().withDescription( "the segment(s) to use").create("segment"); options.addOption(helpOpts); options.addOption(webGraphDbOpts); options.addOption(segOpts); CommandLineParser parser = new GnuParser(); try { CommandLine line = parser.parse(options, args); if (line.hasOption("help") || !line.hasOption("webgraphdb") || !line.hasOption("segment")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("WebGraph", options); return -1; } String webGraphDb = line.getOptionValue("webgraphdb"); String[] segments = line.getOptionValues("segment"); Path[] segPaths = new Path[segments.length]; for (int i = 0; i < segments.length; i++) { segPaths[i] = new Path(segments[i]); } createWebGraph(new Path(webGraphDb), segPaths); return 0; } catch (Exception e) { LOG.fatal("WebGraph: " + StringUtils.stringifyException(e)); return -2; } } }