/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.scoring.webgraph; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapFileOutputFormat; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.ParseData; import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; /** * Creates three databases, one for inlinks, one for outlinks, and a node * database that holds the number of in and outlinks to a url and the current * score for the url. * * The score is set by an analysis program such as LinkRank. The WebGraph is an * update-able database. Outlinks are stored by their fetch time or by the * current system time if no fetch time is available. Only the most recent * version of outlinks for a given url is stored. As more crawls are executed * and the WebGraph updated, newer Outlinks will replace older Outlinks. This * allows the WebGraph to adapt to changes in the link structure of the web. * * The Inlink database is created from the Outlink database and is regenerated * when the WebGraph is updated. The Node database is created from both the * Inlink and Outlink databases. Because the Node database is overwritten when * the WebGraph is updated and because the Node database holds current scores * for urls it is recommended that a crawl-cyle (one or more full crawls) fully * complete before the WebGraph is updated and some type of analysis, such as * LinkRank, is run to update scores in the Node database in a stable fashion. */ public class WebGraph extends Configured implements Tool { public static final Logger LOG = LoggerFactory.getLogger(WebGraph.class); public static final String LOCK_NAME = ".locked"; public static final String INLINK_DIR = "inlinks"; public static final String OUTLINK_DIR = "outlinks/current"; public static final String OLD_OUTLINK_DIR = "outlinks/old"; public static final String NODE_DIR = "nodes"; /** * The OutlinkDb creates a database of all outlinks. Outlinks to internal urls * by domain and host can be ignored. The number of Outlinks out to a given * page or domain can also be limited. */ public static class OutlinkDb extends Configured implements Mapper<Text, Writable, Text, NutchWritable>, Reducer<Text, NutchWritable, Text, LinkDatum> { public static final String URL_NORMALIZING = "webgraph.url.normalizers"; public static final String URL_FILTERING = "webgraph.url.filters"; // ignoring internal domains, internal hosts private boolean ignoreDomain = true; private boolean ignoreHost = true; // limiting urls out to a page or to a domain private boolean limitPages = true; private boolean limitDomains = true; // using normalizers and/or filters private boolean normalize = false; private boolean filter = false; // url normalizers, filters and job configuration private URLNormalizers urlNormalizers; private URLFilters filters; private JobConf conf; /** * Normalizes and trims extra whitespace from the given url. * * @param url The url to normalize. * * @return The normalized url. */ private String normalizeUrl(String url) { if (!normalize) { return url; } String normalized = null; if (urlNormalizers != null) { try { // normalize and trim the url normalized = urlNormalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); normalized = normalized.trim(); } catch (Exception e) { LOG.warn("Skipping " + url + ":" + e); normalized = null; } } return normalized; } /** * Filters the given url. * * @param url The url to filter. * * @return The filtered url or null. */ private String filterUrl(String url) { if (!filter) { return url; } try { url = filters.filter(url); } catch (Exception e) { url = null; } return url; } /** * Returns the fetch time from the parse data or the current system time if * the fetch time doesn't exist. * * @param data The parse data. * * @return The fetch time as a long. */ private long getFetchTime(ParseData data) { // default to current system time long fetchTime = System.currentTimeMillis(); String fetchTimeStr = data.getContentMeta().get(Nutch.FETCH_TIME_KEY); try { // get the fetch time from the parse data fetchTime = Long.parseLong(fetchTimeStr); } catch (Exception e) { fetchTime = System.currentTimeMillis(); } return fetchTime; } /** * Default constructor. */ public OutlinkDb() { } /** * Configurable constructor. */ public OutlinkDb(Configuration conf) { setConf(conf); } /** * Configures the OutlinkDb job. Sets up internal links and link limiting. */ public void configure(JobConf conf) { this.conf = conf; ignoreHost = conf.getBoolean("link.ignore.internal.host", true); ignoreDomain = conf.getBoolean("link.ignore.internal.domain", true); limitPages = conf.getBoolean("link.ignore.limit.page", true); limitDomains = conf.getBoolean("link.ignore.limit.domain", true); normalize = conf.getBoolean(URL_NORMALIZING, false); filter = conf.getBoolean(URL_FILTERING, false); if (normalize) { urlNormalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT); } if (filter) { filters = new URLFilters(conf); } } /** * Passes through existing LinkDatum objects from an existing OutlinkDb and * maps out new LinkDatum objects from new crawls ParseData. */ public void map(Text key, Writable value, OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { // normalize url, stop processing if null String url = normalizeUrl(key.toString()); if (url == null) { return; } // filter url if (filterUrl(url) == null) { return; } // Overwrite the key with the normalized URL key.set(url); if (value instanceof CrawlDatum) { CrawlDatum datum = (CrawlDatum)value; if (datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP || datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM || datum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) { // Tell the reducer to get rid of all instances of this key output.collect(key, new NutchWritable(new BooleanWritable(true))); } } else if (value instanceof ParseData) { // get the parse data and the outlinks from the parse data, along with // the fetch time for those links ParseData data = (ParseData)value; long fetchTime = getFetchTime(data); Outlink[] outlinkAr = data.getOutlinks(); Map<String, String> outlinkMap = new LinkedHashMap<String, String>(); // normalize urls and put into map if (outlinkAr != null && outlinkAr.length > 0) { for (int i = 0; i < outlinkAr.length; i++) { Outlink outlink = outlinkAr[i]; String toUrl = normalizeUrl(outlink.getToUrl()); if (filterUrl(toUrl) == null) { continue; } // only put into map if the url doesn't already exist in the map or // if it does and the anchor for that link is null, will replace if // url is existing boolean existingUrl = outlinkMap.containsKey(toUrl); if (toUrl != null && (!existingUrl || (existingUrl && outlinkMap.get(toUrl) == null))) { outlinkMap.put(toUrl, outlink.getAnchor()); } } } // collect the outlinks under the fetch time for (String outlinkUrl : outlinkMap.keySet()) { String anchor = outlinkMap.get(outlinkUrl); LinkDatum datum = new LinkDatum(outlinkUrl, anchor, fetchTime); output.collect(key, new NutchWritable(datum)); } } else if (value instanceof LinkDatum) { LinkDatum datum = (LinkDatum)value; String linkDatumUrl = normalizeUrl(datum.getUrl()); if (filterUrl(linkDatumUrl) != null) { datum.setUrl(linkDatumUrl); // collect existing outlinks from existing OutlinkDb output.collect(key, new NutchWritable(datum)); } } } public void reduce(Text key, Iterator<NutchWritable> values, OutputCollector<Text, LinkDatum> output, Reporter reporter) throws IOException { // aggregate all outlinks, get the most recent timestamp for a fetch // which should be the timestamp for all of the most recent outlinks long mostRecent = 0L; List<LinkDatum> outlinkList = new ArrayList<LinkDatum>(); while (values.hasNext()) { Writable value = values.next().get(); if (value instanceof LinkDatum) { // loop through, change out most recent timestamp if needed LinkDatum next = (LinkDatum)value; long timestamp = next.getTimestamp(); if (mostRecent == 0L || mostRecent < timestamp) { mostRecent = timestamp; } outlinkList.add((LinkDatum)WritableUtils.clone(next, conf)); reporter.incrCounter("WebGraph.outlinks", "added links", 1); } else if (value instanceof BooleanWritable) { BooleanWritable delete = (BooleanWritable)value; // Actually, delete is always true, otherwise we don't emit it in the mapper in the first place if (delete.get() == true) { // This page is gone, do not emit it's outlinks reporter.incrCounter("WebGraph.outlinks", "removed links", 1); return; } } } // get the url, domain, and host for the url String url = key.toString(); String domain = URLUtil.getDomainName(url); String host = URLUtil.getHost(url); // setup checking sets for domains and pages Set<String> domains = new HashSet<String>(); Set<String> pages = new HashSet<String>(); // loop through the link datums for (LinkDatum datum : outlinkList) { // get the url, host, domain, and page for each outlink String toUrl = datum.getUrl(); String toDomain = URLUtil.getDomainName(toUrl); String toHost = URLUtil.getHost(toUrl); String toPage = URLUtil.getPage(toUrl); datum.setLinkType(LinkDatum.OUTLINK); // outlinks must be the most recent and conform to internal url and // limiting rules, if it does collect it if (datum.getTimestamp() == mostRecent && (!limitPages || (limitPages && !pages.contains(toPage))) && (!limitDomains || (limitDomains && !domains.contains(toDomain))) && (!ignoreHost || (ignoreHost && !toHost.equalsIgnoreCase(host))) && (!ignoreDomain || (ignoreDomain && !toDomain.equalsIgnoreCase(domain)))) { output.collect(key, datum); pages.add(toPage); domains.add(toDomain); } } } public void close() { } } /** * The InlinkDb creates a database of Inlinks. Inlinks are inverted from the * OutlinkDb LinkDatum objects and are regenerated each time the WebGraph is * updated. */ private static class InlinkDb extends Configured implements Mapper<Text, LinkDatum, Text, LinkDatum> { private JobConf conf; private long timestamp; /** * Default constructor. */ public InlinkDb() { } /** * Configurable constructor. */ public InlinkDb(Configuration conf) { setConf(conf); } /** * Configures job. Sets timestamp for all Inlink LinkDatum objects to the * current system time. */ public void configure(JobConf conf) { this.conf = conf; timestamp = System.currentTimeMillis(); } public void close() { } /** * Inverts the Outlink LinkDatum objects into new LinkDatum objects with a * new system timestamp, type and to and from url switched. */ public void map(Text key, LinkDatum datum, OutputCollector<Text, LinkDatum> output, Reporter reporter) throws IOException { // get the to and from url and the anchor String fromUrl = key.toString(); String toUrl = datum.getUrl(); String anchor = datum.getAnchor(); // flip the from and to url and set the new link type LinkDatum inlink = new LinkDatum(fromUrl, anchor, timestamp); inlink.setLinkType(LinkDatum.INLINK); output.collect(new Text(toUrl), inlink); } } /** * Creates the Node database which consists of the number of in and outlinks * for each url and a score slot for analysis programs such as LinkRank. */ private static class NodeDb extends Configured implements Reducer<Text, LinkDatum, Text, Node> { private JobConf conf; /** * Default constructor. */ public NodeDb() { } /** * Configurable constructor. */ public NodeDb(Configuration conf) { setConf(conf); } /** * Configures job. */ public void configure(JobConf conf) { this.conf = conf; } public void close() { } /** * Counts the number of inlinks and outlinks for each url and sets a default * score of 0.0 for each url (node) in the webgraph. */ public void reduce(Text key, Iterator<LinkDatum> values, OutputCollector<Text, Node> output, Reporter reporter) throws IOException { Node node = new Node(); int numInlinks = 0; int numOutlinks = 0; // loop through counting number of in and out links while (values.hasNext()) { LinkDatum next = values.next(); if (next.getLinkType() == LinkDatum.INLINK) { numInlinks++; } else if (next.getLinkType() == LinkDatum.OUTLINK) { numOutlinks++; } } // set the in and outlinks and a default score of 0 node.setNumInlinks(numInlinks); node.setNumOutlinks(numOutlinks); node.setInlinkScore(0.0f); output.collect(key, node); } } /** * Creates the three different WebGraph databases, Outlinks, Inlinks, and * Node. If a current WebGraph exists then it is updated, if it doesn't exist * then a new WebGraph database is created. * * @param webGraphDb The WebGraph to create or update. * @param segments The array of segments used to update the WebGraph. Newer * segments and fetch times will overwrite older segments. * @param normalize whether to use URLNormalizers on URL's in the segment * @param filter whether to use URLFilters on URL's in the segment * * @throws IOException If an error occurs while processing the WebGraph. */ public void createWebGraph(Path webGraphDb, Path[] segments, boolean normalize, boolean filter) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("WebGraphDb: starting at " + sdf.format(start)); LOG.info("WebGraphDb: webgraphdb: " + webGraphDb); LOG.info("WebGraphDb: URL normalize: " + normalize); LOG.info("WebGraphDb: URL filter: " + filter); } Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // lock an existing webgraphdb to prevent multiple simultaneous updates Path lock = new Path(webGraphDb, LOCK_NAME); if (!fs.exists(webGraphDb)) { fs.mkdirs(webGraphDb); } LockUtil.createLockFile(fs, lock, false); // outlink and temp outlink database paths Path outlinkDb = new Path(webGraphDb, OUTLINK_DIR); Path oldOutlinkDb = new Path(webGraphDb, OLD_OUTLINK_DIR); if (!fs.exists(outlinkDb)) { fs.mkdirs(outlinkDb); } Path tempOutlinkDb = new Path(outlinkDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf outlinkJob = new NutchJob(conf); outlinkJob.setJobName("Outlinkdb: " + outlinkDb); boolean deleteGone = conf.getBoolean("link.delete.gone", false); boolean preserveBackup = conf.getBoolean("db.preserve.backup", true); if (deleteGone) { LOG.info("OutlinkDb: deleting gone links"); } // get the parse data and crawl fetch data for all segments if (segments != null) { for (int i = 0; i < segments.length; i++) { Path parseData = new Path(segments[i], ParseData.DIR_NAME); if (fs.exists(parseData)) { LOG.info("OutlinkDb: adding input: " + parseData); FileInputFormat.addInputPath(outlinkJob, parseData); } if (deleteGone) { Path crawlFetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME); if (fs.exists(crawlFetch)) { LOG.info("OutlinkDb: adding input: " + crawlFetch); FileInputFormat.addInputPath(outlinkJob, crawlFetch); } } } } // add the existing webgraph LOG.info("OutlinkDb: adding input: " + outlinkDb); FileInputFormat.addInputPath(outlinkJob, outlinkDb); outlinkJob.setBoolean(OutlinkDb.URL_NORMALIZING, normalize); outlinkJob.setBoolean(OutlinkDb.URL_FILTERING, filter); outlinkJob.setInputFormat(SequenceFileInputFormat.class); outlinkJob.setMapperClass(OutlinkDb.class); outlinkJob.setReducerClass(OutlinkDb.class); outlinkJob.setMapOutputKeyClass(Text.class); outlinkJob.setMapOutputValueClass(NutchWritable.class); outlinkJob.setOutputKeyClass(Text.class); outlinkJob.setOutputValueClass(LinkDatum.class); FileOutputFormat.setOutputPath(outlinkJob, tempOutlinkDb); outlinkJob.setOutputFormat(MapFileOutputFormat.class); outlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the outlinkdb job and replace any old outlinkdb with the new one try { LOG.info("OutlinkDb: running"); JobClient.runJob(outlinkJob); LOG.info("OutlinkDb: installing " + outlinkDb); FSUtils.replace(fs, oldOutlinkDb, outlinkDb, true); FSUtils.replace(fs, outlinkDb, tempOutlinkDb, true); if (!preserveBackup && fs.exists(oldOutlinkDb)) fs.delete(oldOutlinkDb, true); LOG.info("OutlinkDb: finished"); } catch (IOException e) { // remove lock file and and temporary directory if an error occurs LockUtil.removeLockFile(fs, lock); if (fs.exists(tempOutlinkDb)) { fs.delete(tempOutlinkDb, true); } LOG.error(StringUtils.stringifyException(e)); throw e; } // inlink and temp link database paths Path inlinkDb = new Path(webGraphDb, INLINK_DIR); Path tempInlinkDb = new Path(inlinkDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf inlinkJob = new NutchJob(conf); inlinkJob.setJobName("Inlinkdb " + inlinkDb); LOG.info("InlinkDb: adding input: " + outlinkDb); FileInputFormat.addInputPath(inlinkJob, outlinkDb); inlinkJob.setInputFormat(SequenceFileInputFormat.class); inlinkJob.setMapperClass(InlinkDb.class); inlinkJob.setMapOutputKeyClass(Text.class); inlinkJob.setMapOutputValueClass(LinkDatum.class); inlinkJob.setOutputKeyClass(Text.class); inlinkJob.setOutputValueClass(LinkDatum.class); FileOutputFormat.setOutputPath(inlinkJob, tempInlinkDb); inlinkJob.setOutputFormat(MapFileOutputFormat.class); inlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); try { // run the inlink and replace any old with new LOG.info("InlinkDb: running"); JobClient.runJob(inlinkJob); LOG.info("InlinkDb: installing " + inlinkDb); FSUtils.replace(fs, inlinkDb, tempInlinkDb, true); LOG.info("InlinkDb: finished"); } catch (IOException e) { // remove lock file and and temporary directory if an error occurs LockUtil.removeLockFile(fs, lock); if (fs.exists(tempInlinkDb)) { fs.delete(tempInlinkDb, true); } LOG.error(StringUtils.stringifyException(e)); throw e; } // node and temp node database paths Path nodeDb = new Path(webGraphDb, NODE_DIR); Path tempNodeDb = new Path(nodeDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf nodeJob = new NutchJob(conf); nodeJob.setJobName("NodeDb " + nodeDb); LOG.info("NodeDb: adding input: " + outlinkDb); LOG.info("NodeDb: adding input: " + inlinkDb); FileInputFormat.addInputPath(nodeJob, outlinkDb); FileInputFormat.addInputPath(nodeJob, inlinkDb); nodeJob.setInputFormat(SequenceFileInputFormat.class); nodeJob.setReducerClass(NodeDb.class); nodeJob.setMapOutputKeyClass(Text.class); nodeJob.setMapOutputValueClass(LinkDatum.class); nodeJob.setOutputKeyClass(Text.class); nodeJob.setOutputValueClass(Node.class); FileOutputFormat.setOutputPath(nodeJob, tempNodeDb); nodeJob.setOutputFormat(MapFileOutputFormat.class); nodeJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); try { // run the node job and replace old nodedb with new LOG.info("NodeDb: running"); JobClient.runJob(nodeJob); LOG.info("NodeDb: installing " + nodeDb); FSUtils.replace(fs, nodeDb, tempNodeDb, true); LOG.info("NodeDb: finished"); } catch (IOException e) { // remove lock file and and temporary directory if an error occurs LockUtil.removeLockFile(fs, lock); if (fs.exists(tempNodeDb)) { fs.delete(tempNodeDb, true); } LOG.error(StringUtils.stringifyException(e)); throw e; } // remove the lock file for the webgraph LockUtil.removeLockFile(fs, lock); long end = System.currentTimeMillis(); LOG.info("WebGraphDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new WebGraph(), args); System.exit(res); } /** * Parses command link arguments and runs the WebGraph jobs. */ public int run(String[] args) throws Exception { Options options = new Options(); Option helpOpts = OptionBuilder.withArgName("help").withDescription( "show this help message").create("help"); Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg().withDescription( "the web graph database to use").create("webgraphdb"); Option segOpts = OptionBuilder.withArgName("segment").hasArgs().withDescription( "the segment(s) to use").create("segment"); Option segDirOpts = OptionBuilder.withArgName("segmentDir").hasArgs().withDescription( "the segment directory to use").create("segmentDir"); Option normalizeOpts = OptionBuilder.withArgName("normalize").withDescription( "whether to use URLNormalizers on the URL's in the segment").create("normalize"); Option filterOpts = OptionBuilder.withArgName("filter").withDescription( "whether to use URLFilters on the URL's in the segment").create("filter"); options.addOption(helpOpts); options.addOption(webGraphDbOpts); options.addOption(segOpts); options.addOption(segDirOpts); options.addOption(normalizeOpts); options.addOption(filterOpts); CommandLineParser parser = new GnuParser(); try { CommandLine line = parser.parse(options, args); if (line.hasOption("help") || !line.hasOption("webgraphdb") || (!line.hasOption("segment") && !line.hasOption("segmentDir"))) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("WebGraph", options); return -1; } String webGraphDb = line.getOptionValue("webgraphdb"); Path[] segPaths = null; // Handle segment option if (line.hasOption("segment")) { String[] segments = line.getOptionValues("segment"); segPaths = new Path[segments.length]; for (int i = 0; i < segments.length; i++) { segPaths[i] = new Path(segments[i]); } } // Handle segmentDir option if (line.hasOption("segmentDir")) { Path dir = new Path(line.getOptionValue("segmentDir")); FileSystem fs = dir.getFileSystem(getConf()); FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs)); segPaths = HadoopFSUtil.getPaths(fstats); } boolean normalize = false; if (line.hasOption("normalize")) { normalize = true; } boolean filter = false; if (line.hasOption("filter")) { filter = true; } createWebGraph(new Path(webGraphDb), segPaths, normalize, filter); return 0; } catch (Exception e) { LOG.error("WebGraph: " + StringUtils.stringifyException(e)); return -2; } } }