/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.tools.compat; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Random; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.MapWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapFileOutputFormat; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.CrawlDb; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.scoring.webgraph.Node; import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.URLUtil; /** * <p> * Significant changes were made to representative url logic used for redirects. * This tool will fix representative urls stored in current segments and crawl * databases. Any new fetches will use the new representative url logic. * </p> * * <p> * All crawl datums are assumed to be temp url redirects. While this may cause * some urls to be incorrectly removed, this tool is a temporary measure to be * used until fetches can be rerun. This reduce logic is the same for segments * fetch and parse directory as well as for existing crawl databases. * </p> */ public class ReprUrlFixer extends Configured implements Tool, Reducer<Text, CrawlDatum, Text, CrawlDatum> { public static final Log LOG = LogFactory.getLog(ReprUrlFixer.class); private JobConf conf; public void configure(JobConf conf) { this.conf = conf; } /** * Runs the new ReprUrl logic on all crawldatums. */ public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { String url = key.toString(); Node node = null; List<CrawlDatum> datums = new ArrayList<CrawlDatum>(); // get all crawl datums for a given url key, fetch for instance can have // more than one under a given key if there are multiple redirects to a // given url while (values.hasNext()) { CrawlDatum datum = values.next(); datums.add((CrawlDatum)WritableUtils.clone(datum, conf)); } // apply redirect repr url logic for each datum for (CrawlDatum datum : datums) { MapWritable metadata = datum.getMetaData(); Text reprUrl = (Text)metadata.get(Nutch.WRITABLE_REPR_URL_KEY); byte status = datum.getStatus(); boolean isCrawlDb = (CrawlDatum.hasDbStatus(datum)); boolean segFetched = (status == CrawlDatum.STATUS_FETCH_SUCCESS); // only if the crawl datum is from the crawldb or is a successfully // fetched page from the segments if ((isCrawlDb || segFetched) && reprUrl != null) { String src = reprUrl.toString(); String dest = url; URL srcUrl = null; URL dstUrl = null; // both need to be well formed urls try { srcUrl = new URL(src); dstUrl = new URL(url); } catch (MalformedURLException e) { } // if the src and repr urls are the same after the new logic then // remove the repr url from the metadata as it is no longer needed if (srcUrl != null && dstUrl != null) { String reprOut = URLUtil.chooseRepr(src, dest, true); if (reprOut.equals(dest)) { LOG.info("Removing " + reprOut + " from " + dest); metadata.remove(Nutch.WRITABLE_REPR_URL_KEY); } } } // collect each datum output.collect(key, datum); } } public void close() { } /** * Run the fixer on any crawl database and segments specified. */ public void update(Path crawlDb, Path[] segments) throws IOException { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // run the crawl database through the repr fixer if (crawlDb != null) { LOG.info("Running ReprUtilFixer " + crawlDb); Path crawlDbCurrent = new Path(crawlDb, CrawlDb.CURRENT_NAME); Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf updater = new NutchJob(conf); updater.setJobName("ReprUtilFixer: " + crawlDb.toString()); FileInputFormat.addInputPath(updater, crawlDbCurrent); FileOutputFormat.setOutputPath(updater, newCrawlDb); updater.setInputFormat(SequenceFileInputFormat.class); updater.setReducerClass(ReprUrlFixer.class); updater.setOutputKeyClass(Text.class); updater.setOutputValueClass(CrawlDatum.class); updater.setOutputFormat(MapFileOutputFormat.class); try { JobClient.runJob(updater); LOG.info("Installing new crawldb " + crawlDb); CrawlDb.install(updater, crawlDb); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } } // run the segments through the repr fixer, logic will be run on both the // crawl_parse and the crawl_fetch directories for every segment specified if (segments != null) { for (int i = 0; i < segments.length; i++) { Path segment = segments[i]; LOG.info("Running ReprUtilFixer " + segment + " fetch"); Path segFetch = new Path(segment, CrawlDatum.FETCH_DIR_NAME); Path newSegFetch = new Path(segment, CrawlDatum.FETCH_DIR_NAME + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf fetch = new NutchJob(conf); fetch.setJobName("ReprUtilFixer: " + segment.toString()); FileInputFormat.addInputPath(fetch, segFetch); FileOutputFormat.setOutputPath(fetch, newSegFetch); fetch.setInputFormat(SequenceFileInputFormat.class); fetch.setReducerClass(ReprUrlFixer.class); fetch.setOutputKeyClass(Text.class); fetch.setOutputValueClass(CrawlDatum.class); fetch.setOutputFormat(MapFileOutputFormat.class); try { JobClient.runJob(fetch); LOG.info("Installing new segment fetch directory " + newSegFetch); FSUtils.replace(fs, segFetch, newSegFetch, true); LOG.info("ReprUrlFixer: finished installing segment fetch directory"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Running ReprUtilFixer " + segment + " parse"); Path segParse = new Path(segment, CrawlDatum.PARSE_DIR_NAME); Path newSegParse = new Path(segment, CrawlDatum.PARSE_DIR_NAME + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf parse = new NutchJob(conf); parse.setJobName("ReprUtilFixer: " + segment.toString()); FileInputFormat.addInputPath(parse, segParse); FileOutputFormat.setOutputPath(parse, newSegParse); parse.setInputFormat(SequenceFileInputFormat.class); parse.setReducerClass(ReprUrlFixer.class); parse.setOutputKeyClass(Text.class); parse.setOutputValueClass(CrawlDatum.class); parse.setOutputFormat(MapFileOutputFormat.class); try { JobClient.runJob(parse); LOG.info("Installing new segment parse directry " + newSegParse); FSUtils.replace(fs, segParse, newSegParse, true); LOG.info("ReprUrlFixer: finished installing segment parse directory"); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } } } } /** * Runs The ReprUrlFixer. */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new ReprUrlFixer(), args); System.exit(res); } /** * Parse command line options and execute the main update logic. */ public int run(String[] args) throws Exception { Options options = new Options(); Option helpOpts = OptionBuilder.withArgName("help").withDescription( "show this help message").create("help"); Option crawlDbOpts = OptionBuilder.withArgName("crawldb").hasArg().withDescription( "the crawldb to use").create("crawldb"); Option segOpts = OptionBuilder.withArgName("segment").hasArgs().withDescription( "the segment(s) to use").create("segment"); options.addOption(helpOpts); options.addOption(crawlDbOpts); options.addOption(segOpts); CommandLineParser parser = new GnuParser(); try { // parse out common line arguments and make sure either a crawldb or a // segment are specified CommandLine line = parser.parse(options, args); if (line.hasOption("help") || (!line.hasOption("crawldb") && !line.hasOption("segment"))) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("ReprUtilFixer", options); return -1; } // create paths for all of the segments specified, multiple segments may // be run at once String crawlDb = line.getOptionValue("crawldb"); String[] segments = line.getOptionValues("segment"); Path[] segPaths = new Path[segments != null ? segments.length : 0]; if (segments != null) { for (int i = 0; i < segments.length; i++) { segPaths[i] = new Path(segments[i]); } } update(new Path(crawlDb), segPaths); return 0; } catch (Exception e) { LOG.fatal("ReprUtilFixer: " + StringUtils.stringifyException(e)); return -1; } } }