/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.fetcher; import java.io.IOException; import java.net.MalformedURLException; import java.text.SimpleDateFormat; import java.util.Map.Entry; // Commons Logging imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.*; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.*; import org.apache.nutch.protocol.*; import org.apache.nutch.parse.*; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.*; /** The fetcher. Most of the work is done by plugins. */ public class OldFetcher extends Configured implements Tool, MapRunnable<WritableComparable, Writable, Text, NutchWritable> { public static final Log LOG = LogFactory.getLog(OldFetcher.class); public static final int PERM_REFRESH_TIME = 5; public static final String CONTENT_REDIR = "content"; public static final String PROTOCOL_REDIR = "protocol"; public static class InputFormat extends SequenceFileInputFormat<WritableComparable, Writable> { /** Don't split inputs, to keep things polite. */ public InputSplit[] getSplits(JobConf job, int nSplits) throws IOException { FileStatus[] files = listStatus(job); FileSystem fs = FileSystem.get(job); InputSplit[] splits = new InputSplit[files.length]; for (int i = 0; i < files.length; i++) { FileStatus cur = files[i]; splits[i] = new FileSplit(cur.getPath(), 0, cur.getLen(), (String[])null); } return splits; } } private RecordReader<WritableComparable, Writable> input; private OutputCollector<Text, NutchWritable> output; private Reporter reporter; private String segmentName; private int activeThreads; private int maxRedirect; private long start = System.currentTimeMillis(); // start time of fetcher run private long lastRequestStart = start; private long bytes; // total bytes fetched private int pages; // total pages fetched private int errors; // total pages errored private boolean storingContent; private boolean parsing; private class FetcherThread extends Thread { private Configuration conf; private URLFilters urlFilters; private ScoringFilters scfilters; private ParseUtil parseUtil; private URLNormalizers normalizers; private ProtocolFactory protocolFactory; private boolean redirecting; private int redirectCount; private String reprUrl; public FetcherThread(Configuration conf) { this.setDaemon(true); // don't hang JVM on exit this.setName("FetcherThread"); // use an informative name this.conf = conf; this.urlFilters = new URLFilters(conf); this.scfilters = new ScoringFilters(conf); this.parseUtil = new ParseUtil(conf); this.protocolFactory = new ProtocolFactory(conf); this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER); } public void run() { synchronized (OldFetcher.this) {activeThreads++;} // count threads try { Text key = new Text(); CrawlDatum datum = new CrawlDatum(); while (true) { // TODO : NUTCH-258 ... // If something bad happened, then exit // if (conf.getBoolean("fetcher.exit", false)) { // break; // ] try { // get next entry from input if (!input.next(key, datum)) { break; // at eof, exit } } catch (IOException e) { if (LOG.isFatalEnabled()) { e.printStackTrace(LogUtil.getFatalStream(LOG)); LOG.fatal("fetcher caught:"+e.toString()); } break; } synchronized (OldFetcher.this) { lastRequestStart = System.currentTimeMillis(); } // url may be changed through redirects. Text url = new Text(key); Text reprUrlWritable = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); if (reprUrlWritable == null) { reprUrl = key.toString(); } else { reprUrl = reprUrlWritable.toString(); } try { if (LOG.isInfoEnabled()) { LOG.info("fetching " + url); } // fetch the page redirectCount = 0; do { if (LOG.isDebugEnabled()) { LOG.debug("redirectCount=" + redirectCount); } redirecting = false; Protocol protocol = this.protocolFactory.getProtocol(url.toString()); ProtocolOutput output = protocol.getProtocolOutput(url, datum); ProtocolStatus status = output.getStatus(); Content content = output.getContent(); ParseStatus pstatus = null; String urlString = url.toString(); if (reprUrl != null && !reprUrl.equals(urlString)) { datum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); } switch(status.getCode()) { case ProtocolStatus.SUCCESS: // got a page pstatus = output(url, datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS); updateStatus(content.getContent().length); if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) { String newUrl = pstatus.getMessage(); int refreshTime = Integer.valueOf(pstatus.getArgs()[1]); url = handleRedirect(url, datum, urlString, newUrl, refreshTime < PERM_REFRESH_TIME, CONTENT_REDIR); } break; case ProtocolStatus.MOVED: // redirect case ProtocolStatus.TEMP_MOVED: int code; boolean temp; if (status.getCode() == ProtocolStatus.MOVED) { code = CrawlDatum.STATUS_FETCH_REDIR_PERM; temp = false; } else { code = CrawlDatum.STATUS_FETCH_REDIR_TEMP; temp = true; } output(url, datum, content, status, code); String newUrl = status.getMessage(); url = handleRedirect(url, datum, urlString, newUrl, temp, PROTOCOL_REDIR); break; // failures - increase the retry counter case ProtocolStatus.EXCEPTION: logError(url, status.getMessage()); /* FALLTHROUGH */ case ProtocolStatus.RETRY: // retry case ProtocolStatus.WOULDBLOCK: case ProtocolStatus.BLOCKED: output(url, datum, null, status, CrawlDatum.STATUS_FETCH_RETRY); break; // permanent failures case ProtocolStatus.GONE: // gone case ProtocolStatus.NOTFOUND: case ProtocolStatus.ACCESS_DENIED: case ProtocolStatus.ROBOTS_DENIED: output(url, datum, null, status, CrawlDatum.STATUS_FETCH_GONE); break; case ProtocolStatus.NOTMODIFIED: output(url, datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED); break; default: if (LOG.isWarnEnabled()) { LOG.warn("Unknown ProtocolStatus: " + status.getCode()); } output(url, datum, null, status, CrawlDatum.STATUS_FETCH_GONE); } if (redirecting && redirectCount >= maxRedirect) { if (LOG.isInfoEnabled()) { LOG.info(" - redirect count exceeded " + url); } output(url, datum, null, status, CrawlDatum.STATUS_FETCH_GONE); } } while (redirecting && (redirectCount < maxRedirect)); } catch (Throwable t) { // unexpected exception logError(url, t.toString()); output(url, datum, null, null, CrawlDatum.STATUS_FETCH_RETRY); } } } catch (Throwable e) { if (LOG.isFatalEnabled()) { e.printStackTrace(LogUtil.getFatalStream(LOG)); LOG.fatal("fetcher caught:"+e.toString()); } } finally { synchronized (OldFetcher.this) {activeThreads--;} // count threads } } private Text handleRedirect(Text url, CrawlDatum datum, String urlString, String newUrl, boolean temp, String redirType) throws MalformedURLException, URLFilterException { newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); newUrl = urlFilters.filter(newUrl); if (newUrl != null && !newUrl.equals(urlString)) { reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp); url = new Text(newUrl); if (maxRedirect > 0) { redirecting = true; redirectCount++; if (LOG.isDebugEnabled()) { LOG.debug(" - " + redirType + " redirect to " + url + " (fetching now)"); } return url; } else { CrawlDatum newDatum = new CrawlDatum(); if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); } output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED); if (LOG.isDebugEnabled()) { LOG.debug(" - " + redirType + " redirect to " + url + " (fetching later)"); } return null; } } else { if (LOG.isDebugEnabled()) { LOG.debug(" - " + redirType + " redirect skipped: " + (newUrl != null ? "to same url" : "filtered")); } return null; } } private void logError(Text url, String message) { if (LOG.isInfoEnabled()) { LOG.info("fetch of " + url + " failed with: " + message); } synchronized (OldFetcher.this) { // record failure errors++; } } private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status) { datum.setStatus(status); datum.setFetchTime(System.currentTimeMillis()); if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus); ParseResult parseResult = null; if (content != null) { Metadata metadata = content.getMetadata(); // add segment to metadata metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // add score to content metadata so that ParseSegment can pick it up. try { scfilters.passScoreBeforeParsing(key, datum, content); } catch (Exception e) { if (LOG.isWarnEnabled()) { e.printStackTrace(LogUtil.getWarnStream(LOG)); LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } /* Note: Fetcher will only follow meta-redirects coming from the * original URL. */ if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) { try { parseResult = this.parseUtil.parse(content); } catch (Exception e) { LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); } if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, new ParseStatus().getEmptyParse(conf)); datum.setSignature(signature); } } /* Store status code in content So we can read this value during * parsing (as a separate job) and decide to parse or not. */ content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status)); } try { output.collect(key, new NutchWritable(datum)); if (content != null && storingContent) output.collect(key, new NutchWritable(content)); if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); if (!parseStatus.isSuccess()) { LOG.warn("Error parsing: " + key + ": " + parseStatus); parse = parseStatus.getEmptyParse(getConf()); } // Calculate page signature. For non-parsing fetchers this will // be done in ParseSegment byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse); // Ensure segment name and score are in parseData metadata parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName); parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); // Pass fetch time to content meta parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime())); if (url.equals(key)) datum.setSignature(signature); try { scfilters.passScoreAfterParsing(url, content, parse); } catch (Exception e) { if (LOG.isWarnEnabled()) { e.printStackTrace(LogUtil.getWarnStream(LOG)); LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } output.collect(url, new NutchWritable( new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse.isCanonical()))); } } } catch (IOException e) { if (LOG.isFatalEnabled()) { e.printStackTrace(LogUtil.getFatalStream(LOG)); LOG.fatal("fetcher caught:"+e.toString()); } } // return parse status if it exits if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { return p.getData().getStatus(); } } return null; } } private synchronized void updateStatus(int bytesInPage) throws IOException { pages++; bytes += bytesInPage; } private void reportStatus() throws IOException { String status; synchronized (this) { long elapsed = (System.currentTimeMillis() - start)/1000; status = pages+" pages, "+errors+" errors, " + Math.round(((float)pages*10)/elapsed)/10.0+" pages/s, " + Math.round(((((float)bytes)*8)/1024)/elapsed)+" kb/s, "; } reporter.setStatus(status); } public OldFetcher() { } public OldFetcher(Configuration conf) { setConf(conf); } public void configure(JobConf job) { setConf(job); this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY); this.storingContent = isStoringContent(job); this.parsing = isParsing(job); // if (job.getBoolean("fetcher.verbose", false)) { // LOG.setLevel(Level.FINE); // } } public void close() {} public static boolean isParsing(Configuration conf) { return conf.getBoolean("fetcher.parse", true); } public static boolean isStoringContent(Configuration conf) { return conf.getBoolean("fetcher.store.content", true); } public void run(RecordReader<WritableComparable, Writable> input, OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { this.input = input; this.output = output; this.reporter = reporter; this.maxRedirect = getConf().getInt("http.redirect.max", 3); int threadCount = getConf().getInt("fetcher.threads.fetch", 10); if (LOG.isInfoEnabled()) { LOG.info("OldFetcher: threads: " + threadCount); } for (int i = 0; i < threadCount; i++) { // spawn threads new FetcherThread(getConf()).start(); } // select a timeout that avoids a task timeout long timeout = getConf().getInt("mapred.task.timeout", 10*60*1000)/2; do { // wait for threads to exit try { Thread.sleep(1000); } catch (InterruptedException e) {} reportStatus(); // some requests seem to hang, despite all intentions synchronized (this) { if ((System.currentTimeMillis() - lastRequestStart) > timeout) { if (LOG.isWarnEnabled()) { LOG.warn("Aborting with "+activeThreads+" hung threads."); } return; } } } while (activeThreads > 0); } public void fetch(Path segment, int threads) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("OldFetcher: starting at " + sdf.format(start)); LOG.info("OldFetcher: segment: " + segment); } JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setMapRunnerClass(OldFetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("OldFetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } /** Run the fetcher. */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new OldFetcher(), args); System.exit(res); } public int run(String[] args) throws Exception { String usage = "Usage: OldFetcher <segment> [-threads n] [-noParsing]"; if (args.length < 1) { System.err.println(usage); return -1; } Path segment = new Path(args[0]); int threads = getConf().getInt("fetcher.threads.fetch", 10); boolean parsing = true; for (int i = 1; i < args.length; i++) { // parse command line if (args[i].equals("-threads")) { // found -threads option threads = Integer.parseInt(args[++i]); } else if (args[i].equals("-noParsing")) parsing = false; } getConf().setInt("fetcher.threads.fetch", threads); if (!parsing) { getConf().setBoolean("fetcher.parse", parsing); } try { fetch(segment, threads); // run the Fetcher return 0; } catch (Exception e) { LOG.fatal("OldFetcher: " + StringUtils.stringifyException(e)); return -1; } } }