Fetcher.java example

Explorer

googleplaycrawler-master
- src
  - java
    - com
      - trustlook
        googleplaycrawler
        GooglePlayCrawler.java
    - org
      - apache
        nutch
        crawl
        AbstractFetchSchedule.java
        AdaptiveFetchSchedule.java
        Crawl.java
        CrawlDatum.java
        CrawlDb.java
        CrawlDbFilter.java
        CrawlDbMerger.java
        CrawlDbReader.java
        CrawlDbReducer.java
        DefaultFetchSchedule.java
        FetchSchedule.java
        FetchScheduleFactory.java
        Generator.java
        Injector.java
        Inlink.java
        Inlinks.java
        LinkDb.java
        LinkDbFilter.java
        LinkDbMerger.java
        LinkDbReader.java
        MD5Signature.java
        MapWritable.java
        MimeAdaptiveFetchSchedule.java
        NutchWritable.java
        Signature.java
        SignatureComparator.java
        SignatureFactory.java
        TextProfileSignature.java
        URLPartitioner.java
        fetcher
        Fetcher.java
        FetcherOutput.java
        FetcherOutputFormat.java
        OldFetcher.java
        indexer
        IndexerMapReduce.java
        IndexerOutputFormat.java
        IndexingException.java
        IndexingFilter.java
        IndexingFilters.java
        IndexingFiltersChecker.java
        NutchDocument.java
        NutchField.java
        NutchIndexAction.java
        NutchIndexWriter.java
        NutchIndexWriterFactory.java
        solr
        SolrClean.java
        SolrConstants.java
        SolrDeleteDuplicates.java
        SolrIndexer.java
        SolrMappingReader.java
        SolrUtils.java
        SolrWriter.java
        metadata
        CreativeCommons.java
        DublinCore.java
        Feed.java
        HttpHeaders.java
        MetaWrapper.java
        Metadata.java
        Nutch.java
        SpellCheckedMetadata.java
        net
        URLFilter.java
        URLFilterChecker.java
        URLFilterException.java
        URLFilters.java
        URLNormalizer.java
        URLNormalizerChecker.java
        URLNormalizers.java
        protocols
        HttpDateFormat.java
        ProtocolException.java
        Response.java
        parse
        HTMLMetaTags.java
        HtmlParseFilter.java
        HtmlParseFilters.java
        Outlink.java
        OutlinkExtractor.java
        Parse.java
        ParseCallable.java
        ParseData.java
        ParseException.java
        ParseImpl.java
        ParseOutputFormat.java
        ParsePluginList.java
        ParsePluginsReader.java
        ParseResult.java
        ParseSegment.java
        ParseStatus.java
        ParseText.java
        ParseUtil.java
        Parser.java
        ParserChecker.java
        ParserFactory.java
        ParserNotFound.java
        plugin
        CircularDependencyException.java
        Extension.java
        ExtensionPoint.java
        MissingDependencyException.java
        Pluggable.java
        Plugin.java
        PluginClassLoader.java
        PluginDescriptor.java
        PluginManifestParser.java
        PluginRepository.java
        PluginRuntimeException.java
        protocol
        Content.java
        EmptyRobotRules.java
        Protocol.java
        ProtocolException.java
        ProtocolFactory.java
        ProtocolNotFound.java
        ProtocolOutput.java
        ProtocolStatus.java
        RobotRules.java
        scoring
        ScoringFilter.java
        ScoringFilterException.java
        ScoringFilters.java
        webgraph
        LinkDatum.java
        LinkDumper.java
        LinkRank.java
        LoopReader.java
        Loops.java
        Node.java
        NodeDumper.java
        NodeReader.java
        ScoreUpdater.java
        WebGraph.java
        segment
        ContentAsTextInputFormat.java
        SegmentMergeFilter.java
        SegmentMergeFilters.java
        SegmentMerger.java
        SegmentPart.java
        SegmentReader.java
        tools
        Benchmark.java
        CrawlDBScanner.java
        DmozParser.java
        FreeGenerator.java
        ResolveUrls.java
        arc
        ArcInputFormat.java
        ArcRecordReader.java
        ArcSegmentCreator.java
        proxy
        AbstractTestbedHandler.java
        DelayHandler.java
        FakeHandler.java
        LogDebugHandler.java
        NotFoundHandler.java
        SegmentHandler.java
        TestbedProxy.java
        util
        CommandRunner.java
        DeflateUtils.java
        DomUtil.java
        EncodingDetector.java
        FSUtils.java
        GZIPUtils.java
        GenericWritableConfigurable.java
        HadoopFSUtil.java
        LockUtil.java
        MimeUtil.java
        NodeWalker.java
        NutchConfiguration.java
        NutchJob.java
        ObjectCache.java
        PrefixStringMatcher.java
        StringUtil.java
        SuffixStringMatcher.java
        TimingUtil.java
        TrieStringMatcher.java
        URLUtil.java
        domain
        DomainStatistics.java
        DomainSuffix.java
        DomainSuffixes.java
        DomainSuffixesReader.java
        TopLevelDomain.java
  - plugin
    - creativecommons
      - src
        java
        org
        creativecommons
        nutch
        CCIndexingFilter.java
        CCParseFilter.java
        test
        org
        creativecommons
        nutch
        TestCCParseFilter.java
    - feed
      - src
        java
        org
        apache
        nutch
        indexer
        feed
        FeedIndexingFilter.java
        parse
        feed
        FeedParser.java
        test
        org
        apache
        nutch
        parse
        feed
        TestFeedParser.java
    - headings
      - src
        java
        org
        apache
        nutch
        parse
        headings
        HeadingsParseFilter.java
    - index-anchor
      - src
        java
        org
        apache
        nutch
        indexer
        anchor
        AnchorIndexingFilter.java
        test
        org
        apache
        nutch
        indexer
        anchor
        TestAnchorIndexingFilter.java
    - index-basic
      - src
        java
        org
        apache
        nutch
        indexer
        basic
        BasicIndexingFilter.java
    - index-metadata
      - src
        java
        org
        apache
        nutch
        indexer
        metadata
        MetadataIndexer.java
    - index-more
      - src
        java
        org
        apache
        nutch
        indexer
        more
        MoreIndexingFilter.java
        test
        org
        apache
        nutch
        indexer
        more
        TestMoreIndexingFilter.java
    - index-static
      - src
        java
        org
        apache
        nutch
        indexer
        staticfield
        StaticFieldIndexer.java
    - language-identifier
      - src
        java
        org
        apache
        nutch
        analysis
        lang
        HTMLLanguageParser.java
        LanguageIndexingFilter.java
        test
        org
        apache
        nutch
        analysis
        lang
        TestHTMLLanguageParser.java
    - lib-http
      - src
        java
        org
        apache
        nutch
        protocol
        http
        api
        BlockedException.java
        HttpBase.java
        HttpException.java
        RobotRulesParser.java
        test
        org
        apache
        nutch
        protocol
        http
        api
        TestRobotRulesParser.java
    - lib-regex-filter
      - src
        java
        org
        apache
        nutch
        urlfilter
        api
        RegexRule.java
        RegexURLFilterBase.java
        test
        org
        apache
        nutch
        urlfilter
        api
        RegexURLFilterBaseTest.java
    - microformats-reltag
      - src
        java
        org
        apache
        nutch
        microformats
        reltag
        RelTagIndexingFilter.java
        RelTagParser.java
    - parse-ext
      - src
        java
        org
        apache
        nutch
        parse
        ext
        ExtParser.java
        test
        org
        apache
        nutch
        parse
        ext
        TestExtParser.java
    - parse-googleplay
      - src
        java
        org
        apache
        nutch
        parse
        googleplay
        GoogleplayParser.java
    - parse-html
      - src
        java
        org
        apache
        nutch
        parse
        html
        DOMBuilder.java
        DOMContentUtils.java
        HTMLMetaProcessor.java
        HtmlParser.java
        XMLCharacterRecognizer.java
        test
        org
        apache
        nutch
        parse
        html
        TestDOMContentUtils.java
        TestRobotsMetaProcessor.java
    - parse-js
      - src
        java
        org
        apache
        nutch
        parse
        js
        JSParseFilter.java
    - parse-metatags
      - src
        java
        org
        apache
        nutch
        parse
        MetaTagsParser.java
        test
        org
        apache
        nutch
        parse
        html
        TestMetatagParser.java
    - parse-swf
      - src
        java
        org
        apache
        nutch
        parse
        swf
        SWFParser.java
        test
        org
        apache
        nutch
        parse
        swf
        TestSWFParser.java
    - parse-tika
      - src
        java
        org
        apache
        nutch
        parse
        tika
        DOMBuilder.java
        DOMContentUtils.java
        HTMLMetaProcessor.java
        TikaParser.java
        XMLCharacterRecognizer.java
        test
        org
        apache
        nutch
        tika
        TestFeedParser.java
        TestMSWordParser.java
        TestOOParser.java
        TestPdfParser.java
        TestRTFParser.java
    - parse-zip
      - src
        java
        org
        apache
        nutch
        parse
        zip
        ZipParser.java
        ZipTextExtractor.java
        test
        org
        apache
        nutch
        parse
        zip
        TestZipParser.java
    - protocol-file
      - src
        java
        org
        apache
        nutch
        protocol
        file
        File.java
        FileError.java
        FileException.java
        FileResponse.java
        test
        org
        apache
        nutch
        protocol
        file
        TestProtocolFile.java
    - protocol-ftp
      - src
        java
        org
        apache
        nutch
        protocol
        ftp
        Client.java
        Ftp.java
        FtpError.java
        FtpException.java
        FtpExceptionBadSystResponse.java
        FtpExceptionCanNotHaveDataConnection.java
        FtpExceptionControlClosedByForcedDataClose.java
        FtpExceptionUnknownForcedDataClose.java
        FtpResponse.java
        PrintCommandListener.java
    - protocol-http
      - src
        java
        org
        apache
        nutch
        protocol
        http
        Http.java
        HttpResponse.java
    - protocol-httpclient
      - src
        java
        org
        apache
        nutch
        protocol
        httpclient
        DummySSLProtocolSocketFactory.java
        DummyX509TrustManager.java
        Http.java
        HttpAuthentication.java
        HttpAuthenticationException.java
        HttpAuthenticationFactory.java
        HttpBasicAuthentication.java
        HttpResponse.java
        test
        org
        apache
        nutch
        protocol
        httpclient
        TestProtocolHttpClient.java
    - scoring-link
      - src
        java
        org
        apache
        nutch
        scoring
        link
        LinkAnalysisScoringFilter.java
    - scoring-opic
      - src
        java
        org
        apache
        nutch
        scoring
        opic
        OPICScoringFilter.java
    - subcollection
      - src
        java
        org
        apache
        nutch
        collection
        CollectionManager.java
        Subcollection.java
        indexer
        subcollection
        SubcollectionIndexingFilter.java
        test
        org
        apache
        nutch
        collection
        TestSubcollection.java
    - tld
      - src
        java
        org
        apache
        nutch
        indexer
        tld
        TLDIndexingFilter.java
        scoring
        tld
        TLDScoringFilter.java
    - urlfilter-automaton
      - src
        java
        org
        apache
        nutch
        urlfilter
        automaton
        AutomatonURLFilter.java
        test
        org
        apache
        nutch
        urlfilter
        automaton
        TestAutomatonURLFilter.java
    - urlfilter-domain
      - src
        java
        org
        apache
        nutch
        urlfilter
        domain
        DomainURLFilter.java
        test
        org
        apache
        nutch
        urlfilter
        domain
        TestDomainURLFilter.java
    - urlfilter-domainblacklist
      - src
        java
        org
        apache
        nutch
        urlfilter
        domainblacklist
        DomainBlacklistURLFilter.java
        test
        org
        apache
        nutch
        urlfilter
        domainblacklist
        TestDomainBlacklistURLFilter.java
    - urlfilter-prefix
      - src
        java
        org
        apache
        nutch
        urlfilter
        prefix
        PrefixURLFilter.java
    - urlfilter-regex
      - src
        java
        org
        apache
        nutch
        urlfilter
        regex
        RegexURLFilter.java
        test
        org
        apache
        nutch
        urlfilter
        regex
        TestRegexURLFilter.java
    - urlfilter-suffix
      - src
        java
        org
        apache
        nutch
        urlfilter
        suffix
        SuffixURLFilter.java
        test
        org
        apache
        nutch
        urlfilter
        suffix
        TestSuffixURLFilter.java
    - urlfilter-validator
      - src
        java
        org
        apache
        nutch
        urlfilter
        validator
        UrlValidator.java
    - urlmeta
      - src
        java
        org
        apache
        nutch
        indexer
        urlmeta
        URLMetaIndexingFilter.java
        scoring
        urlmeta
        URLMetaScoringFilter.java
    - urlnormalizer-basic
      - src
        java
        org
        apache
        nutch
        net
        urlnormalizer
        basic
        BasicURLNormalizer.java
        test
        org
        apache
        nutch
        net
        urlnormalizer
        basic
        TestBasicURLNormalizer.java
    - urlnormalizer-host
      - src
        java
        org
        apache
        nutch
        net
        urlnormalizer
        host
        HostURLNormalizer.java
        test
        org
        apache
        nutch
        net
        urlnormalizer
        host
        TestHostURLNormalizer.java
    - urlnormalizer-pass
      - src
        java
        org
        apache
        nutch
        net
        urlnormalizer
        pass
        PassURLNormalizer.java
        test
        org
        apache
        nutch
        net
        urlnormalizer
        pass
        TestPassURLNormalizer.java
    - urlnormalizer-regex
      - src
        java
        org
        apache
        nutch
        net
        urlnormalizer
        regex
        RegexURLNormalizer.java
        test
        org
        apache
        nutch
        net
        urlnormalizer
        regex
        TestRegexURLNormalizer.java
  - test
    - org
      - apache
        nutch
        crawl
        CrawlDBTestUtil.java
        DummyWritable.java
        TestCrawlDbMerger.java
        TestGenerator.java
        TestInjector.java
        TestLinkDbMerger.java
        TestSignatureFactory.java
        fetcher
        TestFetcher.java
        indexer
        TestIndexingFilters.java
        metadata
        TestMetadata.java
        TestSpellCheckedMetadata.java
        net
        TestURLFilters.java
        TestURLNormalizers.java
        parse
        TestOutlinkExtractor.java
        TestParseData.java
        TestParseText.java
        TestParserFactory.java
        plugin
        HelloWorldExtension.java
        ITestExtension.java
        SimpleTestPlugin.java
        TestPluginSystem.java
        protocol
        TestContent.java
        TestProtocolFactory.java
        segment
        TestSegmentMerger.java
        util
        TestEncodingDetector.java
        TestGZIPUtils.java
        TestNodeWalker.java
        TestPrefixStringMatcher.java
        TestStringUtil.java
        TestSuffixStringMatcher.java
        TestURLUtil.java
        WritableTestUtils.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.fetcher;

import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

// Slf4j Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.*;
import org.apache.nutch.protocol.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.*;


/**
 * A queue-based fetcher.
 *
 * <p>This fetcher uses a well-known model of one producer (a QueueFeeder)
 * and many consumers (FetcherThread-s).
 *
 * <p>QueueFeeder reads input fetchlists and
 * populates a set of FetchItemQueue-s, which hold FetchItem-s that
 * describe the items to be fetched. There are as many queues as there are unique
 * hosts, but at any given time the total number of fetch items in all queues
 * is less than a fixed number (currently set to a multiple of the number of
 * threads).
 *
 * <p>As items are consumed from the queues, the QueueFeeder continues to add new
 * input items, so that their total count stays fixed (FetcherThread-s may also
 * add new items to the queues e.g. as a results of redirection) - until all
 * input items are exhausted, at which point the number of items in the queues
 * begins to decrease. When this number reaches 0 fetcher will finish.
 *
 * <p>This fetcher implementation handles per-host blocking itself, instead
 * of delegating this work to protocol-specific plugins.
 * Each per-host queue handles its own "politeness" settings, such as the
 * maximum number of concurrent requests and crawl delay between consecutive
 * requests - and also a list of requests in progress, and the time the last
 * request was finished. As FetcherThread-s ask for new items to be fetched,
 * queues may return eligible items or null if for "politeness" reasons this
 * host's queue is not yet ready.
 *
 * <p>If there are still unfetched items in the queues, but none of the items
 * are ready, FetcherThread-s will spin-wait until either some items become
 * available, or a timeout is reached (at which point the Fetcher will abort,
 * assuming the task is hung).
 *
 * @author Andrzej Bialecki
 */
public class Fetcher extends Configured implements Tool,
    MapRunnable<Text, CrawlDatum, Text, NutchWritable> {

  public static final int PERM_REFRESH_TIME = 5;

  public static final String CONTENT_REDIR = "content";

  public static final String PROTOCOL_REDIR = "protocol";

  public static final Logger LOG = LoggerFactory.getLogger(Fetcher.class);

  public static class InputFormat extends SequenceFileInputFormat<Text, CrawlDatum> {
    /** Don't split inputs, to keep things polite. */
    public InputSplit[] getSplits(JobConf job, int nSplits)
      throws IOException {
      FileStatus[] files = listStatus(job);
      FileSplit[] splits = new FileSplit[files.length];
      for (int i = 0; i < files.length; i++) {
        FileStatus cur = files[i];
        splits[i] = new FileSplit(cur.getPath(), 0,
            cur.getLen(), (String[])null);
      }
      return splits;
    }
  }

  private OutputCollector<Text, NutchWritable> output;
  private Reporter reporter;

  private String segmentName;
  private AtomicInteger activeThreads = new AtomicInteger(0);
  private AtomicInteger spinWaiting = new AtomicInteger(0);

  private long start = System.currentTimeMillis(); // start time of fetcher run
  private AtomicLong lastRequestStart = new AtomicLong(start);

  private AtomicLong bytes = new AtomicLong(0);        // total bytes fetched
  private AtomicInteger pages = new AtomicInteger(0);  // total pages fetched
  private AtomicInteger errors = new AtomicInteger(0); // total pages errored

  private boolean storingContent;
  private boolean parsing;
  FetchItemQueues fetchQueues;
  QueueFeeder feeder;

  /**
   * This class described the item to be fetched.
   */
  private static class FetchItem {
    int outlinkDepth = 0;
    String queueID;
    Text url;
    URL u;
    CrawlDatum datum;

    public FetchItem(Text url, URL u, CrawlDatum datum, String queueID) {
      this(url, u, datum, queueID, 0);
    }

    public FetchItem(Text url, URL u, CrawlDatum datum, String queueID, int outlinkDepth) {
      this.url = url;
      this.u = u;
      this.datum = datum;
      this.queueID = queueID;
      this.outlinkDepth = outlinkDepth;
    }

    /** Create an item. Queue id will be created based on <code>queueMode</code>
     * argument, either as a protocol + hostname pair, protocol + IP
     * address pair or protocol+domain pair.
     */
    public static FetchItem create(Text url, CrawlDatum datum,  String queueMode) {
      return create(url, datum, queueMode, 0);
    }

    public static FetchItem create(Text url, CrawlDatum datum,  String queueMode, int outlinkDepth) {
      String queueID;
      URL u = null;
      try {
        u = new URL(url.toString());
      } catch (Exception e) {
        LOG.warn("Cannot parse url: " + url, e);
        return null;
      }
      final String proto = u.getProtocol().toLowerCase();
      String key;
      if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
        try {
          final InetAddress addr = InetAddress.getByName(u.getHost());
          key = addr.getHostAddress();
        } catch (final UnknownHostException e) {
          // unable to resolve it, so don't fall back to host name
          LOG.warn("Unable to resolve: " + u.getHost() + ", skipping.");
          return null;
        }
      }
      else if (FetchItemQueues.QUEUE_MODE_DOMAIN.equalsIgnoreCase(queueMode)){
        key = URLUtil.getDomainName(u);
        if (key == null) {
          LOG.warn("Unknown domain for url: " + url + ", using URL string as key");
          key=u.toExternalForm();
        }
      }
      else {
        key = u.getHost();
        if (key == null) {
          LOG.warn("Unknown host for url: " + url + ", using URL string as key");
          key=u.toExternalForm();
        }
      }
      queueID = proto + "://" + key.toLowerCase();
      return new FetchItem(url, u, datum, queueID, outlinkDepth);
    }

    public CrawlDatum getDatum() {
      return datum;
    }

    public String getQueueID() {
      return queueID;
    }

    public Text getUrl() {
      return url;
    }

    public URL getURL2() {
      return u;
    }
  }

  /**
   * This class handles FetchItems which come from the same host ID (be it
   * a proto/hostname or proto/IP pair). It also keeps track of requests in
   * progress and elapsed time between requests.
   */
  private static class FetchItemQueue {
    List<FetchItem> queue = Collections.synchronizedList(new LinkedList<FetchItem>());
    Set<FetchItem>  inProgress = Collections.synchronizedSet(new HashSet<FetchItem>());
    AtomicLong nextFetchTime = new AtomicLong();
    AtomicInteger exceptionCounter = new AtomicInteger();
    long crawlDelay;
    long minCrawlDelay;
    int maxThreads;
    Configuration conf;

    public FetchItemQueue(Configuration conf, int maxThreads, long crawlDelay, long minCrawlDelay) {
      this.conf = conf;
      this.maxThreads = maxThreads;
      this.crawlDelay = crawlDelay;
      this.minCrawlDelay = minCrawlDelay;
      // ready to start
      setEndTime(System.currentTimeMillis() - crawlDelay);
    }

    public synchronized int emptyQueue() {
      int presize = queue.size();
      queue.clear();
      return presize;
    }

    public int getQueueSize() {
      return queue.size();
    }

    public int getInProgressSize() {
      return inProgress.size();
    }

    public int incrementExceptionCounter() {
      return exceptionCounter.incrementAndGet();
    }

    public void finishFetchItem(FetchItem it, boolean asap) {
      if (it != null) {
        inProgress.remove(it);
        setEndTime(System.currentTimeMillis(), asap);
      }
    }

    public void addFetchItem(FetchItem it) {
      if (it == null) return;
      queue.add(it);
    }

    public void addInProgressFetchItem(FetchItem it) {
      if (it == null) return;
      inProgress.add(it);
    }

    public FetchItem getFetchItem() {
      if (inProgress.size() >= maxThreads) return null;
      long now = System.currentTimeMillis();
      if (nextFetchTime.get() > now) return null;
      FetchItem it = null;
      if (queue.size() == 0) return null;
      try {
        it = queue.remove(0);
        inProgress.add(it);
      } catch (Exception e) {
        LOG.error("Cannot remove FetchItem from queue or cannot add it to inProgress queue", e);
      }
      return it;
    }

    public synchronized void dump() {
      LOG.info("  maxThreads    = " + maxThreads);
      LOG.info("  inProgress    = " + inProgress.size());
      LOG.info("  crawlDelay    = " + crawlDelay);
      LOG.info("  minCrawlDelay = " + minCrawlDelay);
      LOG.info("  nextFetchTime = " + nextFetchTime.get());
      LOG.info("  now           = " + System.currentTimeMillis());
      for (int i = 0; i < queue.size(); i++) {
        FetchItem it = queue.get(i);
        LOG.info("  " + i + ". " + it.url);
      }
    }

    private void setEndTime(long endTime) {
      setEndTime(endTime, false);
    }

    private void setEndTime(long endTime, boolean asap) {
      if (!asap)
        nextFetchTime.set(endTime + (maxThreads > 1 ? minCrawlDelay : crawlDelay));
      else
        nextFetchTime.set(endTime);
    }
  }

  /**
   * Convenience class - a collection of queues that keeps track of the total
   * number of items, and provides items eligible for fetching from any queue.
   */
  private static class FetchItemQueues {
    public static final String DEFAULT_ID = "default";
    Map<String, FetchItemQueue> queues = new HashMap<String, FetchItemQueue>();
    AtomicInteger totalSize = new AtomicInteger(0);
    int maxThreads;
    long crawlDelay;
    long minCrawlDelay;
    long timelimit = -1;
    int maxExceptionsPerQueue = -1;
    Configuration conf;

    public static final String QUEUE_MODE_HOST = "byHost";
    public static final String QUEUE_MODE_DOMAIN = "byDomain";
    public static final String QUEUE_MODE_IP = "byIP";

    String queueMode;

    public FetchItemQueues(Configuration conf) {
      this.conf = conf;
      this.maxThreads = conf.getInt("fetcher.threads.per.queue", 1);
      queueMode = conf.get("fetcher.queue.mode", QUEUE_MODE_HOST);
      // check that the mode is known
      if (!queueMode.equals(QUEUE_MODE_IP) && !queueMode.equals(QUEUE_MODE_DOMAIN)
          && !queueMode.equals(QUEUE_MODE_HOST)) {
        LOG.error("Unknown partition mode : " + queueMode + " - forcing to byHost");
        queueMode = QUEUE_MODE_HOST;
      }
      LOG.info("Using queue mode : "+queueMode);

      this.crawlDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000);
      this.minCrawlDelay = (long) (conf.getFloat("fetcher.server.min.delay", 0.0f) * 1000);
      this.timelimit = conf.getLong("fetcher.timelimit", -1);
      this.maxExceptionsPerQueue = conf.getInt("fetcher.max.exceptions.per.queue", -1);
    }

    public int getTotalSize() {
      return totalSize.get();
    }

    public int getQueueCount() {
      return queues.size();
    }

    public void addFetchItem(Text url, CrawlDatum datum) {
      FetchItem it = FetchItem.create(url, datum, queueMode);
      if (it != null) addFetchItem(it);
    }

    public synchronized void addFetchItem(FetchItem it) {
      FetchItemQueue fiq = getFetchItemQueue(it.queueID);
      fiq.addFetchItem(it);
      totalSize.incrementAndGet();
    }

    public void finishFetchItem(FetchItem it) {
      finishFetchItem(it, false);
    }

    public void finishFetchItem(FetchItem it, boolean asap) {
      FetchItemQueue fiq = queues.get(it.queueID);
      if (fiq == null) {
        LOG.warn("Attempting to finish item from unknown queue: " + it);
        return;
      }
      fiq.finishFetchItem(it, asap);
    }

    public synchronized FetchItemQueue getFetchItemQueue(String id) {
      FetchItemQueue fiq = queues.get(id);
      if (fiq == null) {
        // initialize queue
        fiq = new FetchItemQueue(conf, maxThreads, crawlDelay, minCrawlDelay);
        queues.put(id, fiq);
      }
      return fiq;
    }

    public synchronized FetchItem getFetchItem() {
      Iterator<Map.Entry<String, FetchItemQueue>> it =
        queues.entrySet().iterator();
      while (it.hasNext()) {
        FetchItemQueue fiq = it.next().getValue();
        // reap empty queues
        if (fiq.getQueueSize() == 0 && fiq.getInProgressSize() == 0) {
          it.remove();
          continue;
        }
        FetchItem fit = fiq.getFetchItem();
        if (fit != null) {
          totalSize.decrementAndGet();
          return fit;
        }
      }
      return null;
    }

    // called only once the feeder has stopped
    public synchronized int checkTimelimit() {
      int count = 0;

      if (System.currentTimeMillis() >= timelimit && timelimit != -1) {
        // emptying the queues
        count = emptyQueues();

        // there might also be a case where totalsize !=0 but number of queues
        // == 0
        // in which case we simply force it to 0 to avoid blocking
        if (totalSize.get() != 0 && queues.size() == 0) totalSize.set(0);
      }
      return count;
    }

    // empties the queues (used by timebomb and throughput threshold)
    public synchronized int emptyQueues() {
      int count = 0;

      for (String id : queues.keySet()) {
        FetchItemQueue fiq = queues.get(id);
        if (fiq.getQueueSize() == 0) continue;
        LOG.info("* queue: " + id + " >> dropping! ");
        int deleted = fiq.emptyQueue();
        for (int i = 0; i < deleted; i++) {
          totalSize.decrementAndGet();
        }
        count += deleted;
      }

      return count;
    }

    /**
     * Increment the exception counter of a queue in case of an exception e.g.
     * timeout; when higher than a given threshold simply empty the queue.
     *
     * @param queueid
     * @return number of purged items
     */
    public synchronized int checkExceptionThreshold(String queueid) {
      FetchItemQueue fiq = queues.get(queueid);
      if (fiq == null) {
        return 0;
      }
      if (fiq.getQueueSize() == 0) {
        return 0;
      }
      int excCount = fiq.incrementExceptionCounter();
      if (maxExceptionsPerQueue!= -1 && excCount >= maxExceptionsPerQueue) {
        // too many exceptions for items in this queue - purge it
        int deleted = fiq.emptyQueue();
        LOG.info("* queue: " + queueid + " >> removed " + deleted
            + " URLs from queue because " + excCount + " exceptions occurred");
        for (int i = 0; i < deleted; i++) {
          totalSize.decrementAndGet();
        }
        return deleted;
      }
      return 0;
    }


    public synchronized void dump() {
      for (String id : queues.keySet()) {
        FetchItemQueue fiq = queues.get(id);
        if (fiq.getQueueSize() == 0) continue;
        LOG.info("* queue: " + id);
        fiq.dump();
      }
    }
  }

  /**
   * This class feeds the queues with input items, and re-fills them as
   * items are consumed by FetcherThread-s.
   */
  private static class QueueFeeder extends Thread {
    private RecordReader<Text, CrawlDatum> reader;
    private FetchItemQueues queues;
    private int size;
    private long timelimit = -1;

    public QueueFeeder(RecordReader<Text, CrawlDatum> reader,
        FetchItemQueues queues, int size) {
      this.reader = reader;
      this.queues = queues;
      this.size = size;
      this.setDaemon(true);
      this.setName("QueueFeeder");
    }

    public void setTimeLimit(long tl) {
      timelimit = tl;
    }

    public void run() {
      boolean hasMore = true;
      int cnt = 0;
      int timelimitcount = 0;
      while (hasMore) {
        if (System.currentTimeMillis() >= timelimit && timelimit != -1) {
          // enough .. lets' simply
          // read all the entries from the input without processing them
          try {
            Text url = new Text();
            CrawlDatum datum = new CrawlDatum();
            hasMore = reader.next(url, datum);
            timelimitcount++;
          } catch (IOException e) {
            LOG.error("QueueFeeder error reading input, record " + cnt, e);
            return;
          }
          continue;
        }
        int feed = size - queues.getTotalSize();
        if (feed <= 0) {
          // queues are full - spin-wait until they have some free space
          try {
            Thread.sleep(1000);
          } catch (Exception e) {};
          continue;
        } else {
          LOG.debug("-feeding " + feed + " input urls ...");
          while (feed > 0 && hasMore) {
            try {
              Text url = new Text();
              CrawlDatum datum = new CrawlDatum();
              hasMore = reader.next(url, datum);
              if (hasMore) {
                queues.addFetchItem(url, datum);
                cnt++;
                feed--;
              }
            } catch (IOException e) {
              LOG.error("QueueFeeder error reading input, record " + cnt, e);
              return;
            }
          }
        }
      }
      LOG.info("QueueFeeder finished: total " + cnt + " records + hit by time limit :"
          + timelimitcount);
    }
  }

  /**
   * This class picks items from queues and fetches the pages.
   */
  private class FetcherThread extends Thread {
    private Configuration conf;
    private URLFilters urlFilters;
    private ScoringFilters scfilters;
    private ParseUtil parseUtil;
    private URLNormalizers normalizers;
    private ProtocolFactory protocolFactory;
    private long maxCrawlDelay;
    private String queueMode;
    private int maxRedirect;
    private String reprUrl;
    private boolean redirecting;
    private int redirectCount;
    private boolean ignoreExternalLinks;

    // Used by fetcher.follow.outlinks.depth in parse
    private int maxOutlinksPerPage;
    private final int maxOutlinks;
    private final int interval;
    private int maxOutlinkDepth;
    private int maxOutlinkDepthNumLinks;
    private boolean outlinksIgnoreExternal;

    private int outlinksDepthDivisor;
    private boolean skipTruncated;

    public FetcherThread(Configuration conf) {
      this.setDaemon(true);                       // don't hang JVM on exit
      this.setName("FetcherThread");              // use an informative name
      this.conf = conf;
      this.urlFilters = new URLFilters(conf);
      this.scfilters = new ScoringFilters(conf);
      this.parseUtil = new ParseUtil(conf);
      this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
      this.protocolFactory = new ProtocolFactory(conf);
      this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
      this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
      queueMode = conf.get("fetcher.queue.mode", FetchItemQueues.QUEUE_MODE_HOST);
      // check that the mode is known
      if (!queueMode.equals(FetchItemQueues.QUEUE_MODE_IP) && !queueMode.equals(FetchItemQueues.QUEUE_MODE_DOMAIN)
          && !queueMode.equals(FetchItemQueues.QUEUE_MODE_HOST)) {
        LOG.error("Unknown partition mode : " + queueMode + " - forcing to byHost");
        queueMode = FetchItemQueues.QUEUE_MODE_HOST;
      }
      LOG.info("Using queue mode : "+queueMode);
      this.maxRedirect = conf.getInt("http.redirect.max", 3);
      this.ignoreExternalLinks =
        conf.getBoolean("db.ignore.external.links", false);

      maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
      maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage;
      interval = conf.getInt("db.fetch.interval.default", 2592000);
      ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
      maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1);
      outlinksIgnoreExternal = conf.getBoolean("fetcher.follow.outlinks.ignore.external", false);
      maxOutlinkDepthNumLinks = conf.getInt("fetcher.follow.outlinks.num.links", 4);
      outlinksDepthDivisor = conf.getInt("fetcher.follow.outlinks.depth.divisor", 2);
    }

    public void run() {
      activeThreads.incrementAndGet(); // count threads

      FetchItem fit = null;
      try {

        while (true) {
          fit = fetchQueues.getFetchItem();
          if (fit == null) {
            if (feeder.isAlive() || fetchQueues.getTotalSize() > 0) {
              LOG.debug(getName() + " spin-waiting ...");
              // spin-wait.
              spinWaiting.incrementAndGet();
              try {
                Thread.sleep(500);
              } catch (Exception e) {}
                spinWaiting.decrementAndGet();
              continue;
            } else {
              // all done, finish this thread
              return;
            }
          }
          lastRequestStart.set(System.currentTimeMillis());
          Text reprUrlWritable =
            (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
          if (reprUrlWritable == null) {
            reprUrl = fit.url.toString();
          } else {
            reprUrl = reprUrlWritable.toString();
          }
          try {
            // fetch the page
            redirecting = false;
            redirectCount = 0;
            do {
              if (LOG.isInfoEnabled()) { LOG.info("fetching " + fit.url); }
              if (LOG.isDebugEnabled()) {
                LOG.debug("redirectCount=" + redirectCount);
              }
              redirecting = false;
              Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString());
              RobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
              if (!rules.isAllowed(fit.u)) {
                // unblock
                fetchQueues.finishFetchItem(fit, true);
                if (LOG.isDebugEnabled()) {
                  LOG.debug("Denied by robots.txt: " + fit.url);
                }
                output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                reporter.incrCounter("FetcherStatus", "robots_denied", 1);
                continue;
              }
              if (rules.getCrawlDelay() > 0) {
                if (rules.getCrawlDelay() > maxCrawlDelay) {
                  // unblock
                  fetchQueues.finishFetchItem(fit, true);
                  LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
                  output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                  reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1);
                  continue;
                } else {
                  FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                  fiq.crawlDelay = rules.getCrawlDelay();
                }
              }
              ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum);
              ProtocolStatus status = output.getStatus();
              Content content = output.getContent();
              ParseStatus pstatus = null;
              // unblock queue
              fetchQueues.finishFetchItem(fit);

              String urlString = fit.url.toString();

              reporter.incrCounter("FetcherStatus", status.getName(), 1);

              switch(status.getCode()) {

              case ProtocolStatus.WOULDBLOCK:
                // retry ?
                fetchQueues.addFetchItem(fit);
                break;

              case ProtocolStatus.SUCCESS:        // got a page
                pstatus = output(fit.url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
                updateStatus(content.getContent().length);
                if (pstatus != null && pstatus.isSuccess() &&
                        pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                  String newUrl = pstatus.getMessage();
                  int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
                  Text redirUrl =
                    handleRedirect(fit.url, fit.datum,
                                   urlString, newUrl,
                                   refreshTime < Fetcher.PERM_REFRESH_TIME,
                                   Fetcher.CONTENT_REDIR);
                  if (redirUrl != null) {
                    CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
                        fit.datum.getFetchInterval(), fit.datum.getScore());
                    // transfer existing metadata to the redir
                    newDatum.getMetaData().putAll(fit.datum.getMetaData());
                    scfilters.initialScore(redirUrl, newDatum);
                    if (reprUrl != null) {
                      newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
                          new Text(reprUrl));
                    }
                    fit = FetchItem.create(redirUrl, newDatum, queueMode);
                    if (fit != null) {
                      FetchItemQueue fiq =
                        fetchQueues.getFetchItemQueue(fit.queueID);
                      fiq.addInProgressFetchItem(fit);
                    } else {
                      // stop redirecting
                      redirecting = false;
                      reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
                    }
                  }
                }
                break;

              case ProtocolStatus.MOVED:         // redirect
              case ProtocolStatus.TEMP_MOVED:
                int code;
                boolean temp;
                if (status.getCode() == ProtocolStatus.MOVED) {
                  code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
                  temp = false;
                } else {
                  code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                  temp = true;
                }
                output(fit.url, fit.datum, content, status, code);
                String newUrl = status.getMessage();
                Text redirUrl =
                  handleRedirect(fit.url, fit.datum,
                                 urlString, newUrl, temp,
                                 Fetcher.PROTOCOL_REDIR);
                if (redirUrl != null) {
                  CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
                      fit.datum.getFetchInterval(), fit.datum.getScore());
                  // transfer existing metadata
                  newDatum.getMetaData().putAll(fit.datum.getMetaData());
                  scfilters.initialScore(redirUrl, newDatum);
                  if (reprUrl != null) {
                    newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
                        new Text(reprUrl));
                  }
                  fit = FetchItem.create(redirUrl, newDatum, queueMode);
                  if (fit != null) {
                    FetchItemQueue fiq =
                      fetchQueues.getFetchItemQueue(fit.queueID);
                    fiq.addInProgressFetchItem(fit);
                  } else {
                    // stop redirecting
                    redirecting = false;
                    reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
                  }
                } else {
                  // stop redirecting
                  redirecting = false;
                }
                break;

              case ProtocolStatus.EXCEPTION:
                logError(fit.url, status.getMessage());
                int killedURLs = fetchQueues.checkExceptionThreshold(fit.getQueueID());
                if (killedURLs!=0)
                   reporter.incrCounter("FetcherStatus", "AboveExceptionThresholdInQueue", killedURLs);
                /* FALLTHROUGH */
              case ProtocolStatus.RETRY:          // retry
              case ProtocolStatus.BLOCKED:
                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                break;

              case ProtocolStatus.GONE:           // gone
              case ProtocolStatus.NOTFOUND:
              case ProtocolStatus.ACCESS_DENIED:
              case ProtocolStatus.ROBOTS_DENIED:
                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
                break;

              case ProtocolStatus.NOTMODIFIED:
                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
                break;

              default:
                if (LOG.isWarnEnabled()) {
                  LOG.warn("Unknown ProtocolStatus: " + status.getCode());
                }
                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
              }

              if (redirecting && redirectCount > maxRedirect) {
                fetchQueues.finishFetchItem(fit);
                if (LOG.isInfoEnabled()) {
                  LOG.info(" - redirect count exceeded " + fit.url);
                }
                output(fit.url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatum.STATUS_FETCH_GONE);
              }

            } while (redirecting && (redirectCount <= maxRedirect));

          } catch (Throwable t) {                 // unexpected exception
            // unblock
            fetchQueues.finishFetchItem(fit);
            logError(fit.url, StringUtils.stringifyException(t));
            output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
          }
        }

      } catch (Throwable e) {
        if (LOG.isErrorEnabled()) {
          LOG.error("fetcher caught:"+e.toString());
        }
      } finally {
        if (fit != null) fetchQueues.finishFetchItem(fit);
        activeThreads.decrementAndGet(); // count threads
        LOG.info("-finishing thread " + getName() + ", activeThreads=" + activeThreads);
      }
    }

    private Text handleRedirect(Text url, CrawlDatum datum,
                                String urlString, String newUrl,
                                boolean temp, String redirType)
    throws MalformedURLException, URLFilterException {
      newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
      newUrl = urlFilters.filter(newUrl);

      if (ignoreExternalLinks) {
        try {
          String origHost = new URL(urlString).getHost().toLowerCase();
          String newHost = new URL(newUrl).getHost().toLowerCase();
          if (!origHost.equals(newHost)) {
            if (LOG.isDebugEnabled()) {
              LOG.debug(" - ignoring redirect " + redirType + " from " +
                          urlString + " to " + newUrl +
                          " because external links are ignored");
            }
            return null;
          }
        } catch (MalformedURLException e) { }
      }

      if (newUrl != null && !newUrl.equals(urlString)) {
        reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
        url = new Text(newUrl);
        if (maxRedirect > 0) {
          redirecting = true;
          redirectCount++;
          if (LOG.isDebugEnabled()) {
            LOG.debug(" - " + redirType + " redirect to " +
                url + " (fetching now)");
          }
          return url;
        } else {
          CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED,
              datum.getFetchInterval(),datum.getScore());
          // transfer existing metadata
          newDatum.getMetaData().putAll(datum.getMetaData());
          try {
            scfilters.initialScore(url, newDatum);
          } catch (ScoringFilterException e) {
            e.printStackTrace();
          }
          if (reprUrl != null) {
            newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
                new Text(reprUrl));
          }
          output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
          if (LOG.isDebugEnabled()) {
            LOG.debug(" - " + redirType + " redirect to " +
                url + " (fetching later)");
          }
          return null;
        }
      } else {
        if (LOG.isDebugEnabled()) {
          LOG.debug(" - " + redirType + " redirect skipped: " +
              (newUrl != null ? "to same url" : "filtered"));
        }
        return null;
      }
    }

    private void logError(Text url, String message) {
      if (LOG.isInfoEnabled()) {
        LOG.info("fetch of " + url + " failed with: " + message);
      }
      errors.incrementAndGet();
    }

    private ParseStatus output(Text key, CrawlDatum datum,
                        Content content, ProtocolStatus pstatus, int status) {

      return output(key, datum, content, pstatus, status, 0);
    }

    private ParseStatus output(Text key, CrawlDatum datum,
                        Content content, ProtocolStatus pstatus, int status, int outlinkDepth) {

      datum.setStatus(status);
      datum.setFetchTime(System.currentTimeMillis());
      if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
      
      ParseResult parseResult = null;
      if (content != null) {
        Metadata metadata = content.getMetadata();
        
        // store the guessed content type in the crawldatum
        if (content.getContentType() != null) datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));
        
        // add segment to metadata
        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
        // add score to content metadata so that ParseSegment can pick it up.
        try {
          scfilters.passScoreBeforeParsing(key, datum, content);
        } catch (Exception e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
          }
        }
        /* Note: Fetcher will only follow meta-redirects coming from the
         * original URL. */
        if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
          if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
            try {
              parseResult = this.parseUtil.parse(content);
            } catch (Exception e) {
              LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
            }
          }
  
          if (parseResult == null) {
            byte[] signature =
              SignatureFactory.getSignature(getConf()).calculate(content,
                  new ParseStatus().getEmptyParse(conf));
            datum.setSignature(signature);
          }
        }

        /* Store status code in content So we can read this value during
         * parsing (as a separate job) and decide to parse or not.
         */
        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
      }

      try {
        output.collect(key, new NutchWritable(datum));
        if (content != null && storingContent)
          output.collect(key, new NutchWritable(content));
        if (parseResult != null) {
          for (Entry<Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();
            ParseData parseData = parse.getData();

            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }

            // Calculate page signature. For non-parsing fetchers this will
            // be done in ParseSegment
            byte[] signature =
              SignatureFactory.getSignature(getConf()).calculate(content, parse);
            // Ensure segment name and score are in parseData metadata
            parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
                segmentName);
            parseData.getContentMeta().set(Nutch.SIGNATURE_KEY,
                StringUtil.toHexString(signature));
            // Pass fetch time to content meta
            parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY,
                Long.toString(datum.getFetchTime()));
            if (url.equals(key))
              datum.setSignature(signature);
            try {
              scfilters.passScoreAfterParsing(url, content, parse);
            } catch (Exception e) {
              if (LOG.isWarnEnabled()) {
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }

            String fromHost;

            // collect outlinks for subsequent db update
            Outlink[] links = parseData.getOutlinks();
            int outlinksToStore = Math.min(maxOutlinks, links.length);
            if (ignoreExternalLinks) {
              try {
                fromHost = new URL(url.toString()).getHost().toLowerCase();
              } catch (MalformedURLException e) {
                fromHost = null;
              }
            } else {
              fromHost = null;
            }

            int validCount = 0;

            // Process all outlinks, normalize, filter and deduplicate
            List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
            HashSet<String> outlinks = new HashSet<String>(outlinksToStore);
            for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
              String toUrl = links[i].getToUrl();

              toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, fromHost, ignoreExternalLinks, urlFilters, normalizers);
              if (toUrl == null) {
                continue;
              }

              validCount++;
              links[i].setUrl(toUrl);
              outlinkList.add(links[i]);
              outlinks.add(toUrl);
            }

            // Only process depth N outlinks
            if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
              reporter.incrCounter("FetcherOutlinks", "outlinks_detected", outlinks.size());

              // Counter to limit num outlinks to follow per page
              int outlinkCounter = 0;

              // Calculate variable number of outlinks by depth using the divisor (outlinks = Math.floor(divisor / depth * num.links))
              int maxOutlinksByDepth = (int)Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);

              String followUrl;

              // Walk over the outlinks and add as new FetchItem to the queues
              Iterator<String> iter = outlinks.iterator();
              while(iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
                followUrl = iter.next();

                // Check whether we'll follow external outlinks
                if (outlinksIgnoreExternal) {
                  if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
                    continue;
                  }
                }

                reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1);

                // Create new FetchItem with depth incremented
                FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
                fetchQueues.addFetchItem(fit);

                outlinkCounter++;
              }
            }

            // Overwrite the outlinks in ParseData with the normalized and filtered set
            parseData.setOutlinks((Outlink[])outlinkList.toArray(new Outlink[outlinkList.size()]));

            output.collect(url, new NutchWritable(
                    new ParseImpl(new ParseText(parse.getText()),
                                  parseData, parse.isCanonical())));
          }
        }
      } catch (IOException e) {
        if (LOG.isErrorEnabled()) {
          LOG.error("fetcher caught:"+e.toString());
        }
      }

      // return parse status if it exits
      if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
          reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1);
          return p.getData().getStatus();
        }
      }
      return null;
    }

  }

  public Fetcher() { super(null); }

  public Fetcher(Configuration conf) { super(conf); }

  private void updateStatus(int bytesInPage) throws IOException {
    pages.incrementAndGet();
    bytes.addAndGet(bytesInPage);
  }


  private void reportStatus(int pagesLastSec, int bytesLastSec) throws IOException {
    String status;
    long elapsed = (System.currentTimeMillis() - start)/1000;

    float avgPagesSec = Math.round(((float)pages.get()*10)/elapsed)/10;
    float avgBytesSec = Math.round(((((float)bytes.get())*8)/1000)/elapsed);

    status = activeThreads + " threads, " +
     fetchQueues.getQueueCount() + " queues, "+
     fetchQueues.getTotalSize() + " URLs queued, "+
      pages+" pages, "+errors+" errors, "
      + avgPagesSec + " (" + pagesLastSec + ") pages/s, "
      + avgBytesSec + " (" + bytesLastSec + ") kbits/s, ";

    reporter.setStatus(status);
  }

  public void configure(JobConf job) {
    setConf(job);

    this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY);
    this.storingContent = isStoringContent(job);
    this.parsing = isParsing(job);

//    if (job.getBoolean("fetcher.verbose", false)) {
//      LOG.setLevel(Level.FINE);
//    }
  }

  public void close() {}

  public static boolean isParsing(Configuration conf) {
    return conf.getBoolean("fetcher.parse", true);
  }

  public static boolean isStoringContent(Configuration conf) {
    return conf.getBoolean("fetcher.store.content", true);
  }

  public void run(RecordReader<Text, CrawlDatum> input,
      OutputCollector<Text, NutchWritable> output,
                  Reporter reporter) throws IOException {

    this.output = output;
    this.reporter = reporter;
    this.fetchQueues = new FetchItemQueues(getConf());

    int threadCount = getConf().getInt("fetcher.threads.fetch", 10);
    if (LOG.isInfoEnabled()) { LOG.info("Fetcher: threads: " + threadCount); }

    int timeoutDivisor = getConf().getInt("fetcher.threads.timeout.divisor", 2);
    if (LOG.isInfoEnabled()) { LOG.info("Fetcher: time-out divisor: " + timeoutDivisor); }

    int queueDepthMuliplier =  getConf().getInt("fetcher.queue.depth.multiplier", 50);

    feeder = new QueueFeeder(input, fetchQueues, threadCount * queueDepthMuliplier);
    //feeder.setPriority((Thread.MAX_PRIORITY + Thread.NORM_PRIORITY) / 2);

    // the value of the time limit is either -1 or the time where it should finish
    long timelimit = getConf().getLong("fetcher.timelimit", -1);
    if (timelimit != -1) feeder.setTimeLimit(timelimit);
    feeder.start();

    // set non-blocking & no-robots mode for HTTP protocol plugins.
    getConf().setBoolean(Protocol.CHECK_BLOCKING, false);
    getConf().setBoolean(Protocol.CHECK_ROBOTS, false);

    for (int i = 0; i < threadCount; i++) {       // spawn threads
      new FetcherThread(getConf()).start();
    }

    // select a timeout that avoids a task timeout
    long timeout = getConf().getInt("mapred.task.timeout", 10*60*1000)/timeoutDivisor;

    // Used for threshold check, holds pages and bytes processed in the last second
    int pagesLastSec;
    int bytesLastSec;

    // Set to true whenever the threshold has been exceeded for the first time
    boolean throughputThresholdExceeded = false;
    int throughputThresholdNumRetries = 0;

    int throughputThresholdPages = getConf().getInt("fetcher.throughput.threshold.pages", -1);
    if (LOG.isInfoEnabled()) { LOG.info("Fetcher: throughput threshold: " + throughputThresholdPages); }
    int throughputThresholdMaxRetries = getConf().getInt("fetcher.throughput.threshold.retries", 5);
    if (LOG.isInfoEnabled()) { LOG.info("Fetcher: throughput threshold retries: " + throughputThresholdMaxRetries); }
    long throughputThresholdTimeLimit = getConf().getLong("fetcher.throughput.threshold.check.after", -1);

    do {                                          // wait for threads to exit
      pagesLastSec = pages.get();
      bytesLastSec = (int)bytes.get();

      try {
        Thread.sleep(1000);
      } catch (InterruptedException e) {}

      pagesLastSec = pages.get() - pagesLastSec;
      bytesLastSec = (int)bytes.get() - bytesLastSec;

      reporter.incrCounter("FetcherStatus", "bytes_downloaded", bytesLastSec);

      reportStatus(pagesLastSec, bytesLastSec);

      LOG.info("-activeThreads=" + activeThreads + ", spinWaiting=" + spinWaiting.get()
          + ", fetchQueues.totalSize=" + fetchQueues.getTotalSize());

      if (!feeder.isAlive() && fetchQueues.getTotalSize() < 5) {
        fetchQueues.dump();
      }

      // if throughput threshold is enabled
      if (throughputThresholdTimeLimit < System.currentTimeMillis() && throughputThresholdPages != -1) {
        // Check if we're dropping below the threshold
        if (pagesLastSec < throughputThresholdPages) {
          throughputThresholdNumRetries++;
          LOG.warn(Integer.toString(throughputThresholdNumRetries) + ": dropping below configured threshold of " + Integer.toString(throughputThresholdPages) + " pages per second");

          // Quit if we dropped below threshold too many times
          if (throughputThresholdNumRetries == throughputThresholdMaxRetries) {
            LOG.warn("Dropped below threshold too many times, killing!");

            // Disable the threshold checker
            throughputThresholdPages = -1;

            // Empty the queues cleanly and get number of items that were dropped
            int hitByThrougputThreshold = fetchQueues.emptyQueues();

            if (hitByThrougputThreshold != 0) reporter.incrCounter("FetcherStatus",
              "hitByThrougputThreshold", hitByThrougputThreshold);
          }
        }
      }

      // check timelimit
      if (!feeder.isAlive()) {
        int hitByTimeLimit = fetchQueues.checkTimelimit();
        if (hitByTimeLimit != 0) reporter.incrCounter("FetcherStatus",
            "hitByTimeLimit", hitByTimeLimit);
      }

      // some requests seem to hang, despite all intentions
      if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Aborting with "+activeThreads+" hung threads.");
        }
        return;
      }

    } while (activeThreads.get() > 0);
    LOG.info("-activeThreads=" + activeThreads);

  }

  public void fetch(Path segment, int threads)
    throws IOException {

    checkConfiguration();

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("Fetcher: starting at " + sdf.format(start));
      LOG.info("Fetcher: segment: " + segment);
    }

    // set the actual time for the timelimit relative
    // to the beginning of the whole job and not of a specific task
    // otherwise it keeps trying again if a task fails
    long timelimit = getConf().getLong("fetcher.timelimit.mins", -1);
    if (timelimit != -1) {
      timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
      LOG.info("Fetcher Timelimit set for : " + timelimit);
      getConf().setLong("fetcher.timelimit", timelimit);
    }

    // Set the time limit after which the throughput threshold feature is enabled
    timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10);
    timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
    getConf().setLong("fetcher.throughput.threshold.check.after", timelimit);

    int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1);
    if (maxOutlinkDepth > 0) {
      LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth));

      int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4);
      int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2);

      int totalOutlinksToFollow = 0;
      for (int i = 0; i < maxOutlinkDepth; i++) {
        totalOutlinksToFollow += (int)Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks);
      }

      LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow));
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(InputFormat.class);

    job.setMapRunnerClass(Fetcher.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    long end = System.currentTimeMillis();
    LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  }


  /** Run the fetcher. */
  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(NutchConfiguration.create(), new Fetcher(), args);
    System.exit(res);
  }

  public int run(String[] args) throws Exception {

    String usage = "Usage: Fetcher <segment> [-threads n]";

    if (args.length < 1) {
      System.err.println(usage);
      return -1;
    }

    Path segment = new Path(args[0]);

    int threads = getConf().getInt("fetcher.threads.fetch", 10);
    boolean parsing = false;

    for (int i = 1; i < args.length; i++) {       // parse command line
      if (args[i].equals("-threads")) {           // found -threads option
        threads =  Integer.parseInt(args[++i]);
      }
    }

    getConf().setInt("fetcher.threads.fetch", threads);

    try {
      fetch(segment, threads);
      return 0;
    } catch (Exception e) {
      LOG.error("Fetcher: " + StringUtils.stringifyException(e));
      return -1;
    }

  }

  private void checkConfiguration() {

    // ensure that a value has been set for the agent name and that that
    // agent name is the first value in the agents we advertise for robot
    // rules parsing
    String agentName = getConf().get("http.agent.name");
    if (agentName == null || agentName.trim().length() == 0) {
      String message = "Fetcher: No agents listed in 'http.agent.name'"
          + " property.";
      if (LOG.isErrorEnabled()) {
        LOG.error(message);
      }
      throw new IllegalArgumentException(message);
    } else {

      // get all of the agents that we advertise
      String agentNames = getConf().get("http.robots.agents");
      StringTokenizer tok = new StringTokenizer(agentNames, ",");
      ArrayList<String> agents = new ArrayList<String>();
      while (tok.hasMoreTokens()) {
        agents.add(tok.nextToken().trim());
      }

      // if the first one is not equal to our agent name, log fatal and throw
      // an exception
      if (!(agents.get(0)).equalsIgnoreCase(agentName)) {
        String message = "Fetcher: Your 'http.agent.name' value should be "
            + "listed first in 'http.robots.agents' property.";
        if (LOG.isWarnEnabled()) {
          LOG.warn(message);
        }
      }
    }
  }

}