BundleKeyPartitioner.java example

Explorer

commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/
package org.commoncrawl.mapred.segmenter;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Partitioner;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.SegmentGeneratorBundleKey;
import org.commoncrawl.mapred.SegmentGeneratorItemBundle;
import org.commoncrawl.util.MurmurHash;

/**
 * 
 * @author rana
 *
 */
public class BundleKeyPartitioner implements Partitioner<SegmentGeneratorBundleKey,SegmentGeneratorItemBundle> {

  static final Log LOG = LogFactory.getLog(BundleKeyPartitioner.class);
      
  public int getPartition(SegmentGeneratorBundleKey key,SegmentGeneratorItemBundle value, int numPartitions) {
    int hashCode = MurmurHash.hashLong(key.getDomainFP(),0);
    // calculate local index based host fp % number of buckets per crawler
    int localIndex = (int)((hashCode & Integer.MAX_VALUE) % bucketsPerCrawler);
    // partition by crawler id + number buckets per crawler
    return (key.getCrawlerId() * bucketsPerCrawler) + localIndex;
  }

  int bucketsPerCrawler = -1;

  public void configure(JobConf job) {
    // get buckets per crawler ... 
    bucketsPerCrawler   = job.getInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, 8);
  } 

}