/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.mapred.segmenter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Partitioner;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.SegmentGeneratorBundleKey;
import org.commoncrawl.mapred.SegmentGeneratorItemBundle;
import org.commoncrawl.util.MurmurHash;
/**
*
* @author rana
*
*/
public class BundleKeyPartitioner implements Partitioner<SegmentGeneratorBundleKey,SegmentGeneratorItemBundle> {
static final Log LOG = LogFactory.getLog(BundleKeyPartitioner.class);
public int getPartition(SegmentGeneratorBundleKey key,SegmentGeneratorItemBundle value, int numPartitions) {
int hashCode = MurmurHash.hashLong(key.getDomainFP(),0);
// calculate local index based host fp % number of buckets per crawler
int localIndex = (int)((hashCode & Integer.MAX_VALUE) % bucketsPerCrawler);
// partition by crawler id + number buckets per crawler
return (key.getCrawlerId() * bucketsPerCrawler) + localIndex;
}
int bucketsPerCrawler = -1;
public void configure(JobConf job) {
// get buckets per crawler ...
bucketsPerCrawler = job.getInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, 8);
}
}