/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.aliyun.odps.mapred.lib; import static com.aliyun.odps.mapred.utils.UTF8ByteArrayUtils.unescapeSeparator; import java.io.UnsupportedEncodingException; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.aliyun.odps.data.Record; import com.aliyun.odps.mapred.Partitioner; import com.aliyun.odps.mapred.conf.JobConf; import com.aliyun.odps.mapred.lib.KeyFieldHelper.KeyDescription; /** * Defines a way to partition keys based on certain key fields (also see * {@link KeyFieldBasedComparator}. * The key specification supported is of the form -k pos1[,pos2], where, * pos is of the form f[.c][opts], where f is the number * of the key field to use, and c is the number of the first character from * the beginning of the field. Fields and character posns are numbered * starting with 1; a character position of zero in pos2 indicates the * field's last character. If '.c' is omitted from pos1, it defaults to 1 * (the beginning of the field); if omitted from pos2, it defaults to 0 * (the end of the field). */ public class KeyFieldBasedPartitioner extends Partitioner { private static final Log LOG = LogFactory.getLog( KeyFieldBasedPartitioner.class.getName()); //public static String PARTITIONER_OPTIONS = // "mapreduce.partition.keypartitioner.options"; private int numOfPartitionFields; private KeyFieldHelper keyFieldHelper = new KeyFieldHelper(); private JobConf conf; @Override public void configure(JobConf conf) { this.conf = conf; keyFieldHelper = new KeyFieldHelper(); String keyFieldSeparator = unescapeSeparator(conf.get("map.output.key.field.separator", "\t")); keyFieldHelper.setKeyFieldSeparator(keyFieldSeparator); //if (conf.get("num.key.fields.for.partition") != null) { // LOG.warn("Using deprecated num.key.fields.for.partition. " + // "Use mapreduce.partition.keypartitioner.options instead"); this.numOfPartitionFields = conf.getInt("num.key.fields.for.partition", 0); keyFieldHelper.setKeyFieldSpec(1, numOfPartitionFields); //} else { // String option = conf.get(PARTITIONER_OPTIONS); // keyFieldHelper.parseOption(option); //} } public int getPartition(Record keyRec, Record valueRec, int numReduceTasks) { Object key = keyRec.get(0); byte[] keyBytes; List<KeyDescription> allKeySpecs = keyFieldHelper.keySpecs(); if (allKeySpecs.size() == 0) { return getPartition(key.toString().hashCode(), numReduceTasks); } try { keyBytes = key.toString().getBytes("UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException("The current system does not " + "support UTF-8 encoding!", e); } // return 0 if the key is empty if (keyBytes.length == 0) { return 0; } int[] lengthIndicesFirst = keyFieldHelper.getWordLengths(keyBytes, 0, keyBytes.length); int currentHash = 0; for (KeyDescription keySpec : allKeySpecs) { int startChar = keyFieldHelper.getStartOffset(keyBytes, 0, keyBytes.length, lengthIndicesFirst, keySpec); // no key found! continue if (startChar < 0) { continue; } int endChar = keyFieldHelper.getEndOffset(keyBytes, 0, keyBytes.length, lengthIndicesFirst, keySpec); currentHash = hashCode(keyBytes, startChar, endChar, currentHash); } return getPartition(currentHash, numReduceTasks); } protected int hashCode(byte[] b, int start, int end, int currentHash) { for (int i = start; i <= end; i++) { currentHash = 31 * currentHash + b[i]; } return currentHash; } protected int getPartition(int hash, int numReduceTasks) { return (hash & Integer.MAX_VALUE) % numReduceTasks; } /** * Set the {@link KeyFieldBasedPartitioner} options used for * {@link Partitioner} * * @param keySpec the key specification of the form -k pos1[,pos2], where, * pos is of the form f[.c][opts], where f is the number * of the key field to use, and c is the number of the first character from * the beginning of the field. Fields and character posns are numbered * starting with 1; a character position of zero in pos2 indicates the * field's last character. If '.c' is omitted from pos1, it defaults to 1 * (the beginning of the field); if omitted from pos2, it defaults to 0 * (the end of the field). */ //public void setKeyFieldPartitionerOptions(JobConf job, String keySpec) { // job.set(PARTITIONER_OPTIONS, keySpec); //} /** * Get the {@link KeyFieldBasedPartitioner} options */ //public String getKeyFieldPartitionerOption(JobConf job) { // return job.get(PARTITIONER_OPTIONS); //} }