AlphaPartitioner.java example

Explorer
wayback-machine-master
/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.wayback.hadoop;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * 
 * 
 * @author brad
 * @version $Date$, $Revision$
 */
public class AlphaPartitioner extends Partitioner<Text, Text> implements
		Configurable {
	private static String CONFIG_SPLIT_PATH_NAME = "alphapartitioner.path";
	private String boundaries[] = new String[0];

	Configuration conf;

	@Override
	public int getPartition(Text key, Text value, int numPartitions) {
		String keyS = key.toString();
		int loc = Arrays.binarySearch(boundaries, keyS);
		if (loc < 0) {
			loc = (loc * -1) - 2;
			if (loc < 0) {
				loc = 0;
			}
		}
		return loc;
	}

	public Configuration getConf() {
		return conf;
	}

	public void setConf(Configuration conf) {
		this.conf = conf;
		String partitionPath = getPartitionPath(conf);
		String numReduceTasks = conf.get("mapred.reduce.tasks");
		System.err.println("Num configured reduce tasks:" + numReduceTasks);
		try {
			URI uri = new URI(partitionPath);
			FileSystem fs = FileSystem.get(uri, conf);
			Path p = new Path(partitionPath);
			loadBoundaries(new BufferedReader(new InputStreamReader(fs.open(p))));
		} catch (IOException e) {
			// TODO: ugh. how to handle?
			e.printStackTrace();
		} catch (URISyntaxException e) {
			e.printStackTrace();
		}
	}

	/**
	 * @param conf Configuration for the Job
	 * @param path hdfs:// URI pointing to the split file
	 */
	public static void setPartitionPath(Configuration conf, String path) {
		conf.set(CONFIG_SPLIT_PATH_NAME, path);
	}

	/**
	 * @param conf Configuration for the Job
	 * @return the hdfs:// URI for the split file configured for this job
	 */
	public static String getPartitionPath(Configuration conf) {
		return conf.get(CONFIG_SPLIT_PATH_NAME);
	}

	private void loadBoundaries(BufferedReader bis) throws IOException {
		ArrayList<String> l = new ArrayList<String>();
		while (true) {
			String line = bis.readLine();
			if (line == null) {
				break;
			}
			l.add(line);
		}
		boundaries = l.toArray(boundaries);
		Arrays.sort(boundaries);
	}
}