package org.archive.hadoop.mapreduce;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Partitioner;
import org.archive.util.binsearch.SortedTextFile;
import org.archive.util.iterator.CloseableIterator;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONTokener;
public class ZipNumPartitioner<K, V> extends Partitioner<K, V> implements Configurable {
public final static String ZIPNUM_PARTITIONER_CLUSTER = "conf.zipnum.partitioner.clusterSummary";
public final static String ZIPNUM_PARTITIONER_JSON = "conf.zipnum.partitioner.jsonSplits";
protected SortedTextFile summary = null;
protected String splitsFile = null;
public ZipNumPartitioner()
{
}
protected final static String EMPTY_STRING = "";
protected List<String> splitList = null;
@Override
public int getPartition(K key, V value, int numSplits) {
return getPartition(key.toString(), numSplits);
}
public int getPartition(String searchKey, int numSplits) {
if (numSplits <= 1) {
return 0;
}
if (splitList == null) {
loadSplits(numSplits);
}
int spaceIndex = searchKey.indexOf(' ');
if (spaceIndex >= 0) {
searchKey = searchKey.substring(0, spaceIndex);
}
int index = linSearchSplits(searchKey);
//index = (int)(Math.random() * 5);
return index;
}
protected int linSearchSplits(String key)
{
int index = 0;
for (String split : splitList) {
if (key.compareTo(split) <= 0) {
return index;
}
index++;
}
return index;
}
protected int binSearchSplits(String key)
{
int loc = Collections.binarySearch(splitList, key);
if (loc < 0) {
loc = (loc * -1) - 2;
if (loc < 0) {
loc = 0;
}
}
return loc;
}
protected void loadSplits(int numSplits)
{
CloseableIterator<String> splitIter = null;
try {
if (summary == null) {
return;
}
splitIter = summary.getSplitIterator(EMPTY_STRING, EMPTY_STRING, numSplits);
splitList = new ArrayList<String>();
// Skip first line, don't need the beginning line here
if (splitIter.hasNext()) {
splitIter.next();
}
while (splitIter.hasNext()) {
String str = splitIter.next();
int keyEndIndex = str.indexOf(' ');
if (keyEndIndex >= 0) {
str = str.substring(0, keyEndIndex);
}
splitList.add(str);
//System.out.println(str);
}
if ((numSplits - 1) != splitList.size()) {
throw new RuntimeException("splitList size != " + (numSplits - 1));
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (splitIter != null) {
try {
splitIter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public Configuration getConf() {
// TODO Auto-generated method stub
return null;
}
public void setConf(Configuration conf) {
String clusterSummary = conf.get(ZIPNUM_PARTITIONER_CLUSTER);
if (clusterSummary != null) {
try {
summary = new SortedTextFile(clusterSummary);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return;
}
splitsFile = conf.get(ZIPNUM_PARTITIONER_JSON);
if (splitsFile != null) {
// Don't reload multiple times
if (splitList != null) {
return;
}
try {
if (splitsFile.endsWith(".json")) {
loadJsonSplits(splitsFile, conf);
} else {
loadTextSplits(splitsFile, conf);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
protected void loadTextSplits(String splitsFile, Configuration conf) throws IOException
{
Path splitsPath = new Path(splitsFile);
FileSystem fs = splitsPath.getFileSystem(conf);
FSDataInputStream inputStream = null;
BufferedReader reader = null;
try {
inputStream = fs.open(splitsPath);
reader = new BufferedReader(new InputStreamReader(inputStream));
splitList = new ArrayList<String>();
String line = null;
while ((line = reader.readLine()) != null) {
splitList.add(line);
}
} finally {
if (reader != null) {
reader.close();
}
}
}
protected void loadJsonSplits(String splitsFile, Configuration conf) throws IOException, JSONException
{
Path splitsPath = new Path(splitsFile);
FileSystem fs = splitsPath.getFileSystem(conf);
FSDataInputStream inputStream = null;
try {
inputStream = fs.open(splitsPath);
JSONTokener tokener = new JSONTokener(new InputStreamReader(inputStream));
JSONArray root = new JSONArray(tokener);
// 0th object is number of lines, actual split points are 1th index in the root array
JSONArray splitsArray = root.getJSONArray(1);
// Assuming the first and last values of the array are empty lines
splitList = new ArrayList<String>(splitsArray.length() - 2);
for (int i = 1; i < splitsArray.length() - 1; i++) {
String split = splitsArray.getString(i);
splitList.add(split);
//System.out.println(split);
}
} finally {
if (inputStream != null) {
inputStream.close();
}
}
}
}