package org.archive.hadoop.mapreduce;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Logger;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.archive.hadoop.util.PartitionName;
import org.archive.url.SURT;
import org.archive.util.io.BytesReadObserver;
import org.archive.util.io.MultiMemberOpenJDKGZIPInputStream;
import org.archive.util.io.NotifyingInputStream;
import org.archive.util.iterator.AbstractPeekableIterator;
import org.archive.util.iterator.CachingStringFilter;
import org.archive.util.iterator.FilterStringIterator;
import org.archive.util.iterator.SortedCompositeIterator;
import org.archive.util.iterator.StringFilter;
import org.archive.util.iterator.StringTransformer;
import org.archive.util.iterator.TransformingPrefixStringFilter;
public class SortMergeInputFormat extends InputFormat<Long, Text> {
private final static Logger LOG =
Logger.getLogger(SortMergeInputFormat.class.getName());
private final static Charset UTF8 = Charset.forName("UTF-8");
private static final String SORT_MERGE_INPUT_PATH_CONFIG =
"sort.merge.input.path";
private static final String SORT_MERGE_INPUT_COMPRESSED_CONFIG =
"sort.merge.input.compressed";
private static final String SORT_MERGE_INPUT_FILTER_FIELD =
"sort.merge.input.filter.field";
private static final String SORT_MERGE_INPUT_FILTER_PATH =
"sort.merge.input.filter.path";
// TODO: Make this configurable by class:
private static final String SORT_MERGE_INPUT_FILTER_SURT =
"sort.merge.input.filter.surt";
public static void setInputPath(Configuration conf, String path) throws IOException {
conf.set(SORT_MERGE_INPUT_PATH_CONFIG, path);
addPartitionOutputNames(conf, new Path(path));
}
public static Path getInputPath(Configuration conf) {
return new Path(conf.get(SORT_MERGE_INPUT_PATH_CONFIG));
}
// crazy hack - the Configuration doesn't get serialized after getSplits().
public static void addPartitionOutputNames(Configuration conf, Path path)
throws IOException {
Path inputPath = getInputPath(conf);
FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
FSDataInputStream fsdis = fs.open(inputPath);
InputStreamReader isr = new InputStreamReader(fsdis, Charset.forName("UTF-8"));
BufferedReader br = new BufferedReader(isr);
int i = 0;
while(true) {
String line = br.readLine();
if(line == null) {
break;
}
String parts[] = line.split("\\s");
PartitionName.setPartitionOutputName(conf, i, parts[0]);
i++;
}
br.close();
}
public static void setCompressedInput(Configuration conf,
boolean compressed) {
conf.setBoolean(SORT_MERGE_INPUT_COMPRESSED_CONFIG, compressed);
}
public static boolean getCompressedInput(Configuration conf) {
return conf.getBoolean(SORT_MERGE_INPUT_COMPRESSED_CONFIG,false);
}
public static void setFilterField(Configuration conf, int field) {
conf.setInt(SORT_MERGE_INPUT_FILTER_FIELD, field);
}
public static int getFilterField(Configuration conf) {
return conf.getInt(SORT_MERGE_INPUT_FILTER_FIELD,-1);
}
public static void setFilterPath(Configuration conf, String path) {
conf.set(SORT_MERGE_INPUT_FILTER_PATH, path);
}
public static Path getFilterPath(Configuration conf) {
return new Path(conf.get(SORT_MERGE_INPUT_FILTER_PATH));
}
public static void setFilterSURT(Configuration conf, boolean isSURT) {
conf.setBoolean(SORT_MERGE_INPUT_FILTER_SURT, isSURT);
}
public static boolean getFilterSURT(Configuration conf) {
return conf.getBoolean(SORT_MERGE_INPUT_FILTER_SURT,false);
}
@Override
public RecordReader<Long, Text> createRecordReader(InputSplit arg0,
TaskAttemptContext arg1) throws IOException, InterruptedException {
return new SortMergeRecordReader();
}
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException,
InterruptedException {
Configuration conf = context.getConfiguration();
Path inputPath = getInputPath(conf);
FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
FSDataInputStream fsdis = fs.open(inputPath);
InputStreamReader isr = new InputStreamReader(fsdis, Charset.forName("UTF-8"));
BufferedReader br = new BufferedReader(isr);
ArrayList<InputSplit> splits = new ArrayList<InputSplit>();
long pos = Integer.MAX_VALUE - 1;
int i = 0;
while(true) {
String line = br.readLine();
if(line == null) {
break;
}
SortMergeInputSplit s = parseInputSplit(fs,pos,line);
PartitionName.setPartitionOutputName(conf, i, s.getOutputName());
i++;
pos--;
splits.add(s);
}
br.close();
return splits;
}
private SortMergeInputSplit parseInputSplit(FileSystem fs, long pos, String line) throws IOException {
String parts[] = line.split("\\s");
FileStatus biggest = null;
ArrayList<Path> inputPaths = new ArrayList<Path>();
String outputPath = null;
for(String part : parts) {
if(outputPath == null) {
outputPath = part;
continue;
}
Path path = new Path(part);
FileStatus status = fs.getFileStatus(path);
if(status.isDir()) {
throw new IOException(String.format("Part(%s) is a Directory!",
part));
}
if(biggest == null) {
biggest = status;
} else {
if(status.getLen() > biggest.getLen()) {
biggest = status;
}
}
inputPaths.add(path);
}
BlockLocation[] blockLocations =
fs.getFileBlockLocations(biggest, 0, biggest.getBlockSize());
LOG.warning(String.format("Created InputSplit(%d) Hosts(%s) paths(%s)",
biggest.getLen(),
StringUtils.join(blockLocations[0].getHosts(), ","),
StringUtils.join(inputPaths, ",")));
String[] inp = new String[inputPaths.size()];
for(int i = 0; i< inputPaths.size(); i++) {
inp[i] = inputPaths.get(i).toString();
}
return new SortMergeInputSplit(pos, outputPath,
blockLocations[0].getHosts(), inp);
}
public class SortMergeRecordReader extends RecordReader<Long, Text> implements BytesReadObserver {
float progress = 0.0f;
long totalBytes = 0;
long bytesRead = 0;
Long key = null;
Text value = null;
ArrayList<BufferedReader> readers;
Iterator<String> itr;
public void notifyBytesRead(int read) {
if(read > 0) {
bytesRead += read;
progress = (float) bytesRead / (float) totalBytes;
}
}
@Override
public void close() throws IOException {
for(BufferedReader reader : readers) {
reader.close();
}
}
@Override
public Long getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return progress;
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
SortMergeInputSplit smSplit;
if(split instanceof SortMergeInputSplit) {
smSplit = (SortMergeInputSplit) split;
} else {
throw new IOException("Split not right class!?");
}
Configuration conf = context.getConfiguration();
readers = new ArrayList<BufferedReader>();
SortedCompositeIterator<String> itrS =
new SortedCompositeIterator<String>(new Comparator<String>() {
public int compare(String o1, String o2) {
return o1.compareTo(o2);
}
});
LOG.warning(String.format("Split - components(%s)\n",
StringUtils.join(smSplit.getPaths(), ',')));
for(String pathStr : smSplit.getPaths()) {
Path path = new Path(pathStr);
FileSystem fs = path.getFileSystem(conf);
FSDataInputStream fsdis = fs.open(path);
FileStatus status = fs.getFileStatus(path);
totalBytes += status.getLen();
InputStream is = new NotifyingInputStream(fsdis,this);
if(path.toString().endsWith(".gz") ||
getCompressedInput(conf)) {
is = new MultiMemberOpenJDKGZIPInputStream(is);
LOG.warning(String.format("Opened(%s) as GZ",pathStr));
} else {
LOG.warning(String.format("Opened(%s) as RAW",pathStr));
}
InputStreamReader isr = new InputStreamReader(is, UTF8);
BufferedReader br = new BufferedReader(isr);
readers.add(br);
itrS.addIterator(AbstractPeekableIterator.wrapReader(br));
}
int filterField = getFilterField(conf);
if(filterField >= 0) {
Path filterPath = getFilterPath(conf);
boolean isSURT = getFilterSURT(conf);
StringTransformer stringTransformer = null;
if(isSURT) {
stringTransformer = new StringTransformer() {
public String transform(String input) {
return SURT.toSURT(input);
}
};
}
FileSystem fs = filterPath.getFileSystem(conf);
FSDataInputStream fsdis = fs.open(filterPath);
InputStreamReader isr = new InputStreamReader(fsdis, UTF8);
BufferedReader br = new BufferedReader(isr);
Iterator<String> i = AbstractPeekableIterator.wrapReader(br);
ArrayList<String> al = new ArrayList<String>();
while(i.hasNext()) {
al.add(i.next());
}
StringFilter stringFilter =
new TransformingPrefixStringFilter(al, stringTransformer);
// TODO: make this, and the number of cached items configurable:
CachingStringFilter cachingFilter =
new CachingStringFilter(stringFilter, 100);
FilterStringIterator itrF =
new FilterStringIterator(itrS, cachingFilter);
itrF.setField(filterField);
itr = itrF;
} else {
itr = itrS;
}
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(key == null) {
key = new Long(0);
}
if(value == null) {
value = new Text();
}
if(itr.hasNext()) {
key = new Long(key.longValue() + 1);
value.set(itr.next());
return true;
}
return false;
}
}
}