package com.manning.hip.ch3.binary;
import com.manning.hip.common.HadoopCompat;
import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.util.*;
public class CustomBinaryInputFormat extends
FileInputFormat<LongWritable, BytesWritable> {
@Override
public RecordReader<LongWritable, BytesWritable>
createRecordReader(InputSplit split,
TaskAttemptContext context) {
return new CustomBinaryRecordReader();
}
public long[] getFileOffest(Configuration conf, Path input)
throws IOException {
List<Long> offsets = new ArrayList<Long>();
DataInputStream is = new DataInputStream(
input.getFileSystem(conf).open(input));
try {
long offset = 0;
while(true) {
int dataLen = is.readInt();
is.skipBytes(dataLen);
offsets.add(offset);
// set the offset for the next
offset += dataLen + 4;
}
} catch(EOFException e) {
// we're done
} finally {
IOUtils.closeStream(is);
}
return ArrayUtils.toPrimitive(offsets.toArray(new Long[]{}));
}
public long alignSliceStartToIndex(long[] offsets, long start, long end) {
if (start != 0) {
// find the next block position from
// the start of the split
long newStart = findNextPosition(offsets, start);
if (newStart == -1 || newStart >= end) {
return -1;
}
start = newStart;
}
return start;
}
public long alignSliceEndToIndex(long[] offsets, long end, long fileSize) {
long newEnd = findNextPosition(offsets, end);
if (newEnd != -1) {
end = newEnd;
} else {
// didn't find the next position
// we have hit the end of the file
end = fileSize;
}
return end;
}
public long findNextPosition(long[] offsets, long pos) {
int block = Arrays.binarySearch(offsets, pos);
if (block >= 0) {
// direct hit on a block start position
return offsets[block];
} else {
block = Math.abs(block) - 1;
if (block >= offsets.length) {
return -1;
}
return offsets[block];
}
}
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
List<InputSplit> defaultSplits = super.getSplits(job);
List<InputSplit> result = new ArrayList<InputSplit>();
Path prevPath = null;
long[] offsets = null;
for (InputSplit genericSplit : defaultSplits) {
FileSplit fileSplit = (FileSplit)genericSplit;
Path file = fileSplit.getPath();
if(prevPath == null || !prevPath.equals(file)) {
prevPath = file;
// Load the offsets in the file
offsets = getFileOffest(HadoopCompat.getConfiguration(job), file);
}
long start = fileSplit.getStart();
long end = start + fileSplit.getLength();
long newStart = alignSliceStartToIndex(offsets, start, end);
long newEnd = alignSliceEndToIndex(offsets, end,
file.getFileSystem(HadoopCompat.getConfiguration(job))
.getFileStatus(file).getLen());
if(newStart != -1 || newEnd != -1) {
result.add(new FileSplit(file, newStart, newEnd - newStart, fileSplit.getLocations()));
System.out.println("Adding split start = " + newStart + " end = " + newEnd);
}
}
return result;
}
@Override
protected boolean isSplitable(JobContext context, Path file) {
return super.isSplitable(context, file);
}
}