package org.hipi.imagebundle.mapreduce;
import org.hipi.image.HipiImage;
import org.hipi.image.HipiImageHeader;
import org.hipi.imagebundle.HipiImageBundle;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* Inherits from {@link FileInputFormat} and processes multiple {@link
* HipiImageBundle} (HIB) files as input and generates {@link
* InputSplit} objects for a MapReduce job along with the
* corresponding {@link RecordReader} class.
*/
public class HibInputFormat extends FileInputFormat<HipiImageHeader, HipiImage> {
/**
* Creates a {@link HibRecordReader}
*/
@Override
public RecordReader<HipiImageHeader, HipiImage> createRecordReader(InputSplit split,
TaskAttemptContext context)
throws IOException, InterruptedException {
return new HibRecordReader();
}
/**
* Replacement for non-static protected getBlockIndex which is part
* of Hadoop and, if used, would prevent computeSplits from being
* static.
*/
static protected int staticGetBlockIndex(BlockLocation[] blkLocations,
long offset) {
for (int i = 0 ; i < blkLocations.length; i++) {
// is the offset inside this block?
if ((blkLocations[i].getOffset() <= offset) &&
(offset < blkLocations[i].getOffset() + blkLocations[i].getLength())) {
return i;
}
}
BlockLocation last = blkLocations[blkLocations.length -1];
long fileLength = last.getOffset() + last.getLength() -1;
throw new IllegalArgumentException("Offset " + offset +
" is outside of file (0.." +
fileLength + ")");
}
/**
* Static public method that does all of the heavy lifting of computing InputSplits for a list
* of HIB files. This is static to allow code reuse: one can imagine many different extensions of
* ImageBundleInputFormat that produce different record types (raw bytes, UCharImage, FloatImage,
* OpenCV types, etc.). See, for example, JpegFromHibInputFormat.java.
*/
static public List<InputSplit> computeSplits(JobContext job, List<FileStatus> inputFiles)
throws IOException {
// Read number of requested map tasks from job configuration
Configuration conf = job.getConfiguration();
int numMapTasks = conf.getInt("hipi.map.tasks", 0);
// Initialize list of InputSplits
List<InputSplit> splits = new ArrayList<InputSplit>();
// Iterate over head input HIB
for (FileStatus file : inputFiles) {
// Get path to file and file system object on HDFS
Path path = file.getPath();
FileSystem fs = path.getFileSystem(conf);
// Create HIB object for reading (pasing null as the image
// factory disallows calling any of the image reading methods)
HipiImageBundle hib = new HipiImageBundle(path, conf);
hib.openForRead();
// Get image block offsets (should be in ascending order)
List<Long> offsets = hib.readAllOffsets();
BlockLocation[] blkLocations = fs.getFileBlockLocations(hib.getDataFileStatus(), 0, offsets.get(offsets.size() - 1));
if (numMapTasks == 0) {
// Determine number of map tasks automatically
int i = 0, b = 0;
long lastOffset = 0, currentOffset = 0;
for (; (b < blkLocations.length) && (i < offsets.size()); b++) {
long next = blkLocations[b].getOffset() + blkLocations[b].getLength();
while (currentOffset < next && i < offsets.size()) {
currentOffset = offsets.get(i);
i++;
}
String[] hosts = null;
if (currentOffset > next) {
Set<String> hostSet = new HashSet<String>();
int endIndex = staticGetBlockIndex(blkLocations, currentOffset - 1);
for (int j = b; j < endIndex; j++) {
String[] blkHosts = blkLocations[j].getHosts();
for (int k = 0; k < blkHosts.length; k++)
hostSet.add(blkHosts[k]);
}
hosts = (String[]) hostSet.toArray(new String[hostSet.size()]);
} else { // currentOffset == next
hosts = blkLocations[b].getHosts();
}
splits.add(new FileSplit(hib.getDataFileStatus().getPath(), lastOffset, currentOffset - lastOffset, hosts));
lastOffset = currentOffset;
}
System.out.println("Spawned " + b + "map tasks");
} else {
// User specified number of map tasks
int imageRemaining = offsets.size();
int i = 0, taskRemaining = numMapTasks;
long lastOffset = 0, currentOffset;
while (imageRemaining > 0) {
int numImages = imageRemaining / taskRemaining;
if (imageRemaining % taskRemaining > 0)
numImages++;
int next = Math.min(offsets.size() - i, numImages) - 1;
int startIndex = staticGetBlockIndex(blkLocations, lastOffset);
currentOffset = offsets.get(i + next);
int endIndex = staticGetBlockIndex(blkLocations, currentOffset - 1);
ArrayList<String> hosts = new ArrayList<String>();
// Check getBlockIndex, and getBlockSize
for (int j = startIndex; j <= endIndex; j++) {
String[] blkHosts = blkLocations[j].getHosts();
for (int k = 0; k < blkHosts.length; k++)
hosts.add(blkHosts[k]);
}
splits.add(new FileSplit(hib.getDataFileStatus().getPath(), lastOffset, currentOffset - lastOffset, hosts.toArray(new String[hosts.size()])));
lastOffset = currentOffset;
i += next + 1;
taskRemaining--;
imageRemaining -= numImages;
System.out.println("imageRemaining: " + imageRemaining + "\ttaskRemaining: " + taskRemaining + "\tlastOffset: " + lastOffset + "\ti: " + i);
}
}
// Close HIB
hib.close();
} // for (FileStatus file : inputFiles)
return splits;
}
/**
* Partitions input HIB files to map tasks in a way that attempts to maximize compute and data
* co-locality. To this end, {@link InputSplit}s are created such that one map task is created
* to process the images within one Hadoop block location (multiple images may be in one Hadoop
* block). The operation of this method is sensitive to the size of data chunks in the Hadoop
* configuration (smaller data chunks will yield more map tasks, but may also improve cluster
* utilization). Note that the real work is done in the static public method computeSplits().
*/
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
return HibInputFormat.computeSplits(job, listStatus(job));
}
}