/**
* CopyRight by Chinamobile
*
* BSPFileInputFormat.java
*/
package com.chinamobile.bcbsp.io;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.InvalidInputException;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;
import com.chinamobile.bcbsp.util.BSPJob;
import com.chinamobile.bcbsp.Constants;
/**
* BSPFileInputFormat
*
* This class is used for reading from the file system, such as HDFS.
*
* @author
* @version
*/
public abstract class BSPFileInputFormat<K, V> extends InputFormat<K, V> {
private static final Log LOG = LogFactory.getLog(BSPFileInputFormat.class);
private static final double SPLIT_SLOP = 1.1; // 10% slop
private static final PathFilter hiddenFileFilter = new PathFilter() {
public boolean accept(Path p) {
String name = p.getName();
return !name.startsWith("_") && !name.startsWith(".");
}
};
/**
* Proxy PathFilter that accepts a path only if all filters given in the
* constructor do. Used by the listPaths() to apply the built-in
* hiddenFileFilter together with a user provided one (if any).
*/
private static class MultiPathFilter implements PathFilter {
private List<PathFilter> filters;
public MultiPathFilter(List<PathFilter> filters) {
this.filters = filters;
}
public boolean accept(Path path) {
for (PathFilter filter : filters) {
if (!filter.accept(path)) {
return false;
}
}
return true;
}
}
/**
* Get the lower bound on split size imposed by the format.
*
* @return the number of bytes of the minimal split for this format
*/
protected long getFormatMinSplitSize() {
return 1;
}
/**
* Is the given filename splitable? Usually, true, but if the file is stream
* compressed, it will not be.
*
* @param BSPJob
* the job configuration
* @param filename
* the file name to check
* @return is this file splitable?
*/
protected boolean isSplitable(BSPJob job, Path filename) {
return true;
}
/**
* Set a PathFilter to be applied to the input paths for the map-reduce job.
*
* @param job
* the job to modify
* @param filter
* the PathFilter class use for filtering the input paths.
*/
public static void setInputPathFilter(Job job,
Class<? extends PathFilter> filter) {
job.getConfiguration().setClass("mapred.input.pathFilter.class",
filter, PathFilter.class);
}
/**
* Set the minimum input split size TODO This function is disable
*
* @param job
* the job to modify
* @param size
* the minimum size
*/
public static void setMinInputSplitSize(Job job, long size) {
job.getConfiguration().setLong("mapred.min.split.size", size);
}
/**
* Get the minimum split size TODO This function is disable
*
* @param job
* the job
* @return the minimum number of bytes that can be in a split
*/
public static long getMinSplitSize(Job job) {
return job.getConfiguration().getLong("mapred.min.split.size", 1L);
}
/**
* Set the maximum split size TODO This function is disable
*
* @param job
* the job to modify
* @param size
* the maximum split size
*/
public static void setMaxInputSplitSize(Job job, long size) {
job.getConfiguration().setLong("mapred.max.split.size", size);
}
/**
* Get the maximum split size. TODO This function is disable
*
* @param context
* the job to look at.
* @return the maximum number of bytes a split can include
*/
public static long getMaxSplitSize(Job context) {
return context.getConfiguration().getLong("mapred.max.split.size",
Long.MAX_VALUE);
}
/**
* Get a PathFilter instance of the filter set for the input paths.
*
* @return the PathFilter instance set for the job, NULL if none has been
* set.
*/
public static PathFilter getInputPathFilter(JobContext context) {
Configuration conf = context.getConfiguration();
Class<?> filterClass = conf.getClass("mapred.input.pathFilter.class",
null, PathFilter.class);
return (filterClass != null) ? ( PathFilter ) ReflectionUtils
.newInstance(filterClass, conf) : null;
}
/**
* Generate the list of files and make them into FileSplits.
*/
public List<InputSplit> getSplits(BSPJob job) throws IOException {
List<InputSplit> splits = new ArrayList<InputSplit>();
for (FileStatus file : listStatus(job)) {
Path path = file.getPath();
FileSystem fs = path.getFileSystem(job.getConf());
long length = file.getLen();
BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0,
length);
if ((length != 0) && isSplitable(job, path)) {
long blockSize = file.getBlockSize();
long splitSize = 0L;
if (job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1) == 1) {
if (job.getSplitSize() == 0L) {
splitSize = blockSize;
} else {
splitSize = job.getSplitSize();
}
} else {
if (job.getSplitSize() == 0L) {
splitSize = blockSize * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1);
} else {
splitSize = job.getSplitSize() * job.getInt(Constants.USER_BC_BSP_JOB_SPLIT_FACTOR, 1);
}
}
LOG.info("[Split Size] " + (splitSize / (1024 * 1024)) + " MB");
long bytesRemaining = length;
while ((( double ) bytesRemaining) / splitSize > SPLIT_SLOP) {
int blkIndex = getBlockIndex(blkLocations, length
- bytesRemaining);
splits.add(new FileSplit(path, length - bytesRemaining,
splitSize, blkLocations[blkIndex].getHosts()));
bytesRemaining -= splitSize;
}
if (bytesRemaining != 0) {
splits.add(new FileSplit(path, length - bytesRemaining,
bytesRemaining,
blkLocations[blkLocations.length - 1].getHosts()));
}
} else if (length != 0) {
splits.add(new FileSplit(path, 0, length, blkLocations[0]
.getHosts()));
} else {
// Create empty hosts array for zero length files
splits.add(new FileSplit(path, 0, length, new String[0]));
}
}
LOG.info("[Split Number] " + splits.size());
return splits;
}
/**
* List input directories. Subclasses may override to, e.g., select only
* files matching a regular expression.
*
* @param job
* the job to list input paths for
* @return array of FileStatus objects
* @throws IOException
* if zero items.
*/
protected List<FileStatus> listStatus(BSPJob job) throws IOException {
List<FileStatus> result = new ArrayList<FileStatus>();
Path[] dirs = getInputPaths(job);
if (dirs.length == 0) {
throw new IOException("No input paths specified in job");
}
List<IOException> errors = new ArrayList<IOException>();
// creates a MultiPathFilter with the hiddenFileFilter and the
// user provided one (if any).
List<PathFilter> filters = new ArrayList<PathFilter>();
filters.add(hiddenFileFilter);
PathFilter inputFilter = new MultiPathFilter(filters);
for (int i = 0; i < dirs.length; ++i) {
Path p = dirs[i];
FileSystem fs = p.getFileSystem(job.getConf());
FileStatus[] matches = fs.globStatus(p, inputFilter);
if (matches == null) {
errors.add(new IOException("Input path does not exist: " + p));
} else if (matches.length == 0) {
errors.add(new IOException("Input Pattern " + p
+ " matches 0 files"));
} else {
for (FileStatus globStat : matches) {
if (globStat.isDir()) {
for (FileStatus stat : fs.listStatus(
globStat.getPath(), inputFilter)) {
result.add(stat);
}
} else {
result.add(globStat);
}
}
}
}
if (!errors.isEmpty()) {
throw new InvalidInputException(errors);
}
LOG.info("Total input paths to process : " + result.size());
return result;
}
protected long computeSplitSize(long blockSize, long minSize, long maxSize) {
return Math.max(minSize, Math.min(maxSize, blockSize));
}
protected int getBlockIndex(BlockLocation[] blkLocations, long offset) {
for (int i = 0; i < blkLocations.length; i++) {
// is the offset inside this block?
if ((blkLocations[i].getOffset() <= offset)
&& (offset < blkLocations[i].getOffset()
+ blkLocations[i].getLength())) {
return i;
}
}
BlockLocation last = blkLocations[blkLocations.length - 1];
long fileLength = last.getOffset() + last.getLength() - 1;
throw new IllegalArgumentException("Offset " + offset
+ " is outside of file (0.." + fileLength + ")");
}
/**
* Add a {@link Path} to the list of inputs for the BC_BSP job.
*
* @param job
* The {@link Job} to modify
* @param path
* {@link Path} to be added to the list of inputs for the BC_BSP
* job.
*/
public static void addInputPath(BSPJob job, Path path) throws IOException {
Configuration conf = job.getConf();
FileSystem fs = FileSystem.get(conf);
path = path.makeQualified(fs);
String dirStr = StringUtils.escapeString(path.toString());
String dirs = conf.get(Constants.USER_BC_BSP_JOB_INPUT_DIR);
conf.set(Constants.USER_BC_BSP_JOB_INPUT_DIR, dirs == null ? dirStr
: dirs + "," + dirStr);
}
/**
* Get the list of input {@link Path}s for the bsp job.
*
* @param job
* The job configuration
* @return the list of input {@link Path}s for the bsp job.
*/
public static Path[] getInputPaths(BSPJob job) {
String dirs = job.getConf()
.get(Constants.USER_BC_BSP_JOB_INPUT_DIR, "");
String[] list = StringUtils.split(dirs);
Path[] result = new Path[list.length];
for (int i = 0; i < list.length; i++) {
result[i] = new Path(StringUtils.unEscapeString(list[i]));
}
return result;
}
@Override
public abstract RecordReader<K, V> createRecordReader(InputSplit split,
BSPJob job) throws IOException, InterruptedException;
}