/**
*
*/
package org.archive.hadoop.mapreduce;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
/**
* TextInputFormat that reads text data from HTTP URLs.
* @author kenji
*
*/
public class HttpTextInputFormat extends TextInputFormat {
@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split,
TaskAttemptContext context) {
return new HttpLineRecordReader();
}
// HTTP resources are not splittable for now.
// actually no need to override this method as getSplits(JobContext) is also
// overridden.
@Override
protected boolean isSplitable(JobContext context, Path file) {
return false;
}
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
List<InputSplit> splits = new ArrayList<InputSplit>();
Path[] uris = getInputPaths(job);
for (Path uri : uris) {
FileSplit split = new FileSplit(uri, 0, Long.MAX_VALUE, null);
splits.add(split);
}
return splits;
}
}