/*
* Copyright 2012 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.linkedin.whiteelephant.mapreduce.lib.input;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
public class CombineDocumentFileFormat extends CombineFileInputFormat<Text, BytesWritable>{
public static class WholeFileRecordReader extends RecordReader<Text, BytesWritable>{
private CombineFileSplit inputSplit;
private Integer idx;
private Text path;
private BytesWritable document;
private Configuration conf;
private boolean read;
public WholeFileRecordReader(CombineFileSplit inputSplit, TaskAttemptContext context, Integer idx)
{
this.inputSplit = inputSplit;
this.idx = idx;
this.conf = context.getConfiguration();
this.read = false;
}
@Override
public void close() throws IOException {
// Don't need to do anything
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return path;
}
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return document;
}
@Override
public float getProgress() throws IOException, InterruptedException {
if (!read)
return 0;
else
return 1;
}
@Override
public void initialize(InputSplit arg0, TaskAttemptContext arg1)
throws IOException, InterruptedException {
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!read){
Path file = inputSplit.getPath(idx);
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream input = fs.open(file);
byte[] bytes = new byte[(int) inputSplit.getLength(idx)];
int offset = (int) inputSplit.getOffset(idx);
int length = (int) inputSplit.getLength(idx);
IOUtils.readFully(input, bytes, offset, length);
document = new BytesWritable();
document.set(bytes, offset, length);
path = new Text(file.toString());
read = true;
return true;
} else {
return false;
}
}
}
@Override
protected boolean isSplitable(JobContext context, Path file) {
return false;
}
@Override
public RecordReader<Text, BytesWritable> createRecordReader(InputSplit arg0,
TaskAttemptContext arg1) throws IOException {
return new CombineFileRecordReader<Text, BytesWritable>((CombineFileSplit) arg0, arg1, WholeFileRecordReader.class);
}
}