package edu.isi.karma.mapreduce.inputformat;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
@InterfaceAudience.Public
@InterfaceStability.Stable
public class CSVBatchRecordReader
extends RecordReader<Writable, Text> {
private final LineRecordReader
recordReader;
protected String header = null;
private LongWritable key = null;
List<String> data = new LinkedList<>();
private static final int batchSize = 10000;
public CSVBatchRecordReader()
throws IOException {
recordReader =
new LineRecordReader();
}
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
recordReader.initialize(split, context);
}
@Override
public LongWritable getCurrentKey()
throws IOException, InterruptedException {
return key;
}
@Override
public Text getCurrentValue()
throws IOException, InterruptedException {
StringBuilder builder = new StringBuilder();
builder.append(header);
if(!header.endsWith("\n"))
builder.append("\n");
for (String obj : data) {
builder.append(obj);
if(!obj.endsWith("\n"))
builder.append("\n");
}
return new Text(builder.toString());
}
public synchronized boolean nextKeyValue()
throws IOException, InterruptedException {
int count = 0;
data.clear();
while (recordReader.nextKeyValue()) {
String value = recordReader.getCurrentValue().toString();
if(header == null)
{
header = value;
}
else{
data.add(value);
count++;
if (count == batchSize) {
break;
}
}
key = recordReader.getCurrentKey();
}
return (!data.isEmpty());
}
public float getProgress() throws IOException, InterruptedException {
return recordReader.getProgress();
}
public synchronized void close() throws IOException {
recordReader.close();
}
}