package edu.isi.karma.mapreduce.inputformat;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
import org.json.JSONObject;
@InterfaceAudience.Public
@InterfaceStability.Stable
public class SequenceFileAsJSONRecordBatchReader
extends RecordReader<Text, Text> {
private final SequenceFileRecordReader<WritableComparable<?>, Writable>
sequenceFileRecordReader;
List<JSONObject> data = new LinkedList<>();
private static final int batchSize = 10000;
public SequenceFileAsJSONRecordBatchReader()
throws IOException {
sequenceFileRecordReader =
new SequenceFileRecordReader<>();
}
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
sequenceFileRecordReader.initialize(split, context);
}
@Override
public Text getCurrentKey()
throws IOException, InterruptedException {
return new Text("json");
}
@Override
public Text getCurrentValue()
throws IOException, InterruptedException {
StringBuilder builder = new StringBuilder();
builder.append("[");
boolean isFirst = true;
for (JSONObject obj : data) {
if (isFirst) {
builder.append(obj.toString());
isFirst = false;
}
else {
builder.append(",");
builder.append(obj.toString());
}
}
builder.append("]");
return new Text(builder.toString());
}
public synchronized boolean nextKeyValue()
throws IOException, InterruptedException {
int count = 0;
data.clear();
while (sequenceFileRecordReader.nextKeyValue()) {
JSONObject obj = new JSONObject(sequenceFileRecordReader.getCurrentValue().toString());
data.add(obj);
count++;
if (count == batchSize) {
break;
}
}
return (!data.isEmpty());
}
public float getProgress() throws IOException, InterruptedException {
return sequenceFileRecordReader.getProgress();
}
public synchronized void close() throws IOException {
sequenceFileRecordReader.close();
}
}