package edu.isi.karma.mapreduce.inputformat;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.mapreduce.AvroRecordReaderBase;
import org.apache.hadoop.io.Text;
import org.json.JSONObject;
public class AvroBatchRecordReader<T> extends AvroRecordReaderBase<Text, Text, T>{
private static final int batchSize = 10000;
protected AvroBatchRecordReader(Schema readerSchema) {
super(readerSchema);
}
List<JSONObject> data = new LinkedList<>();
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return new Text("json");
}
@Override
public Text getCurrentValue() throws IOException,
InterruptedException {
StringBuilder builder = new StringBuilder();
builder.append("[");
boolean isFirst = true;
for (JSONObject obj : data) {
if (isFirst) {
builder.append(obj.toString());
isFirst = false;
}
else {
builder.append(",");
builder.append(obj.toString());
}
}
builder.append("]");
return new Text(builder.toString());
}
@Override
public synchronized boolean nextKeyValue() throws IOException, InterruptedException {
data.clear();
int i = 0;
while (super.nextKeyValue()) {
T tmp = getCurrentRecord();
data.add(new JSONObject(tmp.toString()));
i++;
if (i == batchSize) {
break;
}
}
return (!data.isEmpty());
}
}