package com.twitter.elephantbird.mapreduce.input;
import com.twitter.elephantbird.pig.util.ThriftToPig;
import com.twitter.elephantbird.thrift.TStructDescriptor;
import com.twitter.elephantbird.util.ThriftUtils;
import com.twitter.elephantbird.util.TypeRef;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.thrift.TBase;
import org.apache.thrift.TException;
import java.io.IOException;
/**
* This is a wrapper over RCFileThriftInputFormat and provides a method
* to create a Tuple directly from RCFile bytes, skipping building a Thrift
* object.
*/
public class RCFileThriftTupleInputFormat extends RCFileThriftInputFormat {
// for MR
public RCFileThriftTupleInputFormat() {}
public RCFileThriftTupleInputFormat(TypeRef<TBase<?, ?>> typeRef) {
super(typeRef);
}
@Override
public RecordReader<LongWritable, Writable>
createRecordReader(InputSplit split, TaskAttemptContext taskAttempt)
throws IOException, InterruptedException {
return new TupleReader(createUnwrappedRecordReader(split, taskAttempt));
}
public class TupleReader extends RCFileThriftInputFormat.ThriftReader {
private final TupleFactory tf = TupleFactory.getInstance();
/**
* The reader is expected to be a
* <code>RecordReader< LongWritable, BytesRefArrayWritable ></code>
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
public TupleReader(RecordReader reader) {
super(reader);
}
/**
* Returns a Tuple consisting of required fields with out creating
* a Thrift message at the top level.
*/
public Tuple getCurrentTupleValue() throws IOException, InterruptedException, TException {
BytesRefArrayWritable byteRefs = getCurrentBytesRefArrayWritable();
if (byteRefs == null) {
return null;
}
Tuple tuple = tf.newTuple(knownRequiredFields.size());
for (int i=0; i < knownRequiredFields.size(); i++) {
BytesRefWritable buf = byteRefs.get(columnsBeingRead.get(i));
if (buf.getLength() > 0) {
memTransport.reset(buf.getData(), buf.getStart(), buf.getLength());
TStructDescriptor.Field field = knownRequiredFields.get(i);
Object value = ThriftUtils.readFieldNoTag(tProto, field);
tuple.set(i, ThriftToPig.toPigObject(field, value, false));
}
}
if (isReadingUnknonwsColumn()) {
throw new IOException("getCurrentTupleValue() is not supported when 'readUnknownColumns' is set");
}
return tuple;
}
}
}