package com.caseystella.util.pig.loader;
import com.caseystella.util.common.hadoop.input.whole.WholeFileInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.pig.*;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import java.io.IOException;
import java.util.List;
/**
* Created by cstella on 9/2/14.
*/
public class WholeFileLoader extends LoadFunc implements LoadMetadata, LoadPushDown {
Configuration jobConf;
RecordReader<Text, BytesWritable> reader;
@Override
public void setLocation(String location, Job job) throws IOException {
FileInputFormat.setInputPaths(job, location);
this.jobConf = job.getConfiguration();
}
@Override
public InputFormat getInputFormat() throws IOException {
return new WholeFileInputFormat();
}
@Override
public void prepareToRead(RecordReader recordReader, PigSplit pigSplit) throws IOException {
this.reader = recordReader;
}
@Override
public Tuple getNext() throws IOException {
Tuple t = null;
try {
boolean notDone = reader.nextKeyValue();
if (!notDone) {
return null;
}
Text key = reader.getCurrentKey();
BytesWritable value = reader.getCurrentValue();
t = TupleFactory.getInstance().newTuple(2);
t.set(0, key.toString());
t.set(1, new DataByteArray(value.copyBytes()));
}
catch (InterruptedException e) {
throw new IOException("Unable to read next value", e);
}
return t;
}
public ResourceSchema getSchema(String s, Job job) throws IOException {
ResourceSchema ret = new ResourceSchema();
ret.setFields(new ResourceSchema.ResourceFieldSchema[] {
new ResourceSchema.ResourceFieldSchema(new Schema.FieldSchema("location", DataType.CHARARRAY))
,new ResourceSchema.ResourceFieldSchema(new Schema.FieldSchema("data", DataType.BYTEARRAY))
});
return ret;
}
public ResourceStatistics getStatistics(String s, Job job) throws IOException {
return null;
}
public String[] getPartitionKeys(String s, Job job) throws IOException {
return new String[0];
}
public void setPartitionFilter(Expression expression) throws IOException {
}
public List<OperatorSet> getFeatures() {
return null;
}
public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList) throws FrontendException {
return null;
}
}