package com.caseystella.util.pig.loader; import com.caseystella.util.common.interpret.fixed.Config; import com.caseystella.util.common.interpret.fixed.Field; import com.caseystella.util.common.hadoop.input.fixed.FixedWidthInputFormat; import com.caseystella.util.pig.Helper; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.pig.*; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; import java.io.IOException; import java.nio.ByteBuffer; import java.util.EnumMap; import java.util.List; import static com.caseystella.util.common.interpret.fixed.Field.Type.*; /** * Created by cstella on 9/3/14. */ public class FixedWidthLoader extends LoadFunc implements LoadMetadata, LoadPushDown { Config config; String configFile = null; RecordReader<LongWritable, BytesWritable> reader; static EnumMap<Field.Type, Byte> typeToPigType = new EnumMap<Field.Type, Byte>(Field.Type.class); static { typeToPigType.put(BYTES, DataType.BYTEARRAY); typeToPigType.put(STRING, DataType.CHARARRAY); typeToPigType.put(INT, DataType.INTEGER); typeToPigType.put(FLOAT, DataType.FLOAT); typeToPigType.put(DOUBLE, DataType.DOUBLE); } public FixedWidthLoader(String configFile) { this.configFile = configFile; } @Override public void setLocation(String location, Job job) throws IOException { FileInputFormat.setInputPaths(job, location); } @Override public InputFormat getInputFormat() throws IOException { return new FixedWidthInputFormat(getConfig().computeWidth()); } private Config getConfig() throws IOException { if(config == null) { config = Config.load(Helper.open(configFile, FixedWidthLoader.class)); } return config; } @Override public void prepareToRead(RecordReader recordReader, PigSplit pigSplit) throws IOException { getConfig(); reader = recordReader; } @Override public Tuple getNext() throws IOException { Tuple t = null; try { boolean notDone = reader.nextKeyValue(); if (!notDone) { return null; } byte[] value = reader.getCurrentValue().copyBytes(); t = TupleFactory.getInstance().newTuple(); for(Field f : getConfig().getFields()) { ByteBuffer b = ByteBuffer.wrap(value, f.getOffset(), f.getWidth()); switch(f.getType()) { case BYTES: ByteBuffer buff = (ByteBuffer) f.getConverter().convert(b, f.getType(), f.getConfig()); t.append(new DataByteArray(buff.array(), 0, f.getWidth())); break; default: t.append(f.getConverter().convert(b, f.getType(), f.getConfig())); break; } } } catch (InterruptedException e) { throw new IOException("Unable to read next value", e); } return t; } @Override public ResourceSchema getSchema(String s, Job job) throws IOException { Helper.addFileToContext(configFile, FixedWidthLoader.class); getConfig().validate(); ResourceSchema ret = new ResourceSchema(); ResourceSchema.ResourceFieldSchema[] fields = new ResourceSchema.ResourceFieldSchema[getConfig().getFields().length]; int i = 0; for(Field f : getConfig().getFields()) { fields[i] = new ResourceSchema.ResourceFieldSchema(new Schema.FieldSchema(f.getName(), typeToPigType.get(f.getType()))); i++; } ret.setFields(fields); return ret; } @Override public ResourceStatistics getStatistics(String s, Job job) throws IOException { return null; } @Override public String[] getPartitionKeys(String s, Job job) throws IOException { return new String[0]; } @Override public void setPartitionFilter(Expression expression) throws IOException { } @Override public List<OperatorSet> getFeatures() { return null; } @Override public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList) throws FrontendException { return null; } }