package org.apache.howl.pig.drivers; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Properties; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.howl.data.DefaultHowlRecord; import org.apache.howl.data.HowlRecord; import org.apache.howl.data.schema.HowlSchema; import org.apache.howl.mapreduce.HowlInputStorageDriver; import org.apache.howl.pig.PigHowlUtil; import org.apache.pig.LoadFunc; import org.apache.pig.data.Tuple; /** * This is a base class which wraps a Load func in HowlInputStorageDriver. * If you already have a LoadFunc, then this class along with LoadFuncBasedInputFormat * is doing all the heavy lifting. For a new Howl Input Storage Driver just extend it * and override the initialize(). {@link PigStorageInputDriver} illustrates * that well. */ public abstract class LoadFuncBasedInputDriver extends HowlInputStorageDriver{ private LoadFuncBasedInputFormat inputFormat; private HowlSchema dataSchema; private Map<String,String> partVals; private List<String> desiredColNames; protected LoadFunc lf; @Override public HowlRecord convertToHowlRecord(WritableComparable baseKey, Writable baseValue) throws IOException { List<Object> data = ((Tuple)baseValue).getAll(); List<Object> howlRecord = new ArrayList<Object>(desiredColNames.size()); /* Iterate through columns asked for in output schema, look them up in * original data schema. If found, put it. Else look up in partition columns * if found, put it. Else, its a new column, so need to put null. Map lookup * on partition map will return null, if column is not found. */ for(String colName : desiredColNames){ Integer idx = dataSchema.getPosition(colName); howlRecord.add( idx != null ? data.get(idx) : partVals.get(colName)); } return new DefaultHowlRecord(howlRecord); } @Override public InputFormat<? extends WritableComparable, ? extends Writable> getInputFormat( Properties howlProperties) { return inputFormat; } @Override public void setOriginalSchema(JobContext jobContext, HowlSchema howlSchema) throws IOException { dataSchema = howlSchema; } @Override public void setOutputSchema(JobContext jobContext, HowlSchema howlSchema) throws IOException { desiredColNames = howlSchema.getFieldNames(); } @Override public void setPartitionValues(JobContext jobContext, Map<String, String> partitionValues) throws IOException { partVals = partitionValues; } @Override public void initialize(JobContext context, Properties storageDriverArgs) throws IOException { lf.setLocation(location, new Job(context.getConfiguration())); inputFormat = new LoadFuncBasedInputFormat(lf, PigHowlUtil.getResourceSchema(dataSchema)); } private String location; @Override public void setInputPath(JobContext jobContext, String location) throws IOException { this.location = location; super.setInputPath(jobContext, location); } }