LoadFuncBasedInputDriver.java example

Explorer
howl-master
- howl-howl
package org.apache.howl.pig.drivers;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.howl.data.DefaultHowlRecord;
import org.apache.howl.data.HowlRecord;
import org.apache.howl.data.schema.HowlSchema;
import org.apache.howl.mapreduce.HowlInputStorageDriver;
import org.apache.howl.pig.PigHowlUtil;
import org.apache.pig.LoadFunc;
import org.apache.pig.data.Tuple;


/**
 * This is a base class which wraps a Load func in HowlInputStorageDriver.
 * If you already have a LoadFunc, then this class along with LoadFuncBasedInputFormat
 * is doing all the heavy lifting. For a new Howl Input Storage Driver just extend it
 * and override the initialize(). {@link PigStorageInputDriver} illustrates
 * that well.
 */
public abstract class LoadFuncBasedInputDriver extends HowlInputStorageDriver{

  private LoadFuncBasedInputFormat inputFormat;
  private HowlSchema dataSchema;
  private Map<String,String> partVals;
  private List<String> desiredColNames;
  protected LoadFunc lf;

  @Override
  public HowlRecord convertToHowlRecord(WritableComparable baseKey, Writable baseValue)
      throws IOException {

    List<Object> data = ((Tuple)baseValue).getAll();
    List<Object> howlRecord = new ArrayList<Object>(desiredColNames.size());

    /* Iterate through columns asked for in output schema, look them up in
     * original data schema. If found, put it. Else look up in partition columns
     * if found, put it. Else, its a new column, so need to put null. Map lookup
     * on partition map will return null, if column is not found.
     */
    for(String colName : desiredColNames){
      Integer idx = dataSchema.getPosition(colName);
      howlRecord.add( idx != null ? data.get(idx) : partVals.get(colName));
    }
    return new DefaultHowlRecord(howlRecord);
  }

  @Override
  public InputFormat<? extends WritableComparable, ? extends Writable> getInputFormat(
      Properties howlProperties) {

    return inputFormat;
  }

  @Override
  public void setOriginalSchema(JobContext jobContext, HowlSchema howlSchema) throws IOException {

    dataSchema = howlSchema;
  }

  @Override
  public void setOutputSchema(JobContext jobContext, HowlSchema howlSchema) throws IOException {

    desiredColNames = howlSchema.getFieldNames();
  }

  @Override
  public void setPartitionValues(JobContext jobContext, Map<String, String> partitionValues)
      throws IOException {

    partVals = partitionValues;
  }

  @Override
  public void initialize(JobContext context, Properties storageDriverArgs) throws IOException {

    lf.setLocation(location, new Job(context.getConfiguration()));
    inputFormat = new LoadFuncBasedInputFormat(lf, PigHowlUtil.getResourceSchema(dataSchema));
  }

  private String location;

  @Override
  public void setInputPath(JobContext jobContext, String location) throws IOException {

    this.location = location;
    super.setInputPath(jobContext, location);
  }
}