HCatBaseInputFormat.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hive.hcatalog.mapreduce;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;

import org.apache.hive.hcatalog.common.HCatConstants;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.data.schema.HCatFieldSchema;
import org.apache.hive.hcatalog.data.schema.HCatSchema;

public abstract class HCatBaseInputFormat
  extends InputFormat<WritableComparable, HCatRecord> {

  /**
   * get the schema for the HCatRecord data returned by HCatInputFormat.
   *
   * @param context the jobContext
   * @throws IllegalArgumentException
   */
  private Class<? extends InputFormat> inputFileFormatClass;

  // TODO needs to go in InitializeInput? as part of InputJobInfo
  private static HCatSchema getOutputSchema(Configuration conf)
    throws IOException {
    String os = conf.get(HCatConstants.HCAT_KEY_OUTPUT_SCHEMA);
    if (os == null) {
      return getTableSchema(conf);
    } else {
      return (HCatSchema) HCatUtil.deserialize(os);
    }
  }

  /**
   * Set the schema for the HCatRecord data returned by HCatInputFormat.
   * @param job the job object
   * @param hcatSchema the schema to use as the consolidated schema
   */
  public static void setOutputSchema(Job job, HCatSchema hcatSchema)
    throws IOException {
    job.getConfiguration().set(HCatConstants.HCAT_KEY_OUTPUT_SCHEMA,
      HCatUtil.serialize(hcatSchema));
  }

  protected static org.apache.hadoop.mapred.InputFormat<WritableComparable, Writable>
  getMapRedInputFormat(JobConf job, Class inputFormatClass) throws IOException {
    return (
      org.apache.hadoop.mapred.InputFormat<WritableComparable, Writable>)
      ReflectionUtils.newInstance(inputFormatClass, job);
  }

  /**
   * Logically split the set of input files for the job. Returns the
   * underlying InputFormat's splits
   * @param jobContext the job context object
   * @return the splits, an HCatInputSplit wrapper over the storage
   *         handler InputSplits
   * @throws IOException or InterruptedException
   */
  @Override
  public List<InputSplit> getSplits(JobContext jobContext)
    throws IOException, InterruptedException {
    Configuration conf = jobContext.getConfiguration();

    //Get the job info from the configuration,
    //throws exception if not initialized
    InputJobInfo inputJobInfo;
    try {
      inputJobInfo = getJobInfo(conf);
    } catch (Exception e) {
      throw new IOException(e);
    }

    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<PartInfo> partitionInfoList = inputJobInfo.getPartitions();
    if (partitionInfoList == null) {
      //No partitions match the specified partition filter
      return splits;
    }

    HiveStorageHandler storageHandler;
    JobConf jobConf;
    //For each matching partition, call getSplits on the underlying InputFormat
    for (PartInfo partitionInfo : partitionInfoList) {
      jobConf = HCatUtil.getJobConfFromContext(jobContext);
      List<String> setInputPath = setInputPath(jobConf, partitionInfo.getLocation());
      if (setInputPath.isEmpty()) {
        continue;
      }
      Map<String, String> jobProperties = partitionInfo.getJobProperties();

      HCatUtil.copyJobPropertiesToJobConf(jobProperties, jobConf);

      storageHandler = HCatUtil.getStorageHandler(
        jobConf, partitionInfo);

      //Get the input format
      Class inputFormatClass = storageHandler.getInputFormatClass();
      org.apache.hadoop.mapred.InputFormat inputFormat =
        getMapRedInputFormat(jobConf, inputFormatClass);

      //Call getSplit on the InputFormat, create an HCatSplit for each
      //underlying split. When the desired number of input splits is missing,
      //use a default number (denoted by zero).
      //TODO(malewicz): Currently each partition is split independently into
      //a desired number. However, we want the union of all partitions to be
      //split into a desired number while maintaining balanced sizes of input
      //splits.
      int desiredNumSplits =
        conf.getInt(HCatConstants.HCAT_DESIRED_PARTITION_NUM_SPLITS, 0);
      org.apache.hadoop.mapred.InputSplit[] baseSplits =
        inputFormat.getSplits(jobConf, desiredNumSplits);

      for (org.apache.hadoop.mapred.InputSplit split : baseSplits) {
        splits.add(new HCatSplit(partitionInfo, split));
      }
    }

    return splits;
  }

  /**
   * Create the RecordReader for the given InputSplit. Returns the underlying
   * RecordReader if the required operations are supported and schema matches
   * with HCatTable schema. Returns an HCatRecordReader if operations need to
   * be implemented in HCat.
   * @param split the split
   * @param taskContext the task attempt context
   * @return the record reader instance, either an HCatRecordReader(later) or
   *         the underlying storage handler's RecordReader
   * @throws IOException or InterruptedException
   */
  @Override
  public RecordReader<WritableComparable, HCatRecord>
  createRecordReader(InputSplit split,
             TaskAttemptContext taskContext) throws IOException, InterruptedException {

    HCatSplit hcatSplit = InternalUtil.castToHCatSplit(split);
    PartInfo partitionInfo = hcatSplit.getPartitionInfo();
    // Ensure PartInfo's TableInfo is initialized.
    if (partitionInfo.getTableInfo() == null) {
      partitionInfo.setTableInfo(((InputJobInfo)HCatUtil.deserialize(
          taskContext.getConfiguration().get(HCatConstants.HCAT_KEY_JOB_INFO)
      )).getTableInfo());
    }
    JobContext jobContext = taskContext;
    Configuration conf = jobContext.getConfiguration();

    HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(
      conf, partitionInfo);

    JobConf jobConf = HCatUtil.getJobConfFromContext(jobContext);
    Map<String, String> jobProperties = partitionInfo.getJobProperties();
    HCatUtil.copyJobPropertiesToJobConf(jobProperties, jobConf);

    Map<String, Object> valuesNotInDataCols = getColValsNotInDataColumns(
      getOutputSchema(conf), partitionInfo
    );

    return new HCatRecordReader(storageHandler, valuesNotInDataCols);
  }


  /**
   * gets values for fields requested by output schema which will not be in the data
   */
  private static Map<String, Object> getColValsNotInDataColumns(HCatSchema outputSchema,
                                  PartInfo partInfo) throws HCatException {
    HCatSchema dataSchema = partInfo.getPartitionSchema();
    Map<String, Object> vals = new HashMap<String, Object>();
    for (String fieldName : outputSchema.getFieldNames()) {
      if (dataSchema.getPosition(fieldName) == null) {
        // this entry of output is not present in the output schema
        // so, we first check the table schema to see if it is a part col
        if (partInfo.getPartitionValues().containsKey(fieldName)) {

          // First, get the appropriate field schema for this field
          HCatFieldSchema fschema = outputSchema.get(fieldName);

          // For a partition key type, this will be a primitive typeinfo.
          // Obtain relevant object inspector for this typeinfo
          ObjectInspector oi = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(fschema.getTypeInfo());

          // get appropriate object from the string representation of the value in partInfo.getPartitionValues()
          // Essentially, partition values are represented as strings, but we want the actual object type associated
          Object objVal = ObjectInspectorConverters
              .getConverter(PrimitiveObjectInspectorFactory.javaStringObjectInspector, oi)
              .convert(partInfo.getPartitionValues().get(fieldName));

          vals.put(fieldName, objVal);
        } else {
          vals.put(fieldName, null);
        }
      }
    }
    return vals;
  }

  /**
   * Gets the HCatTable schema for the table specified in the HCatInputFormat.setInput call
   * on the specified job context. This information is available only after HCatInputFormat.setInput
   * has been called for a JobContext.
   * @param conf the Configuration object
   * @return the table schema
   * @throws IOException if HCatInputFormat.setInput has not been called
   *                     for the current context
   */
  public static HCatSchema getTableSchema(Configuration conf)
    throws IOException {
    InputJobInfo inputJobInfo = getJobInfo(conf);
    HCatSchema allCols = new HCatSchema(new LinkedList<HCatFieldSchema>());
    for (HCatFieldSchema field :
      inputJobInfo.getTableInfo().getDataColumns().getFields()) {
      allCols.append(field);
    }
    for (HCatFieldSchema field :
      inputJobInfo.getTableInfo().getPartitionColumns().getFields()) {
      allCols.append(field);
    }
    return allCols;
  }

  /**
   * Gets the InputJobInfo object by reading the Configuration and deserializing
   * the string. If InputJobInfo is not present in the configuration, throws an
   * exception since that means HCatInputFormat.setInput has not been called.
   * @param conf the Configuration object
   * @return the InputJobInfo object
   * @throws IOException the exception
   */
  private static InputJobInfo getJobInfo(Configuration conf)
    throws IOException {
    String jobString = conf.get(
      HCatConstants.HCAT_KEY_JOB_INFO);
    if (jobString == null) {
      throw new IOException("job information not found in JobContext."
        + " HCatInputFormat.setInput() not called?");
    }

    return (InputJobInfo) HCatUtil.deserialize(jobString);
  }

  private List<String> setInputPath(JobConf jobConf, String location)
    throws IOException {

    // ideally we should just call FileInputFormat.setInputPaths() here - but
    // that won't work since FileInputFormat.setInputPaths() needs
    // a Job object instead of a JobContext which we are handed here

    int length = location.length();
    int curlyOpen = 0;
    int pathStart = 0;
    boolean globPattern = false;
    List<String> pathStrings = new ArrayList<String>();

    for (int i = 0; i < length; i++) {
      char ch = location.charAt(i);
      switch (ch) {
      case '{': {
        curlyOpen++;
        if (!globPattern) {
          globPattern = true;
        }
        break;
      }
      case '}': {
        curlyOpen--;
        if (curlyOpen == 0 && globPattern) {
          globPattern = false;
        }
        break;
      }
      case ',': {
        if (!globPattern) {
          pathStrings.add(location.substring(pathStart, i));
          pathStart = i + 1;
        }
        break;
      }
      }
    }
    pathStrings.add(location.substring(pathStart, length));

    String separator = "";
    StringBuilder str = new StringBuilder();

    boolean ignoreInvalidPath =jobConf.getBoolean(HCatConstants.HCAT_INPUT_IGNORE_INVALID_PATH_KEY,
        HCatConstants.HCAT_INPUT_IGNORE_INVALID_PATH_DEFAULT);
    Iterator<String> pathIterator = pathStrings.iterator();
    while (pathIterator.hasNext()) {
      String pathString = pathIterator.next();
      if (ignoreInvalidPath && org.apache.commons.lang.StringUtils.isBlank(pathString)) {
        continue;
      }
      Path path = new Path(pathString);
      FileSystem fs = path.getFileSystem(jobConf);
      if (ignoreInvalidPath && !fs.exists(path)) {
        pathIterator.remove();
        continue;
      }
      final String qualifiedPath = fs.makeQualified(path).toString();
      str.append(separator)
        .append(StringUtils.escapeString(qualifiedPath));
      separator = StringUtils.COMMA_STR;
    }

    if (!ignoreInvalidPath || !pathStrings.isEmpty()) {
      jobConf.set("mapred.input.dir", str.toString());
    }
    return pathStrings;
  }

}