Partition.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.metadata;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.hadoop.hive.common.StringInternUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.OutputFormat;

/**
 * A Hive Table Partition: is a fundamental storage unit within a Table.
 *
 * Please note that the ql code should always go through methods of this class to access the
 * metadata, instead of directly accessing org.apache.hadoop.hive.metastore.api.Partition.
 * This helps to isolate the metastore code and the ql code.
 */
public class Partition implements Serializable {

  @SuppressWarnings("nls")
  private static final Logger LOG = LoggerFactory
      .getLogger("hive.ql.metadata.Partition");

  private Table table;
  private org.apache.hadoop.hive.metastore.api.Partition tPartition;

  /**
   * These fields are cached. The information comes from tPartition.
   */
  private Deserializer deserializer;
  private Class<? extends OutputFormat> outputFormatClass;
  private Class<? extends InputFormat> inputFormatClass;

  /**
   * @return The values of the partition
   * @see org.apache.hadoop.hive.metastore.api.Partition#getValues()
   */
  public List<String> getValues() {
    return tPartition.getValues();
  }

  /**
   * Used only for serialization.
   */
  public Partition() {
  }

  /**
   * create an empty partition.
   * SemanticAnalyzer code requires that an empty partition when the table is not partitioned.
   */
  public Partition(Table tbl) throws HiveException {
    org.apache.hadoop.hive.metastore.api.Partition tPart =
        new org.apache.hadoop.hive.metastore.api.Partition();
    if (!tbl.isView()) {
      tPart.setSd(tbl.getTTable().getSd().deepCopy());
    }
    initialize(tbl, tPart);
  }

  public Partition(Table tbl, org.apache.hadoop.hive.metastore.api.Partition tp)
      throws HiveException {
    initialize(tbl, tp);
  }

  /**
   * Create partition object with the given info.
   *
   * @param tbl
   *          Table the partition will be in.
   * @param partSpec
   *          Partition specifications.
   * @param location
   *          Location of the partition, relative to the table.
   * @throws HiveException
   *           Thrown if we could not create the partition.
   */
  public Partition(Table tbl, Map<String, String> partSpec, Path location) throws HiveException {
    initialize(tbl, createMetaPartitionObject(tbl, partSpec, location));
  }

  public static org.apache.hadoop.hive.metastore.api.Partition createMetaPartitionObject(
      Table tbl, Map<String, String> partSpec, Path location) throws HiveException {
    List<String> pvals = new ArrayList<String>();
    for (FieldSchema field : tbl.getPartCols()) {
      String val = partSpec.get(field.getName());
      if (val == null || val.isEmpty()) {
        throw new HiveException("partition spec is invalid; field "
            + field.getName() + " does not exist or is empty");
      }
      pvals.add(val);
    }

    org.apache.hadoop.hive.metastore.api.Partition tpart =
        new org.apache.hadoop.hive.metastore.api.Partition();
    tpart.setDbName(tbl.getDbName());
    tpart.setTableName(tbl.getTableName());
    tpart.setValues(pvals);

    if (!tbl.isView()) {
      tpart.setSd(tbl.getSd().deepCopy());
      tpart.getSd().setLocation((location != null) ? location.toString() : null);
    }
    return tpart;
  }

  /**
   * Initializes this object with the given variables
   *
   * @param table
   *          Table the partition belongs to
   * @param tPartition
   *          Thrift Partition object
   * @throws HiveException
   *           Thrown if we cannot initialize the partition
   */
  protected void initialize(Table table,
      org.apache.hadoop.hive.metastore.api.Partition tPartition) throws HiveException {

    this.table = table;
    setTPartition(tPartition);

    if (table.isView()) {
      return;
    }

    if (table.isPartitioned()) {
      try {
        if (tPartition.getSd().getLocation() == null) {
          // set default if location is not set and this is a physical
          // table partition (not a view partition)
          if (table.getDataLocation() != null) {
            Path partPath = new Path(table.getDataLocation(), Warehouse.makePartName(table.getPartCols(), tPartition.getValues()));
            tPartition.getSd().setLocation(partPath.toString());
          }
        }
        // set default if columns are not set
        if (tPartition.getSd().getCols() == null) {
          if (table.getCols() != null) {
            tPartition.getSd().setCols(table.getCols());
          }
        }
      } catch (MetaException e) {
        throw new HiveException("Invalid partition for table " + table.getTableName(),
            e);
      }
    }

    // Note that we do not set up fields like inputFormatClass, outputFormatClass
    // and deserializer because the Partition needs to be accessed from across
    // the metastore side as well, which will result in attempting to load
    // the class associated with them, which might not be available, and
    // the main reason to instantiate them would be to pre-cache them for
    // performance. Since those fields are null/cache-check by their accessors
    // anyway, that's not a concern.
  }

  public String getName() {
    try {
      return Warehouse.makePartName(table.getPartCols(), tPartition.getValues());
    } catch (MetaException e) {
      throw new RuntimeException(e);
    }
  }

  public Path[] getPath() {
    Path[] ret = new Path[]{getDataLocation()};
    return ret;
  }

  public Path getPartitionPath() {
    return getDataLocation();
  }

  public Path getDataLocation() {
    if (table.isPartitioned()) {
      if (tPartition.getSd() == null)
        return null;
      else
        return new Path(tPartition.getSd().getLocation());
    } else {
      if (table.getTTable() == null || table.getTTable().getSd() == null)
        return null;
      else
        return new Path(table.getTTable().getSd().getLocation());
    }
  }

  final public Deserializer getDeserializer() {
    if (deserializer == null) {
      try {
        deserializer = MetaStoreUtils.getDeserializer(SessionState.getSessionConf(),
            tPartition, table.getTTable());
      } catch (MetaException e) {
        throw new RuntimeException(e);
      }
    }
    return deserializer;
  }

  public Properties getSchema() {
    return MetaStoreUtils.getSchema(tPartition, table.getTTable());
  }

  public Properties getMetadataFromPartitionSchema() {
    return MetaStoreUtils.getPartitionMetadata(tPartition, table.getTTable());
  }

  public Properties getSchemaFromTableSchema(Properties tblSchema) {
    return MetaStoreUtils.getPartSchemaFromTableSchema(tPartition.getSd(), table.getTTable().getSd(),
        tPartition.getParameters(), table.getDbName(), table.getTableName(), table.getPartitionKeys(),
        tblSchema);
  }

  /**
   * @param inputFormatClass
   */
  public void setInputFormatClass(Class<? extends InputFormat> inputFormatClass) {
    this.inputFormatClass = inputFormatClass;
    tPartition.getSd().setInputFormat(inputFormatClass.getName());
  }

  /**
   * @param outputFormatClass
   */
  public void setOutputFormatClass(Class<? extends HiveOutputFormat> outputFormatClass) {
    this.outputFormatClass = outputFormatClass;
    tPartition.getSd().setOutputFormat(HiveFileFormatUtils
        .getOutputFormatSubstitute(outputFormatClass).getName());
  }

  final public Class<? extends InputFormat> getInputFormatClass()
      throws HiveException {
    if (inputFormatClass == null) {
      // sd can be null for views
      String clsName = tPartition.getSd() == null ? null : tPartition.getSd().getInputFormat();
      if (clsName == null) {
        return inputFormatClass = table.getInputFormatClass();
      }
      try {
        inputFormatClass = ((Class<? extends InputFormat>) Class.forName(clsName, true,
            Utilities.getSessionSpecifiedClassLoader()));
      } catch (ClassNotFoundException e) {
        throw new HiveException("Class not found: " + clsName, e);
      }
    }
    return inputFormatClass;
  }

  final public Class<? extends OutputFormat> getOutputFormatClass()
      throws HiveException {
    if (outputFormatClass == null) {
      // sd can be null for views
      String clsName = tPartition.getSd() == null ? null : tPartition.getSd().getOutputFormat();
      if (clsName == null) {
        return outputFormatClass = table.getOutputFormatClass();
      }
      try {
        Class<?> c = Class.forName(clsName, true, Utilities.getSessionSpecifiedClassLoader());
        // Replace FileOutputFormat for backward compatibility
        outputFormatClass = HiveFileFormatUtils.getOutputFormatSubstitute(c);
      } catch (ClassNotFoundException e) {
        throw new HiveException("Class not found: " + clsName, e);
      }
    }
    return outputFormatClass;
  }

  public int getBucketCount() {
    return tPartition.getSd().getNumBuckets();
    /*
     * TODO: Keeping this code around for later use when we will support
     * sampling on tables which are not created with CLUSTERED INTO clause
     *
     * // read from table meta data int numBuckets = this.table.getNumBuckets();
     * if (numBuckets == -1) { // table meta data does not have bucket
     * information // check if file system has multiple buckets(files) in this
     * partition String pathPattern = this.partPath.toString() + "/*"; try {
     * FileSystem fs = FileSystem.get(this.table.getDataLocation(),
     * Hive.get().getConf()); FileStatus srcs[] = fs.globStatus(new
     * Path(pathPattern), FileUtils.HIDDEN_FILES_PATH_FILTER); numBuckets = srcs.length; } catch (Exception e) {
     * throw new RuntimeException("Cannot get bucket count for table " +
     * this.table.getName(), e); } } return numBuckets;
     */
  }

  public void setBucketCount(int newBucketNum) {
    tPartition.getSd().setNumBuckets(newBucketNum);
  }

  public List<String> getBucketCols() {
    return tPartition.getSd().getBucketCols();
  }

  public List<Order> getSortCols() {
    return tPartition.getSd().getSortCols();
  }

  public List<String> getSortColNames() {
    return Utilities.getColumnNamesFromSortCols(getSortCols());
  }

  /**
   * get all paths for this partition in a sorted manner
   */
  @SuppressWarnings("nls")
  public FileStatus[] getSortedPaths() {
    try {
      // Previously, this got the filesystem of the Table, which could be
      // different from the filesystem of the partition.
      FileSystem fs = getDataLocation().getFileSystem(SessionState.getSessionConf());
      String pathPattern = getDataLocation().toString();
      if (getBucketCount() > 0) {
        pathPattern = pathPattern + "/*";
      }
      LOG.info("Path pattern = " + pathPattern);
      FileStatus srcs[] = fs.globStatus(new Path(pathPattern), FileUtils.HIDDEN_FILES_PATH_FILTER);
      Arrays.sort(srcs);
      for (FileStatus src : srcs) {
        LOG.info("Got file: " + src.getPath());
      }
      if (srcs.length == 0) {
        return null;
      }
      return srcs;
    } catch (Exception e) {
      throw new RuntimeException("Cannot get path ", e);
    }
  }

  /**
   * mapping from bucket number to bucket path
   */
  // TODO: add test case and clean it up
  @SuppressWarnings("nls")
  public Path getBucketPath(int bucketNum) {
    FileStatus srcs[] = getSortedPaths();
    if (srcs == null) {
      return null;
    }
    return srcs[bucketNum].getPath();
  }

  @SuppressWarnings("nls")
  public Path[] getPath(Sample s) throws HiveException {
    if (s == null) {
      return getPath();
    } else {
      int bcount = getBucketCount();
      if (bcount == 0) {
        return getPath();
      }

      Dimension d = s.getSampleDimension();
      if (!d.getDimensionId().equals(table.getBucketingDimensionId())) {
        // if the bucket dimension is not the same as the sampling dimension
        // we must scan all the data
        return getPath();
      }

      int scount = s.getSampleFraction();
      ArrayList<Path> ret = new ArrayList<Path>();

      if (bcount == scount) {
        ret.add(getBucketPath(s.getSampleNum() - 1));
      } else if (bcount < scount) {
        if ((scount / bcount) * bcount != scount) {
          throw new HiveException("Sample Count" + scount
              + " is not a multiple of bucket count " + bcount + " for table "
              + table.getTableName());
        }
        // undersampling a bucket
        ret.add(getBucketPath((s.getSampleNum() - 1) % bcount));
      } else if (bcount > scount) {
        if ((bcount / scount) * scount != bcount) {
          throw new HiveException("Sample Count" + scount
              + " is not a divisor of bucket count " + bcount + " for table "
              + table.getTableName());
        }
        // sampling multiple buckets
        for (int i = 0; i < bcount / scount; i++) {
          ret.add(getBucketPath(i * scount + (s.getSampleNum() - 1)));
        }
      }
      return (ret.toArray(new Path[ret.size()]));
    }
  }

  public LinkedHashMap<String, String> getSpec() {
    return table.createSpec(tPartition);
  }

  @SuppressWarnings("nls")
  @Override
  public String toString() {
    String pn = "Invalid Partition";
    try {
      pn = Warehouse.makePartName(getSpec(), false);
    } catch (MetaException e) {
      // ignore as we most probably in an exception path already otherwise this
      // error wouldn't occur
    }
    return table.toString() + "(" + pn + ")";
  }

  public Table getTable() {
    return table;
  }

  /**
   * Should be only used by serialization.
   */
  public void setTable(Table table) {
    this.table = table;
  }

  /**
   * Should be only used by serialization.
   */
  public org.apache.hadoop.hive.metastore.api.Partition getTPartition() {
    return tPartition;
  }

  /**
   * Should be only used by serialization.
   */
  public void setTPartition(
      org.apache.hadoop.hive.metastore.api.Partition partition) {
    StringInternUtils.internStringsInList(partition.getValues());
    tPartition = partition;
  }

  public Map<String, String> getParameters() {
    return tPartition.getParameters();
  }

  public List<FieldSchema> getCols() {
    return getColsInternal(false);
  }

  public List<FieldSchema> getColsForMetastore() {
    return getColsInternal(true);
  }

  private List<FieldSchema> getColsInternal(boolean forMs) {

    try {
      String serializationLib = tPartition.getSd().getSerdeInfo().getSerializationLib();
      // Do the lightweight check for general case.
      if (Table.hasMetastoreBasedSchema(SessionState.getSessionConf(), serializationLib)) {
        return tPartition.getSd().getCols();
      } else if (forMs && !Table.shouldStoreFieldsInMetastore(
          SessionState.getSessionConf(), serializationLib, table.getParameters())) {
        return Hive.getFieldsFromDeserializerForMsStorage(table, getDeserializer());
      }
      return MetaStoreUtils.getFieldsFromDeserializer(table.getTableName(), getDeserializer());
    } catch (Exception e) {
      LOG.error("Unable to get cols from serde: " +
          tPartition.getSd().getSerdeInfo().getSerializationLib(), e);
    }

    return new ArrayList<FieldSchema>();
  }

  public String getLocation() {
    if (tPartition.getSd() == null) {
      return null;
    } else {
      return tPartition.getSd().getLocation();
    }
  }

  public void setLocation(String location) {
    tPartition.getSd().setLocation(location);
  }

  /**
   * Set Partition's values
   *
   * @param partSpec
   *          Partition specifications.
   * @throws HiveException
   *           Thrown if we could not create the partition.
   */
  public void setValues(Map<String, String> partSpec)
      throws HiveException {
    List<String> pvals = new ArrayList<String>();
    for (FieldSchema field : table.getPartCols()) {
      String val = partSpec.get(field.getName());
      if (val == null) {
        throw new HiveException(
            "partition spec is invalid. field.getName() does not exist in input.");
      }
      pvals.add(val.intern());
    }
    tPartition.setValues(pvals);
  }

  /**
   * @return include the db name
   */
  public String getCompleteName() {
    return getTable().getCompleteName() + "@" + getName();
  }

  public int getLastAccessTime() {
    return tPartition.getLastAccessTime();
  }

  public void setLastAccessTime(int lastAccessTime) {
    tPartition.setLastAccessTime(lastAccessTime);
  }

  public boolean isStoredAsSubDirectories() {
    return tPartition.getSd().isStoredAsSubDirectories();
  }

  public List<List<String>> getSkewedColValues(){
    return tPartition.getSd().getSkewedInfo().getSkewedColValues();
  }

  public List<String> getSkewedColNames() {
    LOG.debug("sd is " + tPartition.getSd().getClass().getName());
    return tPartition.getSd().getSkewedInfo().getSkewedColNames();
  }

  public void setSkewedValueLocationMap(List<String> valList, String dirName)
      throws HiveException {
    Map<List<String>, String> mappings = tPartition.getSd().getSkewedInfo()
        .getSkewedColValueLocationMaps();
    if (null == mappings) {
      mappings = new HashMap<List<String>, String>();
      tPartition.getSd().getSkewedInfo().setSkewedColValueLocationMaps(mappings);
    }

    // Add or update new mapping
    mappings.put(valList, dirName);
  }

  public Map<List<String>, String> getSkewedColValueLocationMaps() {
    return tPartition.getSd().getSkewedInfo().getSkewedColValueLocationMaps();
  }

  public void checkValidity() throws HiveException {
    if (!tPartition.getSd().equals(table.getSd())) {
      Table.validateColumns(getCols(), table.getPartCols());
    }
  }
}