AbstractFileMergeOperator.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.exec;

import java.io.IOException;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
import org.apache.hadoop.hive.ql.plan.FileMergeDesc;
import org.apache.hadoop.mapred.JobConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Fast file merge operator for ORC and RCfile. This is an abstract class which
 * does not process any rows. Refer {@link org.apache.hadoop.hive.ql.exec.OrcFileMergeOperator}
 * or {@link org.apache.hadoop.hive.ql.exec.RCFileMergeOperator} for more details.
 */
public abstract class AbstractFileMergeOperator<T extends FileMergeDesc>
    extends Operator<T> implements Serializable {

  public static final String BACKUP_PREFIX = "_backup.";
  public static final Logger LOG = LoggerFactory.getLogger(AbstractFileMergeOperator.class);

  protected JobConf jc;
  protected FileSystem fs;
  protected boolean autoDelete;
  protected boolean exception;
  protected Path outPath;
  protected Path finalPath;
  protected Path dpPath;
  protected Path tmpPath;
  protected Path taskTmpPath;
  protected int listBucketingDepth;
  protected boolean hasDynamicPartitions;
  protected boolean isListBucketingAlterTableConcatenate;
  protected boolean tmpPathFixedConcatenate;
  protected boolean tmpPathFixed;
  protected Set<Path> incompatFileSet;
  protected transient DynamicPartitionCtx dpCtx;

  /** Kryo ctor. */
  protected AbstractFileMergeOperator() {
    super();
  }

  public AbstractFileMergeOperator(CompilationOpContext ctx) {
    super(ctx);
  }

  @Override
  public void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    this.jc = new JobConf(hconf);
    incompatFileSet = new HashSet<Path>();
    autoDelete = false;
    exception = false;
    tmpPathFixed = false;
    tmpPathFixedConcatenate = false;
    outPath = null;
    finalPath = null;
    dpPath = null;
    tmpPath = null;
    taskTmpPath = null;
    dpCtx = conf.getDpCtx();
    hasDynamicPartitions = conf.hasDynamicPartitions();
    isListBucketingAlterTableConcatenate = conf
        .isListBucketingAlterTableConcatenate();
    listBucketingDepth = conf.getListBucketingDepth();
    Path specPath = conf.getOutputPath();
    updatePaths(Utilities.toTempPath(specPath),
        Utilities.toTaskTempPath(specPath));
    try {
      fs = specPath.getFileSystem(hconf);
      autoDelete = fs.deleteOnExit(outPath);
    } catch (IOException e) {
      this.exception = true;
      throw new HiveException("Failed to initialize AbstractFileMergeOperator",
          e);
    }
  }

  // sets up temp and task temp path
  private void updatePaths(Path tp, Path ttp) {
    String taskId = Utilities.getTaskId(jc);
    tmpPath = tp;
    taskTmpPath = ttp;
    finalPath = new Path(tp, taskId);
    outPath = new Path(ttp, Utilities.toTempPath(taskId));
  }

  /**
   * Fixes tmpPath to point to the correct partition. Initialize operator will
   * set tmpPath and taskTmpPath based on root table directory. So initially,
   * tmpPath will be <prefix>/_tmp.-ext-10000 and taskTmpPath will be
   * <prefix>/_task_tmp.-ext-10000. The depth of these two paths will be 0.
   * Now, in case of dynamic partitioning or list bucketing the inputPath will
   * have additional sub-directories under root table directory. This function
   * updates the tmpPath and taskTmpPath to reflect these additional
   * subdirectories. It updates tmpPath and taskTmpPath in the following way
   * 1. finds out the difference in path based on depthDiff provided
   * and saves the path difference in newPath
   * 2. newPath is used to update the existing tmpPath and taskTmpPath similar
   * to the way initializeOp() does.
   *
   * Note: The path difference between inputPath and tmpDepth can be DP or DP+LB.
   * This method will automatically handle it.
   *
   * Continuing the example above, if inputPath is <prefix>/-ext-10000/hr=a1/,
   * newPath will be hr=a1/. Then, tmpPath and taskTmpPath will be updated to
   * <prefix>/-ext-10000/hr=a1/_tmp.ext-10000 and
   * <prefix>/-ext-10000/hr=a1/_task_tmp.ext-10000 respectively.
   * We have list_bucket_dml_6.q cover this case: DP + LP + multiple skewed
   * values + merge.
   *
   * @param inputPath - input path
   * @throws java.io.IOException
   */
  protected void fixTmpPath(Path inputPath, int depthDiff) throws IOException {

    // don't need to update tmp paths when there is no depth difference in paths
    if (depthDiff <=0) {
      return;
    }

    dpPath = inputPath;
    Path newPath = new Path(".");

    // Build the path from bottom up
    while (inputPath != null && depthDiff > 0) {
      newPath = new Path(inputPath.getName(), newPath);
      depthDiff--;
      inputPath = inputPath.getParent();
    }

    Path newTmpPath = new Path(tmpPath, newPath);
    Path newTaskTmpPath = new Path(taskTmpPath, newPath);
    if (!fs.exists(newTmpPath)) {
      fs.mkdirs(newTmpPath);
    }
    updatePaths(newTmpPath, newTaskTmpPath);
  }

  /**
   * Validates that each input path belongs to the same partition since each
   * mapper merges the input to a single output directory
   *
   * @param inputPath - input path
   */
  protected void checkPartitionsMatch(Path inputPath) throws IOException {
    if (!dpPath.equals(inputPath)) {
      // Temp partition input path does not match exist temp path
      String msg = "Multiple partitions for one merge mapper: " + dpPath +
          " NOT EQUAL TO "
          + inputPath;
      LOG.error(msg);
      throw new IOException(msg);
    }
  }

  protected void fixTmpPath(Path path) throws IOException {

    // Fix temp path for alter table ... concatenate
    if (isListBucketingAlterTableConcatenate) {
      if (this.tmpPathFixedConcatenate) {
        checkPartitionsMatch(path);
      } else {
        fixTmpPath(path, listBucketingDepth);
        tmpPathFixedConcatenate = true;
      }
    } else {
      if (hasDynamicPartitions || (listBucketingDepth > 0)) {
        if (tmpPathFixed) {
          checkPartitionsMatch(path);
        } else {
          // We haven't fixed the TMP path for this mapper yet
          int depthDiff = path.depth() - tmpPath.depth();
          fixTmpPath(path, depthDiff);
          tmpPathFixed = true;
        }
      }
    }
  }

  @Override
  public void closeOp(boolean abort) throws HiveException {
    try {
      if (!abort) {
        // if outPath does not exist, then it means all paths within combine split are skipped as
        // they are incompatible for merge (for example: files without stripe stats).
        // Those files will be added to incompatFileSet
        if (fs.exists(outPath)) {
          FileStatus fss = fs.getFileStatus(outPath);
          if (!fs.rename(outPath, finalPath)) {
            throw new IOException(
                "Unable to rename " + outPath + " to " + finalPath);
          }
          LOG.info("renamed path " + outPath + " to " + finalPath + " . File" +
              " size is "
              + fss.getLen());
        }

        // move any incompatible files to final path
        if (incompatFileSet != null && !incompatFileSet.isEmpty()) {
          for (Path incompatFile : incompatFileSet) {
            Path destDir = finalPath.getParent();
            try {
              Utilities.renameOrMoveFiles(fs, incompatFile, destDir);
              LOG.info("Moved incompatible file " + incompatFile + " to " +
                  destDir);
            } catch (HiveException e) {
              LOG.error("Unable to move " + incompatFile + " to " + destDir);
              throw new IOException(e);
            }
          }
        }
      } else {
        if (!autoDelete) {
          fs.delete(outPath, true);
        }
      }
    } catch (IOException e) {
      throw new HiveException("Failed to close AbstractFileMergeOperator", e);
    }
  }

  @Override
  public void jobCloseOp(Configuration hconf, boolean success)
      throws HiveException {
    try {
      Path outputDir = conf.getOutputPath();
      FileSystem fs = outputDir.getFileSystem(hconf);
      Path backupPath = backupOutputPath(fs, outputDir);
      Utilities
          .mvFileToFinalPath(outputDir, hconf, success, LOG, conf.getDpCtx(),
              null, reporter);
      if (success) {
        LOG.info("jobCloseOp moved merged files to output dir: " + outputDir);
      }
      if (backupPath != null) {
        fs.delete(backupPath, true);
      }
    } catch (IOException e) {
      throw new HiveException("Failed jobCloseOp for AbstractFileMergeOperator",
          e);
    }
    super.jobCloseOp(hconf, success);
  }

  private Path backupOutputPath(FileSystem fs, Path outpath)
      throws IOException, HiveException {
    if (fs.exists(outpath)) {
      Path backupPath = new Path(outpath.getParent(),
          BACKUP_PREFIX + outpath.getName());
      Utilities.rename(fs, outpath, backupPath);
      return backupPath;
    } else {
      return null;
    }
  }

  @Override
  public String getName() {
    return AbstractFileMergeOperator.getOperatorName();
  }

  public static String getOperatorName() {
    return "MERGE";
  }
}