KeyValueOutputCommitter.java example

Explorer
tachyon-master
- alluxio-master
/*
 * The Alluxio Open Foundation licenses this work under the Apache License, version 2.0
 * (the "License"). You may not use this work except in compliance with the License, which is
 * available at www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied, as more fully set forth in the License.
 *
 * See the NOTICE file distributed with this work for information regarding copyright ownership.
 */

package alluxio.hadoop.mapreduce;

import alluxio.annotation.PublicApi;
import alluxio.client.keyvalue.KeyValueSystem;
import alluxio.exception.AlluxioException;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;

import javax.annotation.concurrent.ThreadSafe;

/**
 * Extension of {@link FileOutputCommitter} where creating, completing, or deleting a {@link
 * KeyValueSystem} in different phases of a job's or task's lifecycle is considered.
 * <p>
 * This committer must be used along with {@link KeyValueOutputFormat} to merge the key-value stores
 * created by each Reducer into one key-value store under the MapReduce output directory.
 */
@PublicApi
@ThreadSafe
public final class KeyValueOutputCommitter extends FileOutputCommitter {
  private static final Logger LOG = LoggerFactory.getLogger(KeyValueOutputCommitter.class);

  private static final KeyValueSystem KEY_VALUE_SYSTEM = KeyValueSystem.Factory.create();

  /**
   * Constructor.
   *
   * @param outputPath the job's output path, or null if the output committer is a noop
   * @param taskContext the task's context
   */
  public KeyValueOutputCommitter(Path outputPath, TaskAttemptContext taskContext)
      throws IOException {
    super(outputPath, taskContext);
  }

  /**
   * @param taskContext MapReduce task configuration
   * @return true if the task output directory exists, otherwise false
   */
  @Override
  public boolean needsTaskCommit(TaskAttemptContext taskContext) throws IOException {
    Path taskOutputPath = new Path(KeyValueOutputFormat.getTaskOutputURI(taskContext).toString());
    FileSystem fs = taskOutputPath.getFileSystem(taskContext.getConfiguration());
    return fs.exists(taskOutputPath);
  }

  /**
   * {@inheritDoc}
   * <p/>
   * Merges the completed key-value store under the task's temporary output directory to the
   * key-value store at job output directory, then calls {@link
   * FileOutputCommitter#commitTask(TaskAttemptContext)}.
   */
  @Override
  public void commitTask(TaskAttemptContext taskContext) throws IOException {
    try {
      KEY_VALUE_SYSTEM.mergeStore(KeyValueOutputFormat.getTaskOutputURI(taskContext),
          KeyValueOutputFormat.getJobOutputURI(taskContext));
    } catch (AlluxioException e) {
      throw new IOException(e);
    }
    super.commitTask(taskContext);
  }

  /**
   * {@inheritDoc}
   * <p/>
   * Deletes the completed key-value stores under the task's temporary output directory, and then
   * calls {@link FileOutputCommitter#abortTask(TaskAttemptContext)}.
   */
  @Override
  public void abortTask(TaskAttemptContext taskContext) {
    // TODO(binfan): in Hadoop 1.x FileOutputCommitter#abortTask doesn't throw IOException. To
    // keep the code compile with early Hadoop versions, we catch this exception.
    try {
      try {
        KEY_VALUE_SYSTEM.deleteStore(KeyValueOutputFormat.getTaskOutputURI(taskContext));
      } catch (AlluxioException e) {
        throw new IOException(e);
      }
      super.abortTask(taskContext);
    } catch (IOException e) {
      LOG.error("Failed to abort task", taskContext);
    }
  }

  /**
   * @return the temp directory name
   */
  public static String getPendingDirName() {
    // Due to Hadoop 1 support we stick with the deprecated version. If we drop support for it
    // FileOutputCommitter.PENDING_DIR_NAME will be the new one.
    return FileOutputCommitter.TEMP_DIR_NAME;
  }
}