MRInput.java example

Explorer
incubator-tez-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tez.mapreduce.input;

import java.io.IOException;
import java.util.List;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitIndex;
import org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitMetaInfo;
import org.apache.hadoop.mapreduce.split.TezGroupedSplitsInputFormat;
import org.apache.tez.common.counters.TaskCounter;
import org.apache.tez.dag.api.Vertex;
import org.apache.tez.mapreduce.hadoop.MRHelpers;
import org.apache.tez.mapreduce.hadoop.MRJobConfig;
import org.apache.tez.mapreduce.input.base.MRInputBase;
import org.apache.tez.mapreduce.lib.MRInputUtils;
import org.apache.tez.mapreduce.lib.MRReader;
import org.apache.tez.mapreduce.lib.MRReaderMapReduce;
import org.apache.tez.mapreduce.lib.MRReaderMapred;
import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto;
import org.apache.tez.runtime.api.Event;
import org.apache.tez.runtime.api.Input;
import org.apache.tez.runtime.api.events.RootInputDataInformationEvent;
import org.apache.tez.runtime.library.api.KeyValueReader;

import com.google.common.base.Preconditions;

/**
 * {@link MRInput} is an {@link Input} which provides key/values pairs
 * for the consumer.
 *
 * It is compatible with all standard Apache Hadoop MapReduce 
 * {@link InputFormat} implementations.
 * 
 * This class is not meant to be extended by external projects.
 */

public class MRInput extends MRInputBase {

  private static final Log LOG = LogFactory.getLog(MRInput.class);
  
  private final ReentrantLock rrLock = new ReentrantLock();
  private final Condition rrInited = rrLock.newCondition();
  
  private volatile boolean eventReceived = false;

  private boolean readerCreated = false;

  protected MRReader mrReader;

  protected TaskSplitIndex splitMetaInfo = new TaskSplitIndex();

  // Potential counters - #splits, #totalSize, #actualyBytesRead
  
  @Private
  volatile boolean splitInfoViaEvents;
  
  /**
   * Helper API to generate the user payload for the MRInput and
   * MRInputAMSplitGenerator (if used). The InputFormat will be invoked by Tez
   * at DAG runtime to generate the input splits.
   * 
   * @param conf
   *          Configuration for the InputFormat
   * @param inputFormatClassName
   *          Name of the class of the InputFormat
   * @param useNewApi
   *          use new mapreduce API or old mapred API
   * @param groupSplitsInAM
   *          do grouping of splits in the AM. If true then splits generated by
   *          the InputFormat will be grouped in the AM based on available
   *          resources, locality etc. This option may be set to true only when
   *          using MRInputAMSplitGenerator as the initializer class in
   *          {@link Vertex#addInput(String, org.apache.tez.dag.api.InputDescriptor, Class)}
   * @return returns the user payload to be set on the InputDescriptor of  MRInput
   * @throws IOException
   */
  public static byte[] createUserPayload(Configuration conf,
      String inputFormatClassName, boolean useNewApi, boolean groupSplitsInAM)
      throws IOException {
    Configuration inputConf = new JobConf(conf);
    String wrappedInputFormatClassName = null;
    String configInputFormatClassName = null;
    if (groupSplitsInAM) {
      wrappedInputFormatClassName = inputFormatClassName;
      configInputFormatClassName = TezGroupedSplitsInputFormat.class.getName();
    } else {
      wrappedInputFormatClassName = null;
      configInputFormatClassName = inputFormatClassName;
    }
    inputConf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR,
        configInputFormatClassName);
    inputConf.setBoolean("mapred.mapper.new-api", useNewApi);
    MRHelpers.translateVertexConfToTez(inputConf);
    MRHelpers.doJobClientMagic(inputConf);
    if (groupSplitsInAM) {
      return MRHelpers.createMRInputPayloadWithGrouping(inputConf,
          wrappedInputFormatClassName);
    } else {
      return MRHelpers.createMRInputPayload(inputConf, null);
    }
  }
  
  @Override
  public List<Event> initialize() throws IOException {
    super.initialize();
    getContext().inputIsReady();
    this.splitInfoViaEvents = jobConf.getBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS,
        MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS_DEFAULT);
    LOG.info("Using New mapreduce API: " + useNewApi
        + ", split information via event: " + splitInfoViaEvents);
    initializeInternal();
    return null;
  }

  @Override
  public void start() {
    Preconditions.checkState(getNumPhysicalInputs() == 1, "Expecting only 1 physical input for MRInput");
  }

  @Private
  void initializeInternal() throws IOException {
    // Primarily for visibility
    rrLock.lock();
    try {
      
      if (splitInfoViaEvents) {
        if (useNewApi) {
          mrReader = new MRReaderMapReduce(jobConf, getContext().getCounters(), inputRecordCounter,
              getContext().getApplicationId().getClusterTimestamp(), getContext()
                  .getTaskVertexIndex(), getContext().getApplicationId().getId(), getContext()
                  .getTaskIndex(), getContext().getTaskAttemptNumber());
        } else {
          mrReader = new MRReaderMapred(jobConf, getContext().getCounters(), inputRecordCounter);
        }
      } else {
        TaskSplitMetaInfo[] allMetaInfo = MRInputUtils.readSplits(jobConf);
        TaskSplitMetaInfo thisTaskMetaInfo = allMetaInfo[getContext().getTaskIndex()];
        TaskSplitIndex splitMetaInfo = new TaskSplitIndex(thisTaskMetaInfo.getSplitLocation(),
            thisTaskMetaInfo.getStartOffset());
        if (useNewApi) {
          org.apache.hadoop.mapreduce.InputSplit newInputSplit = MRInputUtils
              .getNewSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters()
                  .findCounter(TaskCounter.SPLIT_RAW_BYTES));
          mrReader = new MRReaderMapReduce(jobConf, newInputSplit, getContext().getCounters(),
              inputRecordCounter, getContext().getApplicationId().getClusterTimestamp(),
              getContext().getTaskVertexIndex(), getContext().getApplicationId().getId(),
              getContext().getTaskIndex(), getContext().getTaskAttemptNumber());
        } else {
          org.apache.hadoop.mapred.InputSplit oldInputSplit = MRInputUtils
              .getOldSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters()
                  .findCounter(TaskCounter.SPLIT_RAW_BYTES));
          mrReader = new MRReaderMapred(jobConf, oldInputSplit, getContext().getCounters(), inputRecordCounter);
        }
      }
    } finally {
      rrLock.unlock();
    }
    LOG.info("Initialzed MRInput: " + getContext().getSourceVertexName());
  }

  @Override
  public KeyValueReader getReader() throws IOException {
    Preconditions
        .checkState(readerCreated == false,
            "Only a single instance of record reader can be created for this input.");
    readerCreated = true;
    rrLock.lock();
    try {
      if (!mrReader.isSetup())
        checkAndAwaitRecordReaderInitialization();
    } finally {
      rrLock.unlock();
    }

    return mrReader;
  }

  @Override
  public void handleEvents(List<Event> inputEvents) throws Exception {
    if (eventReceived || inputEvents.size() != 1) {
      throw new IllegalStateException(
          "MRInput expects only a single input. Received: current eventListSize: "
              + inputEvents.size() + "Received previous input: "
              + eventReceived);
    }
    Event event = inputEvents.iterator().next();
    Preconditions.checkArgument(event instanceof RootInputDataInformationEvent,
        getClass().getSimpleName()
            + " can only handle a single event of type: "
            + RootInputDataInformationEvent.class.getSimpleName());

    processSplitEvent((RootInputDataInformationEvent)event);
  }

  @Override
  public List<Event> close() throws IOException {
    mrReader.close();
    return null;
  }

  /**
   * {@link MRInput} sets some additional parameters like split location when using
   * the new API. This methods returns the list of additional updates, and
   * should be used by Processors using the old MapReduce API with {@link MRInput}.
   * 
   * @return the additional fields set by {@link MRInput}
   */
  public Configuration getConfigUpdates() {
    if (!useNewApi) {
      return ((MRReaderMapred) mrReader).getConfigUpdates();
    } else {
      return null;
    }
  }

  public float getProgress() throws IOException, InterruptedException {
    return mrReader.getProgress();
  }

  void processSplitEvent(RootInputDataInformationEvent event)
      throws IOException {
    rrLock.lock();
    try {
      initFromEventInternal(event);
      LOG.info("Notifying on RecordReader Initialized");
      rrInited.signal();
    } finally {
      rrLock.unlock();
    }
  }
  
  void checkAndAwaitRecordReaderInitialization() throws IOException {
    assert rrLock.getHoldCount() == 1;
    rrLock.lock();
    try {
      LOG.info("Awaiting RecordReader initialization");
      rrInited.await();
    } catch (Exception e) {
      throw new IOException(
          "Interrupted waiting for RecordReader initiailization");
    } finally {
      rrLock.unlock();
    }
  }

  @Private
  void initFromEvent(RootInputDataInformationEvent initEvent)
      throws IOException {
    rrLock.lock();
    try {
      initFromEventInternal(initEvent);
    } finally {
      rrLock.unlock();
    }
  }
  
  private void initFromEventInternal(RootInputDataInformationEvent initEvent) throws IOException {
    LOG.info("Initializing RecordReader from event");
    Preconditions.checkState(initEvent != null, "InitEvent must be specified");
    MRSplitProto splitProto = MRSplitProto.parseFrom(initEvent.getUserPayload());
    Object split = null;
    if (useNewApi) {
      split = MRInputUtils.getNewSplitDetailsFromEvent(splitProto, jobConf);
      LOG.info("Split Details -> SplitClass: " + split.getClass().getName() + ", NewSplit: "
          + split);

    } else {
      split = MRInputUtils.getOldSplitDetailsFromEvent(splitProto, jobConf);
      LOG.info("Split Details -> SplitClass: " + split.getClass().getName() + ", OldSplit: "
          + split);
    }
    mrReader.setSplit(split);
    LOG.info("Initialized RecordReader from event");
  }
}