HadoopInputFormatIO.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional information regarding
 * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License. You may obtain a
 * copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.beam.sdk.io.hadoop.inputformat;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

import com.google.auto.value.AutoValue;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.AtomicDouble;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import javax.annotation.Nullable;
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.coders.CannotProvideCoderException;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.CoderException;
import org.apache.beam.sdk.coders.CoderRegistry;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.io.hadoop.WritableCoder;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.util.CoderUtils;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.TypeDescriptor;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A {@link HadoopInputFormatIO} is a Transform for reading data from any source which
 * implements Hadoop {@link InputFormat}. For example- Cassandra, Elasticsearch, HBase, Redis,
 * Postgres etc. {@link HadoopInputFormatIO} has to make several performance trade-offs in
 * connecting to {@link InputFormat}, so if there is another Beam IO Transform specifically for
 * connecting to your data source of choice, we would recommend using that one, but this IO
 * Transform allows you to connect to many data sources that do not yet have a Beam IO Transform.
 *
 * <p>You will need to pass a Hadoop {@link Configuration} with parameters specifying how the read
 * will occur. Many properties of the Configuration are optional, and some are required for certain
 * {@link InputFormat} classes, but the following properties must be set for all InputFormats:
 * <ul>
 * <li>{@code mapreduce.job.inputformat.class}: The {@link InputFormat} class used to connect to
 * your data source of choice.</li>
 * <li>{@code key.class}: The key class returned by the {@link InputFormat} in
 * {@code mapreduce.job.inputformat.class}.</li>
 * <li>{@code value.class}: The value class returned by the {@link InputFormat} in
 * {@code mapreduce.job.inputformat.class}.</li>
 * </ul>
 * For example:
 *
 * <pre>
 * {
 *   Configuration myHadoopConfiguration = new Configuration(false);
 *   // Set Hadoop InputFormat, key and value class in configuration
 *   myHadoopConfiguration.setClass("mapreduce.job.inputformat.class",
 *      MyDbInputFormatClass, InputFormat.class);
 *   myHadoopConfiguration.setClass("key.class", MyDbInputFormatKeyClass, Object.class);
 *   myHadoopConfiguration.setClass("value.class",
 *      MyDbInputFormatValueClass, Object.class);
 * }
 * </pre>
 *
 * <p>You will need to check to see if the key and value classes output by the {@link InputFormat}
 * have a Beam {@link Coder} available. If not, you can use withKeyTranslation/withValueTranslation
 * to specify a method transforming instances of those classes into another class that is supported
 * by a Beam {@link Coder}. These settings are optional and you don't need to specify translation
 * for both key and value. If you specify a translation, you will need to make sure the K or V of
 * the read transform match the output type of the translation.
 *
 * <p>You will need to set appropriate InputFormat key and value class (i.e. "key.class" and
 * "value.class") in Hadoop {@link Configuration}. If you set different InputFormat key or
 * value class than InputFormat's actual key or value class then, it may result in an error like
 * "unexpected extra bytes after decoding" while the decoding process of key/value object happens.
 * Hence, it is important to set appropriate InputFormat key and value class.
 *
 * <h3>Reading using {@link HadoopInputFormatIO}</h3>
 *
 * <pre>
 * {@code
 * Pipeline p = ...; // Create pipeline.
 * // Read data only with Hadoop configuration.
 * p.apply("read",
 *     HadoopInputFormatIO.<InputFormatKeyClass, InputFormatKeyClass>read()
 *              .withConfiguration(myHadoopConfiguration);
 * }
 * // Read data with configuration and key translation (Example scenario: Beam Coder is not
 * available for key class hence key translation is required.).
 * SimpleFunction<InputFormatKeyClass, MyKeyClass> myOutputKeyType =
 *       new SimpleFunction<InputFormatKeyClass, MyKeyClass>() {
 *         public MyKeyClass apply(InputFormatKeyClass input) {
 *           // ...logic to transform InputFormatKeyClass to MyKeyClass
 *         }
 * };
 * </pre>
 *
 * <pre>
 * {@code
 * p.apply("read",
 *     HadoopInputFormatIO.<MyKeyClass, InputFormatKeyClass>read()
 *              .withConfiguration(myHadoopConfiguration)
 *              .withKeyTranslation(myOutputKeyType);
 * }
 * </pre>
 *
 * <p>// Read data with configuration and value translation (Example scenario: Beam Coder is not
 * available for value class hence value translation is required.).
 *
 * <pre>
 * {@code
 * SimpleFunction<InputFormatValueClass, MyValueClass> myOutputValueType =
 *      new SimpleFunction<InputFormatValueClass, MyValueClass>() {
 *          public MyValueClass apply(InputFormatValueClass input) {
 *            // ...logic to transform InputFormatValueClass to MyValueClass
 *          }
 *  };
 * }
 * </pre>
 *
 * <pre>
 * {@code
 * p.apply("read",
 *     HadoopInputFormatIO.<InputFormatKeyClass, MyValueClass>read()
 *              .withConfiguration(myHadoopConfiguration)
 *              .withValueTranslation(myOutputValueType);
 * }
 * </pre>
 */
@Experimental
public class HadoopInputFormatIO {
  private static final Logger LOG = LoggerFactory.getLogger(HadoopInputFormatIO.class);

  /**
   * Creates an uninitialized {@link HadoopInputFormatIO.Read}. Before use, the {@code Read} must
   * be initialized with a HadoopInputFormatIO.Read#withConfiguration(HadoopConfiguration) that
   * specifies the source. A key/value translation may also optionally be specified using
   * {@link HadoopInputFormatIO.Read#withKeyTranslation}/
   * {@link HadoopInputFormatIO.Read#withValueTranslation}.
   */
  public static <K, V> Read<K, V> read() {
    return new AutoValue_HadoopInputFormatIO_Read.Builder<K, V>().build();
  }

  /**
   * A {@link PTransform} that reads from any data source which implements Hadoop InputFormat. For
   * e.g. Cassandra, Elasticsearch, HBase, Redis, Postgres, etc. See the class-level Javadoc on
   * {@link HadoopInputFormatIO} for more information.
   * @param <K> Type of keys to be read.
   * @param <V> Type of values to be read.
   * @see HadoopInputFormatIO
   */
  @AutoValue
  public abstract static class Read<K, V> extends PTransform<PBegin, PCollection<KV<K, V>>> {

    // Returns the Hadoop Configuration which contains specification of source.
    @Nullable
    public abstract SerializableConfiguration getConfiguration();

    @Nullable public abstract SimpleFunction<?, K> getKeyTranslationFunction();
    @Nullable public abstract SimpleFunction<?, V> getValueTranslationFunction();
    @Nullable public abstract TypeDescriptor<K> getKeyTypeDescriptor();
    @Nullable public abstract TypeDescriptor<V> getValueTypeDescriptor();
    @Nullable public abstract TypeDescriptor<?> getinputFormatClass();
    @Nullable public abstract TypeDescriptor<?> getinputFormatKeyClass();
    @Nullable public abstract TypeDescriptor<?> getinputFormatValueClass();

    abstract Builder<K, V> toBuilder();

    @AutoValue.Builder
    abstract static class Builder<K, V> {
      abstract Builder<K, V> setConfiguration(SerializableConfiguration configuration);
      abstract Builder<K, V> setKeyTranslationFunction(SimpleFunction<?, K> function);
      abstract Builder<K, V> setValueTranslationFunction(SimpleFunction<?, V> function);
      abstract Builder<K, V> setKeyTypeDescriptor(TypeDescriptor<K> keyTypeDescriptor);
      abstract Builder<K, V> setValueTypeDescriptor(TypeDescriptor<V> valueTypeDescriptor);
      abstract Builder<K, V> setInputFormatClass(TypeDescriptor<?> inputFormatClass);
      abstract Builder<K, V> setInputFormatKeyClass(TypeDescriptor<?> inputFormatKeyClass);
      abstract Builder<K, V> setInputFormatValueClass(TypeDescriptor<?> inputFormatValueClass);
      abstract Read<K, V> build();
    }

    /**
     * Returns a new {@link HadoopInputFormatIO.Read} that will read from the source using the
     * options provided by the given configuration.
     *
     * <p>Does not modify this object.
     */
    public Read<K, V> withConfiguration(Configuration configuration) {
      validateConfiguration(configuration);
      TypeDescriptor<?> inputFormatClass =
          TypeDescriptor.of(configuration.getClass("mapreduce.job.inputformat.class", null));
      TypeDescriptor<?> inputFormatKeyClass =
          TypeDescriptor.of(configuration.getClass("key.class", null));
      TypeDescriptor<?> inputFormatValueClass =
          TypeDescriptor.of(configuration.getClass("value.class", null));
      Builder<K, V> builder =
          toBuilder().setConfiguration(new SerializableConfiguration(configuration));
      builder.setInputFormatClass(inputFormatClass);
      builder.setInputFormatKeyClass(inputFormatKeyClass);
      builder.setInputFormatValueClass(inputFormatValueClass);
      /*
       * Sets the output key class to InputFormat key class if withKeyTranslation() is not called
       * yet.
       */
      if (getKeyTranslationFunction() == null) {
        builder.setKeyTypeDescriptor((TypeDescriptor<K>) inputFormatKeyClass);
      }
      /*
       * Sets the output value class to InputFormat value class if withValueTranslation() is not
       * called yet.
       */
      if (getValueTranslationFunction() == null) {
        builder.setValueTypeDescriptor((TypeDescriptor<V>) inputFormatValueClass);
      }
      return builder.build();
    }

    /**
     * Returns a new {@link HadoopInputFormatIO.Read} that will transform the keys read from the
     * source using the given key translation function.
     *
     * <p>Does not modify this object.
     */
    public Read<K, V> withKeyTranslation(SimpleFunction<?, K> function) {
      checkNotNull(function, "function");
      // Sets key class to key translation function's output class type.
      return toBuilder().setKeyTranslationFunction(function)
          .setKeyTypeDescriptor((TypeDescriptor<K>) function.getOutputTypeDescriptor()).build();
    }

    /**
     * Returns a new {@link HadoopInputFormatIO.Read} that will transform the values read from the
     * source using the given value translation function.
     *
     * <p>Does not modify this object.
     */
    public Read<K, V> withValueTranslation(SimpleFunction<?, V> function) {
      checkNotNull(function, "function");
      // Sets value class to value translation function's output class type.
      return toBuilder().setValueTranslationFunction(function)
          .setValueTypeDescriptor((TypeDescriptor<V>) function.getOutputTypeDescriptor()).build();
    }

    @Override
    public PCollection<KV<K, V>> expand(PBegin input) {
      validateTransform();
      // Get the key and value coders based on the key and value classes.
      CoderRegistry coderRegistry = input.getPipeline().getCoderRegistry();
      Coder<K> keyCoder = getDefaultCoder(getKeyTypeDescriptor(), coderRegistry);
      Coder<V> valueCoder = getDefaultCoder(getValueTypeDescriptor(), coderRegistry);
      HadoopInputFormatBoundedSource<K, V> source = new HadoopInputFormatBoundedSource<K, V>(
          getConfiguration(),
          keyCoder,
          valueCoder,
          getKeyTranslationFunction(),
          getValueTranslationFunction());
      return input.getPipeline().apply(org.apache.beam.sdk.io.Read.from(source));
    }

    /**
     * Validates that the mandatory configuration properties such as InputFormat class, InputFormat
     * key and value classes are provided in the Hadoop configuration.
     */
    private void validateConfiguration(Configuration configuration) {
      checkNotNull(configuration, "configuration");
      checkNotNull(configuration.get("mapreduce.job.inputformat.class"),
          "configuration.get(\"mapreduce.job.inputformat.class\")");
      checkNotNull(configuration.get("key.class"), "configuration.get(\"key.class\")");
      checkNotNull(configuration.get("value.class"),
          "configuration.get(\"value.class\")");
    }

    /**
     * Validates construction of this transform.
     */
    @VisibleForTesting
    void validateTransform() {
      checkNotNull(getConfiguration(), "getConfiguration()");
      // Validate that the key translation input type must be same as key class of InputFormat.
      validateTranslationFunction(getinputFormatKeyClass(), getKeyTranslationFunction(),
          "Key translation's input type is not same as hadoop InputFormat : %s key class : %s");
      // Validate that the value translation input type must be same as value class of InputFormat.
      validateTranslationFunction(getinputFormatValueClass(), getValueTranslationFunction(),
          "Value translation's input type is not same as hadoop InputFormat :  "
              + "%s value class : %s");
    }

    /**
     * Validates translation function given for key/value translation.
     */
    private void validateTranslationFunction(TypeDescriptor<?> inputType,
        SimpleFunction<?, ?> simpleFunction, String errorMsg) {
      if (simpleFunction != null) {
        if (!simpleFunction.getInputTypeDescriptor().equals(inputType)) {
          throw new IllegalArgumentException(
              String.format(errorMsg, getinputFormatClass().getRawType(), inputType.getRawType()));
        }
      }
    }

    /**
     * Returns the default coder for a given type descriptor. Coder Registry is queried for correct
     * coder, if not found in Coder Registry, then check if the type descriptor provided is of type
     * Writable, then WritableCoder is returned, else exception is thrown "Cannot find coder".
     */
    public <T> Coder<T> getDefaultCoder(TypeDescriptor<?> typeDesc, CoderRegistry coderRegistry) {
      Class classType = typeDesc.getRawType();
      try {
        return (Coder<T>) coderRegistry.getCoder(typeDesc);
      } catch (CannotProvideCoderException e) {
        if (Writable.class.isAssignableFrom(classType)) {
          return (Coder<T>) WritableCoder.of(classType);
        }
        throw new IllegalStateException(String.format("Cannot find coder for %s  : ", typeDesc)
            + e.getMessage(), e);
      }
    }
  }

  /**
   * Bounded source implementation for {@link HadoopInputFormatIO}.
   * @param <K> Type of keys to be read.
   * @param <V> Type of values to be read.
   */
  public static class HadoopInputFormatBoundedSource<K, V> extends BoundedSource<KV<K, V>>
      implements Serializable {
    private final SerializableConfiguration conf;
    private final Coder<K> keyCoder;
    private final Coder<V> valueCoder;
    @Nullable private final SimpleFunction<?, K> keyTranslationFunction;
    @Nullable private final SimpleFunction<?, V> valueTranslationFunction;
    private final SerializableSplit inputSplit;
    private transient List<SerializableSplit> inputSplits;
    private long boundedSourceEstimatedSize = 0;
    private transient InputFormat<?, ?> inputFormatObj;
    private transient TaskAttemptContext taskAttemptContext;
    private static final Set<Class<?>> immutableTypes = new HashSet<Class<?>>(
        Arrays.asList(
            String.class,
            Byte.class,
            Short.class,
            Integer.class,
            Long.class,
            Float.class,
            Double.class,
            Boolean.class,
            BigInteger.class,
            BigDecimal.class));

    HadoopInputFormatBoundedSource(
        SerializableConfiguration conf,
        Coder<K> keyCoder,
        Coder<V> valueCoder,
        @Nullable SimpleFunction<?, K> keyTranslationFunction,
        @Nullable SimpleFunction<?, V> valueTranslationFunction) {
      this(conf,
          keyCoder,
          valueCoder,
          keyTranslationFunction,
          valueTranslationFunction,
          null);
    }

    protected HadoopInputFormatBoundedSource(
        SerializableConfiguration conf,
        Coder<K> keyCoder,
        Coder<V> valueCoder,
        @Nullable SimpleFunction<?, K> keyTranslationFunction,
        @Nullable SimpleFunction<?, V> valueTranslationFunction,
        SerializableSplit inputSplit) {
      this.conf = conf;
      this.inputSplit = inputSplit;
      this.keyCoder = keyCoder;
      this.valueCoder = valueCoder;
      this.keyTranslationFunction = keyTranslationFunction;
      this.valueTranslationFunction = valueTranslationFunction;
    }

    public SerializableConfiguration getConfiguration() {
      return conf;
    }

    @Override
    public void validate() {
      checkNotNull(conf, "conf");
      checkNotNull(keyCoder, "keyCoder");
      checkNotNull(valueCoder, "valueCoder");
    }

    @Override
    public void populateDisplayData(DisplayData.Builder builder) {
      super.populateDisplayData(builder);
      Configuration hadoopConfig = getConfiguration().getHadoopConfiguration();
      if (hadoopConfig != null) {
        builder.addIfNotNull(DisplayData.item("mapreduce.job.inputformat.class",
            hadoopConfig.get("mapreduce.job.inputformat.class"))
            .withLabel("InputFormat Class"));
        builder.addIfNotNull(DisplayData.item("key.class",
            hadoopConfig.get("key.class"))
            .withLabel("Key Class"));
        builder.addIfNotNull(DisplayData.item("value.class",
            hadoopConfig.get("value.class"))
            .withLabel("Value Class"));
      }
    }

    @Override
    public List<BoundedSource<KV<K, V>>> split(long desiredBundleSizeBytes,
        PipelineOptions options) throws Exception {
      // desiredBundleSizeBytes is not being considered as splitting based on this
      // value is not supported by inputFormat getSplits() method.
      if (inputSplit != null) {
        LOG.info("Not splitting source {} because source is already split.", this);
        return ImmutableList.of((BoundedSource<KV<K, V>>) this);
      }
      computeSplitsIfNecessary();
      LOG.info("Generated {} splits. Size of first split is {} ", inputSplits.size(), inputSplits
          .get(0).getSplit().getLength());
      return Lists.transform(inputSplits,
          new Function<SerializableSplit, BoundedSource<KV<K, V>>>() {
            @Override
            public BoundedSource<KV<K, V>> apply(SerializableSplit serializableInputSplit) {
              HadoopInputFormatBoundedSource<K, V> hifBoundedSource =
                  new HadoopInputFormatBoundedSource<K, V>(conf, keyCoder, valueCoder,
                      keyTranslationFunction, valueTranslationFunction, serializableInputSplit);
              return hifBoundedSource;
            }
          });
    }

    @Override
    public long getEstimatedSizeBytes(PipelineOptions po) throws Exception {
      if (inputSplit == null) {
        // If there are no splits computed yet, then retrieve the splits.
        computeSplitsIfNecessary();
        return boundedSourceEstimatedSize;
      }
      return inputSplit.getSplit().getLength();
    }

    /**
     * This is a helper function to compute splits. This method will also calculate size of the
     * data being read. Note: This method is executed exactly once and the splits are retrieved
     * and cached in this. These splits are further used by split() and
     * getEstimatedSizeBytes().
     */
    @VisibleForTesting
    void computeSplitsIfNecessary() throws IOException, InterruptedException {
      if (inputSplits != null) {
        return;
      }
      createInputFormatInstance();
      List<InputSplit> splits =
          inputFormatObj.getSplits(Job.getInstance(conf.getHadoopConfiguration()));
      if (splits == null) {
        throw new IOException("Error in computing splits, getSplits() returns null.");
      }
      if (splits.isEmpty()) {
        throw new IOException("Error in computing splits, getSplits() returns a empty list");
      }
      boundedSourceEstimatedSize = 0;
      inputSplits = new ArrayList<SerializableSplit>();
      for (InputSplit inputSplit : splits) {
        if (inputSplit == null) {
          throw new IOException("Error in computing splits, split is null in InputSplits list "
              + "populated by getSplits() : ");
        }
        boundedSourceEstimatedSize += inputSplit.getLength();
        inputSplits.add(new SerializableSplit(inputSplit));
      }
    }

    /**
     * Creates instance of InputFormat class. The InputFormat class name is specified in the Hadoop
     * configuration.
     */
    protected void createInputFormatInstance() throws IOException {
      if (inputFormatObj == null) {
        try {
          taskAttemptContext =
              new TaskAttemptContextImpl(conf.getHadoopConfiguration(), new TaskAttemptID());
          inputFormatObj =
              (InputFormat<?, ?>) conf
                  .getHadoopConfiguration()
                  .getClassByName(
                      conf.getHadoopConfiguration().get("mapreduce.job.inputformat.class"))
                  .newInstance();
          /*
           * If InputFormat explicitly implements interface {@link Configurable}, then setConf()
           * method of {@link Configurable} needs to be explicitly called to set all the
           * configuration parameters. For example: InputFormat classes which implement Configurable
           * are {@link org.apache.hadoop.mapreduce.lib.db.DBInputFormat DBInputFormat}, {@link
           * org.apache.hadoop.hbase.mapreduce.TableInputFormat TableInputFormat}, etc.
           */
          if (Configurable.class.isAssignableFrom(inputFormatObj.getClass())) {
            ((Configurable) inputFormatObj).setConf(conf.getHadoopConfiguration());
          }
        } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
          throw new IOException("Unable to create InputFormat object: ", e);
        }
      }
    }

    @VisibleForTesting
    InputFormat<?, ?> getInputFormat(){
      return inputFormatObj;
    }

    @VisibleForTesting
    void setInputFormatObj(InputFormat<?, ?> inputFormatObj) {
      this.inputFormatObj = inputFormatObj;
    }

    @Override
    public Coder<KV<K, V>> getDefaultOutputCoder() {
      return KvCoder.of(keyCoder, valueCoder);
    }

    @Override
    public BoundedReader<KV<K, V>> createReader(PipelineOptions options) throws IOException {
      this.validate();
      if (inputSplit == null) {
        throw new IOException("Cannot create reader as source is not split yet.");
      } else {
        createInputFormatInstance();
        return new HadoopInputFormatReader<>(
            this,
            keyTranslationFunction,
            valueTranslationFunction,
            inputSplit,
            inputFormatObj,
            taskAttemptContext);
      }
    }

    /**
     * BoundedReader for Hadoop InputFormat source.
     *
     * @param <K> Type of keys RecordReader emits.
     * @param <V> Type of values RecordReader emits.
     */
    class HadoopInputFormatReader<T1, T2> extends BoundedSource.BoundedReader<KV<K, V>> {

      private final HadoopInputFormatBoundedSource<K, V> source;
      @Nullable private final SimpleFunction<T1, K> keyTranslationFunction;
      @Nullable private final SimpleFunction<T2, V> valueTranslationFunction;
      private final SerializableSplit split;
      private RecordReader<T1, T2> recordReader;
      private volatile boolean doneReading = false;
      private AtomicLong recordsReturned = new AtomicLong();
      // Tracks the progress of the RecordReader.
      private AtomicDouble progressValue = new AtomicDouble();
      private transient InputFormat<T1, T2> inputFormatObj;
      private transient TaskAttemptContext taskAttemptContext;

      private HadoopInputFormatReader(HadoopInputFormatBoundedSource<K, V> source,
          @Nullable SimpleFunction keyTranslationFunction,
          @Nullable SimpleFunction valueTranslationFunction,
          SerializableSplit split,
          InputFormat inputFormatObj,
          TaskAttemptContext taskAttemptContext) {
        this.source = source;
        this.keyTranslationFunction = keyTranslationFunction;
        this.valueTranslationFunction = valueTranslationFunction;
        this.split = split;
        this.inputFormatObj = inputFormatObj;
        this.taskAttemptContext = taskAttemptContext;
      }

      @Override
      public HadoopInputFormatBoundedSource<K, V> getCurrentSource() {
        return source;
      }

      @Override
      public boolean start() throws IOException {
        try {
          recordsReturned.set(0L);
          recordReader =
              (RecordReader<T1, T2>) inputFormatObj.createRecordReader(split.getSplit(),
                  taskAttemptContext);
          if (recordReader != null) {
            recordReader.initialize(split.getSplit(), taskAttemptContext);
            progressValue.set(getProgress());
            if (recordReader.nextKeyValue()) {
              recordsReturned.incrementAndGet();
              doneReading = false;
              return true;
            }
          } else {
            throw new IOException(String.format("Null RecordReader object returned by %s",
                inputFormatObj.getClass()));
          }
          recordReader = null;
        } catch (InterruptedException e) {
          throw new IOException(
              "Could not read because the thread got interrupted while "
              + "reading the records with an exception: ",
              e);
        }
        doneReading = true;
        return false;
      }

      @Override
      public boolean advance() throws IOException {
        try {
          progressValue.set(getProgress());
          if (recordReader.nextKeyValue()) {
            recordsReturned.incrementAndGet();
            return true;
          }
          doneReading = true;
        } catch (InterruptedException e) {
          throw new IOException("Unable to read data: ", e);
        }
        return false;
      }

      @Override
      public KV<K, V> getCurrent() {
        K key = null;
        V value = null;
        try {
          // Transform key if translation function is provided.
          key =
              transformKeyOrValue((T1) recordReader.getCurrentKey(), keyTranslationFunction,
                  keyCoder);
          // Transform value if translation function is provided.
          value =
              transformKeyOrValue((T2) recordReader.getCurrentValue(), valueTranslationFunction,
                  valueCoder);
        } catch (IOException | InterruptedException e) {
          LOG.error("Unable to read data: " + "{}", e);
          throw new IllegalStateException("Unable to read data: " + "{}", e);
        }
        return KV.of(key, value);
      }

      /**
       * Returns the serialized output of transformed key or value object.
       * @throws ClassCastException
       * @throws CoderException
       */
      private <T, T3> T3 transformKeyOrValue(T input,
          @Nullable SimpleFunction<T, T3> simpleFunction, Coder<T3> coder) throws CoderException,
          ClassCastException {
        T3 output;
        if (null != simpleFunction) {
          output = simpleFunction.apply(input);
        } else {
          output = (T3) input;
        }
        return cloneIfPossiblyMutable((T3) output, coder);
      }

      /**
       * Beam expects immutable objects, but the Hadoop InputFormats tend to re-use the same object
       * when returning them. Hence, mutable objects returned by Hadoop InputFormats are cloned.
       */
      private <T> T cloneIfPossiblyMutable(T input, Coder<T> coder) throws CoderException,
          ClassCastException {
        // If the input object is not of known immutable type, clone the object.
        if (!isKnownImmutable(input)) {
          input = CoderUtils.clone(coder, input);
        }
        return input;
      }

      /**
       * Utility method to check if the passed object is of a known immutable type.
       */
      private boolean isKnownImmutable(Object o) {
        return immutableTypes.contains(o.getClass());
      }

      @Override
      public void close() throws IOException {
        LOG.info("Closing reader after reading {} records.", recordsReturned);
        if (recordReader != null) {
          recordReader.close();
          recordReader = null;
        }
      }

      @Override
      public Double getFractionConsumed() {
        if (doneReading) {
          return 1.0;
        } else if (recordReader == null || recordsReturned.get() == 0L) {
          return 0.0;
        }
        if (progressValue.get() == 0.0) {
          return null;
        }
        return progressValue.doubleValue();
      }

      /**
       * Returns RecordReader's progress.
       * @throws IOException
       * @throws InterruptedException
       */
      private Double getProgress() throws IOException, InterruptedException {
        try {
          float progress = recordReader.getProgress();
          return (double) progress < 0 || progress > 1 ? 0.0 : progress;
        } catch (IOException e) {
          LOG.error(
              "Error in computing the fractions consumed as RecordReader.getProgress() throws an "
              + "exception : " + "{}", e);
          throw new IOException(
              "Error in computing the fractions consumed as RecordReader.getProgress() throws an "
              + "exception : " + e.getMessage(), e);
        }
      }

      @Override
      public final long getSplitPointsRemaining() {
        if (doneReading) {
          return 0;
        }
        /**
         * This source does not currently support dynamic work rebalancing, so remaining parallelism
         * is always 1.
         */
        return 1;
      }
    }
  }

  /**
   * A wrapper to allow Hadoop {@link org.apache.hadoop.mapreduce.InputSplit} to be serialized using
   * Java's standard serialization mechanisms.
   */
  public static class SerializableSplit implements Serializable {

    InputSplit inputSplit;

    public SerializableSplit() {}

    public SerializableSplit(InputSplit split) {
      checkArgument(split instanceof Writable,
          String.format("Split is not of type Writable: %s", split));
      this.inputSplit = split;
    }

    public InputSplit getSplit() {
      return inputSplit;
    }

    private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
      ObjectWritable ow = new ObjectWritable();
      ow.setConf(new Configuration(false));
      ow.readFields(in);
      this.inputSplit = (InputSplit) ow.get();
    }

    private void writeObject(ObjectOutputStream out) throws IOException {
      new ObjectWritable(inputSplit).write(out);
    }
  }

  /**
   * A wrapper to allow Hadoop {@link org.apache.hadoop.conf.Configuration} to be serialized using
   * Java's standard serialization mechanisms. Note that the org.apache.hadoop.conf.Configuration
   * is Writable.
   */
  public static class SerializableConfiguration implements Externalizable {

    private Configuration conf;

    public SerializableConfiguration() {}

    public SerializableConfiguration(Configuration conf) {
      this.conf = conf;
    }

    public Configuration getHadoopConfiguration() {
      return conf;
    }

    @Override
    public void writeExternal(ObjectOutput out) throws IOException {
      out.writeUTF(conf.getClass().getCanonicalName());
      ((Writable) conf).write(out);
    }

    @Override
    public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
      String className = in.readUTF();
      try {
        conf = (Configuration) Class.forName(className).newInstance();
        conf.readFields(in);
      } catch (InstantiationException | IllegalAccessException e) {
        throw new IOException("Unable to create configuration: " + e);
      }
    }
  }
}