HadoopInputFormatIOTest.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional information regarding
 * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License. You may obtain a
 * copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.beam.sdk.io.hadoop.inputformat;

import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem;
import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.beam.sdk.coders.AvroCoder;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.io.BoundedSource.BoundedReader;
import org.apache.beam.sdk.io.hadoop.WritableCoder;
import org.apache.beam.sdk.io.hadoop.inputformat.EmployeeInputFormat.EmployeeRecordReader;
import org.apache.beam.sdk.io.hadoop.inputformat.EmployeeInputFormat.NewObjectsEmployeeInputSplit;
import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.HadoopInputFormatBoundedSource;
import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.SerializableConfiguration;
import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.SerializableSplit;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.testing.PAssert;
import org.apache.beam.sdk.testing.SourceTestUtils;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.mockito.Mockito;

/**
 * Unit tests for {@link HadoopInputFormatIO}.
 */
@RunWith(JUnit4.class)
public class HadoopInputFormatIOTest {
  static SerializableConfiguration serConf;
  static SimpleFunction<Text, String> myKeyTranslate;
  static SimpleFunction<Employee, String> myValueTranslate;

  @Rule public final transient TestPipeline p = TestPipeline.create();
  @Rule public ExpectedException thrown = ExpectedException.none();

  private PBegin input = PBegin.in(p);

  @BeforeClass
  public static void setUp() throws IOException, InterruptedException {
    serConf = loadTestConfiguration(
                  EmployeeInputFormat.class,
                  Text.class,
                  Employee.class);
    myKeyTranslate = new SimpleFunction<Text, String>() {
      @Override
      public String apply(Text input) {
        return input.toString();
      }
    };
    myValueTranslate = new SimpleFunction<Employee, String>() {
      @Override
      public String apply(Employee input) {
        return input.getEmpName() + "_" + input.getEmpAddress();
      }
    };
  }

  @Test
  public void testReadBuildsCorrectly() {
    HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
        .withConfiguration(serConf.getHadoopConfiguration())
        .withKeyTranslation(myKeyTranslate)
        .withValueTranslation(myValueTranslate);
    assertEquals(serConf.getHadoopConfiguration(),
        read.getConfiguration().getHadoopConfiguration());
    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
    assertEquals(myValueTranslate, read.getValueTranslationFunction());
    assertEquals(myValueTranslate.getOutputTypeDescriptor(), read.getValueTypeDescriptor());
    assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor());
  }

  /**
   * This test validates {@link HadoopInputFormatIO.Read Read} builds correctly in different order
   * of with configuration/key translation/value translation. This test also validates output
   * PCollection key/value classes are set correctly even if Hadoop configuration is set after
   * setting key/value translation.
   */
  @Test
  public void testReadBuildsCorrectlyInDifferentOrder() {
    HadoopInputFormatIO.Read<String, String> read =
        HadoopInputFormatIO.<String, String>read()
            .withValueTranslation(myValueTranslate)
            .withConfiguration(serConf.getHadoopConfiguration())
            .withKeyTranslation(myKeyTranslate);
    assertEquals(serConf.getHadoopConfiguration(),
        read.getConfiguration().getHadoopConfiguration());
    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
    assertEquals(myValueTranslate, read.getValueTranslationFunction());
    assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor());
    assertEquals(myValueTranslate.getOutputTypeDescriptor(), read.getValueTypeDescriptor());
  }

  /**
   * This test validates {@link HadoopInputFormatIO.Read Read} object creation if
   * {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()} is called more than
   * once.
   * @throws InterruptedException
   * @throws IOException
   */
  @Test
  public void testReadBuildsCorrectlyIfWithConfigurationIsCalledMoreThanOneTime()
      throws IOException, InterruptedException {
    SerializableConfiguration diffConf =
        loadTestConfiguration(
            EmployeeInputFormat.class,
            Employee.class,
            Text.class);
    HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
        .withConfiguration(serConf.getHadoopConfiguration())
        .withKeyTranslation(myKeyTranslate)
        .withConfiguration(diffConf.getHadoopConfiguration());
    assertEquals(diffConf.getHadoopConfiguration(),
        read.getConfiguration().getHadoopConfiguration());
    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
    assertEquals(null, read.getValueTranslationFunction());
    assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor());
    assertEquals(diffConf.getHadoopConfiguration().getClass("value.class", Object.class), read
        .getValueTypeDescriptor().getRawType());
  }

  /**
   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with
   * null configuration. {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()}
   * method checks configuration is null and throws exception if it is null.
   */
  @Test
  public void testReadObjectCreationFailsIfConfigurationIsNull() {
    thrown.expect(NullPointerException.class);
    HadoopInputFormatIO.<Text, Employee>read()
          .withConfiguration(null);
  }

  /**
   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with only
   * configuration.
   */
  @Test
  public void testReadObjectCreationWithConfiguration() {
    HadoopInputFormatIO.Read<Text, Employee> read = HadoopInputFormatIO.<Text, Employee>read()
        .withConfiguration(serConf.getHadoopConfiguration());
    assertEquals(serConf.getHadoopConfiguration(),
        read.getConfiguration().getHadoopConfiguration());
    assertEquals(null, read.getKeyTranslationFunction());
    assertEquals(null, read.getValueTranslationFunction());
    assertEquals(serConf.getHadoopConfiguration().getClass("key.class", Object.class), read
        .getKeyTypeDescriptor().getRawType());
    assertEquals(serConf.getHadoopConfiguration().getClass("value.class", Object.class), read
        .getValueTypeDescriptor().getRawType());
  }

  /**
   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with
   * configuration and null key translation. {@link HadoopInputFormatIO.Read#withKeyTranslation()
   * withKeyTranslation()} checks keyTranslation is null and throws exception if it null value is
   * passed.
   */
  @Test
  public void testReadObjectCreationFailsIfKeyTranslationFunctionIsNull() {
    thrown.expect(NullPointerException.class);
    HadoopInputFormatIO.<String, Employee>read()
        .withConfiguration(serConf.getHadoopConfiguration())
        .withKeyTranslation(null);
  }

  /**
   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with
   * configuration and key translation.
   */
  @Test
  public void testReadObjectCreationWithConfigurationKeyTranslation() {
    HadoopInputFormatIO.Read<String, Employee> read = HadoopInputFormatIO.<String, Employee>read()
        .withConfiguration(serConf.getHadoopConfiguration())
        .withKeyTranslation(myKeyTranslate);
    assertEquals(serConf.getHadoopConfiguration(),
        read.getConfiguration().getHadoopConfiguration());
    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
    assertEquals(null, read.getValueTranslationFunction());
    assertEquals(myKeyTranslate.getOutputTypeDescriptor().getRawType(),
        read.getKeyTypeDescriptor().getRawType());
    assertEquals(serConf.getHadoopConfiguration().getClass("value.class", Object.class),
        read.getValueTypeDescriptor().getRawType());
  }

  /**
   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with
   * configuration and null value translation.
   * {@link HadoopInputFormatIO.Read#withValueTranslation() withValueTranslation()} checks
   * valueTranslation is null and throws exception if null value is passed.
   */
  @Test
  public void testReadObjectCreationFailsIfValueTranslationFunctionIsNull() {
    thrown.expect(NullPointerException.class);
    HadoopInputFormatIO.<Text, String>read()
        .withConfiguration(serConf.getHadoopConfiguration())
        .withValueTranslation(null);
  }

  /**
   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with
   * configuration and value translation.
   */
  @Test
  public void testReadObjectCreationWithConfigurationValueTranslation() {
    HadoopInputFormatIO.Read<Text, String> read = HadoopInputFormatIO.<Text, String>read()
        .withConfiguration(serConf.getHadoopConfiguration())
        .withValueTranslation(myValueTranslate);
    assertEquals(serConf.getHadoopConfiguration(),
        read.getConfiguration().getHadoopConfiguration());
    assertEquals(null, read.getKeyTranslationFunction());
    assertEquals(myValueTranslate, read.getValueTranslationFunction());
    assertEquals(serConf.getHadoopConfiguration().getClass("key.class", Object.class),
        read.getKeyTypeDescriptor().getRawType());
    assertEquals(myValueTranslate.getOutputTypeDescriptor().getRawType(),
        read.getValueTypeDescriptor().getRawType());
  }

  /**
   * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with
   * configuration, key translation and value translation.
   */
  @Test
  public void testReadObjectCreationWithConfigurationKeyTranslationValueTranslation() {
    HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
        .withConfiguration(serConf.getHadoopConfiguration())
        .withKeyTranslation(myKeyTranslate)
        .withValueTranslation(myValueTranslate);
    assertEquals(serConf.getHadoopConfiguration(),
        read.getConfiguration().getHadoopConfiguration());
    assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
    assertEquals(myValueTranslate, read.getValueTranslationFunction());
    assertEquals(myKeyTranslate.getOutputTypeDescriptor().getRawType(),
        read.getKeyTypeDescriptor().getRawType());
    assertEquals(myValueTranslate.getOutputTypeDescriptor().getRawType(),
        read.getValueTypeDescriptor().getRawType());
  }

  /**
   * This test validates functionality of {@link HadoopInputFormatIO.Read#validateTransform()
   * Read.validateTransform()} function when Read transform is created without calling
   * {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()}.
   */
  @Test
  public void testReadValidationFailsMissingConfiguration() {
    HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read();
    thrown.expect(NullPointerException.class);
    read.validateTransform();
  }

  /**
   * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration()
   * withConfiguration()} function when Hadoop InputFormat class is not provided by the user in
   * configuration.
   */
  @Test
  public void testReadValidationFailsMissingInputFormatInConf() {
    Configuration configuration = new Configuration();
    configuration.setClass("key.class", Text.class, Object.class);
    configuration.setClass("value.class", Employee.class, Object.class);
    thrown.expect(NullPointerException.class);
    HadoopInputFormatIO.<Text, Employee>read()
        .withConfiguration(configuration);
  }

  /**
   * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration()
   * withConfiguration()} function when key class is not provided by the user in configuration.
   */
  @Test
  public void testReadValidationFailsMissingKeyClassInConf() {
    Configuration configuration = new Configuration();
    configuration.setClass("mapreduce.job.inputformat.class", EmployeeInputFormat.class,
        InputFormat.class);
    configuration.setClass("value.class", Employee.class, Object.class);
    thrown.expect(NullPointerException.class);
    HadoopInputFormatIO.<Text, Employee>read()
        .withConfiguration(configuration);
  }

  /**
   * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration()
   * withConfiguration()} function when value class is not provided by the user in configuration.
   */
  @Test
  public void testReadValidationFailsMissingValueClassInConf() {
    Configuration configuration = new Configuration();
    configuration.setClass("mapreduce.job.inputformat.class", EmployeeInputFormat.class,
        InputFormat.class);
    configuration.setClass("key.class", Text.class, Object.class);
    thrown.expect(NullPointerException.class);
    HadoopInputFormatIO.<Text, Employee>read().withConfiguration(configuration);
  }

  /**
   * This test validates functionality of {@link HadoopInputFormatIO.Read#validateTransform()
   * Read.validateTransform()} function when myKeyTranslate's (simple function provided by user for
   * key translation) input type is not same as Hadoop InputFormat's keyClass(Which is property set
   * in configuration as "key.class").
   */
  @Test
  public void testReadValidationFailsWithWrongInputTypeKeyTranslationFunction() {
    SimpleFunction<LongWritable, String> myKeyTranslateWithWrongInputType =
        new SimpleFunction<LongWritable, String>() {
          @Override
          public String apply(LongWritable input) {
            return input.toString();
          }
        };
    HadoopInputFormatIO.Read<String, Employee> read = HadoopInputFormatIO.<String, Employee>read()
        .withConfiguration(serConf.getHadoopConfiguration())
        .withKeyTranslation(myKeyTranslateWithWrongInputType);
    thrown.expect(IllegalArgumentException.class);
    thrown.expectMessage(String.format(
        "Key translation's input type is not same as hadoop InputFormat : %s key " + "class : %s",
        serConf.getHadoopConfiguration().getClass("mapreduce.job.inputformat.class",
            InputFormat.class), serConf.getHadoopConfiguration()
            .getClass("key.class", Object.class)));
    read.validateTransform();
  }

  /**
   * This test validates functionality of {@link HadoopInputFormatIO.Read#validateTransform()
   * Read.validateTransform()} function when myValueTranslate's (simple function provided by user
   * for value translation) input type is not same as Hadoop InputFormat's valueClass(Which is
   * property set in configuration as "value.class").
   */
  @Test
  public void testReadValidationFailsWithWrongInputTypeValueTranslationFunction() {
    SimpleFunction<LongWritable, String> myValueTranslateWithWrongInputType =
        new SimpleFunction<LongWritable, String>() {
          @Override
          public String apply(LongWritable input) {
            return input.toString();
          }
        };
    HadoopInputFormatIO.Read<Text, String> read =
        HadoopInputFormatIO.<Text, String>read()
            .withConfiguration(serConf.getHadoopConfiguration())
            .withValueTranslation(myValueTranslateWithWrongInputType);
    String expectedMessage =
        String.format(
            "Value translation's input type is not same as hadoop InputFormat :  "
                + "%s value class : %s",
            serConf.getHadoopConfiguration().getClass("mapreduce.job.inputformat.class",
                InputFormat.class),
            serConf.getHadoopConfiguration().getClass("value.class", Object.class));
    thrown.expect(IllegalArgumentException.class);
    thrown.expectMessage(expectedMessage);
    read.validateTransform();
  }

  @Test
  public void testReadingData() throws Exception {
    HadoopInputFormatIO.Read<Text, Employee> read = HadoopInputFormatIO.<Text, Employee>read()
        .withConfiguration(serConf.getHadoopConfiguration());
    List<KV<Text, Employee>> expected = TestEmployeeDataSet.getEmployeeData();
    PCollection<KV<Text, Employee>> actual = p.apply("ReadTest", read);
    PAssert.that(actual).containsInAnyOrder(expected);
    p.run();
  }

  /**
   * This test validates functionality of
   * {@link HadoopInputFormatIO.HadoopInputFormatBoundedSource#populateDisplayData()
   * populateDisplayData()}.
   */
  @Test
  public void testReadDisplayData() {
    HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
        new HadoopInputFormatBoundedSource<Text, Employee>(
            serConf,
            WritableCoder.of(Text.class),
            AvroCoder.of(Employee.class),
            null, // No key translation required.
            null, // No value translation required.
            new SerializableSplit());
    DisplayData displayData = DisplayData.from(boundedSource);
    assertThat(
        displayData,
        hasDisplayItem("mapreduce.job.inputformat.class",
            serConf.getHadoopConfiguration().get("mapreduce.job.inputformat.class")));
    assertThat(displayData,
        hasDisplayItem("key.class", serConf.getHadoopConfiguration().get("key.class")));
    assertThat(displayData,
        hasDisplayItem("value.class", serConf.getHadoopConfiguration().get("value.class")));
  }

  /**
   * This test validates behavior of {@link HadoopInputFormatBoundedSource} if RecordReader object
   * creation fails.
   */
  @Test
  public void testReadIfCreateRecordReaderFails() throws Exception {
    thrown.expect(Exception.class);
    thrown.expectMessage("Exception in creating RecordReader");
    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
    Mockito.when(
        mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
            Mockito.any(TaskAttemptContext.class))).thenThrow(
        new IOException("Exception in creating RecordReader"));
    HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
        new HadoopInputFormatBoundedSource<Text, Employee>(
            serConf,
            WritableCoder.of(Text.class),
            AvroCoder.of(Employee.class),
            null, // No key translation required.
            null, // No value translation required.
            new SerializableSplit());
    boundedSource.setInputFormatObj(mockInputFormat);
    SourceTestUtils.readFromSource(boundedSource, p.getOptions());
  }

  /**
   * This test validates behavior of HadoopInputFormatSource if
   * {@link InputFormat#createRecordReader() createRecordReader()} of InputFormat returns null.
   */
  @Test
  public void testReadWithNullCreateRecordReader() throws Exception {
    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
    thrown.expect(IOException.class);
    thrown.expectMessage(String.format("Null RecordReader object returned by %s",
            mockInputFormat.getClass()));
    Mockito.when(
        mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
            Mockito.any(TaskAttemptContext.class))).thenReturn(null);
    HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
        new HadoopInputFormatBoundedSource<Text, Employee>(
            serConf,
            WritableCoder.of(Text.class),
            AvroCoder.of(Employee.class),
            null, // No key translation required.
            null, // No value translation required.
            new SerializableSplit());
    boundedSource.setInputFormatObj(mockInputFormat);
    SourceTestUtils.readFromSource(boundedSource, p.getOptions());
  }

  /**
   * This test validates behavior of
   * {@link HadoopInputFormatBoundedSource.HadoopInputFormatReader#start() start()} method if
   * InputFormat's {@link InputFormat#getSplits() getSplits()} returns InputSplitList having zero
   * records.
   */
  @Test
  public void testReadersStartWhenZeroRecords() throws Exception {

    InputFormat mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
    EmployeeRecordReader mockReader = Mockito.mock(EmployeeRecordReader.class);
    Mockito.when(
        mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
            Mockito.any(TaskAttemptContext.class))).thenReturn(mockReader);
    Mockito.when(mockReader.nextKeyValue()).thenReturn(false);
    InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class);
    HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
        new HadoopInputFormatBoundedSource<Text, Employee>(
            serConf,
            WritableCoder.of(Text.class),
            AvroCoder.of(Employee.class),
            null, // No key translation required.
            null, // No value translation required.
            new SerializableSplit(mockInputSplit));
    boundedSource.setInputFormatObj(mockInputFormat);
    BoundedReader<KV<Text, Employee>> reader = boundedSource.createReader(p.getOptions());
    assertEquals(false, reader.start());
    assertEquals(Double.valueOf(1), reader.getFractionConsumed());
    reader.close();
  }

  /**
   * This test validates the method getFractionConsumed()- which indicates the progress of the read
   * in range of 0 to 1.
   */
  @Test
  public void testReadersGetFractionConsumed() throws Exception {
    List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
    HadoopInputFormatBoundedSource<Text, Employee> hifSource = getTestHIFSource(
        EmployeeInputFormat.class,
        Text.class,
        Employee.class,
        WritableCoder.of(Text.class),
        AvroCoder.of(Employee.class));
    long estimatedSize = hifSource.getEstimatedSizeBytes(p.getOptions());
    // Validate if estimated size is equal to the size of records.
    assertEquals(referenceRecords.size(), estimatedSize);
    List<BoundedSource<KV<Text, Employee>>> boundedSourceList =
        hifSource.split(0, p.getOptions());
    // Validate if split() has split correctly.
    assertEquals(TestEmployeeDataSet.NUMBER_OF_SPLITS, boundedSourceList.size());
    List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
    for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
      List<KV<Text, Employee>> elements = new ArrayList<KV<Text, Employee>>();
      BoundedReader<KV<Text, Employee>> reader = source.createReader(p.getOptions());
      float recordsRead = 0;
      // When start is not called, getFractionConsumed() should return 0.
      assertEquals(Double.valueOf(0), reader.getFractionConsumed());
      boolean start = reader.start();
      assertEquals(true, start);
      if (start) {
        elements.add(reader.getCurrent());
        boolean advance = reader.advance();
        // Validate if getFractionConsumed() returns the correct fraction based on
        // the number of records read in the split.
        assertEquals(
            Double.valueOf(++recordsRead / TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT),
            reader.getFractionConsumed());
        assertEquals(true, advance);
        while (advance) {
          elements.add(reader.getCurrent());
          advance = reader.advance();
          assertEquals(
              Double.valueOf(++recordsRead / TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT),
              reader.getFractionConsumed());
        }
        bundleRecords.addAll(elements);
      }
      // Validate if getFractionConsumed() returns 1 after reading is complete.
      assertEquals(Double.valueOf(1), reader.getFractionConsumed());
      reader.close();
    }
    assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
  }

/**
   * This test validates the method getFractionConsumed()- when a bad progress value is returned by
   * the inputformat.
   */
  @Test
  public void testGetFractionConsumedForBadProgressValue() throws Exception {
    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
    EmployeeRecordReader mockReader = Mockito.mock(EmployeeRecordReader.class);
    Mockito.when(
        mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
            Mockito.any(TaskAttemptContext.class))).thenReturn(mockReader);
    Mockito.when(mockReader.nextKeyValue()).thenReturn(true);
    // Set to a bad value , not in range of 0 to 1
    Mockito.when(mockReader.getProgress()).thenReturn(2.0F);
    InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class);
    HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
        new HadoopInputFormatBoundedSource<Text, Employee>(
            serConf,
            WritableCoder.of(Text.class),
            AvroCoder.of(Employee.class),
            null, // No key translation required.
            null, // No value translation required.
            new SerializableSplit(mockInputSplit));
    boundedSource.setInputFormatObj(mockInputFormat);
    BoundedReader<KV<Text, Employee>> reader = boundedSource.createReader(p.getOptions());
    assertEquals(Double.valueOf(0), reader.getFractionConsumed());
    boolean start = reader.start();
    assertEquals(true, start);
    if (start) {
      boolean advance = reader.advance();
      assertEquals(null, reader.getFractionConsumed());
      assertEquals(true, advance);
      if (advance) {
        advance = reader.advance();
        assertEquals(null, reader.getFractionConsumed());
      }
    }
    // Validate if getFractionConsumed() returns null after few number of reads as getProgress
    // returns invalid value '2' which is not in the range of 0 to 1.
    assertEquals(null, reader.getFractionConsumed());
    reader.close();
  }
  /**
   * This test validates that reader and its parent source reads the same records.
   */
  @Test
  public void testReaderAndParentSourceReadsSameData() throws Exception {
    InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class);
    HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
        new HadoopInputFormatBoundedSource<Text, Employee>(
            serConf,
            WritableCoder.of(Text.class),
            AvroCoder.of(Employee.class),
            null, // No key translation required.
            null, // No value translation required.
            new SerializableSplit(mockInputSplit));
    BoundedReader<KV<Text, Employee>> reader = boundedSource
        .createReader(p.getOptions());
    SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource(reader, p.getOptions());
  }

  /**
   * This test verifies that the method
   * {@link HadoopInputFormatBoundedSource.HadoopInputFormatReader#getCurrentSource()
   * getCurrentSource()} returns correct source object.
   */
  @Test
  public void testGetCurrentSourceFunction() throws Exception {
    SerializableSplit split = new SerializableSplit();
    BoundedSource<KV<Text, Employee>> source =
        new HadoopInputFormatBoundedSource<Text, Employee>(
            serConf,
            WritableCoder.of(Text.class),
            AvroCoder.of(Employee.class),
            null, // No key translation required.
            null, // No value translation required.
            split);
    BoundedReader<KV<Text, Employee>> hifReader = source.createReader(p.getOptions());
    BoundedSource<KV<Text, Employee>> hifSource = hifReader.getCurrentSource();
    assertEquals(hifSource, source);
  }

  /**
   * This test validates behavior of
   * {@link HadoopInputFormatBoundedSource#createReader(PipelineOptions)}
   * createReader()} method when
   * {@link HadoopInputFormatBoundedSource#split(long, PipelineOptions)}
   * is not called.
   */
  @Test
  public void testCreateReaderIfSplitNotCalled() throws Exception {
    HadoopInputFormatBoundedSource<Text, Employee> hifSource = getTestHIFSource(
        EmployeeInputFormat.class,
        Text.class,
        Employee.class,
        WritableCoder.of(Text.class),
        AvroCoder.of(Employee.class));
    thrown.expect(IOException.class);
    thrown.expectMessage("Cannot create reader as source is not split yet.");
    hifSource.createReader(p.getOptions());
  }

  /**
   * This test validates behavior of
   * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop
   * InputFormat's {@link InputFormat#getSplits(JobContext)} returns empty list.
   */
  @Test
  public void testComputeSplitsIfGetSplitsReturnsEmptyList() throws Exception {
    InputFormat<?, ?> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
    SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class);
    Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(
        new ArrayList<InputSplit>());
    HadoopInputFormatBoundedSource<Text, Employee> hifSource =
        new HadoopInputFormatBoundedSource<Text, Employee>(
            serConf,
            WritableCoder.of(Text.class),
            AvroCoder.of(Employee.class),
            null, // No key translation required.
            null, // No value translation required.
            mockInputSplit);
    thrown.expect(IOException.class);
    thrown.expectMessage("Error in computing splits, getSplits() returns a empty list");
    hifSource.setInputFormatObj(mockInputFormat);
    hifSource.computeSplitsIfNecessary();
  }

  /**
   * This test validates behavior of
   * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop
   * InputFormat's {@link InputFormat#getSplits() getSplits()} returns NULL value.
   */
  @Test
  public void testComputeSplitsIfGetSplitsReturnsNullValue() throws Exception {
    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
    SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class);
    Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(null);
    HadoopInputFormatBoundedSource<Text, Employee> hifSource =
        new HadoopInputFormatBoundedSource<Text, Employee>(
            serConf,
            WritableCoder.of(Text.class),
            AvroCoder.of(Employee.class),
            null, // No key translation required.
            null, // No value translation required.
            mockInputSplit);
    thrown.expect(IOException.class);
    thrown.expectMessage("Error in computing splits, getSplits() returns null.");
    hifSource.setInputFormatObj(mockInputFormat);
    hifSource.computeSplitsIfNecessary();
  }

  /**
   * This test validates behavior of
   * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} if Hadoop
   * InputFormat's {@link InputFormat#getSplits() getSplits()} returns InputSplit list having some
   * null values.
   */
  @Test
  public void testComputeSplitsIfGetSplitsReturnsListHavingNullValues() throws Exception {
    // InputSplit list having null value.
    InputSplit mockInputSplit =
        Mockito.mock(InputSplit.class, Mockito.withSettings().extraInterfaces(Writable.class));
    List<InputSplit> inputSplitList = new ArrayList<InputSplit>();
    inputSplitList.add(mockInputSplit);
    inputSplitList.add(null);
    InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
    Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(
        inputSplitList);
    HadoopInputFormatBoundedSource<Text, Employee> hifSource =
        new HadoopInputFormatBoundedSource<Text, Employee>(
            serConf,
            WritableCoder.of(Text.class),
            AvroCoder.of(Employee.class),
            null, // No key translation required.
            null, // No value translation required.
            new SerializableSplit());
    thrown.expect(IOException.class);
    thrown.expectMessage("Error in computing splits, split is null in InputSplits list populated "
        + "by getSplits() : ");
    hifSource.setInputFormatObj(mockInputFormat);
    hifSource.computeSplitsIfNecessary();
  }

  /**
   * This test validates records emitted in PCollection are immutable if InputFormat's recordReader
   * returns same objects(i.e. same locations in memory) but with updated values for each record.
   */
  @Test
  public void testImmutablityOfOutputOfReadIfRecordReaderObjectsAreMutable() throws Exception {
    List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList(
       ReuseObjectsEmployeeInputFormat.class,
       Text.class,
       Employee.class,
       WritableCoder.of(Text.class),
       AvroCoder.of(Employee.class));
    List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
    for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
      List<KV<Text, Employee>> elems = SourceTestUtils.readFromSource(source, p.getOptions());
      bundleRecords.addAll(elems);
    }
    List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
    assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
  }

  /**
   * Test reading if InputFormat implements {@link org.apache.hadoop.conf.Configurable
   * Configurable}.
   */
  @Test
  public void testReadingWithConfigurableInputFormat() throws Exception {
    List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList(
        ConfigurableEmployeeInputFormat.class,
        Text.class,
        Employee.class,
        WritableCoder.of(Text.class),
        AvroCoder.of(Employee.class));
    for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
      // Cast to HadoopInputFormatBoundedSource to access getInputFormat().
      @SuppressWarnings("unchecked")
      HadoopInputFormatBoundedSource<Text, Employee> hifSource =
          (HadoopInputFormatBoundedSource<Text, Employee>) source;
      hifSource.createInputFormatInstance();
      ConfigurableEmployeeInputFormat inputFormatObj =
          (ConfigurableEmployeeInputFormat) hifSource.getInputFormat();
      assertEquals(true, inputFormatObj.isConfSet);
    }
  }

  /**
   * This test validates records emitted in PCollection are immutable if InputFormat's
   * {@link org.apache.hadoop.mapreduce.RecordReader RecordReader} returns different objects (i.e.
   * different locations in memory).
   */
  @Test
  public void testImmutablityOfOutputOfReadIfRecordReaderObjectsAreImmutable() throws Exception {
   List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList(
       EmployeeInputFormat.class,
       Text.class,
       Employee.class,
       WritableCoder.of(Text.class),
       AvroCoder.of(Employee.class));
    List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
    for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
      List<KV<Text, Employee>> elems = SourceTestUtils.readFromSource(source, p.getOptions());
      bundleRecords.addAll(elems);
    }
    List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
    assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
  }

  private static SerializableConfiguration loadTestConfiguration(Class<?> inputFormatClassName,
      Class<?> keyClass, Class<?> valueClass) {
    Configuration conf = new Configuration();
    conf.setClass("mapreduce.job.inputformat.class", inputFormatClassName, InputFormat.class);
    conf.setClass("key.class", keyClass, Object.class);
    conf.setClass("value.class", valueClass, Object.class);
    return new SerializableConfiguration(conf);
  }

  private <K, V> HadoopInputFormatBoundedSource<K, V> getTestHIFSource(
      Class<?> inputFormatClass,
      Class<K> inputFormatKeyClass,
      Class<V> inputFormatValueClass,
      Coder<K> keyCoder,
      Coder<V> valueCoder){
    SerializableConfiguration serConf =
        loadTestConfiguration(
            inputFormatClass,
            inputFormatKeyClass,
            inputFormatValueClass);
    return new HadoopInputFormatBoundedSource<K, V>(
            serConf,
            keyCoder,
            valueCoder,
            null, // No key translation required.
            null); // No value translation required.
  }

  private <K, V> List<BoundedSource<KV<K, V>>> getBoundedSourceList(
      Class<?> inputFormatClass,
      Class<K> inputFormatKeyClass,
      Class<V> inputFormatValueClass,
      Coder<K> keyCoder,
      Coder<V> valueCoder) throws Exception{
    HadoopInputFormatBoundedSource<K, V> boundedSource = getTestHIFSource(
        inputFormatClass,
        inputFormatKeyClass,
        inputFormatValueClass,
        keyCoder,
        valueCoder);
    return boundedSource.split(0, p.getOptions());
  }
}