/* * Licensed to the Apache Software Foundation (ASF) under one or more contributor license * agreements. See the NOTICE file distributed with this work for additional information regarding * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. You may obtain a * copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. */ package org.apache.beam.sdk.io.hadoop.inputformat; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.beam.sdk.values.KV; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; /** * This is a valid InputFormat for reading employee data, available in the form of {@code List<KV>} * as {@linkplain EmployeeRecordReader#employeeDataList employeeDataList} . * {@linkplain EmployeeRecordReader#employeeDataList employeeDataList} is populated using * {@linkplain TestEmployeeDataSet#populateEmployeeData()}. * {@linkplain EmployeeInputFormat} is used to test whether the * {@linkplain HadoopInputFormatIO } source returns immutable records in the scenario when * RecordReader creates new key and value objects every time it reads data. */ public class EmployeeInputFormat extends InputFormat<Text, Employee> { public EmployeeInputFormat() {} @Override public RecordReader<Text, Employee> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new EmployeeRecordReader(); } @Override public List<InputSplit> getSplits(JobContext arg0) throws IOException, InterruptedException { List<InputSplit> inputSplitList = new ArrayList<InputSplit>(); for (int i = 1; i <= TestEmployeeDataSet.NUMBER_OF_SPLITS; i++) { InputSplit inputSplitObj = new NewObjectsEmployeeInputSplit( ((i - 1) * TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT), (i * TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT - 1)); inputSplitList.add(inputSplitObj); } return inputSplitList; } /** * InputSplit implementation for EmployeeInputFormat. */ public static class NewObjectsEmployeeInputSplit extends InputSplit implements Writable { // Start and end map index of each split of employeeData. private long startIndex; private long endIndex; public NewObjectsEmployeeInputSplit() {} public NewObjectsEmployeeInputSplit(long startIndex, long endIndex) { this.startIndex = startIndex; this.endIndex = endIndex; } /** * Returns number of records in each split. */ @Override public long getLength() throws IOException, InterruptedException { return this.endIndex - this.startIndex + 1; } @Override public String[] getLocations() throws IOException, InterruptedException { return null; } public long getStartIndex() { return startIndex; } public long getEndIndex() { return endIndex; } @Override public void readFields(DataInput dataIn) throws IOException { startIndex = dataIn.readLong(); endIndex = dataIn.readLong(); } @Override public void write(DataOutput dataOut) throws IOException { dataOut.writeLong(startIndex); dataOut.writeLong(endIndex); } } /** * RecordReader for EmployeeInputFormat. */ public class EmployeeRecordReader extends RecordReader<Text, Employee> { private NewObjectsEmployeeInputSplit split; private Text currentKey; private Employee currentValue; private long employeeListIndex = 0L; private long recordsRead = 0L; private List<KV<String, String>> employeeDataList; public EmployeeRecordReader() {} @Override public void close() throws IOException {} @Override public Text getCurrentKey() throws IOException, InterruptedException { return currentKey; } @Override public Employee getCurrentValue() throws IOException, InterruptedException { return currentValue; } @Override public float getProgress() throws IOException, InterruptedException { return (float) recordsRead / split.getLength(); } @Override public void initialize(InputSplit split, TaskAttemptContext arg1) throws IOException, InterruptedException { this.split = (NewObjectsEmployeeInputSplit) split; employeeListIndex = this.split.getStartIndex() - 1; recordsRead = 0; employeeDataList = TestEmployeeDataSet.populateEmployeeData(); currentValue = new Employee(null, null); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if ((recordsRead++) >= split.getLength()) { return false; } employeeListIndex++; KV<String, String> employeeDetails = employeeDataList.get((int) employeeListIndex); String empData[] = employeeDetails.getValue().split("_"); /* * New objects must be returned every time for key and value in order to test the scenario as * discussed the in the class' javadoc. */ currentKey = new Text(employeeDetails.getKey()); currentValue = new Employee(empData[0], empData[1]); return true; } } }