/*
* Licensed to the Apache Software Foundation (ASF) under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional information regarding
* copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.beam.sdk.io.hadoop.inputformat;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import com.google.auto.value.AutoValue;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.AtomicDouble;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import javax.annotation.Nullable;
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.coders.CannotProvideCoderException;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.CoderException;
import org.apache.beam.sdk.coders.CoderRegistry;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.io.hadoop.WritableCoder;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.util.CoderUtils;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.TypeDescriptor;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A {@link HadoopInputFormatIO} is a Transform for reading data from any source which
* implements Hadoop {@link InputFormat}. For example- Cassandra, Elasticsearch, HBase, Redis,
* Postgres etc. {@link HadoopInputFormatIO} has to make several performance trade-offs in
* connecting to {@link InputFormat}, so if there is another Beam IO Transform specifically for
* connecting to your data source of choice, we would recommend using that one, but this IO
* Transform allows you to connect to many data sources that do not yet have a Beam IO Transform.
*
* <p>You will need to pass a Hadoop {@link Configuration} with parameters specifying how the read
* will occur. Many properties of the Configuration are optional, and some are required for certain
* {@link InputFormat} classes, but the following properties must be set for all InputFormats:
* <ul>
* <li>{@code mapreduce.job.inputformat.class}: The {@link InputFormat} class used to connect to
* your data source of choice.</li>
* <li>{@code key.class}: The key class returned by the {@link InputFormat} in
* {@code mapreduce.job.inputformat.class}.</li>
* <li>{@code value.class}: The value class returned by the {@link InputFormat} in
* {@code mapreduce.job.inputformat.class}.</li>
* </ul>
* For example:
*
* <pre>
* {
* Configuration myHadoopConfiguration = new Configuration(false);
* // Set Hadoop InputFormat, key and value class in configuration
* myHadoopConfiguration.setClass("mapreduce.job.inputformat.class",
* MyDbInputFormatClass, InputFormat.class);
* myHadoopConfiguration.setClass("key.class", MyDbInputFormatKeyClass, Object.class);
* myHadoopConfiguration.setClass("value.class",
* MyDbInputFormatValueClass, Object.class);
* }
* </pre>
*
* <p>You will need to check to see if the key and value classes output by the {@link InputFormat}
* have a Beam {@link Coder} available. If not, you can use withKeyTranslation/withValueTranslation
* to specify a method transforming instances of those classes into another class that is supported
* by a Beam {@link Coder}. These settings are optional and you don't need to specify translation
* for both key and value. If you specify a translation, you will need to make sure the K or V of
* the read transform match the output type of the translation.
*
* <p>You will need to set appropriate InputFormat key and value class (i.e. "key.class" and
* "value.class") in Hadoop {@link Configuration}. If you set different InputFormat key or
* value class than InputFormat's actual key or value class then, it may result in an error like
* "unexpected extra bytes after decoding" while the decoding process of key/value object happens.
* Hence, it is important to set appropriate InputFormat key and value class.
*
* <h3>Reading using {@link HadoopInputFormatIO}</h3>
*
* <pre>
* {@code
* Pipeline p = ...; // Create pipeline.
* // Read data only with Hadoop configuration.
* p.apply("read",
* HadoopInputFormatIO.<InputFormatKeyClass, InputFormatKeyClass>read()
* .withConfiguration(myHadoopConfiguration);
* }
* // Read data with configuration and key translation (Example scenario: Beam Coder is not
* available for key class hence key translation is required.).
* SimpleFunction<InputFormatKeyClass, MyKeyClass> myOutputKeyType =
* new SimpleFunction<InputFormatKeyClass, MyKeyClass>() {
* public MyKeyClass apply(InputFormatKeyClass input) {
* // ...logic to transform InputFormatKeyClass to MyKeyClass
* }
* };
* </pre>
*
* <pre>
* {@code
* p.apply("read",
* HadoopInputFormatIO.<MyKeyClass, InputFormatKeyClass>read()
* .withConfiguration(myHadoopConfiguration)
* .withKeyTranslation(myOutputKeyType);
* }
* </pre>
*
* <p>// Read data with configuration and value translation (Example scenario: Beam Coder is not
* available for value class hence value translation is required.).
*
* <pre>
* {@code
* SimpleFunction<InputFormatValueClass, MyValueClass> myOutputValueType =
* new SimpleFunction<InputFormatValueClass, MyValueClass>() {
* public MyValueClass apply(InputFormatValueClass input) {
* // ...logic to transform InputFormatValueClass to MyValueClass
* }
* };
* }
* </pre>
*
* <pre>
* {@code
* p.apply("read",
* HadoopInputFormatIO.<InputFormatKeyClass, MyValueClass>read()
* .withConfiguration(myHadoopConfiguration)
* .withValueTranslation(myOutputValueType);
* }
* </pre>
*/
@Experimental
public class HadoopInputFormatIO {
private static final Logger LOG = LoggerFactory.getLogger(HadoopInputFormatIO.class);
/**
* Creates an uninitialized {@link HadoopInputFormatIO.Read}. Before use, the {@code Read} must
* be initialized with a HadoopInputFormatIO.Read#withConfiguration(HadoopConfiguration) that
* specifies the source. A key/value translation may also optionally be specified using
* {@link HadoopInputFormatIO.Read#withKeyTranslation}/
* {@link HadoopInputFormatIO.Read#withValueTranslation}.
*/
public static <K, V> Read<K, V> read() {
return new AutoValue_HadoopInputFormatIO_Read.Builder<K, V>().build();
}
/**
* A {@link PTransform} that reads from any data source which implements Hadoop InputFormat. For
* e.g. Cassandra, Elasticsearch, HBase, Redis, Postgres, etc. See the class-level Javadoc on
* {@link HadoopInputFormatIO} for more information.
* @param <K> Type of keys to be read.
* @param <V> Type of values to be read.
* @see HadoopInputFormatIO
*/
@AutoValue
public abstract static class Read<K, V> extends PTransform<PBegin, PCollection<KV<K, V>>> {
// Returns the Hadoop Configuration which contains specification of source.
@Nullable
public abstract SerializableConfiguration getConfiguration();
@Nullable public abstract SimpleFunction<?, K> getKeyTranslationFunction();
@Nullable public abstract SimpleFunction<?, V> getValueTranslationFunction();
@Nullable public abstract TypeDescriptor<K> getKeyTypeDescriptor();
@Nullable public abstract TypeDescriptor<V> getValueTypeDescriptor();
@Nullable public abstract TypeDescriptor<?> getinputFormatClass();
@Nullable public abstract TypeDescriptor<?> getinputFormatKeyClass();
@Nullable public abstract TypeDescriptor<?> getinputFormatValueClass();
abstract Builder<K, V> toBuilder();
@AutoValue.Builder
abstract static class Builder<K, V> {
abstract Builder<K, V> setConfiguration(SerializableConfiguration configuration);
abstract Builder<K, V> setKeyTranslationFunction(SimpleFunction<?, K> function);
abstract Builder<K, V> setValueTranslationFunction(SimpleFunction<?, V> function);
abstract Builder<K, V> setKeyTypeDescriptor(TypeDescriptor<K> keyTypeDescriptor);
abstract Builder<K, V> setValueTypeDescriptor(TypeDescriptor<V> valueTypeDescriptor);
abstract Builder<K, V> setInputFormatClass(TypeDescriptor<?> inputFormatClass);
abstract Builder<K, V> setInputFormatKeyClass(TypeDescriptor<?> inputFormatKeyClass);
abstract Builder<K, V> setInputFormatValueClass(TypeDescriptor<?> inputFormatValueClass);
abstract Read<K, V> build();
}
/**
* Returns a new {@link HadoopInputFormatIO.Read} that will read from the source using the
* options provided by the given configuration.
*
* <p>Does not modify this object.
*/
public Read<K, V> withConfiguration(Configuration configuration) {
validateConfiguration(configuration);
TypeDescriptor<?> inputFormatClass =
TypeDescriptor.of(configuration.getClass("mapreduce.job.inputformat.class", null));
TypeDescriptor<?> inputFormatKeyClass =
TypeDescriptor.of(configuration.getClass("key.class", null));
TypeDescriptor<?> inputFormatValueClass =
TypeDescriptor.of(configuration.getClass("value.class", null));
Builder<K, V> builder =
toBuilder().setConfiguration(new SerializableConfiguration(configuration));
builder.setInputFormatClass(inputFormatClass);
builder.setInputFormatKeyClass(inputFormatKeyClass);
builder.setInputFormatValueClass(inputFormatValueClass);
/*
* Sets the output key class to InputFormat key class if withKeyTranslation() is not called
* yet.
*/
if (getKeyTranslationFunction() == null) {
builder.setKeyTypeDescriptor((TypeDescriptor<K>) inputFormatKeyClass);
}
/*
* Sets the output value class to InputFormat value class if withValueTranslation() is not
* called yet.
*/
if (getValueTranslationFunction() == null) {
builder.setValueTypeDescriptor((TypeDescriptor<V>) inputFormatValueClass);
}
return builder.build();
}
/**
* Returns a new {@link HadoopInputFormatIO.Read} that will transform the keys read from the
* source using the given key translation function.
*
* <p>Does not modify this object.
*/
public Read<K, V> withKeyTranslation(SimpleFunction<?, K> function) {
checkNotNull(function, "function");
// Sets key class to key translation function's output class type.
return toBuilder().setKeyTranslationFunction(function)
.setKeyTypeDescriptor((TypeDescriptor<K>) function.getOutputTypeDescriptor()).build();
}
/**
* Returns a new {@link HadoopInputFormatIO.Read} that will transform the values read from the
* source using the given value translation function.
*
* <p>Does not modify this object.
*/
public Read<K, V> withValueTranslation(SimpleFunction<?, V> function) {
checkNotNull(function, "function");
// Sets value class to value translation function's output class type.
return toBuilder().setValueTranslationFunction(function)
.setValueTypeDescriptor((TypeDescriptor<V>) function.getOutputTypeDescriptor()).build();
}
@Override
public PCollection<KV<K, V>> expand(PBegin input) {
validateTransform();
// Get the key and value coders based on the key and value classes.
CoderRegistry coderRegistry = input.getPipeline().getCoderRegistry();
Coder<K> keyCoder = getDefaultCoder(getKeyTypeDescriptor(), coderRegistry);
Coder<V> valueCoder = getDefaultCoder(getValueTypeDescriptor(), coderRegistry);
HadoopInputFormatBoundedSource<K, V> source = new HadoopInputFormatBoundedSource<K, V>(
getConfiguration(),
keyCoder,
valueCoder,
getKeyTranslationFunction(),
getValueTranslationFunction());
return input.getPipeline().apply(org.apache.beam.sdk.io.Read.from(source));
}
/**
* Validates that the mandatory configuration properties such as InputFormat class, InputFormat
* key and value classes are provided in the Hadoop configuration.
*/
private void validateConfiguration(Configuration configuration) {
checkNotNull(configuration, "configuration");
checkNotNull(configuration.get("mapreduce.job.inputformat.class"),
"configuration.get(\"mapreduce.job.inputformat.class\")");
checkNotNull(configuration.get("key.class"), "configuration.get(\"key.class\")");
checkNotNull(configuration.get("value.class"),
"configuration.get(\"value.class\")");
}
/**
* Validates construction of this transform.
*/
@VisibleForTesting
void validateTransform() {
checkNotNull(getConfiguration(), "getConfiguration()");
// Validate that the key translation input type must be same as key class of InputFormat.
validateTranslationFunction(getinputFormatKeyClass(), getKeyTranslationFunction(),
"Key translation's input type is not same as hadoop InputFormat : %s key class : %s");
// Validate that the value translation input type must be same as value class of InputFormat.
validateTranslationFunction(getinputFormatValueClass(), getValueTranslationFunction(),
"Value translation's input type is not same as hadoop InputFormat : "
+ "%s value class : %s");
}
/**
* Validates translation function given for key/value translation.
*/
private void validateTranslationFunction(TypeDescriptor<?> inputType,
SimpleFunction<?, ?> simpleFunction, String errorMsg) {
if (simpleFunction != null) {
if (!simpleFunction.getInputTypeDescriptor().equals(inputType)) {
throw new IllegalArgumentException(
String.format(errorMsg, getinputFormatClass().getRawType(), inputType.getRawType()));
}
}
}
/**
* Returns the default coder for a given type descriptor. Coder Registry is queried for correct
* coder, if not found in Coder Registry, then check if the type descriptor provided is of type
* Writable, then WritableCoder is returned, else exception is thrown "Cannot find coder".
*/
public <T> Coder<T> getDefaultCoder(TypeDescriptor<?> typeDesc, CoderRegistry coderRegistry) {
Class classType = typeDesc.getRawType();
try {
return (Coder<T>) coderRegistry.getCoder(typeDesc);
} catch (CannotProvideCoderException e) {
if (Writable.class.isAssignableFrom(classType)) {
return (Coder<T>) WritableCoder.of(classType);
}
throw new IllegalStateException(String.format("Cannot find coder for %s : ", typeDesc)
+ e.getMessage(), e);
}
}
}
/**
* Bounded source implementation for {@link HadoopInputFormatIO}.
* @param <K> Type of keys to be read.
* @param <V> Type of values to be read.
*/
public static class HadoopInputFormatBoundedSource<K, V> extends BoundedSource<KV<K, V>>
implements Serializable {
private final SerializableConfiguration conf;
private final Coder<K> keyCoder;
private final Coder<V> valueCoder;
@Nullable private final SimpleFunction<?, K> keyTranslationFunction;
@Nullable private final SimpleFunction<?, V> valueTranslationFunction;
private final SerializableSplit inputSplit;
private transient List<SerializableSplit> inputSplits;
private long boundedSourceEstimatedSize = 0;
private transient InputFormat<?, ?> inputFormatObj;
private transient TaskAttemptContext taskAttemptContext;
private static final Set<Class<?>> immutableTypes = new HashSet<Class<?>>(
Arrays.asList(
String.class,
Byte.class,
Short.class,
Integer.class,
Long.class,
Float.class,
Double.class,
Boolean.class,
BigInteger.class,
BigDecimal.class));
HadoopInputFormatBoundedSource(
SerializableConfiguration conf,
Coder<K> keyCoder,
Coder<V> valueCoder,
@Nullable SimpleFunction<?, K> keyTranslationFunction,
@Nullable SimpleFunction<?, V> valueTranslationFunction) {
this(conf,
keyCoder,
valueCoder,
keyTranslationFunction,
valueTranslationFunction,
null);
}
protected HadoopInputFormatBoundedSource(
SerializableConfiguration conf,
Coder<K> keyCoder,
Coder<V> valueCoder,
@Nullable SimpleFunction<?, K> keyTranslationFunction,
@Nullable SimpleFunction<?, V> valueTranslationFunction,
SerializableSplit inputSplit) {
this.conf = conf;
this.inputSplit = inputSplit;
this.keyCoder = keyCoder;
this.valueCoder = valueCoder;
this.keyTranslationFunction = keyTranslationFunction;
this.valueTranslationFunction = valueTranslationFunction;
}
public SerializableConfiguration getConfiguration() {
return conf;
}
@Override
public void validate() {
checkNotNull(conf, "conf");
checkNotNull(keyCoder, "keyCoder");
checkNotNull(valueCoder, "valueCoder");
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
Configuration hadoopConfig = getConfiguration().getHadoopConfiguration();
if (hadoopConfig != null) {
builder.addIfNotNull(DisplayData.item("mapreduce.job.inputformat.class",
hadoopConfig.get("mapreduce.job.inputformat.class"))
.withLabel("InputFormat Class"));
builder.addIfNotNull(DisplayData.item("key.class",
hadoopConfig.get("key.class"))
.withLabel("Key Class"));
builder.addIfNotNull(DisplayData.item("value.class",
hadoopConfig.get("value.class"))
.withLabel("Value Class"));
}
}
@Override
public List<BoundedSource<KV<K, V>>> split(long desiredBundleSizeBytes,
PipelineOptions options) throws Exception {
// desiredBundleSizeBytes is not being considered as splitting based on this
// value is not supported by inputFormat getSplits() method.
if (inputSplit != null) {
LOG.info("Not splitting source {} because source is already split.", this);
return ImmutableList.of((BoundedSource<KV<K, V>>) this);
}
computeSplitsIfNecessary();
LOG.info("Generated {} splits. Size of first split is {} ", inputSplits.size(), inputSplits
.get(0).getSplit().getLength());
return Lists.transform(inputSplits,
new Function<SerializableSplit, BoundedSource<KV<K, V>>>() {
@Override
public BoundedSource<KV<K, V>> apply(SerializableSplit serializableInputSplit) {
HadoopInputFormatBoundedSource<K, V> hifBoundedSource =
new HadoopInputFormatBoundedSource<K, V>(conf, keyCoder, valueCoder,
keyTranslationFunction, valueTranslationFunction, serializableInputSplit);
return hifBoundedSource;
}
});
}
@Override
public long getEstimatedSizeBytes(PipelineOptions po) throws Exception {
if (inputSplit == null) {
// If there are no splits computed yet, then retrieve the splits.
computeSplitsIfNecessary();
return boundedSourceEstimatedSize;
}
return inputSplit.getSplit().getLength();
}
/**
* This is a helper function to compute splits. This method will also calculate size of the
* data being read. Note: This method is executed exactly once and the splits are retrieved
* and cached in this. These splits are further used by split() and
* getEstimatedSizeBytes().
*/
@VisibleForTesting
void computeSplitsIfNecessary() throws IOException, InterruptedException {
if (inputSplits != null) {
return;
}
createInputFormatInstance();
List<InputSplit> splits =
inputFormatObj.getSplits(Job.getInstance(conf.getHadoopConfiguration()));
if (splits == null) {
throw new IOException("Error in computing splits, getSplits() returns null.");
}
if (splits.isEmpty()) {
throw new IOException("Error in computing splits, getSplits() returns a empty list");
}
boundedSourceEstimatedSize = 0;
inputSplits = new ArrayList<SerializableSplit>();
for (InputSplit inputSplit : splits) {
if (inputSplit == null) {
throw new IOException("Error in computing splits, split is null in InputSplits list "
+ "populated by getSplits() : ");
}
boundedSourceEstimatedSize += inputSplit.getLength();
inputSplits.add(new SerializableSplit(inputSplit));
}
}
/**
* Creates instance of InputFormat class. The InputFormat class name is specified in the Hadoop
* configuration.
*/
protected void createInputFormatInstance() throws IOException {
if (inputFormatObj == null) {
try {
taskAttemptContext =
new TaskAttemptContextImpl(conf.getHadoopConfiguration(), new TaskAttemptID());
inputFormatObj =
(InputFormat<?, ?>) conf
.getHadoopConfiguration()
.getClassByName(
conf.getHadoopConfiguration().get("mapreduce.job.inputformat.class"))
.newInstance();
/*
* If InputFormat explicitly implements interface {@link Configurable}, then setConf()
* method of {@link Configurable} needs to be explicitly called to set all the
* configuration parameters. For example: InputFormat classes which implement Configurable
* are {@link org.apache.hadoop.mapreduce.lib.db.DBInputFormat DBInputFormat}, {@link
* org.apache.hadoop.hbase.mapreduce.TableInputFormat TableInputFormat}, etc.
*/
if (Configurable.class.isAssignableFrom(inputFormatObj.getClass())) {
((Configurable) inputFormatObj).setConf(conf.getHadoopConfiguration());
}
} catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
throw new IOException("Unable to create InputFormat object: ", e);
}
}
}
@VisibleForTesting
InputFormat<?, ?> getInputFormat(){
return inputFormatObj;
}
@VisibleForTesting
void setInputFormatObj(InputFormat<?, ?> inputFormatObj) {
this.inputFormatObj = inputFormatObj;
}
@Override
public Coder<KV<K, V>> getDefaultOutputCoder() {
return KvCoder.of(keyCoder, valueCoder);
}
@Override
public BoundedReader<KV<K, V>> createReader(PipelineOptions options) throws IOException {
this.validate();
if (inputSplit == null) {
throw new IOException("Cannot create reader as source is not split yet.");
} else {
createInputFormatInstance();
return new HadoopInputFormatReader<>(
this,
keyTranslationFunction,
valueTranslationFunction,
inputSplit,
inputFormatObj,
taskAttemptContext);
}
}
/**
* BoundedReader for Hadoop InputFormat source.
*
* @param <K> Type of keys RecordReader emits.
* @param <V> Type of values RecordReader emits.
*/
class HadoopInputFormatReader<T1, T2> extends BoundedSource.BoundedReader<KV<K, V>> {
private final HadoopInputFormatBoundedSource<K, V> source;
@Nullable private final SimpleFunction<T1, K> keyTranslationFunction;
@Nullable private final SimpleFunction<T2, V> valueTranslationFunction;
private final SerializableSplit split;
private RecordReader<T1, T2> recordReader;
private volatile boolean doneReading = false;
private AtomicLong recordsReturned = new AtomicLong();
// Tracks the progress of the RecordReader.
private AtomicDouble progressValue = new AtomicDouble();
private transient InputFormat<T1, T2> inputFormatObj;
private transient TaskAttemptContext taskAttemptContext;
private HadoopInputFormatReader(HadoopInputFormatBoundedSource<K, V> source,
@Nullable SimpleFunction keyTranslationFunction,
@Nullable SimpleFunction valueTranslationFunction,
SerializableSplit split,
InputFormat inputFormatObj,
TaskAttemptContext taskAttemptContext) {
this.source = source;
this.keyTranslationFunction = keyTranslationFunction;
this.valueTranslationFunction = valueTranslationFunction;
this.split = split;
this.inputFormatObj = inputFormatObj;
this.taskAttemptContext = taskAttemptContext;
}
@Override
public HadoopInputFormatBoundedSource<K, V> getCurrentSource() {
return source;
}
@Override
public boolean start() throws IOException {
try {
recordsReturned.set(0L);
recordReader =
(RecordReader<T1, T2>) inputFormatObj.createRecordReader(split.getSplit(),
taskAttemptContext);
if (recordReader != null) {
recordReader.initialize(split.getSplit(), taskAttemptContext);
progressValue.set(getProgress());
if (recordReader.nextKeyValue()) {
recordsReturned.incrementAndGet();
doneReading = false;
return true;
}
} else {
throw new IOException(String.format("Null RecordReader object returned by %s",
inputFormatObj.getClass()));
}
recordReader = null;
} catch (InterruptedException e) {
throw new IOException(
"Could not read because the thread got interrupted while "
+ "reading the records with an exception: ",
e);
}
doneReading = true;
return false;
}
@Override
public boolean advance() throws IOException {
try {
progressValue.set(getProgress());
if (recordReader.nextKeyValue()) {
recordsReturned.incrementAndGet();
return true;
}
doneReading = true;
} catch (InterruptedException e) {
throw new IOException("Unable to read data: ", e);
}
return false;
}
@Override
public KV<K, V> getCurrent() {
K key = null;
V value = null;
try {
// Transform key if translation function is provided.
key =
transformKeyOrValue((T1) recordReader.getCurrentKey(), keyTranslationFunction,
keyCoder);
// Transform value if translation function is provided.
value =
transformKeyOrValue((T2) recordReader.getCurrentValue(), valueTranslationFunction,
valueCoder);
} catch (IOException | InterruptedException e) {
LOG.error("Unable to read data: " + "{}", e);
throw new IllegalStateException("Unable to read data: " + "{}", e);
}
return KV.of(key, value);
}
/**
* Returns the serialized output of transformed key or value object.
* @throws ClassCastException
* @throws CoderException
*/
private <T, T3> T3 transformKeyOrValue(T input,
@Nullable SimpleFunction<T, T3> simpleFunction, Coder<T3> coder) throws CoderException,
ClassCastException {
T3 output;
if (null != simpleFunction) {
output = simpleFunction.apply(input);
} else {
output = (T3) input;
}
return cloneIfPossiblyMutable((T3) output, coder);
}
/**
* Beam expects immutable objects, but the Hadoop InputFormats tend to re-use the same object
* when returning them. Hence, mutable objects returned by Hadoop InputFormats are cloned.
*/
private <T> T cloneIfPossiblyMutable(T input, Coder<T> coder) throws CoderException,
ClassCastException {
// If the input object is not of known immutable type, clone the object.
if (!isKnownImmutable(input)) {
input = CoderUtils.clone(coder, input);
}
return input;
}
/**
* Utility method to check if the passed object is of a known immutable type.
*/
private boolean isKnownImmutable(Object o) {
return immutableTypes.contains(o.getClass());
}
@Override
public void close() throws IOException {
LOG.info("Closing reader after reading {} records.", recordsReturned);
if (recordReader != null) {
recordReader.close();
recordReader = null;
}
}
@Override
public Double getFractionConsumed() {
if (doneReading) {
return 1.0;
} else if (recordReader == null || recordsReturned.get() == 0L) {
return 0.0;
}
if (progressValue.get() == 0.0) {
return null;
}
return progressValue.doubleValue();
}
/**
* Returns RecordReader's progress.
* @throws IOException
* @throws InterruptedException
*/
private Double getProgress() throws IOException, InterruptedException {
try {
float progress = recordReader.getProgress();
return (double) progress < 0 || progress > 1 ? 0.0 : progress;
} catch (IOException e) {
LOG.error(
"Error in computing the fractions consumed as RecordReader.getProgress() throws an "
+ "exception : " + "{}", e);
throw new IOException(
"Error in computing the fractions consumed as RecordReader.getProgress() throws an "
+ "exception : " + e.getMessage(), e);
}
}
@Override
public final long getSplitPointsRemaining() {
if (doneReading) {
return 0;
}
/**
* This source does not currently support dynamic work rebalancing, so remaining parallelism
* is always 1.
*/
return 1;
}
}
}
/**
* A wrapper to allow Hadoop {@link org.apache.hadoop.mapreduce.InputSplit} to be serialized using
* Java's standard serialization mechanisms.
*/
public static class SerializableSplit implements Serializable {
InputSplit inputSplit;
public SerializableSplit() {}
public SerializableSplit(InputSplit split) {
checkArgument(split instanceof Writable,
String.format("Split is not of type Writable: %s", split));
this.inputSplit = split;
}
public InputSplit getSplit() {
return inputSplit;
}
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
ObjectWritable ow = new ObjectWritable();
ow.setConf(new Configuration(false));
ow.readFields(in);
this.inputSplit = (InputSplit) ow.get();
}
private void writeObject(ObjectOutputStream out) throws IOException {
new ObjectWritable(inputSplit).write(out);
}
}
/**
* A wrapper to allow Hadoop {@link org.apache.hadoop.conf.Configuration} to be serialized using
* Java's standard serialization mechanisms. Note that the org.apache.hadoop.conf.Configuration
* is Writable.
*/
public static class SerializableConfiguration implements Externalizable {
private Configuration conf;
public SerializableConfiguration() {}
public SerializableConfiguration(Configuration conf) {
this.conf = conf;
}
public Configuration getHadoopConfiguration() {
return conf;
}
@Override
public void writeExternal(ObjectOutput out) throws IOException {
out.writeUTF(conf.getClass().getCanonicalName());
((Writable) conf).write(out);
}
@Override
public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
String className = in.readUTF();
try {
conf = (Configuration) Class.forName(className).newInstance();
conf.readFields(in);
} catch (InstantiationException | IllegalAccessException e) {
throw new IOException("Unable to create configuration: " + e);
}
}
}
}