/*
* Copyright [2013-2015] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.shifu.guagua;
import java.io.IOException;
import java.lang.reflect.Constructor;
import ml.shifu.guagua.GuaguaRuntimeException;
import ml.shifu.guagua.hadoop.io.GuaguaWritableAdapter;
import ml.shifu.guagua.io.GuaguaFileSplit;
import ml.shifu.guagua.io.GuaguaRecordReader;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.pig.data.Tuple;
import parquet.filter.UnboundRecordFilter;
import parquet.filter2.compat.FilterCompat;
import parquet.filter2.compat.FilterCompat.Filter;
import parquet.filter2.predicate.FilterPredicate;
import parquet.hadoop.BadConfigurationException;
import parquet.hadoop.ParquetInputSplit;
import parquet.hadoop.ParquetRecordReader;
import parquet.hadoop.api.ReadSupport;
import parquet.hadoop.util.ConfigurationUtil;
import parquet.hadoop.util.SerializationUtil;
import parquet.pig.TupleReadSupport;
/**
* {@link GuaguaParquetRecordReader} is a reader to read parquet format data to pig tuple format.
*
* <p>
* {@link GuaguaParquetRecordReader} is depending on pig and temperally to make it in shifu. In long term, this should
* be migrated to Guagua.
*
* @author Zhang David (pengzhang@paypal.com)
*/
public class GuaguaParquetRecordReader implements
GuaguaRecordReader<GuaguaWritableAdapter<LongWritable>, GuaguaWritableAdapter<Tuple>> {
/**
* key to configure the ReadSupport implementation
*/
public static final String READ_SUPPORT_CLASS = "parquet.read.support.class";
/**
* key to configure the filter predicate
*/
public static final String FILTER_PREDICATE = "parquet.private.read.filter.predicate";
/**
* key to configure the filter
*/
public static final String UNBOUND_RECORD_FILTER = "parquet.read.filter";
private ParquetRecordReader<Tuple> parquetRecordReader;
private Configuration conf;
public GuaguaParquetRecordReader() {
this.conf = new Configuration();
}
public GuaguaParquetRecordReader(GuaguaFileSplit split) throws IOException {
this(new Configuration(), split);
}
public GuaguaParquetRecordReader(Configuration conf, GuaguaFileSplit split) throws IOException {
this.conf = conf;
initialize(split);
}
private boolean isHadoop2() {
try {
Class.forName("org.apache.hadoop.mapreduce.task.MapContextImpl");
return true;
} catch (ClassNotFoundException e) {
return false;
}
}
/*
* (non-Javadoc)
*
* @see ml.shifu.guagua.io.GuaguaRecordReader#initialize(ml.shifu.guagua.io.GuaguaFileSplit)
*/
@Override
public void initialize(GuaguaFileSplit split) throws IOException {
ReadSupport<Tuple> readSupport = getReadSupportInstance(this.conf);
this.parquetRecordReader = new ParquetRecordReader<Tuple>(readSupport, getFilter(this.conf));
ParquetInputSplit parquetInputSplit = new ParquetInputSplit(new Path(split.getPath()), split.getOffset(),
split.getOffset() + split.getLength(), split.getLength(), null, null);
try {
this.parquetRecordReader.initialize(parquetInputSplit, buildContext());
} catch (InterruptedException e) {
throw new GuaguaRuntimeException(e);
}
}
/*
* Build context through reflection to make sure code compatible between hadoop 1 and hadoop 2
*/
private TaskAttemptContext buildContext() {
TaskAttemptID id = null;
TaskAttemptContext context = null;
try {
if(isHadoop2()) {
Class<?> taskTypeClass = Class.forName("org.apache.hadoop.mapreduce.TaskType");
Constructor<TaskAttemptID> constructor = TaskAttemptID.class.getDeclaredConstructor(String.class,
Integer.TYPE, taskTypeClass, Integer.TYPE, Integer.TYPE);
id = constructor.newInstance("mock", -1, fromEnumConstantName(taskTypeClass, "MAP"), -1, -1);
Constructor<?> contextConstructor = Class.forName(
"org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl").getDeclaredConstructor(
Configuration.class, TaskAttemptID.class);
context = (TaskAttemptContext) contextConstructor.newInstance(this.conf, id);
} else {
Constructor<TaskAttemptID> constructor = TaskAttemptID.class.getDeclaredConstructor(String.class,
Integer.TYPE, Boolean.TYPE, Integer.TYPE, Integer.TYPE);
constructor.setAccessible(true);
id = constructor.newInstance("mock", -1, false, -1, -1);
Constructor<?> contextConstructor = Class.forName("org.apache.hadoop.mapreduce.TaskAttemptContext")
.getDeclaredConstructor(Configuration.class, TaskAttemptID.class);
context = (TaskAttemptContext) contextConstructor.newInstance(this.conf, id);
}
} catch (Throwable e) {
throw new GuaguaRuntimeException(e);
}
return context;
}
private static FilterPredicate getFilterPredicate(Configuration configuration) {
try {
return SerializationUtil.readObjectFromConfAsBase64(FILTER_PREDICATE, configuration);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/*
* Returns a non-null Filter, which is a wrapper around either a
* FilterPredicate, an UnboundRecordFilter, or a no-op filter.
*/
public static Filter getFilter(Configuration conf) {
return FilterCompat.get(getFilterPredicate(conf), getUnboundRecordFilterInstance(conf));
}
private static UnboundRecordFilter getUnboundRecordFilterInstance(Configuration configuration) {
Class<?> clazz = ConfigurationUtil.getClassFromConfig(configuration, UNBOUND_RECORD_FILTER,
UnboundRecordFilter.class);
if(clazz == null) {
return null;
}
try {
UnboundRecordFilter unboundRecordFilter = (UnboundRecordFilter) clazz.newInstance();
if(unboundRecordFilter instanceof Configurable) {
((Configurable) unboundRecordFilter).setConf(configuration);
}
return unboundRecordFilter;
} catch (InstantiationException e) {
throw new BadConfigurationException("could not instantiate unbound record filter class", e);
} catch (IllegalAccessException e) {
throw new BadConfigurationException("could not instantiate unbound record filter class", e);
}
}
/*
* Return read support instance
*
* @param configuration
* to find the configuration for the read support
*
* @return the configured read support
*/
@SuppressWarnings("unchecked")
public static <T> ReadSupport<T> getReadSupportInstance(Configuration configuration) {
return getReadSupportInstance((Class<? extends ReadSupport<T>>) getReadSupportClass(configuration));
}
public static Class<?> getReadSupportClass(Configuration configuration) {
Class<?> clazz = ConfigurationUtil.getClassFromConfig(configuration, READ_SUPPORT_CLASS, ReadSupport.class);
if(clazz == null) {
clazz = TupleReadSupport.class;
}
return clazz;
}
/*
* Return read support instance
*
* @param readSupportClass
* to instantiate
*
* @return the configured read support
*/
static <T> ReadSupport<T> getReadSupportInstance(Class<? extends ReadSupport<T>> readSupportClass) {
try {
return readSupportClass.newInstance();
} catch (InstantiationException e) {
throw new BadConfigurationException("could not instantiate read support class", e);
} catch (IllegalAccessException e) {
throw new BadConfigurationException("could not instantiate read support class", e);
}
}
/*
* (non-Javadoc)
*
* @see ml.shifu.guagua.io.GuaguaRecordReader#nextKeyValue()
*/
@Override
public boolean nextKeyValue() throws IOException {
try {
return this.parquetRecordReader.nextKeyValue();
} catch (InterruptedException e) {
throw new GuaguaRuntimeException(e);
}
}
/*
* (non-Javadoc)
*
* @see ml.shifu.guagua.io.GuaguaRecordReader#getCurrentKey()
*/
@Override
public GuaguaWritableAdapter<LongWritable> getCurrentKey() {
return null;
}
/*
* (non-Javadoc)
*
* @see ml.shifu.guagua.io.GuaguaRecordReader#getCurrentValue()
*/
@Override
public GuaguaWritableAdapter<Tuple> getCurrentValue() {
try {
return new GuaguaWritableAdapter<Tuple>(this.parquetRecordReader.getCurrentValue());
} catch (IOException e) {
throw new GuaguaRuntimeException(e);
} catch (InterruptedException e) {
throw new GuaguaRuntimeException(e);
}
}
@SuppressWarnings("rawtypes")
private static Enum fromEnumConstantName(Class<?> enumClass, String constantName) {
Object[] enumConstants = enumClass.getEnumConstants();
for(Object t: enumConstants) {
if(((java.lang.Enum<?>) t).name().equals(constantName)) {
return (Enum) t;
}
}
return null;
}
/*
* (non-Javadoc)
*
* @see ml.shifu.guagua.io.GuaguaRecordReader#close()
*/
@Override
public void close() throws IOException {
this.parquetRecordReader.close();
}
}