/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.hcatalog;
import org.apache.flink.api.common.io.RichInputFormat;
import org.apache.flink.api.common.io.LocatableInputSplitAssigner;
import org.apache.flink.api.common.io.statistics.BaseStatistics;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.hadoop.mapreduce.utils.HadoopUtils;
import org.apache.flink.api.java.hadoop.mapreduce.wrapper.HadoopInputSplit;
import org.apache.flink.api.java.typeutils.GenericTypeInfo;
import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.api.java.typeutils.WritableTypeInfo;
import org.apache.flink.core.io.InputSplitAssigner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.hive.hcatalog.data.DefaultHCatRecord;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.data.schema.HCatFieldSchema;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* A InputFormat to read from HCatalog tables.
* The InputFormat supports projection (selection and order of fields) and partition filters.
*
* Data can be returned as {@link org.apache.hive.hcatalog.data.HCatRecord} or Flink-native tuple.
*
* Note: Flink tuples might only support a limited number of fields (depending on the API).
*
* @param <T>
*/
public abstract class HCatInputFormatBase<T> extends RichInputFormat<T, HadoopInputSplit> implements ResultTypeQueryable<T> {
private static final long serialVersionUID = 1L;
private Configuration configuration;
private org.apache.hive.hcatalog.mapreduce.HCatInputFormat hCatInputFormat;
private RecordReader<WritableComparable, HCatRecord> recordReader;
private boolean fetched = false;
private boolean hasNext;
protected String[] fieldNames = new String[0];
protected HCatSchema outputSchema;
private TypeInformation<T> resultType;
public HCatInputFormatBase() { }
/**
* Creates a HCatInputFormat for the given database and table.
* By default, the InputFormat returns {@link org.apache.hive.hcatalog.data.HCatRecord}.
* The return type of the InputFormat can be changed to Flink-native tuples by calling
* {@link HCatInputFormatBase#asFlinkTuples()}.
*
* @param database The name of the database to read from.
* @param table The name of the table to read.
* @throws java.io.IOException
*/
public HCatInputFormatBase(String database, String table) throws IOException {
this(database, table, new Configuration());
}
/**
* Creates a HCatInputFormat for the given database, table, and
* {@link org.apache.hadoop.conf.Configuration}.
* By default, the InputFormat returns {@link org.apache.hive.hcatalog.data.HCatRecord}.
* The return type of the InputFormat can be changed to Flink-native tuples by calling
* {@link HCatInputFormatBase#asFlinkTuples()}.
*
* @param database The name of the database to read from.
* @param table The name of the table to read.
* @param config The Configuration for the InputFormat.
* @throws java.io.IOException
*/
public HCatInputFormatBase(String database, String table, Configuration config) throws IOException {
super();
this.configuration = config;
HadoopUtils.mergeHadoopConf(this.configuration);
this.hCatInputFormat = org.apache.hive.hcatalog.mapreduce.HCatInputFormat.setInput(this.configuration, database, table);
this.outputSchema = org.apache.hive.hcatalog.mapreduce.HCatInputFormat.getTableSchema(this.configuration);
// configure output schema of HCatFormat
configuration.set("mapreduce.lib.hcat.output.schema", HCatUtil.serialize(outputSchema));
// set type information
this.resultType = new WritableTypeInfo(DefaultHCatRecord.class);
}
/**
* Specifies the fields which are returned by the InputFormat and their order.
*
* @param fields The fields and their order which are returned by the InputFormat.
* @return This InputFormat with specified return fields.
* @throws java.io.IOException
*/
public HCatInputFormatBase<T> getFields(String... fields) throws IOException {
// build output schema
ArrayList<HCatFieldSchema> fieldSchemas = new ArrayList<HCatFieldSchema>(fields.length);
for(String field : fields) {
fieldSchemas.add(this.outputSchema.get(field));
}
this.outputSchema = new HCatSchema(fieldSchemas);
// update output schema configuration
configuration.set("mapreduce.lib.hcat.output.schema", HCatUtil.serialize(outputSchema));
return this;
}
/**
* Specifies a SQL-like filter condition on the table's partition columns.
* Filter conditions on non-partition columns are invalid.
* A partition filter can significantly reduce the amount of data to be read.
*
* @param filter A SQL-like filter condition on the table's partition columns.
* @return This InputFormat with specified partition filter.
* @throws java.io.IOException
*/
public HCatInputFormatBase<T> withFilter(String filter) throws IOException {
// set filter
this.hCatInputFormat.setFilter(filter);
return this;
}
/**
* Specifies that the InputFormat returns Flink tuples instead of
* {@link org.apache.hive.hcatalog.data.HCatRecord}.
*
* Note: Flink tuples might only support a limited number of fields (depending on the API).
*
* @return This InputFormat.
* @throws org.apache.hive.hcatalog.common.HCatException
*/
public HCatInputFormatBase<T> asFlinkTuples() throws HCatException {
// build type information
int numFields = outputSchema.getFields().size();
if(numFields > this.getMaxFlinkTupleSize()) {
throw new IllegalArgumentException("Only up to "+this.getMaxFlinkTupleSize()+
" fields can be returned as Flink tuples.");
}
TypeInformation[] fieldTypes = new TypeInformation[numFields];
fieldNames = new String[numFields];
for (String fieldName : outputSchema.getFieldNames()) {
HCatFieldSchema field = outputSchema.get(fieldName);
int fieldPos = outputSchema.getPosition(fieldName);
TypeInformation fieldType = getFieldType(field);
fieldTypes[fieldPos] = fieldType;
fieldNames[fieldPos] = fieldName;
}
this.resultType = new TupleTypeInfo(fieldTypes);
return this;
}
protected abstract int getMaxFlinkTupleSize();
private TypeInformation getFieldType(HCatFieldSchema fieldSchema) {
switch(fieldSchema.getType()) {
case INT:
return BasicTypeInfo.INT_TYPE_INFO;
case TINYINT:
return BasicTypeInfo.BYTE_TYPE_INFO;
case SMALLINT:
return BasicTypeInfo.SHORT_TYPE_INFO;
case BIGINT:
return BasicTypeInfo.LONG_TYPE_INFO;
case BOOLEAN:
return BasicTypeInfo.BOOLEAN_TYPE_INFO;
case FLOAT:
return BasicTypeInfo.FLOAT_TYPE_INFO;
case DOUBLE:
return BasicTypeInfo.DOUBLE_TYPE_INFO;
case STRING:
return BasicTypeInfo.STRING_TYPE_INFO;
case BINARY:
return PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO;
case ARRAY:
return new GenericTypeInfo(List.class);
case MAP:
return new GenericTypeInfo(Map.class);
case STRUCT:
return new GenericTypeInfo(List.class);
default:
throw new IllegalArgumentException("Unknown data type \""+fieldSchema.getType()+"\" encountered.");
}
}
/**
* Returns the {@link org.apache.hadoop.conf.Configuration} of the HCatInputFormat.
*
* @return The Configuration of the HCatInputFormat.
*/
public Configuration getConfiguration() {
return this.configuration;
}
/**
* Returns the {@link org.apache.hive.hcatalog.data.schema.HCatSchema} of the {@link org.apache.hive.hcatalog.data.HCatRecord}
* returned by this InputFormat.
*
* @return The HCatSchema of the HCatRecords returned by this InputFormat.
*/
public HCatSchema getOutputSchema() {
return this.outputSchema;
}
// --------------------------------------------------------------------------------------------
// InputFormat
// --------------------------------------------------------------------------------------------
@Override
public void configure(org.apache.flink.configuration.Configuration parameters) {
// nothing to do
}
@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
// no statistics provided at the moment
return null;
}
@Override
public HadoopInputSplit[] createInputSplits(int minNumSplits)
throws IOException {
configuration.setInt("mapreduce.input.fileinputformat.split.minsize", minNumSplits);
JobContext jobContext = null;
try {
jobContext = HadoopUtils.instantiateJobContext(configuration, new JobID());
} catch (Exception e) {
throw new RuntimeException(e);
}
List<InputSplit> splits;
try {
splits = this.hCatInputFormat.getSplits(jobContext);
} catch (InterruptedException e) {
throw new IOException("Could not get Splits.", e);
}
HadoopInputSplit[] hadoopInputSplits = new HadoopInputSplit[splits.size()];
for(int i = 0; i < hadoopInputSplits.length; i++){
hadoopInputSplits[i] = new HadoopInputSplit(i, splits.get(i), jobContext);
}
return hadoopInputSplits;
}
@Override
public InputSplitAssigner getInputSplitAssigner(HadoopInputSplit[] inputSplits) {
return new LocatableInputSplitAssigner(inputSplits);
}
@Override
public void open(HadoopInputSplit split) throws IOException {
TaskAttemptContext context = null;
try {
context = HadoopUtils.instantiateTaskAttemptContext(configuration, new TaskAttemptID());
} catch(Exception e) {
throw new RuntimeException(e);
}
try {
this.recordReader = this.hCatInputFormat
.createRecordReader(split.getHadoopInputSplit(), context);
this.recordReader.initialize(split.getHadoopInputSplit(), context);
} catch (InterruptedException e) {
throw new IOException("Could not create RecordReader.", e);
} finally {
this.fetched = false;
}
}
@Override
public boolean reachedEnd() throws IOException {
if(!this.fetched) {
fetchNext();
}
return !this.hasNext;
}
private void fetchNext() throws IOException {
try {
this.hasNext = this.recordReader.nextKeyValue();
} catch (InterruptedException e) {
throw new IOException("Could not fetch next KeyValue pair.", e);
} finally {
this.fetched = true;
}
}
@Override
public T nextRecord(T record) throws IOException {
if(!this.fetched) {
// first record
fetchNext();
}
if(!this.hasNext) {
return null;
}
try {
// get next HCatRecord
HCatRecord v = this.recordReader.getCurrentValue();
this.fetched = false;
if(this.fieldNames.length > 0) {
// return as Flink tuple
return this.buildFlinkTuple(record, v);
} else {
// return as HCatRecord
return (T)v;
}
} catch (InterruptedException e) {
throw new IOException("Could not get next record.", e);
}
}
protected abstract T buildFlinkTuple(T t, HCatRecord record) throws HCatException;
@Override
public void close() throws IOException {
this.recordReader.close();
}
// --------------------------------------------------------------------------------------------
// Custom de/serialization methods
// --------------------------------------------------------------------------------------------
private void writeObject(ObjectOutputStream out) throws IOException {
out.writeInt(this.fieldNames.length);
for(String fieldName : this.fieldNames) {
out.writeUTF(fieldName);
}
this.configuration.write(out);
}
@SuppressWarnings("unchecked")
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
this.fieldNames = new String[in.readInt()];
for(int i=0; i<this.fieldNames.length; i++) {
this.fieldNames[i] = in.readUTF();
}
Configuration configuration = new Configuration();
configuration.readFields(in);
if(this.configuration == null) {
this.configuration = configuration;
}
this.hCatInputFormat = new org.apache.hive.hcatalog.mapreduce.HCatInputFormat();
this.outputSchema = (HCatSchema)HCatUtil.deserialize(this.configuration.get("mapreduce.lib.hcat.output.schema"));
}
// --------------------------------------------------------------------------------------------
// Result type business
// --------------------------------------------------------------------------------------------
@Override
public TypeInformation<T> getProducedType() {
return this.resultType;
}
}