HCatInputFormatBase.java example

Explorer
flink-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.hcatalog;

import org.apache.flink.api.common.io.RichInputFormat;
import org.apache.flink.api.common.io.LocatableInputSplitAssigner;
import org.apache.flink.api.common.io.statistics.BaseStatistics;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.hadoop.mapreduce.utils.HadoopUtils;
import org.apache.flink.api.java.hadoop.mapreduce.wrapper.HadoopInputSplit;
import org.apache.flink.api.java.typeutils.GenericTypeInfo;
import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.api.java.typeutils.WritableTypeInfo;
import org.apache.flink.core.io.InputSplitAssigner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.hive.hcatalog.data.DefaultHCatRecord;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.data.schema.HCatFieldSchema;
import org.apache.hive.hcatalog.data.schema.HCatSchema;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * A InputFormat to read from HCatalog tables.
 * The InputFormat supports projection (selection and order of fields) and partition filters.
 *
 * Data can be returned as {@link org.apache.hive.hcatalog.data.HCatRecord} or Flink-native tuple.
 *
 * Note: Flink tuples might only support a limited number of fields (depending on the API).
 *
 * @param <T>
 */
public abstract class HCatInputFormatBase<T> extends RichInputFormat<T, HadoopInputSplit> implements ResultTypeQueryable<T> {

	private static final long serialVersionUID = 1L;

	private Configuration configuration;

	private org.apache.hive.hcatalog.mapreduce.HCatInputFormat hCatInputFormat;
	private RecordReader<WritableComparable, HCatRecord> recordReader;
	private boolean fetched = false;
	private boolean hasNext;

	protected String[] fieldNames = new String[0];
	protected HCatSchema outputSchema;

	private TypeInformation<T> resultType;

	public HCatInputFormatBase() { }

	/**
	 * Creates a HCatInputFormat for the given database and table.
	 * By default, the InputFormat returns {@link org.apache.hive.hcatalog.data.HCatRecord}.
	 * The return type of the InputFormat can be changed to Flink-native tuples by calling
	 * {@link HCatInputFormatBase#asFlinkTuples()}.
	 *
	 * @param database The name of the database to read from.
	 * @param table The name of the table to read.
	 * @throws java.io.IOException
	 */
	public HCatInputFormatBase(String database, String table) throws IOException {
		this(database, table, new Configuration());
	}

	/**
	 * Creates a HCatInputFormat for the given database, table, and
	 * {@link org.apache.hadoop.conf.Configuration}.
	 * By default, the InputFormat returns {@link org.apache.hive.hcatalog.data.HCatRecord}.
	 * The return type of the InputFormat can be changed to Flink-native tuples by calling
	 * {@link HCatInputFormatBase#asFlinkTuples()}.
	 *
	 * @param database The name of the database to read from.
	 * @param table The name of the table to read.
	 * @param config The Configuration for the InputFormat.
	 * @throws java.io.IOException
	 */
	public HCatInputFormatBase(String database, String table, Configuration config) throws IOException {
		super();
		this.configuration = config;
		HadoopUtils.mergeHadoopConf(this.configuration);

		this.hCatInputFormat = org.apache.hive.hcatalog.mapreduce.HCatInputFormat.setInput(this.configuration, database, table);
		this.outputSchema = org.apache.hive.hcatalog.mapreduce.HCatInputFormat.getTableSchema(this.configuration);

		// configure output schema of HCatFormat
		configuration.set("mapreduce.lib.hcat.output.schema", HCatUtil.serialize(outputSchema));
		// set type information
		this.resultType = new WritableTypeInfo(DefaultHCatRecord.class);
	}

	/**
	 * Specifies the fields which are returned by the InputFormat and their order.
	 *
	 * @param fields The fields and their order which are returned by the InputFormat.
	 * @return This InputFormat with specified return fields.
	 * @throws java.io.IOException
	 */
	public HCatInputFormatBase<T> getFields(String... fields) throws IOException {

		// build output schema
		ArrayList<HCatFieldSchema> fieldSchemas = new ArrayList<HCatFieldSchema>(fields.length);
		for(String field : fields) {
			fieldSchemas.add(this.outputSchema.get(field));
		}
		this.outputSchema = new HCatSchema(fieldSchemas);

		// update output schema configuration
		configuration.set("mapreduce.lib.hcat.output.schema", HCatUtil.serialize(outputSchema));

		return this;
	}

	/**
	 * Specifies a SQL-like filter condition on the table's partition columns.
	 * Filter conditions on non-partition columns are invalid.
	 * A partition filter can significantly reduce the amount of data to be read.
	 *
	 * @param filter A SQL-like filter condition on the table's partition columns.
	 * @return This InputFormat with specified partition filter.
	 * @throws java.io.IOException
	 */
	public HCatInputFormatBase<T> withFilter(String filter) throws IOException {

		// set filter
		this.hCatInputFormat.setFilter(filter);

		return this;
	}

	/**
	 * Specifies that the InputFormat returns Flink tuples instead of
	 * {@link org.apache.hive.hcatalog.data.HCatRecord}.
	 *
	 * Note: Flink tuples might only support a limited number of fields (depending on the API).
	 *
	 * @return This InputFormat.
	 * @throws org.apache.hive.hcatalog.common.HCatException
	 */
	public HCatInputFormatBase<T> asFlinkTuples() throws HCatException {

		// build type information
		int numFields = outputSchema.getFields().size();
		if(numFields > this.getMaxFlinkTupleSize()) {
			throw new IllegalArgumentException("Only up to "+this.getMaxFlinkTupleSize()+
					" fields can be returned as Flink tuples.");
		}

		TypeInformation[] fieldTypes = new TypeInformation[numFields];
		fieldNames = new String[numFields];
		for (String fieldName : outputSchema.getFieldNames()) {
			HCatFieldSchema field = outputSchema.get(fieldName);

			int fieldPos = outputSchema.getPosition(fieldName);
			TypeInformation fieldType = getFieldType(field);

			fieldTypes[fieldPos] = fieldType;
			fieldNames[fieldPos] = fieldName;

		}
		this.resultType = new TupleTypeInfo(fieldTypes);

		return this;
	}

	protected abstract int getMaxFlinkTupleSize();

	private TypeInformation getFieldType(HCatFieldSchema fieldSchema) {

		switch(fieldSchema.getType()) {
			case INT:
				return BasicTypeInfo.INT_TYPE_INFO;
			case TINYINT:
				return BasicTypeInfo.BYTE_TYPE_INFO;
			case SMALLINT:
				return BasicTypeInfo.SHORT_TYPE_INFO;
			case BIGINT:
				return BasicTypeInfo.LONG_TYPE_INFO;
			case BOOLEAN:
				return BasicTypeInfo.BOOLEAN_TYPE_INFO;
			case FLOAT:
				return BasicTypeInfo.FLOAT_TYPE_INFO;
			case DOUBLE:
				return BasicTypeInfo.DOUBLE_TYPE_INFO;
			case STRING:
				return BasicTypeInfo.STRING_TYPE_INFO;
			case BINARY:
				return PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO;
			case ARRAY:
				return new GenericTypeInfo(List.class);
			case MAP:
				return new GenericTypeInfo(Map.class);
			case STRUCT:
				return new GenericTypeInfo(List.class);
			default:
				throw new IllegalArgumentException("Unknown data type \""+fieldSchema.getType()+"\" encountered.");
		}
	}

	/**
	 * Returns the {@link org.apache.hadoop.conf.Configuration} of the HCatInputFormat.
	 *
	 * @return The Configuration of the HCatInputFormat.
	 */
	public Configuration getConfiguration() {
		return this.configuration;
	}

	/**
	 * Returns the {@link org.apache.hive.hcatalog.data.schema.HCatSchema} of the {@link org.apache.hive.hcatalog.data.HCatRecord}
	 * returned by this InputFormat.
	 *
	 * @return The HCatSchema of the HCatRecords returned by this InputFormat.
	 */
	public HCatSchema getOutputSchema() {
		return this.outputSchema;
	}

	// --------------------------------------------------------------------------------------------
	//  InputFormat
	// --------------------------------------------------------------------------------------------

	@Override
	public void configure(org.apache.flink.configuration.Configuration parameters) {
		// nothing to do
	}

	@Override
	public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
		// no statistics provided at the moment
		return null;
	}

	@Override
	public HadoopInputSplit[] createInputSplits(int minNumSplits)
			throws IOException {
		configuration.setInt("mapreduce.input.fileinputformat.split.minsize", minNumSplits);

		JobContext jobContext = null;
		try {
			jobContext = HadoopUtils.instantiateJobContext(configuration, new JobID());
		} catch (Exception e) {
			throw new RuntimeException(e);
		}

		List<InputSplit> splits;
		try {
			splits = this.hCatInputFormat.getSplits(jobContext);
		} catch (InterruptedException e) {
			throw new IOException("Could not get Splits.", e);
		}
		HadoopInputSplit[] hadoopInputSplits = new HadoopInputSplit[splits.size()];

		for(int i = 0; i < hadoopInputSplits.length; i++){
			hadoopInputSplits[i] = new HadoopInputSplit(i, splits.get(i), jobContext);
		}
		return hadoopInputSplits;
	}

	@Override
	public InputSplitAssigner getInputSplitAssigner(HadoopInputSplit[] inputSplits) {
		return new LocatableInputSplitAssigner(inputSplits);
	}

	@Override
	public void open(HadoopInputSplit split) throws IOException {
		TaskAttemptContext context = null;
		try {
			context = HadoopUtils.instantiateTaskAttemptContext(configuration, new TaskAttemptID());
		} catch(Exception e) {
			throw new RuntimeException(e);
		}

		try {
			this.recordReader = this.hCatInputFormat
					.createRecordReader(split.getHadoopInputSplit(), context);
			this.recordReader.initialize(split.getHadoopInputSplit(), context);
		} catch (InterruptedException e) {
			throw new IOException("Could not create RecordReader.", e);
		} finally {
			this.fetched = false;
		}
	}

	@Override
	public boolean reachedEnd() throws IOException {
		if(!this.fetched) {
			fetchNext();
		}
		return !this.hasNext;
	}

	private void fetchNext() throws IOException {
		try {
			this.hasNext = this.recordReader.nextKeyValue();
		} catch (InterruptedException e) {
			throw new IOException("Could not fetch next KeyValue pair.", e);
		} finally {
			this.fetched = true;
		}
	}

	@Override
	public T nextRecord(T record) throws IOException {
		if(!this.fetched) {
			// first record
			fetchNext();
		}
		if(!this.hasNext) {
			return null;
		}
		try {

			// get next HCatRecord
			HCatRecord v = this.recordReader.getCurrentValue();
			this.fetched = false;

			if(this.fieldNames.length > 0) {
				// return as Flink tuple
				return this.buildFlinkTuple(record, v);

			} else {
				// return as HCatRecord
				return (T)v;
			}

		} catch (InterruptedException e) {
			throw new IOException("Could not get next record.", e);
		}
	}

	protected abstract T buildFlinkTuple(T t, HCatRecord record) throws HCatException;

	@Override
	public void close() throws IOException {
		this.recordReader.close();
	}

	// --------------------------------------------------------------------------------------------
	//  Custom de/serialization methods
	// --------------------------------------------------------------------------------------------

	private void writeObject(ObjectOutputStream out) throws IOException {
		out.writeInt(this.fieldNames.length);
		for(String fieldName : this.fieldNames) {
			out.writeUTF(fieldName);
		}
		this.configuration.write(out);
	}

	@SuppressWarnings("unchecked")
	private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
		this.fieldNames = new String[in.readInt()];
		for(int i=0; i<this.fieldNames.length; i++) {
			this.fieldNames[i] = in.readUTF();
		}

		Configuration configuration = new Configuration();
		configuration.readFields(in);

		if(this.configuration == null) {
			this.configuration = configuration;
		}

		this.hCatInputFormat = new org.apache.hive.hcatalog.mapreduce.HCatInputFormat();
		this.outputSchema = (HCatSchema)HCatUtil.deserialize(this.configuration.get("mapreduce.lib.hcat.output.schema"));
	}

	// --------------------------------------------------------------------------------------------
	//  Result type business
	// --------------------------------------------------------------------------------------------

	@Override
	public TypeInformation<T> getProducedType() {
		return this.resultType;
	}

}