CustomAvroScheme.java example

Explorer
Hydrograph-master
/*******************************************************************************
 * Copyright 2017 Capital One Services, LLC and Bitwise, Inc.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License
 *******************************************************************************/
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package hydrograph.engine.cascading.scheme.avro;

import cascading.avro.serialization.AvroSpecificRecordSerialization;
import cascading.flow.FlowProcess;
import cascading.scheme.Scheme;
import cascading.scheme.SinkCall;
import cascading.scheme.SourceCall;
import cascading.tap.CompositeTap;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.mapred.*;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;

public class CustomAvroScheme extends
		Scheme<JobConf, RecordReader, OutputCollector, Object[], Object[]> {

	private static final String DEFAULT_RECORD_NAME = "CascadingAvroRecord";
	private static final PathFilter filter = new PathFilter() {
		@Override
		public boolean accept(Path path) {
			return !path.getName().startsWith("_");
		}
	};
	Schema schema;

	/**
	 * Constructor to read from an Avro source or write to an Avro sink without
	 * specifying the schema. If using as a sink, the sink Fields must have type
	 * information and currently Map and List are not supported.
	 */
	// public CustomAvroScheme() {
	// this(null);
	// }

	/**
	 * Create a new Cascading 2.0 scheme suitable for reading and writing data
	 * using the Avro serialization format. This is the legacy constructor
	 * format. A Fields object and the corresponding types must be provided.
	 *
	 * @param fields
	 *            Fields object from cascading
	 * @param types
	 *            array of Class types
	 */
	public CustomAvroScheme(AvroDescriptor avroDescriptor) {

		this(CustomCascadingToAvro.generateAvroSchemaFromFieldsAndTypes(
				DEFAULT_RECORD_NAME, avroDescriptor.getInputFields(),
				avroDescriptor.getFieldDataTypes(),
				avroDescriptor.getFieldPrecision(),
				avroDescriptor.getFieldScale()));
	}

	/**
	 * Create a new Cascading 2.0 scheme suitable for reading and writing data
	 * using the Avro serialization format. Note that if schema is null, the
	 * Avro schema will be inferred from one of the source files (if this scheme
	 * is being used as a source). At the moment, we are unable to infer a
	 * schema for a sink (this will change soon with a new version of cascading
	 * though).
	 *
	 * @param schema
	 *            Avro schema, or null if this is to be inferred from source
	 *            file. Note that a runtime exception will happen if the
	 *            AvroScheme is used as a sink and no schema is supplied.
	 */
	public CustomAvroScheme(Schema schema) {
		this.schema = schema;

		if (schema == null) {
			setSinkFields(Fields.ALL);
			setSourceFields(Fields.UNKNOWN);
		} else {
			Fields cascadingFields = new Fields();
			for (Field avroField : schema.getFields()) {
				cascadingFields = cascadingFields.append(new Fields(avroField
						.name()));
			}
			setSinkFields(cascadingFields);
			setSourceFields(cascadingFields);
		}
	}

	/**
	 * Helper method to read in a schema when de-serializing the object
	 *
	 * @param in
	 *            The ObjectInputStream containing the serialized object
	 * @return Schema The parsed schema.
	 */
	protected static Schema readSchema(java.io.ObjectInputStream in)
			throws IOException {
		final Schema.Parser parser = new Schema.Parser();
		try {
			return parser.parse(in.readObject().toString());
		} catch (ClassNotFoundException cce) {
			throw new RuntimeException(
					"Unable to read schema which is expected to be written as a java string",
					cce);
		}
	}

	/**
	 * Return the schema which has been set as a string
	 *
	 * @return String representing the schema
	 */
	String getJsonSchema() {
		if (schema == null) {
			return "";
		} else {
			return schema.toString();
		}
	}

	/**
	 * Sink method to take an outgoing tuple and write it to Avro.
	 *
	 * @param flowProcess
	 *            The cascading FlowProcess object. Should be passed in by
	 *            cascading automatically.
	 * @param sinkCall
	 *            The cascading SinkCall object. Should be passed in by
	 *            cascading automatically.
	 * @throws IOException
	 */
	@Override
	public void sink(FlowProcess<? extends JobConf> flowProcess,
			SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
		TupleEntry tupleEntry = sinkCall.getOutgoingEntry();

		IndexedRecord record = new Record((Schema) sinkCall.getContext()[0]);
		Object[] objectArray = CustomCascadingToAvro.parseTupleEntry(
				tupleEntry, (Schema) sinkCall.getContext()[0]);
		for (int i = 0; i < objectArray.length; i++) {
			record.put(i, objectArray[i]);
		}
		// noinspection unchecked
		sinkCall.getOutput().collect(new AvroWrapper<IndexedRecord>(record),
				NullWritable.get());

	}

	/**
	 * Sink prepare method called by cascading once on each reducer. This method
	 * stuffs the schema into a context for easy access by the sink method.
	 *
	 * @param flowProcess
	 *            The cascading FlowProcess object. Should be passed in by
	 *            cascading automatically.
	 * @param sinkCall
	 *            The cascading SinkCall object. Should be passed in by
	 *            cascading automatically.
	 * @throws IOException
	 */
	@Override
	public void sinkPrepare(FlowProcess<? extends JobConf> flowProcess,
			SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
		sinkCall.setContext(new Object[] { schema });

	}

	/**
	 * sinkConfInit is called by cascading to set up the sinks. This happens on
	 * the client side before the job is distributed. There is a check for the
	 * presence of a schema and an exception is thrown if none has been
	 * provided. After the schema check the conf object is given the options
	 * that Avro needs.
	 *
	 * @param flowProcess
	 *            The cascading FlowProcess object. Should be passed in by
	 *            cascading automatically.
	 * @param tap
	 *            The cascading Tap object. Should be passed in by cascading
	 *            automatically.
	 * @param conf
	 *            The Hadoop JobConf object. This is passed in by cascading
	 *            automatically.
	 * @throws RuntimeException
	 *             If no schema is present this halts the entire process.
	 */
	@Override
	public void sinkConfInit(FlowProcess<? extends JobConf> flowProcess,
			Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) {

		if (schema == null) {
			throw new RuntimeException("Must provide sink schema");
		}
		// Set the output schema and output format class
		conf.set(AvroJob.OUTPUT_SCHEMA, schema.toString());
		conf.setOutputFormat(AvroOutputFormat.class);

		// add AvroSerialization to io.serializations
		addAvroSerializations(conf);
	}

	/**
	 * This method is called by cascading to set up the incoming fields. If a
	 * schema isn't present then it will go and peek at the input data to
	 * retrieve one. The field names from the schema are used to name the
	 * cascading fields.
	 *
	 * @param flowProcess
	 *            The cascading FlowProcess object. Should be passed in by
	 *            cascading automatically.
	 * @param tap
	 *            The cascading Tap object. Should be passed in by cascading
	 *            automatically.
	 * @return Fields The source cascading fields.
	 */
	@Override
	public Fields retrieveSourceFields(
			FlowProcess<? extends JobConf> flowProcess, Tap tap) {
		if (schema == null) {
			try {
				schema = getSourceSchema(flowProcess, tap);
			} catch (IOException e) {
				throw new RuntimeException("Can't get schema from data source");
			}
		}
		Fields cascadingFields = new Fields();
		if (schema.getType().equals(Schema.Type.NULL)) {
			cascadingFields = Fields.NONE;
		} else {
			for (Field avroField : schema.getFields())
				cascadingFields = cascadingFields.append(new Fields(avroField
						.name()));
		}
		setSourceFields(cascadingFields);
		return getSourceFields();
	}

	/**
	 * Source method to take an incoming Avro record and make it a Tuple.
	 *
	 * @param flowProcess
	 *            The cascading FlowProcess object. Should be passed in by
	 *            cascading automatically.
	 * @param sourceCall
	 *            The cascading SourceCall object. Should be passed in by
	 *            cascading automatically.
	 * @return boolean true on successful parsing and collection, false on
	 *         failure.
	 * @throws IOException
	 */
	@Override
	public boolean source(FlowProcess<? extends JobConf> flowProcess,
			SourceCall<Object[], RecordReader> sourceCall) throws IOException {

		@SuppressWarnings("unchecked")
		RecordReader<AvroWrapper<IndexedRecord>, Writable> input = sourceCall
				.getInput();
		AvroWrapper<IndexedRecord> wrapper = input.createKey();
		if (!input.next(wrapper, input.createValue())) {
			return false;
		}
		IndexedRecord record = wrapper.datum();
		Tuple tuple = sourceCall.getIncomingEntry().getTuple();
		tuple.clear();

		Object[] split = CustomAvroToCascading.parseRecord(record, schema);
		tuple.addAll(split);

		return true;
	}

	/**
	 * sourceConfInit is called by cascading to set up the sources. This happens
	 * on the client side before the job is distributed. There is a check for
	 * the presence of a schema and if none has been provided the data is peeked
	 * at to get a schema. After the schema check the conf object is given the
	 * options that Avro needs.
	 *
	 * @param flowProcess
	 *            The cascading FlowProcess object. Should be passed in by
	 *            cascading automatically.
	 * @param tap
	 *            The cascading Tap object. Should be passed in by cascading
	 *            automatically.
	 * @param conf
	 *            The Hadoop JobConf object. This is passed in by cascading
	 *            automatically.
	 * @throws RuntimeException
	 *             If no schema is present this halts the entire process.
	 */
	@Override
	public void sourceConfInit(FlowProcess<? extends JobConf> flowProcess,
			Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) {

		retrieveSourceFields(flowProcess, tap);
		// Set the input schema and input class
		conf.set(AvroJob.INPUT_SCHEMA, schema.toString());
		conf.setInputFormat(AvroInputFormat.class);

		// add AvroSerialization to io.serializations
		addAvroSerializations(conf);
	}

	/**
	 * This method peeks at the source data to get a schema when none has been
	 * provided.
	 *
	 * @param flowProcess
	 *            The cascading FlowProcess object for this flow.
	 * @param tap
	 *            The cascading Tap object.
	 * @return Schema The schema of the peeked at data, or Schema.NULL if none
	 *         exists.
	 */
	private Schema getSourceSchema(FlowProcess<? extends JobConf> flowProcess,
			Tap tap) throws IOException {

		if (tap instanceof CompositeTap) {
			tap = (Tap) ((CompositeTap) tap).getChildTaps().next();
		}
		final String path = tap.getIdentifier();
		Path p = new Path(path);
		final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy());
		// Get all the input dirs
		List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs
				.globStatus(p, filter)));
		// Now get all the things that are one level down
		for (FileStatus status : new LinkedList<FileStatus>(statuses)) {
			if (status.isDir())
				for (FileStatus child : Arrays.asList(fs.listStatus(
						status.getPath(), filter))) {
					if (child.isDir()) {
						statuses.addAll(Arrays.asList(fs.listStatus(
								child.getPath(), filter)));
					} else if (fs.isFile(child.getPath())) {
						statuses.add(child);
					}
				}
		}
		for (FileStatus status : statuses) {
			Path statusPath = status.getPath();
			if (fs.isFile(statusPath)) {
				// no need to open them all
				InputStream stream = null;
				DataFileStream reader = null;
				try {
					stream = new BufferedInputStream(fs.open(statusPath));
					reader = new DataFileStream(stream,
							new GenericDatumReader());
					return reader.getSchema();
				} finally {
					if (reader == null) {
						if (stream != null) {
							stream.close();
						}
					} else {
						reader.close();
					}
				}

			}
		}
		// couldn't find any Avro files, return null schema
		return Schema.create(Schema.Type.NULL);
	}

	private void addAvroSerializations(JobConf conf) {
		Collection<String> serializations = conf
				.getStringCollection("io.serializations");
		if (!serializations.contains(AvroSerialization.class.getName())) {
			serializations.add(AvroSerialization.class.getName());
			serializations.add(AvroSpecificRecordSerialization.class.getName());
		}

		conf.setStrings("io.serializations",
				serializations.toArray(new String[serializations.size()]));
	}

	private void writeObject(java.io.ObjectOutputStream out) throws IOException {
		out.writeObject(this.schema.toString());
	}

	private void readObject(java.io.ObjectInputStream in) throws IOException {
		this.schema = readSchema(in);
	}

	@Override
	public boolean equals(Object o) {
		if (this == o)
			return true;
		if (o == null || getClass() != o.getClass())
			return false;
		if (!super.equals(o))
			return false;

		CustomAvroScheme that = (CustomAvroScheme) o;

		return !(schema != null ? !schema.equals(that.schema)
				: that.schema != null);

	}

	@Override
	public String toString() {
		return "AvroScheme{" + "schema=" + schema + '}';
	}

	@Override
	public int hashCode() {

		return 31 * getSinkFields().hashCode()
				+ (schema == null ? 0 : schema.hashCode());
	}
}