/******************************************************************************* * Copyright 2017 Capital One Services, LLC and Bitwise, Inc. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License *******************************************************************************/ /* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package hydrograph.engine.cascading.scheme.avro; import cascading.avro.serialization.AvroSpecificRecordSerialization; import cascading.flow.FlowProcess; import cascading.scheme.Scheme; import cascading.scheme.SinkCall; import cascading.scheme.SourceCall; import cascading.tap.CompositeTap; import cascading.tap.Tap; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.file.DataFileStream; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.IndexedRecord; import org.apache.avro.mapred.*; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.Collection; import java.util.LinkedList; import java.util.List; public class CustomAvroScheme extends Scheme<JobConf, RecordReader, OutputCollector, Object[], Object[]> { private static final String DEFAULT_RECORD_NAME = "CascadingAvroRecord"; private static final PathFilter filter = new PathFilter() { @Override public boolean accept(Path path) { return !path.getName().startsWith("_"); } }; Schema schema; /** * Constructor to read from an Avro source or write to an Avro sink without * specifying the schema. If using as a sink, the sink Fields must have type * information and currently Map and List are not supported. */ // public CustomAvroScheme() { // this(null); // } /** * Create a new Cascading 2.0 scheme suitable for reading and writing data * using the Avro serialization format. This is the legacy constructor * format. A Fields object and the corresponding types must be provided. * * @param fields * Fields object from cascading * @param types * array of Class types */ public CustomAvroScheme(AvroDescriptor avroDescriptor) { this(CustomCascadingToAvro.generateAvroSchemaFromFieldsAndTypes( DEFAULT_RECORD_NAME, avroDescriptor.getInputFields(), avroDescriptor.getFieldDataTypes(), avroDescriptor.getFieldPrecision(), avroDescriptor.getFieldScale())); } /** * Create a new Cascading 2.0 scheme suitable for reading and writing data * using the Avro serialization format. Note that if schema is null, the * Avro schema will be inferred from one of the source files (if this scheme * is being used as a source). At the moment, we are unable to infer a * schema for a sink (this will change soon with a new version of cascading * though). * * @param schema * Avro schema, or null if this is to be inferred from source * file. Note that a runtime exception will happen if the * AvroScheme is used as a sink and no schema is supplied. */ public CustomAvroScheme(Schema schema) { this.schema = schema; if (schema == null) { setSinkFields(Fields.ALL); setSourceFields(Fields.UNKNOWN); } else { Fields cascadingFields = new Fields(); for (Field avroField : schema.getFields()) { cascadingFields = cascadingFields.append(new Fields(avroField .name())); } setSinkFields(cascadingFields); setSourceFields(cascadingFields); } } /** * Helper method to read in a schema when de-serializing the object * * @param in * The ObjectInputStream containing the serialized object * @return Schema The parsed schema. */ protected static Schema readSchema(java.io.ObjectInputStream in) throws IOException { final Schema.Parser parser = new Schema.Parser(); try { return parser.parse(in.readObject().toString()); } catch (ClassNotFoundException cce) { throw new RuntimeException( "Unable to read schema which is expected to be written as a java string", cce); } } /** * Return the schema which has been set as a string * * @return String representing the schema */ String getJsonSchema() { if (schema == null) { return ""; } else { return schema.toString(); } } /** * Sink method to take an outgoing tuple and write it to Avro. * * @param flowProcess * The cascading FlowProcess object. Should be passed in by * cascading automatically. * @param sinkCall * The cascading SinkCall object. Should be passed in by * cascading automatically. * @throws IOException */ @Override public void sink(FlowProcess<? extends JobConf> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException { TupleEntry tupleEntry = sinkCall.getOutgoingEntry(); IndexedRecord record = new Record((Schema) sinkCall.getContext()[0]); Object[] objectArray = CustomCascadingToAvro.parseTupleEntry( tupleEntry, (Schema) sinkCall.getContext()[0]); for (int i = 0; i < objectArray.length; i++) { record.put(i, objectArray[i]); } // noinspection unchecked sinkCall.getOutput().collect(new AvroWrapper<IndexedRecord>(record), NullWritable.get()); } /** * Sink prepare method called by cascading once on each reducer. This method * stuffs the schema into a context for easy access by the sink method. * * @param flowProcess * The cascading FlowProcess object. Should be passed in by * cascading automatically. * @param sinkCall * The cascading SinkCall object. Should be passed in by * cascading automatically. * @throws IOException */ @Override public void sinkPrepare(FlowProcess<? extends JobConf> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException { sinkCall.setContext(new Object[] { schema }); } /** * sinkConfInit is called by cascading to set up the sinks. This happens on * the client side before the job is distributed. There is a check for the * presence of a schema and an exception is thrown if none has been * provided. After the schema check the conf object is given the options * that Avro needs. * * @param flowProcess * The cascading FlowProcess object. Should be passed in by * cascading automatically. * @param tap * The cascading Tap object. Should be passed in by cascading * automatically. * @param conf * The Hadoop JobConf object. This is passed in by cascading * automatically. * @throws RuntimeException * If no schema is present this halts the entire process. */ @Override public void sinkConfInit(FlowProcess<? extends JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { if (schema == null) { throw new RuntimeException("Must provide sink schema"); } // Set the output schema and output format class conf.set(AvroJob.OUTPUT_SCHEMA, schema.toString()); conf.setOutputFormat(AvroOutputFormat.class); // add AvroSerialization to io.serializations addAvroSerializations(conf); } /** * This method is called by cascading to set up the incoming fields. If a * schema isn't present then it will go and peek at the input data to * retrieve one. The field names from the schema are used to name the * cascading fields. * * @param flowProcess * The cascading FlowProcess object. Should be passed in by * cascading automatically. * @param tap * The cascading Tap object. Should be passed in by cascading * automatically. * @return Fields The source cascading fields. */ @Override public Fields retrieveSourceFields( FlowProcess<? extends JobConf> flowProcess, Tap tap) { if (schema == null) { try { schema = getSourceSchema(flowProcess, tap); } catch (IOException e) { throw new RuntimeException("Can't get schema from data source"); } } Fields cascadingFields = new Fields(); if (schema.getType().equals(Schema.Type.NULL)) { cascadingFields = Fields.NONE; } else { for (Field avroField : schema.getFields()) cascadingFields = cascadingFields.append(new Fields(avroField .name())); } setSourceFields(cascadingFields); return getSourceFields(); } /** * Source method to take an incoming Avro record and make it a Tuple. * * @param flowProcess * The cascading FlowProcess object. Should be passed in by * cascading automatically. * @param sourceCall * The cascading SourceCall object. Should be passed in by * cascading automatically. * @return boolean true on successful parsing and collection, false on * failure. * @throws IOException */ @Override public boolean source(FlowProcess<? extends JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall) throws IOException { @SuppressWarnings("unchecked") RecordReader<AvroWrapper<IndexedRecord>, Writable> input = sourceCall .getInput(); AvroWrapper<IndexedRecord> wrapper = input.createKey(); if (!input.next(wrapper, input.createValue())) { return false; } IndexedRecord record = wrapper.datum(); Tuple tuple = sourceCall.getIncomingEntry().getTuple(); tuple.clear(); Object[] split = CustomAvroToCascading.parseRecord(record, schema); tuple.addAll(split); return true; } /** * sourceConfInit is called by cascading to set up the sources. This happens * on the client side before the job is distributed. There is a check for * the presence of a schema and if none has been provided the data is peeked * at to get a schema. After the schema check the conf object is given the * options that Avro needs. * * @param flowProcess * The cascading FlowProcess object. Should be passed in by * cascading automatically. * @param tap * The cascading Tap object. Should be passed in by cascading * automatically. * @param conf * The Hadoop JobConf object. This is passed in by cascading * automatically. * @throws RuntimeException * If no schema is present this halts the entire process. */ @Override public void sourceConfInit(FlowProcess<? extends JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { retrieveSourceFields(flowProcess, tap); // Set the input schema and input class conf.set(AvroJob.INPUT_SCHEMA, schema.toString()); conf.setInputFormat(AvroInputFormat.class); // add AvroSerialization to io.serializations addAvroSerializations(conf); } /** * This method peeks at the source data to get a schema when none has been * provided. * * @param flowProcess * The cascading FlowProcess object for this flow. * @param tap * The cascading Tap object. * @return Schema The schema of the peeked at data, or Schema.NULL if none * exists. */ private Schema getSourceSchema(FlowProcess<? extends JobConf> flowProcess, Tap tap) throws IOException { if (tap instanceof CompositeTap) { tap = (Tap) ((CompositeTap) tap).getChildTaps().next(); } final String path = tap.getIdentifier(); Path p = new Path(path); final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy()); // Get all the input dirs List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs .globStatus(p, filter))); // Now get all the things that are one level down for (FileStatus status : new LinkedList<FileStatus>(statuses)) { if (status.isDir()) for (FileStatus child : Arrays.asList(fs.listStatus( status.getPath(), filter))) { if (child.isDir()) { statuses.addAll(Arrays.asList(fs.listStatus( child.getPath(), filter))); } else if (fs.isFile(child.getPath())) { statuses.add(child); } } } for (FileStatus status : statuses) { Path statusPath = status.getPath(); if (fs.isFile(statusPath)) { // no need to open them all InputStream stream = null; DataFileStream reader = null; try { stream = new BufferedInputStream(fs.open(statusPath)); reader = new DataFileStream(stream, new GenericDatumReader()); return reader.getSchema(); } finally { if (reader == null) { if (stream != null) { stream.close(); } } else { reader.close(); } } } } // couldn't find any Avro files, return null schema return Schema.create(Schema.Type.NULL); } private void addAvroSerializations(JobConf conf) { Collection<String> serializations = conf .getStringCollection("io.serializations"); if (!serializations.contains(AvroSerialization.class.getName())) { serializations.add(AvroSerialization.class.getName()); serializations.add(AvroSpecificRecordSerialization.class.getName()); } conf.setStrings("io.serializations", serializations.toArray(new String[serializations.size()])); } private void writeObject(java.io.ObjectOutputStream out) throws IOException { out.writeObject(this.schema.toString()); } private void readObject(java.io.ObjectInputStream in) throws IOException { this.schema = readSchema(in); } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; if (!super.equals(o)) return false; CustomAvroScheme that = (CustomAvroScheme) o; return !(schema != null ? !schema.equals(that.schema) : that.schema != null); } @Override public String toString() { return "AvroScheme{" + "schema=" + schema + '}'; } @Override public int hashCode() { return 31 * getSinkFields().hashCode() + (schema == null ? 0 : schema.hashCode()); } }