/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.util; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import org.apache.avro.AvroRuntimeException; import org.apache.avro.Schema; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import org.codehaus.jackson.JsonNode; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; /*** * This class provides methods to flatten an Avro Schema to make it more optimal for ORC * (Hive does not support predicate pushdown for ORC with nested fields: ETL-7214) * * The behavior of Avro Schema un-nesting is listed below: * * 1. Record within Record (and so on recursively) are flattened into the parent Record * Record R1 { * fields: {[ * { * Record R2 { * fields: {[ * { * Record R3 { * fields: {[ * { * String S2 * } * ]} * }, { * String S3 * } * } * * ]} * } * }, { * String S1 * } * ]} * } * will be flattened to: * Record R1 { * fields: {[ * { * String S1 * }, { * String S2 * }, { * String S3 * } * ]} * } * * 2. All fields un-nested from a Record within an Option (ie. Union of the type [null, Record] or [Record, null]) * within a Record are moved to parent Record as a list of Option fields * Record R1 { * fields : {[ * { * Union : [ * null, * Record R2 { * fields : {[ * { * String S1 * }, { * String S2 * } * ]} * } * } * ]} * } * will be flattened to: * Record R1 { * fields : {[ * { * Union : [ null, String S1] * }, { * Union : [ null, String S2] * } * ]} * } * * 3. Array or Map will not be un-nested, however Records within it will be un-nested as described above * * 4. All un-nested fields are decorated with a new property "flatten_source" which is a dot separated string * concatenation of parent fields name, similarly un-nested fields are renamed to double-underscore string * concatenation of parent fields name * * 5. Primitive Types are not un-nested */ public class AvroFlattener { private static final Logger LOG = Logger.getLogger(AvroFlattener.class); private static final String FLATTENED_NAME_JOINER = "__"; private static final String FLATTENED_SOURCE_JOINER = "."; private static final String FLATTENED_SOURCE_KEY = "flatten_source"; private String flattenedNameJoiner; private String flattenedSourceJoiner; /*** * Flatten the Schema to un-nest recursive Records (to make it optimal for ORC) * @param schema Avro Schema to flatten * @param flattenComplexTypes Flatten complex types recursively other than Record and Option * @return Flattened Avro Schema */ public Schema flatten(Schema schema, boolean flattenComplexTypes) { Preconditions.checkNotNull(schema); // To help make it configurable later this.flattenedNameJoiner = FLATTENED_NAME_JOINER; this.flattenedSourceJoiner = FLATTENED_SOURCE_JOINER; Schema flattenedSchema = flatten(schema, false, flattenComplexTypes); LOG.debug("Original Schema : " + schema); LOG.debug("Flattened Schema: " + flattenedSchema); return flattenedSchema; } /*** * Flatten the Schema to un-nest recursive Records (to make it optimal for ORC) * @param schema Schema to flatten * @param shouldPopulateLineage is set to true if the field is going to be flattened and moved up the hierarchy - * so that lineage information can be tagged to it; which happens when there is a * Record within a Record OR Record within Option within Record and so on, * however not when there is a Record within Map or Array * @param flattenComplexTypes Flatten complex types recursively other than Record and Option * @return Flattened Avro Schema */ private Schema flatten(Schema schema, boolean shouldPopulateLineage, boolean flattenComplexTypes) { Schema flattenedSchema; // Process all Schema Types // (Primitives are simply cloned) switch (schema.getType()) { case ARRAY: // Array might be an array of recursive Records, flatten them if (flattenComplexTypes) { flattenedSchema = Schema.createArray(flatten(schema.getElementType(), false)); } else { flattenedSchema = Schema.createArray(schema.getElementType()); } break; case BOOLEAN: flattenedSchema = Schema.create(schema.getType()); break; case BYTES: flattenedSchema = Schema.create(schema.getType()); break; case DOUBLE: flattenedSchema = Schema.create(schema.getType()); break; case ENUM: flattenedSchema = Schema.createEnum(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.getEnumSymbols()); break; case FIXED: flattenedSchema = Schema.createFixed(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.getFixedSize()); break; case FLOAT: flattenedSchema = Schema.create(schema.getType()); break; case INT: flattenedSchema = Schema.create(schema.getType()); break; case LONG: flattenedSchema = Schema.create(schema.getType()); break; case MAP: if (flattenComplexTypes) { flattenedSchema = Schema.createMap(flatten(schema.getValueType(), false)); } else { flattenedSchema = Schema.createMap(schema.getValueType()); } break; case NULL: flattenedSchema = Schema.create(schema.getType()); break; case RECORD: flattenedSchema = flattenRecord(schema, shouldPopulateLineage, flattenComplexTypes); break; case STRING: flattenedSchema = Schema.create(schema.getType()); break; case UNION: flattenedSchema = flattenUnion(schema, shouldPopulateLineage, flattenComplexTypes); break; default: String exceptionMessage = String.format("Schema flattening failed for \"%s\" ", schema); LOG.error(exceptionMessage); throw new AvroRuntimeException(exceptionMessage); } // Copy schema metadata copyProperties(schema, flattenedSchema); return flattenedSchema; } /*** * Flatten Record schema * @param schema Record Schema to flatten * @param shouldPopulateLineage If lineage information should be tagged in the field, this is true when we are * un-nesting fields * @param flattenComplexTypes Flatten complex types recursively other than Record and Option * @return Flattened Record Schema */ private Schema flattenRecord(Schema schema, boolean shouldPopulateLineage, boolean flattenComplexTypes) { Preconditions.checkNotNull(schema); Preconditions.checkArgument(Schema.Type.RECORD.equals(schema.getType())); Schema flattenedSchema; List<Schema.Field> flattenedFields = new ArrayList<>(); if (schema.getFields().size() > 0) { for (Schema.Field oldField : schema.getFields()) { List<Schema.Field> newFields = flattenField(oldField, ImmutableList.<String>of(), shouldPopulateLineage, flattenComplexTypes, Optional.<Schema>absent()); if (null != newFields && newFields.size() > 0) { flattenedFields.addAll(newFields); } } } flattenedSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError()); flattenedSchema.setFields(flattenedFields); return flattenedSchema; } /*** * Flatten Union Schema * @param schema Union Schema to flatten * @param shouldPopulateLineage If lineage information should be tagged in the field, this is true when we are * un-nesting fields * @param flattenComplexTypes Flatten complex types recursively other than Record and Option * @return Flattened Union Schema */ private Schema flattenUnion(Schema schema, boolean shouldPopulateLineage, boolean flattenComplexTypes) { Preconditions.checkNotNull(schema); Preconditions.checkArgument(Schema.Type.UNION.equals(schema.getType())); Schema flattenedSchema; List<Schema> flattenedUnionMembers = new ArrayList<>(); if (null != schema.getTypes() && schema.getTypes().size() > 0) { for (Schema oldUnionMember : schema.getTypes()) { if (flattenComplexTypes) { // It's member might still recursively contain records flattenedUnionMembers.add(flatten(oldUnionMember, shouldPopulateLineage, flattenComplexTypes)); } else { flattenedUnionMembers.add(oldUnionMember); } } } flattenedSchema = Schema.createUnion(flattenedUnionMembers); return flattenedSchema; } /*** * Flatten Record field, and compute a list of flattened fields * * Note: Lineage represents the source path from root for the flattened field. For. eg. If the original schema is: * { * "type" : "record", * "name" : "parentRecordName", * "fields" : [ { * "name" : "parentFieldRecord", * "type" : { * "type" : "record", * "name" : "nestedRecordName", * "fields" : [ { * "name" : "nestedFieldString", * "type" : "string" * }, { * "name" : "nestedFieldInt", * "type" : "int" * } ] * } * }] * } * The expected output schema is: * { * "type" : "record", * "name" : "parentRecordName", * "fields" : [ { * "name" : "parentFieldRecord__nestedFieldString", * "type" : "string", * "flatten_source" : "parentFieldRecord.nestedFieldString" * }, { * "name" : "parentFieldRecord__nestedFieldInt", * "type" : "int", * "flatten_source" : "parentFieldRecord.nestedFieldInt" * }, { * "name" : "parentFieldInt", * "type" : "int" * } ] * } * Here, 'flatten_source' and field 'name' has also been modified to represent their origination from nested schema * lineage helps to determine that * * @param f Field to flatten * @param parentLineage Parent's lineage represented as a List of Strings * @param shouldPopulateLineage If lineage information should be tagged in the field, this is true when we are * un-nesting fields * @param flattenComplexTypes Flatten complex types recursively other than Record and Option * @param shouldWrapInOption If the field should be wrapped as an OPTION, if we un-nest fields within an OPTION * we make all the unnested fields as OPTIONs * @return List of flattened Record fields */ private List<Schema.Field> flattenField(Schema.Field f, ImmutableList<String> parentLineage, boolean shouldPopulateLineage, boolean flattenComplexTypes, Optional<Schema> shouldWrapInOption) { Preconditions.checkNotNull(f); Preconditions.checkNotNull(f.schema()); Preconditions.checkNotNull(f.name()); List<Schema.Field> flattenedFields = new ArrayList<>(); ImmutableList<String> lineage = ImmutableList.<String>builder() .addAll(parentLineage.iterator()).add(f.name()).build(); // If field.Type = RECORD, un-nest its fields and return them if (Schema.Type.RECORD.equals(f.schema().getType())) { if (null != f.schema().getFields() && f.schema().getFields().size() > 0) { for (Schema.Field field : f.schema().getFields()) { flattenedFields.addAll(flattenField(field, lineage, true, flattenComplexTypes, Optional.<Schema>absent())); } } } // If field.Type = OPTION, un-nest its fields and return them else { Optional<Schema> optionalRecord = isOfOptionType(f.schema()); if (optionalRecord.isPresent()) { Schema record = optionalRecord.get(); if (record.getFields().size() > 0) { for (Schema.Field field : record.getFields()) { flattenedFields.addAll(flattenField(field, lineage, true, flattenComplexTypes, Optional.of(f.schema()))); } } } // If field.Type = any-other, copy and return it else { // Compute name and source using lineage String flattenName = f.name(); String flattenSource = StringUtils.EMPTY; if (shouldPopulateLineage) { flattenName = StringUtils.join(lineage, flattenedNameJoiner); flattenSource = StringUtils.join(lineage, flattenedSourceJoiner); } // Copy field Schema flattenedFieldSchema = flatten(f.schema(), shouldPopulateLineage, flattenComplexTypes); if (shouldWrapInOption.isPresent()) { boolean isNullFirstMember = Schema.Type.NULL.equals(shouldWrapInOption.get().getTypes().get(0).getType()); // If already Union, just copy it instead of wrapping (Union within Union is not supported) if (Schema.Type.UNION.equals(flattenedFieldSchema.getType())) { List<Schema> newUnionMembers = new ArrayList<>(); if (isNullFirstMember) { newUnionMembers.add(Schema.create(Schema.Type.NULL)); } for (Schema type : flattenedFieldSchema.getTypes()) { if (Schema.Type.NULL.equals(type.getType())) { continue; } newUnionMembers.add(type); } if (!isNullFirstMember) { newUnionMembers.add(Schema.create(Schema.Type.NULL)); } flattenedFieldSchema = Schema.createUnion(newUnionMembers); } // Wrap the Union, since parent Union is an option else { if (isNullFirstMember) { flattenedFieldSchema = Schema.createUnion(Arrays.asList(Schema.create(Schema.Type.NULL), flattenedFieldSchema)); } else { flattenedFieldSchema = Schema.createUnion(Arrays.asList(flattenedFieldSchema, Schema.create(Schema.Type.NULL))); } } } Schema.Field field = new Schema.Field(flattenName, flattenedFieldSchema, f.doc(), f.defaultValue(), f.order()); if (StringUtils.isNotBlank(flattenSource)) { field.addProp(FLATTENED_SOURCE_KEY, flattenSource); } for (Map.Entry<String, JsonNode> entry : f.getJsonProps().entrySet()) { field.addProp(entry.getKey(), entry.getValue()); } flattenedFields.add(field); } } return flattenedFields; } /*** * Check if the Avro Schema is of type OPTION * ie. [null, RECORD] or [RECORD, null] * @param schema Avro Schema to check * @return Optional Avro Record if schema is of type OPTION */ private static Optional<Schema> isOfOptionType(Schema schema) { Preconditions.checkNotNull(schema); // If not of type UNION, cant be an OPTION if (!Schema.Type.UNION.equals(schema.getType())) { return Optional.<Schema>absent(); } // If has more than two members, can't be an OPTION List<Schema> types = schema.getTypes(); if (null != types && types.size() == 2) { Schema first = types.get(0); Schema second = types.get(1); // One member should be of type NULL and other of type RECORD if (Schema.Type.NULL.equals(first.getType()) && Schema.Type.RECORD.equals(second.getType())) { return Optional.of(second); } else if (Schema.Type.RECORD.equals(first.getType()) && Schema.Type.NULL.equals(second.getType())) { return Optional.of(first); } } return Optional.<Schema>absent(); } /*** * Copy properties from old Avro Schema to new Avro Schema * @param oldSchema Old Avro Schema to copy properties from * @param newSchema New Avro Schema to copy properties to */ private static void copyProperties(Schema oldSchema, Schema newSchema) { Preconditions.checkNotNull(oldSchema); Preconditions.checkNotNull(newSchema); Map<String, JsonNode> props = oldSchema.getJsonProps(); copyProperties(props, newSchema); } /*** * Copy properties to an Avro Schema * @param props Properties to copy to Avro Schema * @param schema Avro Schema to copy properties to */ private static void copyProperties(Map<String, JsonNode> props, Schema schema) { Preconditions.checkNotNull(schema); // (if null, don't copy but do not throw exception) if (null != props) { for (Map.Entry<String, JsonNode> prop : props.entrySet()) { schema.addProp(prop.getKey(), prop.getValue()); } } } }