/*
* Copyright (c) 2013, Cloudera, Inc. All Rights Reserved.
*
* Cloudera, Inc. licenses this file to you under the Apache License,
* Version 2.0 (the "License"). You may not use this file except in
* compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for
* the specific language governing permissions and limitations under the
* License.
*/
package com.cloudera.recordbreaker.analyzer;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericRecord;
import java.util.List;
import java.util.Arrays;
import java.util.Random;
import java.util.TreeMap;
import java.util.Iterator;
import java.util.ArrayList;
import java.util.Collections;
/*******************************************************
* The extracted schemas obtained by LearnStructure are
* not always very user-friendly. For example, they often
* contain a lot of hard-to-understand unions; they also
* support all the lines in an input file, even rare ones.
*
* <code>SchemaUtils</code> exists to massage and edit these
* Schemas after they've been emitted by LearnStructure. The
* functions here are often useful when presenting the schema
* to the user, or building import code for other tools (eg, Hive).
*
* @author "Michael Cafarella" <mjc@lofie.local>
* @version 1.0
* @since 1.0
********************************************************/
public class SchemaUtils {
static Random r = new Random();
static class SchemaPair implements Comparable {
int schemaId;
int count;
public SchemaPair(int schemaId, int count) {
this.schemaId = schemaId;
this.count = count;
}
public int compareTo(Object o) {
SchemaPair sp = (SchemaPair) o;
int result = count - sp.count;
if (result == 0) {
result = schemaId - sp.schemaId;
}
return result;
}
}
/**
* Takes a schema that potentially contains unions and converts it into
* a list of union-free schemas observed with the given data object.
*/
public static List<Schema> unrollUnionsWithData(Schema schema, Object grObj, boolean topLevelOnly) {
return unrollUnionsWithData(schema, grObj, true, topLevelOnly);
}
static List<Schema> unrollUnionsWithData(Schema schema, Object grObj, boolean isTopLevel, boolean topLevelOnly) {
if (schema.getType() == Schema.Type.RECORD && grObj instanceof GenericRecord) {
GenericRecord gr = (GenericRecord) grObj;
List<List<Schema>> fieldSchemaLists = new ArrayList<List<Schema>>();
int targetTotal = 1;
for (Schema.Field sf: schema.getFields()) {
if (gr.get(sf.name()) == null) {
return null;
}
List<Schema> fieldSchemaList = unrollUnionsWithData(sf.schema(), gr.get(sf.name()), false, topLevelOnly);
if (fieldSchemaList == null) {
return null;
}
fieldSchemaLists.add(fieldSchemaList);
targetTotal *= fieldSchemaList.size();
}
List<Schema> outputSchemas = new ArrayList<Schema>();
for (int i = 0; i < targetTotal; i++) {
List<Schema.Field> newFields = new ArrayList<Schema.Field>();
int j = 0;
for (Schema.Field oldField: schema.getFields()) {
List<Schema> curFieldSchemaList = fieldSchemaLists.get(j);
newFields.add(new Schema.Field(oldField.name(), curFieldSchemaList.get(i % curFieldSchemaList.size()), oldField.doc(), null));
j++;
}
Schema s = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), false);
s.setFields(newFields);
outputSchemas.add(s);
}
return outputSchemas;
} else if (schema.getType() == Schema.Type.UNION) {
List<Schema> unrolledSchemas = new ArrayList<Schema>();
if ((! topLevelOnly) || isTopLevel) {
for (Schema s: schema.getTypes()) {
List<Schema> subschemas = SchemaUtils.unrollUnionsWithData(s, grObj, false, topLevelOnly);
if (subschemas != null) {
unrolledSchemas.addAll(subschemas);
}
}
} else {
unrolledSchemas.add(schema);
}
return unrolledSchemas;
} else if (schema.getType() == Schema.Type.ARRAY) {
// Iterate through all elements of array; call unrollUnionsWithData() on each one.
// Then deduplicate the resulting schemas
TreeMap<String, Schema> seenSchemas = new TreeMap<String, Schema>();
GenericArray gra = (GenericArray) grObj;
for (int i = 0; i < gra.size(); i++) {
List<Schema> result = unrollUnionsWithData(schema.getElementType(), gra.get(i), false, topLevelOnly);
if (result != null) {
for (Schema subS: result) {
if (seenSchemas.get(subS.toString()) == null) {
seenSchemas.put(subS.toString(), subS);
}
}
}
}
// Xform the tree into a list, and return.
List<Schema> newSchemas = new ArrayList<Schema>();
for (Schema s: seenSchemas.values()) {
newSchemas.add(Schema.createArray(s));
}
return newSchemas;
} else {
// Base type
if (grObj instanceof GenericData.Record
|| grObj instanceof GenericData.Array) {
return null;
}
List<Schema> retList = new ArrayList<Schema>();
retList.add(schema);
return retList;
}
}
/**
* <code>getUnionFreeSchemasByFrequency</code> will transform the schema of
* a given SchemaDescriptor into a set of union-free schemas. It will then
* rank them by popularity in a data sample of 'maxRows' tuples from the file.
*
* The resulting schema list is returned in descending order of frequency, and
* only includes schemas that appeared at least once in the sample.
*/
public static List<Schema> getUnionFreeSchemasByFrequency(SchemaDescriptor sd, int maxRows, boolean topLevelOnly) {
Schema schema = sd.getSchema();
// 1. Enumerate all the non-union schemas that we observe in the sample
TreeMap<String, Integer> schemaCounts = new TreeMap<String, Integer>();
int numRows = 0;
TreeMap<String, Schema> uniqueUnrolledSchemas = new TreeMap<String, Schema>();
for (Iterator it = sd.getIterator(); it.hasNext(); ) {
GenericData.Record gr = (GenericData.Record) it.next();
List<Schema> grSchemas = SchemaUtils.unrollUnionsWithData(schema, gr, topLevelOnly);
if (grSchemas != null) {
for (Schema grs: grSchemas) {
if (uniqueUnrolledSchemas.get(grs.toString()) == null) {
uniqueUnrolledSchemas.put(grs.toString(), grs);
}
Integer oldCount = schemaCounts.get(grs.toString());
if (oldCount == null) {
oldCount = new Integer(0);
}
schemaCounts.put(grs.toString(), new Integer(oldCount.intValue() + 1));
}
}
if (numRows >= maxRows) {
break;
}
numRows++;
}
List<Schema> allSchemas = new ArrayList(uniqueUnrolledSchemas.values());
List<SchemaPair> schemaFrequency = new ArrayList<SchemaPair>();
for (int i = 0; i < allSchemas.size(); i++) {
Schema s1 = allSchemas.get(i);
Integer sCount = schemaCounts.get(s1.toString());
schemaFrequency.add(new SchemaPair(i, sCount.intValue()));
}
SchemaPair sortedByFreq[] = schemaFrequency.toArray(new SchemaPair[schemaFrequency.size()]);
Arrays.sort(sortedByFreq, Collections.reverseOrder());
List<Schema> schemasRankedByFreq = new ArrayList<Schema>();
for (int i = 0; i < sortedByFreq.length; i++) {
if (sortedByFreq[i].count > 0) {
schemasRankedByFreq.add(allSchemas.get(sortedByFreq[i].schemaId));
}
}
return schemasRankedByFreq;
}
/**
* Takes an Avro record Schema and creates dot-separated names for each
* leaf-level field. The input Schema should *not* have any unions.
*/
public static List<String> flattenNames(Schema schema) {
if (schema.getType() == Schema.Type.RECORD) {
List<String> schemaLabels = new ArrayList<String>();
for (Schema.Field field: schema.getFields()) {
Schema fieldSchema = field.schema();
Schema.Type fieldSchemaType = fieldSchema.getType();
List<String> subnames = SchemaUtils.flattenNames(fieldSchema);
if (subnames == null) {
schemaLabels.add(field.name());
} else {
for (String s: subnames) {
schemaLabels.add(field.name() + "." + s);
}
}
}
return schemaLabels;
} else if (schema.getType() == Schema.Type.UNION) {
List<Schema> unionTypes = schema.getTypes();
throw new UnsupportedOperationException("Cannot process UNION");
} else if (schema.getType() == Schema.Type.ARRAY) {
return flattenNames(schema.getElementType());
} else {
return null;
}
}
/**
* Grab a value from a record that is potentially deeply-nested, using
* a dot-notation field label.
*/
public static Object getNestedValues(GenericRecord gr, String fieldname) {
int dotIndex = fieldname.indexOf(".");
if (dotIndex >= 0) {
String firstComponent = fieldname.substring(0, dotIndex);
String remainder = fieldname.substring(dotIndex+1);
Object oobj2 = gr.get(firstComponent);
return (oobj2 == null || (! (oobj2 instanceof GenericRecord))) ? "" : getNestedValues((GenericRecord) oobj2, remainder);
} else {
Object result = gr.get(fieldname);
return (result == null ? "" : result);
}
}
}