SchemaUtils.java example

Explorer
RecordBreaker-master
- src
/*
 * Copyright (c) 2013, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.analyzer;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericRecord;

import java.util.List;
import java.util.Arrays;
import java.util.Random;
import java.util.TreeMap;
import java.util.Iterator;
import java.util.ArrayList;
import java.util.Collections;

/*******************************************************
 * The extracted schemas obtained by LearnStructure are
 * not always very user-friendly.  For example, they often
 * contain a lot of hard-to-understand unions; they also
 * support all the lines in an input file, even rare ones.
 * 
 * <code>SchemaUtils</code> exists to massage and edit these
 * Schemas after they've been emitted by LearnStructure.  The
 * functions here are often useful when presenting the schema
 * to the user, or building import code for other tools (eg, Hive).
 * 
 * @author "Michael Cafarella" <mjc@lofie.local>
 * @version 1.0
 * @since 1.0
 ********************************************************/
public class SchemaUtils {
  static Random r = new Random();
  static class SchemaPair implements Comparable {
    int schemaId;
    int count;
    public SchemaPair(int schemaId, int count) {
      this.schemaId = schemaId;
      this.count = count;
    }
    public int compareTo(Object o) {
      SchemaPair sp = (SchemaPair) o;
      int result = count - sp.count;
      if (result == 0) {
        result = schemaId - sp.schemaId;
      }
      return result;
    }
  }

  /**
   * Takes a schema that potentially contains unions and converts it into
   * a list of union-free schemas observed with the given data object.
   */
  public static List<Schema> unrollUnionsWithData(Schema schema, Object grObj, boolean topLevelOnly) {
    return unrollUnionsWithData(schema, grObj, true, topLevelOnly);
  }

  static List<Schema> unrollUnionsWithData(Schema schema, Object grObj, boolean isTopLevel, boolean topLevelOnly) {
    if (schema.getType() == Schema.Type.RECORD && grObj instanceof GenericRecord) {
      GenericRecord gr = (GenericRecord) grObj;
      List<List<Schema>> fieldSchemaLists = new ArrayList<List<Schema>>();
      int targetTotal = 1;
      for (Schema.Field sf: schema.getFields()) {
        if (gr.get(sf.name()) == null) {
          return null;
        }
        List<Schema> fieldSchemaList = unrollUnionsWithData(sf.schema(), gr.get(sf.name()), false, topLevelOnly);
        if (fieldSchemaList == null) {
          return null;
        }
        fieldSchemaLists.add(fieldSchemaList);
        targetTotal *= fieldSchemaList.size();
      }

      List<Schema> outputSchemas = new ArrayList<Schema>();
      for (int i = 0; i < targetTotal; i++) {
        List<Schema.Field> newFields = new ArrayList<Schema.Field>();

        int j = 0;
        for (Schema.Field oldField: schema.getFields()) {
          List<Schema> curFieldSchemaList = fieldSchemaLists.get(j);
          newFields.add(new Schema.Field(oldField.name(), curFieldSchemaList.get(i % curFieldSchemaList.size()), oldField.doc(), null));
          j++;
        }
        Schema s = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), false);
        s.setFields(newFields);
        outputSchemas.add(s);
      }
      return outputSchemas;
    } else if (schema.getType() == Schema.Type.UNION) {
      List<Schema> unrolledSchemas = new ArrayList<Schema>();

      if ((! topLevelOnly) || isTopLevel) {
        for (Schema s: schema.getTypes()) {
          List<Schema> subschemas = SchemaUtils.unrollUnionsWithData(s, grObj, false, topLevelOnly);
          if (subschemas != null) {
            unrolledSchemas.addAll(subschemas);
          }
        }
      } else {
        unrolledSchemas.add(schema);
      }
      return unrolledSchemas;
    } else if (schema.getType() == Schema.Type.ARRAY) {
      // Iterate through all elements of array; call unrollUnionsWithData() on each one.
      // Then deduplicate the resulting schemas
      TreeMap<String, Schema> seenSchemas = new TreeMap<String, Schema>();
      GenericArray gra = (GenericArray) grObj;
      for (int i = 0; i < gra.size(); i++) {
        List<Schema> result = unrollUnionsWithData(schema.getElementType(), gra.get(i), false, topLevelOnly);
        if (result != null) {
          for (Schema subS: result) {
            if (seenSchemas.get(subS.toString()) == null) {
              seenSchemas.put(subS.toString(), subS);
            }
          }
        }
      }
      
      // Xform the tree into a list, and return.
      List<Schema> newSchemas = new ArrayList<Schema>();
      for (Schema s: seenSchemas.values()) {
        newSchemas.add(Schema.createArray(s));
      }
      return newSchemas;
    } else {
      // Base type
      if (grObj instanceof GenericData.Record
          || grObj instanceof GenericData.Array) {
        return null;
      }
      List<Schema> retList = new ArrayList<Schema>();
      retList.add(schema);
      return retList;
    }
  }

  /**
   * <code>getUnionFreeSchemasByFrequency</code> will transform the schema of
   * a given SchemaDescriptor into a set of union-free schemas.  It will then
   * rank them by popularity in a data sample of 'maxRows' tuples from the file.
   * 
   * The resulting schema list is returned in descending order of frequency, and
   * only includes schemas that appeared at least once in the sample.
   */
  public static List<Schema> getUnionFreeSchemasByFrequency(SchemaDescriptor sd, int maxRows, boolean topLevelOnly) {
    Schema schema = sd.getSchema();

    // 1. Enumerate all the non-union schemas that we observe in the sample
    TreeMap<String, Integer> schemaCounts = new TreeMap<String, Integer>();
    int numRows = 0;
    TreeMap<String, Schema> uniqueUnrolledSchemas = new TreeMap<String, Schema>();    
    for (Iterator it = sd.getIterator(); it.hasNext(); ) {
      GenericData.Record gr = (GenericData.Record) it.next();
      List<Schema> grSchemas = SchemaUtils.unrollUnionsWithData(schema, gr, topLevelOnly);
      if (grSchemas != null) {
        for (Schema grs: grSchemas) {
          if (uniqueUnrolledSchemas.get(grs.toString()) == null) {
            uniqueUnrolledSchemas.put(grs.toString(), grs);
          }
          Integer oldCount = schemaCounts.get(grs.toString());
          if (oldCount == null) {
            oldCount = new Integer(0);
          }
          schemaCounts.put(grs.toString(), new Integer(oldCount.intValue() + 1));
        }
      }
      if (numRows >= maxRows) {
        break;
      }
      numRows++;
    }

    List<Schema> allSchemas = new ArrayList(uniqueUnrolledSchemas.values());
    List<SchemaPair> schemaFrequency = new ArrayList<SchemaPair>();
    for (int i = 0; i < allSchemas.size(); i++) {
      Schema s1 = allSchemas.get(i);
      Integer sCount = schemaCounts.get(s1.toString());
      schemaFrequency.add(new SchemaPair(i, sCount.intValue()));
    }

    SchemaPair sortedByFreq[] = schemaFrequency.toArray(new SchemaPair[schemaFrequency.size()]);
    Arrays.sort(sortedByFreq, Collections.reverseOrder());
    List<Schema> schemasRankedByFreq = new ArrayList<Schema>();
    for (int i = 0; i < sortedByFreq.length; i++) {
      if (sortedByFreq[i].count > 0) {
        schemasRankedByFreq.add(allSchemas.get(sortedByFreq[i].schemaId));
      }
    }
    return schemasRankedByFreq;
  }

  /**
   * Takes an Avro record Schema and creates dot-separated names for each
   * leaf-level field.  The input Schema should *not* have any unions.
   */
  public static List<String> flattenNames(Schema schema) {
    if (schema.getType() == Schema.Type.RECORD) {
      List<String> schemaLabels = new ArrayList<String>();
      for (Schema.Field field: schema.getFields()) {
        Schema fieldSchema = field.schema();
        Schema.Type fieldSchemaType = fieldSchema.getType();
        List<String> subnames = SchemaUtils.flattenNames(fieldSchema);
        if (subnames == null) {
          schemaLabels.add(field.name());
        } else {
          for (String s: subnames) {
            schemaLabels.add(field.name() + "." + s);
          }
        }
      }
      return schemaLabels;
    } else if (schema.getType() == Schema.Type.UNION) {
      List<Schema> unionTypes = schema.getTypes();
      throw new UnsupportedOperationException("Cannot process UNION");
    } else if (schema.getType() == Schema.Type.ARRAY) {
      return flattenNames(schema.getElementType());
    } else {
      return null;
    }
  }

  /**
   * Grab a value from a record that is potentially deeply-nested, using
   * a dot-notation field label.
   */
  public static Object getNestedValues(GenericRecord gr, String fieldname) {
    int dotIndex = fieldname.indexOf(".");
    if (dotIndex >= 0) {
      String firstComponent = fieldname.substring(0, dotIndex);
      String remainder = fieldname.substring(dotIndex+1);
      Object oobj2 = gr.get(firstComponent);
      return (oobj2 == null || (! (oobj2 instanceof GenericRecord))) ? "" : getNestedValues((GenericRecord) oobj2, remainder);
    } else {
      Object result = gr.get(fieldname);
      return (result == null ? "" : result);
    }
  }
}