SchemaSuggest.java example

Explorer
RecordBreaker-master
- src
/*
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.schemadict;

import java.io.*;
import java.util.*;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.reflect.ReflectDatumReader;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericFixed;
import org.apache.avro.generic.GenericRecord;

/****************************************************************
 * SchemaSuggest generates labels for an Avro file with schema elts that are not
 * usefully named.  It uses a dictionary of schemas/data with high-quality labels.
 * It compares the candidate avro data to everything in the dictionary, finding the
 * k most-similar entries.  It then computes a mapping between the candidate schema
 * and each of the k best ones.  The user can then use the resulting schema to replace
 * the candidate's badly-labeled one.
 *
 * This class is particularly useful when operating on an Avro schema that was
 * algorithmically-generated (\ie, through learn-avro).
 *
 * @author mjc
 ****************************************************************/
public class SchemaSuggest {
  int NUM_BUCKETS = 20;
  SchemaDictionary dict;
  boolean useAttributeLabels = true;
  List<List<SchemaDictionaryEntry>> dictBySize;
  
  /**
   * Load in the Schema Dictionary from the indicated file.
   */
  public SchemaSuggest(File dataDir) throws IOException {
    this.dict = new SchemaDictionary(dataDir);

    // The 'dictBySize' structure allows us to perform schema inference
    // more quickly, by avoiding examination of schemas that can't possibly
    // be returned by inferSchemaMapping().
    this.dictBySize = new ArrayList<List<SchemaDictionaryEntry>>();
    for (int i = 0; i < NUM_BUCKETS; i++) {
      dictBySize.add(new ArrayList<SchemaDictionaryEntry>());
    }

    for (SchemaDictionaryEntry elt: dict.contents()) {
      Schema comparisonSchema = elt.getSchema();
      int comparisonSchemaSize = comparisonSchema.getFields().size();
      if (comparisonSchemaSize < dictBySize.size()-1) {
        dictBySize.get(comparisonSchemaSize-1).add(elt);
      } else {
        dictBySize.get(dictBySize.size()-1).add(elt);
      }
    }
  }

  /**
   * Should SchemaSuggest examine attribute labels when trying to find a match?
   * Typically this should be set to true.  However, it can be useful to turn off
   * this feature for testing purposes.
   */
  public void setUseAttributeLabels(boolean useAttributeLabels) {
    this.useAttributeLabels = useAttributeLabels;
  }

  /**
   * This method infers new schema labels for each element in the input.  It returns a Schema object that
   * has the identical format as the input file's Schema object, but the labels may be changed.
   */
  public List<DictionaryMapping> inferSchemaMapping(File avroFile, int k) throws IOException {
    SchemaStatisticalSummary srcSummary = new SchemaStatisticalSummary("input");
    Schema srcSchema = srcSummary.createSummaryFromData(avroFile);
    srcSummary.setUseAttributeLabels(useAttributeLabels);    

    //
    // Compare the statistics to the database of schema statistics.  Find the closest matches, both
    // on a per-attribute basis and structurally.
    //
    int schemaSize = srcSchema.getFields().size();
    //
    // We start testing the input database against known schemas that have an identical
    // number of attributes, which should allow for the best matches.  This gives us an
    // initial set of distances.  We then expand the search to schemas of greater or fewer
    // attributes, as long as a given bucket of size-k schemas has a min-distance of less
    // than the current top-k matches.
    //
    //
    TreeSet<DictionaryMapping> sorter = new TreeSet<DictionaryMapping>();
    int numMatches = 0;
    List<Integer> seenIndexes = new ArrayList<Integer>();
    int searchRadius = 0;
    boolean seenAllCandidates = false;
    int srcSchemaSize = srcSchema.getFields().size();
    int totalSchemasExamined = 0;
    
    while (! seenAllCandidates) {
      // Examine the relevant schema buckets, compute all matches to those schemas
      for (int j = Math.max(1, srcSchemaSize - searchRadius);
           j <= Math.min(NUM_BUCKETS, srcSchemaSize + searchRadius); j++) {

        if (seenIndexes.contains(j-1)) {
          continue;
        }
        for (SchemaDictionaryEntry elt: dictBySize.get(j-1)) {
          /////////////////////////////
          // This is where we instrument the mapping stuff.
          // If the pair is an interesting one, then emit the mapping that
          // we discover.  Why are good matches going undiscovered?
          /////////////////////////////
          SchemaMapping mapping = srcSummary.getBestMapping(elt.getSummary());
          if (srcSchema.getName().equals(elt.getSchema().getName())) {
            System.err.println("Comparing " + srcSchema.getName() + " with " + elt.getSchema().getName());
            System.err.println("Obtained mapping: " + mapping.toString());
          }
          
          totalSchemasExamined++;
          sorter.add(new DictionaryMapping(mapping, elt));
          numMatches++;
        }
        seenIndexes.add(j-1);
      }

      // Have we examined the entire corpus of known schemas?
      if ((srcSchemaSize - searchRadius) <= 1 && (srcSchemaSize + searchRadius) >= NUM_BUCKETS) {
        seenAllCandidates = true;
      } else {
        // Test to see if the best matches are good enough that we can stop looking.
        // We compare the lowest known match distance to the minimum distance for matches
        // in the closest non-examined buckets.
        int lowestSize = srcSchemaSize - searchRadius - 1;
        int highestSize = srcSchemaSize + searchRadius + 1;
        double minNearbyDistance = Double.MAX_VALUE;
        if (lowestSize >= 1) {
          minNearbyDistance = Math.min(minNearbyDistance,
                                       SchemaStatisticalSummary.getMinimumMappingCost(srcSchemaSize, lowestSize));
        }
        if (highestSize <= NUM_BUCKETS) {
          minNearbyDistance = Math.min(minNearbyDistance,
                                       SchemaStatisticalSummary.getMinimumMappingCost(srcSchemaSize, highestSize));
        }
        // Grab from the Sorter the elt that is MIN_ELTS_SUGGESTED into the sorted list
        if (sorter.size() >= k) {
          DictionaryMapping testDictMapping = null;
          int idx = 0;
          for (DictionaryMapping cur: sorter) {
            idx++;
            if (idx == k) {
              testDictMapping = cur;
              break;
            }
          }
          if (testDictMapping.getMapping().getDist() < minNearbyDistance) {
            seenAllCandidates = true;
          }
        }
      }
      searchRadius++;
    }
      
    // Return the k best schema mappings
    double smallestDistance = sorter.first().getMapping().getDist();
    List<DictionaryMapping> dsts = new ArrayList<DictionaryMapping>();
    for (DictionaryMapping dp: sorter) {
      if (dsts.size() > k && dp.getMapping().getDist() > smallestDistance) {
        break;
      }
      dsts.add(dp);
    }
    double pct = totalSchemasExamined / (1.0 * dict.contents().size());
    System.err.println("Final search radius of " + searchRadius + " yielded a search over " + pct + " of all known databases.");
    return dsts;
  }

  /**
   * SchemaSuggest takes an avro file where schema elements may be anonymous.  It then attempts to 
   * compute good labels for the anonymous elts.  By default, this tool simply prints out the
   * suggested labels, if any.  The user may include a flag to rewrite the input data using
   * the new labels.
   *
   * schemaSuggest avroFile 
   *
   */
  public static void main(String argv[]) throws IOException {
    CommandLine cmd = null;
    boolean debug = false;
    Options options = new Options();
    options.addOption("?", false, "Help for command-line");
    options.addOption("f", true, "Accept suggestions and rewrite input to a new Avro file");
    options.addOption("d", false, "Debug mode");
    options.addOption("k", true, "How many matches to emit.");

    try {
      CommandLineParser parser = new PosixParser();
      cmd = parser.parse(options, argv);
    } catch (ParseException e) {
      HelpFormatter fmt = new HelpFormatter();
      fmt.printHelp("SchemaSuggest", options, true);
      System.err.println("Required inputs: <schemadictionary> <anonymousAvro>");
      System.exit(-1);
    }

    if (cmd.hasOption("?")) {
      HelpFormatter fmt = new HelpFormatter();
      fmt.printHelp("SchemaSuggest", options, true);
      System.err.println("Required inputs: <schemadictionary> <anonymousAvro>");
      System.exit(0);
    }

    if (cmd.hasOption("d")) {
      debug = true;
    }

    int k = 1;
    if (cmd.hasOption("k")) {
      try {
        k = Integer.parseInt(cmd.getOptionValue("k"));
      } catch (NumberFormatException nfe) {
      }
    }

    String[] argArray = cmd.getArgs();
    if (argArray.length < 2) {
      HelpFormatter fmt = new HelpFormatter();
      fmt.printHelp("SchemaSuggest", options, true);
      System.err.println("Required inputs: <schemadictionary> <anonymousAvro>");
      System.exit(0);
    }

    File dataDir = new File(argArray[0]).getCanonicalFile();
    File inputData = new File(argArray[1]).getCanonicalFile();
    SchemaSuggest ss = new SchemaSuggest(dataDir);
    List<DictionaryMapping> mappings = ss.inferSchemaMapping(inputData, k);

    if (! cmd.hasOption("f")) {
      System.out.println("Ranking of closest known data types, with match-distance (smaller is better):");
      int counter = 1;
      for (DictionaryMapping mapping: mappings) {
        SchemaMapping sm = mapping.getMapping();
        List<SchemaMappingOp> bestOps = sm.getMapping();

        System.err.println();
        System.err.println();
        System.err.println("-------------------------------------------------------------");
        System.out.println(counter + ".  '" + mapping.getDictEntry().getInfo() + "', with distance: " + sm.getDist());

        List<SchemaMappingOp> renames = new ArrayList<SchemaMappingOp>();
        List<SchemaMappingOp> extraInTarget = new ArrayList<SchemaMappingOp>();
        List<SchemaMappingOp> extraInSource = new ArrayList<SchemaMappingOp>();

        for (SchemaMappingOp op: bestOps) {
          if (op.opcode == SchemaMappingOp.CREATE_OP) {
            extraInTarget.add(op);
          } else if (op.opcode == SchemaMappingOp.DELETE_OP) {
            if (op.getS1DatasetLabel().compareTo("input") == 0) {
              extraInSource.add(op);
            } else {
              extraInTarget.add(op);
            }
          } else if (op.opcode == SchemaMappingOp.TRANSFORM_OP) {
            renames.add(op);
          }
        }

        System.err.println();
        System.err.println(" DISCOVERED LABELS");
        int counterIn = 1;
        if (renames.size() == 0) {
          System.err.println("  (None)");
        } else {
          for (SchemaMappingOp op: renames) {
            System.err.println("  " + counterIn + ".  " + "In '" + op.getS1DatasetLabel() + "', label '" + op.getS1FieldLabel() + "' AS " + op.getS2FieldLabel());
            if (debug) {
              if (op.getS1DocStr() != null && op.getS1DocStr().length() > 0) {
                System.err.println("         '" + op.getS1DocStr() + "'  ==> '" + op.getS2DocStr() + "'");
              }
            }
            counterIn++;
          }
        }

        System.err.println();
        System.err.println(" UNMATCHED ITEMS IN TARGET DATA TYPE");
        counterIn = 1;
        if (extraInTarget.size() == 0) {
          System.err.println("  (None)");
        } else {
          for (SchemaMappingOp op: extraInTarget) {
            System.err.println("  " + counterIn + ".  " + op.getS1FieldLabel());
            if (debug) {
              if (op.getS1DocStr() != null && op.getS1DocStr().length() > 0) {
                System.err.println("         " + op.getS1DocStr());
              }
            }
            counterIn++;
          }
        }

        System.err.println();
        System.err.println(" UNMATCHED ITEMS IN SOURCE DATA");
        counterIn = 1;
        if (extraInSource.size() == 0) {
          System.err.println("  (None)");
        } else {
          for (SchemaMappingOp op: extraInSource) {
            System.err.println("  " + counterIn + ".  " + op.getS1FieldLabel());
            if (debug) {
              if (op.getS1DocStr() != null && op.getS1DocStr().length() > 0) {
                System.err.println("         " + op.getS1DocStr());
              }
            }
            counterIn++;
          }
        }
        counter++;
      }
    }
  }
}