/*
* Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
*
* Cloudera, Inc. licenses this file to you under the Apache License,
* Version 2.0 (the "License"). You may not use this file except in
* compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for
* the specific language governing permissions and limitations under the
* License.
*/
package com.cloudera.recordbreaker.schemadict;
import java.io.*;
import java.util.*;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.reflect.ReflectDatumReader;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericFixed;
import org.apache.avro.generic.GenericRecord;
/****************************************************************
* SchemaSuggest generates labels for an Avro file with schema elts that are not
* usefully named. It uses a dictionary of schemas/data with high-quality labels.
* It compares the candidate avro data to everything in the dictionary, finding the
* k most-similar entries. It then computes a mapping between the candidate schema
* and each of the k best ones. The user can then use the resulting schema to replace
* the candidate's badly-labeled one.
*
* This class is particularly useful when operating on an Avro schema that was
* algorithmically-generated (\ie, through learn-avro).
*
* @author mjc
****************************************************************/
public class SchemaSuggest {
int NUM_BUCKETS = 20;
SchemaDictionary dict;
boolean useAttributeLabels = true;
List<List<SchemaDictionaryEntry>> dictBySize;
/**
* Load in the Schema Dictionary from the indicated file.
*/
public SchemaSuggest(File dataDir) throws IOException {
this.dict = new SchemaDictionary(dataDir);
// The 'dictBySize' structure allows us to perform schema inference
// more quickly, by avoiding examination of schemas that can't possibly
// be returned by inferSchemaMapping().
this.dictBySize = new ArrayList<List<SchemaDictionaryEntry>>();
for (int i = 0; i < NUM_BUCKETS; i++) {
dictBySize.add(new ArrayList<SchemaDictionaryEntry>());
}
for (SchemaDictionaryEntry elt: dict.contents()) {
Schema comparisonSchema = elt.getSchema();
int comparisonSchemaSize = comparisonSchema.getFields().size();
if (comparisonSchemaSize < dictBySize.size()-1) {
dictBySize.get(comparisonSchemaSize-1).add(elt);
} else {
dictBySize.get(dictBySize.size()-1).add(elt);
}
}
}
/**
* Should SchemaSuggest examine attribute labels when trying to find a match?
* Typically this should be set to true. However, it can be useful to turn off
* this feature for testing purposes.
*/
public void setUseAttributeLabels(boolean useAttributeLabels) {
this.useAttributeLabels = useAttributeLabels;
}
/**
* This method infers new schema labels for each element in the input. It returns a Schema object that
* has the identical format as the input file's Schema object, but the labels may be changed.
*/
public List<DictionaryMapping> inferSchemaMapping(File avroFile, int k) throws IOException {
SchemaStatisticalSummary srcSummary = new SchemaStatisticalSummary("input");
Schema srcSchema = srcSummary.createSummaryFromData(avroFile);
srcSummary.setUseAttributeLabels(useAttributeLabels);
//
// Compare the statistics to the database of schema statistics. Find the closest matches, both
// on a per-attribute basis and structurally.
//
int schemaSize = srcSchema.getFields().size();
//
// We start testing the input database against known schemas that have an identical
// number of attributes, which should allow for the best matches. This gives us an
// initial set of distances. We then expand the search to schemas of greater or fewer
// attributes, as long as a given bucket of size-k schemas has a min-distance of less
// than the current top-k matches.
//
//
TreeSet<DictionaryMapping> sorter = new TreeSet<DictionaryMapping>();
int numMatches = 0;
List<Integer> seenIndexes = new ArrayList<Integer>();
int searchRadius = 0;
boolean seenAllCandidates = false;
int srcSchemaSize = srcSchema.getFields().size();
int totalSchemasExamined = 0;
while (! seenAllCandidates) {
// Examine the relevant schema buckets, compute all matches to those schemas
for (int j = Math.max(1, srcSchemaSize - searchRadius);
j <= Math.min(NUM_BUCKETS, srcSchemaSize + searchRadius); j++) {
if (seenIndexes.contains(j-1)) {
continue;
}
for (SchemaDictionaryEntry elt: dictBySize.get(j-1)) {
/////////////////////////////
// This is where we instrument the mapping stuff.
// If the pair is an interesting one, then emit the mapping that
// we discover. Why are good matches going undiscovered?
/////////////////////////////
SchemaMapping mapping = srcSummary.getBestMapping(elt.getSummary());
if (srcSchema.getName().equals(elt.getSchema().getName())) {
System.err.println("Comparing " + srcSchema.getName() + " with " + elt.getSchema().getName());
System.err.println("Obtained mapping: " + mapping.toString());
}
totalSchemasExamined++;
sorter.add(new DictionaryMapping(mapping, elt));
numMatches++;
}
seenIndexes.add(j-1);
}
// Have we examined the entire corpus of known schemas?
if ((srcSchemaSize - searchRadius) <= 1 && (srcSchemaSize + searchRadius) >= NUM_BUCKETS) {
seenAllCandidates = true;
} else {
// Test to see if the best matches are good enough that we can stop looking.
// We compare the lowest known match distance to the minimum distance for matches
// in the closest non-examined buckets.
int lowestSize = srcSchemaSize - searchRadius - 1;
int highestSize = srcSchemaSize + searchRadius + 1;
double minNearbyDistance = Double.MAX_VALUE;
if (lowestSize >= 1) {
minNearbyDistance = Math.min(minNearbyDistance,
SchemaStatisticalSummary.getMinimumMappingCost(srcSchemaSize, lowestSize));
}
if (highestSize <= NUM_BUCKETS) {
minNearbyDistance = Math.min(minNearbyDistance,
SchemaStatisticalSummary.getMinimumMappingCost(srcSchemaSize, highestSize));
}
// Grab from the Sorter the elt that is MIN_ELTS_SUGGESTED into the sorted list
if (sorter.size() >= k) {
DictionaryMapping testDictMapping = null;
int idx = 0;
for (DictionaryMapping cur: sorter) {
idx++;
if (idx == k) {
testDictMapping = cur;
break;
}
}
if (testDictMapping.getMapping().getDist() < minNearbyDistance) {
seenAllCandidates = true;
}
}
}
searchRadius++;
}
// Return the k best schema mappings
double smallestDistance = sorter.first().getMapping().getDist();
List<DictionaryMapping> dsts = new ArrayList<DictionaryMapping>();
for (DictionaryMapping dp: sorter) {
if (dsts.size() > k && dp.getMapping().getDist() > smallestDistance) {
break;
}
dsts.add(dp);
}
double pct = totalSchemasExamined / (1.0 * dict.contents().size());
System.err.println("Final search radius of " + searchRadius + " yielded a search over " + pct + " of all known databases.");
return dsts;
}
/**
* SchemaSuggest takes an avro file where schema elements may be anonymous. It then attempts to
* compute good labels for the anonymous elts. By default, this tool simply prints out the
* suggested labels, if any. The user may include a flag to rewrite the input data using
* the new labels.
*
* schemaSuggest avroFile
*
*/
public static void main(String argv[]) throws IOException {
CommandLine cmd = null;
boolean debug = false;
Options options = new Options();
options.addOption("?", false, "Help for command-line");
options.addOption("f", true, "Accept suggestions and rewrite input to a new Avro file");
options.addOption("d", false, "Debug mode");
options.addOption("k", true, "How many matches to emit.");
try {
CommandLineParser parser = new PosixParser();
cmd = parser.parse(options, argv);
} catch (ParseException e) {
HelpFormatter fmt = new HelpFormatter();
fmt.printHelp("SchemaSuggest", options, true);
System.err.println("Required inputs: <schemadictionary> <anonymousAvro>");
System.exit(-1);
}
if (cmd.hasOption("?")) {
HelpFormatter fmt = new HelpFormatter();
fmt.printHelp("SchemaSuggest", options, true);
System.err.println("Required inputs: <schemadictionary> <anonymousAvro>");
System.exit(0);
}
if (cmd.hasOption("d")) {
debug = true;
}
int k = 1;
if (cmd.hasOption("k")) {
try {
k = Integer.parseInt(cmd.getOptionValue("k"));
} catch (NumberFormatException nfe) {
}
}
String[] argArray = cmd.getArgs();
if (argArray.length < 2) {
HelpFormatter fmt = new HelpFormatter();
fmt.printHelp("SchemaSuggest", options, true);
System.err.println("Required inputs: <schemadictionary> <anonymousAvro>");
System.exit(0);
}
File dataDir = new File(argArray[0]).getCanonicalFile();
File inputData = new File(argArray[1]).getCanonicalFile();
SchemaSuggest ss = new SchemaSuggest(dataDir);
List<DictionaryMapping> mappings = ss.inferSchemaMapping(inputData, k);
if (! cmd.hasOption("f")) {
System.out.println("Ranking of closest known data types, with match-distance (smaller is better):");
int counter = 1;
for (DictionaryMapping mapping: mappings) {
SchemaMapping sm = mapping.getMapping();
List<SchemaMappingOp> bestOps = sm.getMapping();
System.err.println();
System.err.println();
System.err.println("-------------------------------------------------------------");
System.out.println(counter + ". '" + mapping.getDictEntry().getInfo() + "', with distance: " + sm.getDist());
List<SchemaMappingOp> renames = new ArrayList<SchemaMappingOp>();
List<SchemaMappingOp> extraInTarget = new ArrayList<SchemaMappingOp>();
List<SchemaMappingOp> extraInSource = new ArrayList<SchemaMappingOp>();
for (SchemaMappingOp op: bestOps) {
if (op.opcode == SchemaMappingOp.CREATE_OP) {
extraInTarget.add(op);
} else if (op.opcode == SchemaMappingOp.DELETE_OP) {
if (op.getS1DatasetLabel().compareTo("input") == 0) {
extraInSource.add(op);
} else {
extraInTarget.add(op);
}
} else if (op.opcode == SchemaMappingOp.TRANSFORM_OP) {
renames.add(op);
}
}
System.err.println();
System.err.println(" DISCOVERED LABELS");
int counterIn = 1;
if (renames.size() == 0) {
System.err.println(" (None)");
} else {
for (SchemaMappingOp op: renames) {
System.err.println(" " + counterIn + ". " + "In '" + op.getS1DatasetLabel() + "', label '" + op.getS1FieldLabel() + "' AS " + op.getS2FieldLabel());
if (debug) {
if (op.getS1DocStr() != null && op.getS1DocStr().length() > 0) {
System.err.println(" '" + op.getS1DocStr() + "' ==> '" + op.getS2DocStr() + "'");
}
}
counterIn++;
}
}
System.err.println();
System.err.println(" UNMATCHED ITEMS IN TARGET DATA TYPE");
counterIn = 1;
if (extraInTarget.size() == 0) {
System.err.println(" (None)");
} else {
for (SchemaMappingOp op: extraInTarget) {
System.err.println(" " + counterIn + ". " + op.getS1FieldLabel());
if (debug) {
if (op.getS1DocStr() != null && op.getS1DocStr().length() > 0) {
System.err.println(" " + op.getS1DocStr());
}
}
counterIn++;
}
}
System.err.println();
System.err.println(" UNMATCHED ITEMS IN SOURCE DATA");
counterIn = 1;
if (extraInSource.size() == 0) {
System.err.println(" (None)");
} else {
for (SchemaMappingOp op: extraInSource) {
System.err.println(" " + counterIn + ". " + op.getS1FieldLabel());
if (debug) {
if (op.getS1DocStr() != null && op.getS1DocStr().length() > 0) {
System.err.println(" " + op.getS1DocStr());
}
}
counterIn++;
}
}
counter++;
}
}
}
}