/*
* Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
*
* Cloudera, Inc. licenses this file to you under the Apache License,
* Version 2.0 (the "License"). You may not use this file except in
* compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for
* the specific language governing permissions and limitations under the
* License.
*/
package com.cloudera.recordbreaker.schemadict;
import java.io.File;
import java.io.IOException;
import java.io.DataInput;
import java.io.DataOutput;
import java.util.Iterator;
import java.util.HashMap;
import java.util.TreeMap;
import java.util.HashSet;
import java.util.TreeSet;
import java.util.Set;
import java.util.SortedSet;
import java.util.Hashtable;
import java.util.Map;
import java.lang.Math;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
import java.nio.ByteBuffer;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.reflect.ReflectDatumReader;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericFixed;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.UTF8;
import org.apache.hadoop.io.Text;
/********************************************
* The SchemaStatistical Summary object is designed to mirror the structure of an input Schema.
* In addition to the name and type information associated with a Schema object, it keeps statistical data
* about observed actual data values that correspond to each Schema element.
*
* This class is intended to be used in the following way:
* 1) Instantiate a SchemaStatisticalSummary object with a preexisting Schema.
* 2) For each GenericData item that exhibits the Schema, call SchemaStatisticalSummary.addData(GenericData). This is
* designed to be called multiple times.
* 3) Once all the desired data has been added, call finalizeStatistics().
* 4) The resulting finalized SchemaStatisticalSummary object can then be compared to other SchemaStatisticalSummary objects with the measureDistance() function.
*
********************************************/
public class SchemaStatisticalSummary implements Writable {
final static byte MAGIC = (byte) 0xa1;
final static byte VERSION = (byte) 1;
final static int MAX_SUMMARY_SAMPLES = 50;
final static double MATCHCOST_TYPE_CLASH = 1 * 10 * 1000;
final static double MATCHCOST_CREATE = 1 * 1000;
final static double MATCHCOST_DELETE = 1 * 1000;
final static short ARRAY_NODE = 1;
final static short BOOLEAN_NODE = 2;
final static short BYTES_NODE = 3;
final static short DOUBLE_NODE = 4;
final static short ENUM_NODE = 5;
final static short FIXED_NODE = 6;
final static short FLOAT_NODE = 7;
final static short INT_NODE = 8;
final static short LONG_NODE = 9;
final static short MAP_NODE = 10;
final static short NULL_NODE = 11;
final static short RECORD_NODE = 12;
final static short STRING_NODE = 13;
final static short UNION_NODE = 14;
/////////////////////////////////////////////////
// Inner classes
/////////////////////////////////////////////////
/*****************************************************
* SummaryNode is a generic statistical summary object for a given elt in the
* hierarchy. A single tuple in the source data may yield a number of nested
* SummaryNodes, all rooted at a GenericRecord.
*
* The hierarchy is instantiated by examining the schema. Each new data item
* results in a call to SummaryNode.addData(), in which the data item is passed in.
******************************************************/
abstract class SummaryNode implements Cloneable {
SummaryNode parent = null;
int preorderIdx;
int numData;
String docStr = "";
public SummaryNode() {
}
public SummaryNode(String docStr) {
this.docStr = docStr;
}
//////////////////////////////////////////
// Methods for constructing the summary-node tree
//////////////////////////////////////////
public void addData(Object obj) {
if (obj instanceof Boolean) {
this.addData((Boolean) obj);
} else if (obj instanceof GenericArray) {
this.addData((GenericArray) obj);
} else if (obj instanceof Double) {
this.addData((Double) obj);
} else if (obj instanceof Float) {
this.addData((Float) obj);
} else if (obj instanceof GenericFixed) {
this.addData((GenericFixed) obj);
} else if (obj instanceof Integer) {
this.addData((Integer) obj);
} else if (obj instanceof Long) {
this.addData((Long) obj);
} else if (obj instanceof Map) {
this.addData((Map) obj);
} else if (obj instanceof ByteBuffer) {
this.addData((ByteBuffer) obj);
} else if (obj instanceof GenericRecord) {
this.addData((GenericRecord) obj);
} else if (obj instanceof Utf8) {
this.addData((Utf8) obj);
} else if (obj instanceof String) {
this.addData((String) obj);
}
}
// Overridden on per-subclass basis.
public void addData(Boolean b) {};
public void addData(GenericArray g) {};
public void addData(Double d) {};
public void addData(Float f) {};
public void addData(Integer i) {};
public void addData(Long l) {};
public void addData(Map m) {};
public void addData(ByteBuffer bb) {};
public void addData(GenericRecord g) {};
public void addData(Utf8 u) {};
public void addData(String s) {};
///////////////////////////////////////////////
// Tree-manipulation and info methods
///////////////////////////////////////////////
/**
* How many nodes in this subtree?
*/
public int size() {
int total = 0;
for (SummaryNode child: children()) {
total += child.size();
}
return total + 1;
}
/**
* Setters/getters
*/
SummaryNode getParent() {
return parent;
}
void setParent(SummaryNode parent) {
this.parent = parent;
}
public List<SummaryNode> children() {
return new ArrayList<SummaryNode>();
}
public int preorderCount() {
return preorderIdx;
}
public SummaryNode parent() {
return parent;
}
/**
* Dealing with paths and node orderings
*/
public int computePreorder(int lastIdx) {
lastIdx++;
this.preorderIdx = lastIdx;
for (SummaryNode child: children()) {
lastIdx = child.computePreorder(lastIdx);
child.setParent(this);
}
return lastIdx;
}
void preorder(List<SummaryNode> soFar) {
soFar.add(this);
for (SummaryNode child: children()) {
child.preorder(soFar);
}
}
public List<SummaryNode> preorder() {
List<SummaryNode> l = new ArrayList<SummaryNode>();
preorder(l);
return l;
}
public List<SummaryNode> pathToRoot() {
List<SummaryNode> path = new ArrayList<SummaryNode>();
SummaryNode cur = this;
while (cur != null) {
path.add(cur);
cur = cur.getParent();
}
return path;
}
public List<SummaryNode> getLastNodeOnPath() {
List<SummaryNode> path = new ArrayList<SummaryNode>();
SummaryNode cur = this;
while (cur != null) {
path.add(cur);
cur = cur.getParent();
}
return path;
}
/**
* Useful in testing whether two fields are referring to the same thing.
* Levenshtein edit distance is great, but we would like a value that ranges 0..1.
*
* To compute this, note that the LD is at least abs(len(s1)-len(s2)). It is also at
* most max(len(s1), len(s2)). So we normalize LD by that range.
*/
double normalizedLevenshteinDistance(String s1, String s2) {
int rawLD = computeLevenshteinDistance(s1, s2);
int range = Math.abs(Math.max(s1.length(), s2.length()) - Math.abs(s1.length() - s2.length()));
return (rawLD / (1.0 * range));
}
/**
* The classic string edit distance algorithm rides again.
*/
int computeLevenshteinDistance(String s1, String s2) {
int s1Length = s1.length();
int s2Length = s2.length();
int s1pos;
int s2pos;
if (s1Length == 0) {
return s2Length;
}
if (s2Length == 0) {
return s1Length;
}
int d[][] = new int[s1Length + 1][];
for (int i = 0; i <= s1Length; i++) {
d[i] = new int[s2Length + 1];
}
for (int i = 0; i <= s1Length; i++) {
d[i][0] = i;
}
for (int j = 0; j <= s2Length; j++) {
d[0][j] = j;
}
for (int i = 1; i <= s1Length; i++) {
char s1Char = s1.charAt(i-1);
for (int j = 1; j <= s2Length; j++) {
char s2Char = s2.charAt(j-1);
int cost = 0;
if (s1Char != s2Char) {
cost = 1;
}
d[i][j] = Math.min(d[i-1][j]+1,
Math.min(d[i][j-1]+1, d[i-1][j-1] + cost));
}
}
return d[s1Length][s2Length];
}
///////////////////////////////////////////////
// Methods for string representation
///////////////////////////////////////////////
/**
* Helper method for rendering a string version of the data
*/
String prefixString(int prefix) {
StringBuffer buf = new StringBuffer();
for (int i = 0; i < prefix; i++) {
buf.append(" ");
}
return buf.toString();
}
/**
* Render a string version of the data
*/
public String dumpSummary(int prefix) {
return prefixString(prefix) + "numData: " + numData + "\n";
}
public abstract String getTypeDesc();
/**
* Find the right node and obtain a description of it.
*/
public abstract String getDesc(boolean verbose);
public String getDesc(int nodeid) {
if (nodeid == preorderIdx) {
return getDesc(false);
} else {
for (SummaryNode child: children()) {
String desc = child.getDesc(nodeid);
if (desc != null) {
return desc;
}
}
}
return null;
}
public String getLabel(int nodeid) {
if (nodeid == preorderIdx) {
return getLabel();
} else {
for (SummaryNode child: children()) {
String label = child.getLabel(nodeid);
if (label != null) {
return label;
}
}
}
return null;
}
public String getTypeDesc(int nodeid) {
if (nodeid == preorderIdx) {
return getTypeDesc();
} else {
for (SummaryNode child: children()) {
String typedesc = child.getTypeDesc(nodeid);
if (typedesc != null) {
return typedesc;
}
}
}
return null;
}
public String getDocStr(int nodeid) {
if (nodeid == preorderIdx) {
return docStr;
} else {
for (SummaryNode child: children()) {
String docstr = child.getDocStr(nodeid);
if (docstr != null) {
return docstr;
}
}
}
return null;
}
/**
* Find the "label" for the current node. Since the top-level element in the
* NodeSummary hierarchy is a record, we know that every element has a label.
* The getLabel() function goes up the tree to the root, constructing the
* dotted label sequence all the way.
*/
public String getLabel() {
if (parent != null) {
return parent.getLabel("", this);
} else {
return "<root>";
}
}
public String getLabel(String labelSoFar, SummaryNode src) {
if (parent != null) {
return parent.getLabel(labelSoFar, this);
} else {
return labelSoFar;
}
}
///////////////////////////////////////////////
// Cost functions for schema matching
///////////////////////////////////////////////
/**
* Figure out basic normalized string edit distance to
* see if the schema labels match. If 'useAttributeLabels'
* is set to false, then this distance is always zero.
*/
double computeSchemaLabelDistance(String l1, String l2) {
if (! useAttributeLabels) {
return 0;
} else {
if (l1.indexOf(".") >= 0) {
l1 = l1.substring(l1.lastIndexOf(".")+1);
}
if (l2.indexOf(".") >= 0) {
l2 = l2.substring(l2.lastIndexOf(".")+1);
}
return normalizedLevenshteinDistance(l1, l2);
}
}
/**
* The default non-type-specific way of performing schema matching is to
* just compare the attribute labels. We can also examine data distributions,
* but this is only possible in the subclasses' overriding transformCost() methods.
*/
public double transformCost(SummaryNode other) {
if (this.getClass() == other.getClass()) {
// Examine the field name for a schema-label distance
return computeSchemaLabelDistance(this.getLabel(), other.getLabel());
} else {
return MATCHCOST_TYPE_CLASH;
}
}
public double deleteCost() {
return MATCHCOST_DELETE;
}
public double createCost() {
return MATCHCOST_CREATE;
}
///////////////////////////////////////////////
// Serialization/deserialization
///////////////////////////////////////////////
public abstract void write(DataOutput out) throws IOException;
public abstract void readFields(DataInput in) throws IOException;
}
/*****************************************************
* Store statistical summary of observed arrays. Basically, store length information and # times seen.
****************************************************/
class ArraySummaryNode extends SummaryNode {
int totalSize;
SummaryNode eltSummary;
public ArraySummaryNode() {
}
public ArraySummaryNode(SummaryNode eltSummary, String docStr) {
super(docStr);
this.eltSummary = eltSummary;
}
/**
*/
public void addData(GenericArray data) {
numData++;
totalSize += data.size();
for (Iterator it = data.iterator(); it.hasNext(); ) {
eltSummary.addData(it.next());
}
}
/////////////////////////////
// String representation
/////////////////////////////
public String dumpSummary(int prefix) {
return prefixString(prefix) + "numData: " + numData + ", avgSize: " + (totalSize / (1.0 * numData)) + "\n" + eltSummary.dumpSummary(prefix+2);
}
public String getTypeDesc() {
return "ARRAY";
}
public String getDesc(boolean verbose) {
String desc = "ARRAY";
if (verbose) {
desc += "(numData: " + numData + ", avgSize: " + (totalSize / (1.0 * numData)) + ")";
}
return getLabel() + ": " + desc;
}
/////////////////////////////
// Serialize/deserialize
/////////////////////////////
public void write(DataOutput out) throws IOException {
out.writeShort(ARRAY_NODE);
out.writeInt(numData);
UTF8.writeString(out, docStr == null ? "" : docStr);
out.writeInt(totalSize);
eltSummary.write(out);
}
public void readFields(DataInput in) throws IOException {
this.numData = in.readInt();
this.docStr = UTF8.readString(in);
this.totalSize = in.readInt();
this.eltSummary = readAndCreate(in);
}
}
/*****************************************************
* Store statistical summary of observed boolean field. Store # times seen and distribution true vs false
****************************************************/
class BooleanSummaryNode extends SummaryNode {
int numTrue;
int numFalse;
public BooleanSummaryNode() {
}
public BooleanSummaryNode(String docStr) {
super(docStr);
}
public void addData(Boolean b) {
numData++;
if (b.booleanValue()) {
numTrue++;
} else {
numFalse++;
}
}
/////////////////////////////
// String representation
/////////////////////////////
public String dumpSummary(int prefix) {
return prefixString(prefix) + "numData: " + numData + ", numTrue: " + numTrue + ", numFalse: " + numFalse + "\n";
}
public String getTypeDesc() {
return "BOOLEAN";
}
public String getDesc(boolean verbose) {
String desc = "BOOLEAN";
if (verbose) {
desc += "(numData: " + numData + ", numTrue: " + numTrue + ", numFalse: " + numFalse + ")";
}
return getLabel() + ": " + desc;
}
/////////////////////////////
// Serialize/deserialize
/////////////////////////////
public void write(DataOutput out) throws IOException {
out.writeShort(BOOLEAN_NODE);
out.writeInt(numData);
UTF8.writeString(out, docStr == null ? "" : docStr);
out.writeInt(numTrue);
out.writeInt(numFalse);
}
public void readFields(DataInput in) throws IOException {
this.numData = in.readInt();
this.docStr = UTF8.readString(in);
this.numTrue = in.readInt();
this.numFalse = in.readInt();
}
}
/*****************************************************
* Store statistical summary of observed Bytes field. Store # times seen and # bytes seen.
****************************************************/
class BytesSummaryNode extends SummaryNode {
int totalSize = 0;
public BytesSummaryNode() {
}
public BytesSummaryNode(String docStr) {
super(docStr);
}
public void addData(ByteBuffer bb) {
numData++;
totalSize += bb.remaining();
}
/////////////////////////////
// String representation
/////////////////////////////
public String dumpSummary(int prefix) {
return prefixString(prefix) + "numData: " + numData + ", totalSize: " + totalSize + "\n";
}
public String getTypeDesc() {
return "BYTES";
}
public String getDesc(boolean verbose) {
String desc = "BYTES";
if (verbose) {
desc += "(numData: " + numData + ", totalSize: " + totalSize + ")";
}
return getLabel() + ": " + desc;
}
/////////////////////////////
// Serialize/deserialize
/////////////////////////////
public void write(DataOutput out) throws IOException {
out.writeShort(BYTES_NODE);
out.writeInt(numData);
UTF8.writeString(out, docStr == null ? "" : docStr);
out.writeInt(totalSize);
}
public void readFields(DataInput in) throws IOException {
this.numData = in.readInt();
this.docStr = UTF8.readString(in);
this.totalSize = in.readInt();
}
}
/*****************************************************
* Store statistical summary of observed Double field. Store # times seen and total value
****************************************************/
class DoubleSummaryNode extends SummaryNode {
double total;
public DoubleSummaryNode() {
}
public DoubleSummaryNode(String docStr) {
super(docStr);
}
public void addData(Double d) {
numData++;
total += d.doubleValue();
}
/////////////////////////////
// String representation
/////////////////////////////
public String dumpSummary(int prefix) {
return prefixString(prefix) + "numData: " + numData + ", avg: " + (total / (1.0 * numData)) + "\n";
}
public String getTypeDesc() {
return "DOUBLE";
}
public String getDesc(boolean verbose) {
String desc = "DOUBLE";
if (verbose) {
desc += "(numData: " + numData + ", avg: " + (total / (1.0 * numData)) + ")";
}
return getLabel() + ": " + desc;
}
/////////////////////////////
// Serialize/deserialize
/////////////////////////////
public void write(DataOutput out) throws IOException {
out.writeShort(DOUBLE_NODE);
out.writeInt(numData);
UTF8.writeString(out, docStr == null ? "" : docStr);
out.writeDouble(total);
}
public void readFields(DataInput in) throws IOException {
this.numData = in.readInt();
this.docStr = UTF8.readString(in);
this.total = in.readDouble();
}
}
/*****************************************************
* Store statistical summary of observed Enumerated Type field. Store # times seen and statistics on how often
* each enum-value is seen.
****************************************************/
class EnumSummaryNode extends SummaryNode {
String name;
Map<String, Integer> symbolCounts = new HashMap<String, Integer>();
public EnumSummaryNode() {
}
public EnumSummaryNode(String name, List<String> symbols, String docStr) {
super(docStr);
this.name = name;
for (String symbol: symbols) {
this.symbolCounts.put(symbol, 1);
}
}
public void addData(String s) {
this.symbolCounts.put(s, symbolCounts.get(s) + 1);
}
/////////////////////////////
// String representation
/////////////////////////////
public String dumpSummary(int prefix) {
StringBuffer buf = new StringBuffer();
buf.append(prefixString(prefix) + "numData: " + numData + " =>\n");
for (String symbol: symbolCounts.keySet()) {
buf.append(prefixString(prefix+2) + symbol + ": " + symbolCounts.get(symbol) + "\n");
}
return buf.toString();
}
public String getTypeDesc() {
return "ENUM";
}
public String getDesc(boolean verbose) {
String desc = "ENUM";
if (verbose) {
desc += "(numData: " + numData + ", numSymbols: " + symbolCounts.size() + ")";
}
return getLabel() + ": " + desc;
}
/////////////////////////////
// Serialize/deserialize
/////////////////////////////
public void write(DataOutput out) throws IOException {
out.writeShort(ENUM_NODE);
out.writeInt(numData);
UTF8.writeString(out, docStr == null ? "" : docStr);
out.writeInt(symbolCounts.size());
for (String symbol: symbolCounts.keySet()) {
new Text(symbol).write(out);
out.writeInt(symbolCounts.get(symbol));
}
}
public void readFields(DataInput in) throws IOException {
this.numData = in.readInt();
this.docStr = UTF8.readString(in);
symbolCounts = new HashMap<String, Integer>();
int numElts = in.readInt();
for (int i = 0; i < numElts; i++) {
Text symbol = new Text();
symbol.readFields(in);
Integer count = in.readInt();
symbolCounts.put(symbol.toString(), count);
}
}
}
/*****************************************************
* Store statistical summary of observed GenericFixed field. Store # times seen and byte length information. Eventually,
* store info on the byte content, too.
****************************************************/
class FixedSummaryNode extends SummaryNode {
String name;
int size;
int total;
public FixedSummaryNode() {
}
public FixedSummaryNode(String name, int size, String docStr) {
super(docStr);
this.name = name;
this.size = size;
this.total = 0;
}
public void addData(GenericFixed data) {
byte d[] = data.bytes();
total += d.length;
numData++;
}
/////////////////////////////
// String representation
/////////////////////////////
public String dumpSummary(int prefix) {
return prefixString(prefix) + "size: " + size + ", total: " + total + ", numData: " + numData;
}
public String getTypeDesc() {
return "FIXED";
}
public String getDesc(boolean verbose) {
String desc = "FIXED";
if (verbose) {
desc += "(numData: " + numData + ", size: " + size + ", total: " + total + ")";
}
return getLabel() + ": " + desc;
}
/////////////////////////////
// Serialize/deserialize
/////////////////////////////
public void write(DataOutput out) throws IOException {
out.writeShort(FIXED_NODE);
new Text(name).write(out);
UTF8.writeString(out, docStr == null ? "" : docStr);
out.writeInt(size);
out.writeInt(total);
}
public void readFields(DataInput in) throws IOException {
this.name = Text.readString(in);
this.docStr = UTF8.readString(in);
this.size = in.readInt();
this.total = in.readInt();
}
}
/*****************************************************
* Store statistical summary of observed Float field. Store # times seen and total value
****************************************************/
class FloatSummaryNode extends SummaryNode {
float total;
public FloatSummaryNode() {
}
public FloatSummaryNode(String docStr) {
super(docStr);
}
public void addData(Float f) {
numData++;
total += f.floatValue();
}
/////////////////////////////
// String representation
/////////////////////////////
public String dumpSummary(int prefix) {
return prefixString(prefix) + "numData: " + numData + ", avg: " + (total / (1.0 * numData)) + "\n";
}
public String getTypeDesc() {
return "FLOAT";
}
public String getDesc(boolean verbose) {
String desc = "FLOAT";
if (verbose) {
desc += "(numData: " + numData + ", avg: " + (total / (1.0 * numData)) + ")";
}
return getLabel() + ": " + desc;
}
/////////////////////////////
// Serialize/deserialize
/////////////////////////////
public void write(DataOutput out) throws IOException {
out.writeShort(FLOAT_NODE);
out.writeInt(numData);
UTF8.writeString(out, docStr == null ? "" : docStr);
out.writeFloat(total);
}
public void readFields(DataInput in) throws IOException {
this.numData = in.readInt();
this.docStr = UTF8.readString(in);
this.total = in.readFloat();
}
}
/*****************************************************
* Store statistical summary of observed Integer field.
* Store total value, num data elements, and a sample of actual data elts
****************************************************/
class IntegerSummaryNode extends SummaryNode {
int total;
List<Integer> samples = new ArrayList<Integer>();
public IntegerSummaryNode() {
}
public IntegerSummaryNode(String docStr) {
super(docStr);
}
public void addData(Integer i) {
numData++;
total += i.intValue();
if (samples.size() < MAX_SUMMARY_SAMPLES) {
samples.add(i);
}
}
///////////////////////////////////////////////
// Cost functions for schema matching
///////////////////////////////////////////////
public double transformCost(SummaryNode other) {
if (this.getClass() == other.getClass()) {
double schemaLabelDistance = computeSchemaLabelDistance(this.getLabel(), other.getLabel());
double klDivergence = computeSampleKLDivergence((IntegerSummaryNode) other);
return schemaLabelDistance + klDivergence;
} else {
return MATCHCOST_TYPE_CLASH;
}
}
/**
* This computes the Kullback-Leibler divergence between two int distributions. It
* measures how much the two integer distributions differ. Useful for testing whether
* they should be matched.
*
* Assumes the two distributions are gaussians.
*/
public double computeSampleKLDivergence(IntegerSummaryNode other) {
double mean1 = total / (1.0 * numData);
double mean2 = other.total / (1.0 * other.numData);
double stddev1 = computeStddev();
double stddev2 = other.computeStddev();
double variance1 = Math.pow(stddev1, 2);
double variance2 = Math.pow(stddev2, 2);
return Math.log(stddev2 / stddev1) + ((variance1 + Math.pow(mean1 - mean2, 2)) / (2 * Math.pow(variance2, 2))) - 0.5;
}
/**
* Compute the standard deviation of the distribution of integers in this summary node.
* Note that if the sample is smaller than the genuine data, we take the
* "sample standard deviation", not the true stddev.
*/
public double computeStddev() {
double mean = total / (1.0 * numData);
double total = 0;
for (Integer sample: samples) {
total += Math.pow(sample.intValue() - mean, 2);
}
double normalizer = 1 / (1.0 * numData);
if (samples.size() < numData) {
// This here's what makes the "sample std deviation" in case we're not
// looking at the full dataset.
normalizer = 1 / (1.0 * (numData-1));
}
double variance = normalizer * total;
return Math.sqrt(variance);
}
/////////////////////////////
// String representation
/////////////////////////////
public String dumpSummary(int prefix) {
return prefixString(prefix) + "numData: " + numData + ", avg: " + (total / (1.0 * numData)) + "\n";
}
public String getTypeDesc() {
return "INT";
}
public String getDesc(boolean verbose) {
String desc = "INT";
if (verbose) {
desc += "(numData: " + numData + ", avg: " + (total / (1.0 * numData)) + ")";
}
return getLabel() + ": " + desc;
}
/////////////////////////////
// Serialize/deserialize
/////////////////////////////
public void write(DataOutput out) throws IOException {
out.writeShort(INT_NODE);
out.writeInt(numData);
UTF8.writeString(out, (docStr == null) ? "" : docStr);
out.writeInt(total);
out.writeInt(samples.size());
for (Integer sample: samples) {
out.writeInt(sample.intValue());
}
}
public void readFields(DataInput in) throws IOException {
this.numData = in.readInt();
this.docStr = UTF8.readString(in);
this.total = in.readInt();
this.samples.clear();
int numSamples = in.readInt();
for (int i = 0; i < numSamples; i++) {
this.samples.add(in.readInt());
}
}
}
/*****************************************************
* Store statistical summary of observed Long field. Store # times seen and total value
****************************************************/
class LongSummaryNode extends SummaryNode {
long total;
public LongSummaryNode() {
}
public LongSummaryNode(String docStr) {
super(docStr);
}
public void addData(Long l) {
numData++;
total += l.longValue();
}
/////////////////////////////
// String representation
/////////////////////////////
public String dumpSummary(int prefix) {
return prefixString(prefix) + "numData: " + numData + ", avg: " + (total / (1.0 * numData)) + "\n";
}
public String getTypeDesc() {
return "LONG";
}
public String getDesc(boolean verbose) {
String desc = "LONG";
if (verbose) {
desc += "(numData: " + numData + ", avg: " + (total / (1.0 * numData)) + ")";
}
return getLabel() + ": " + desc;
}
/////////////////////////////
// Serialize/deserialize
/////////////////////////////
public void write(DataOutput out) throws IOException {
out.writeShort(LONG_NODE);
out.writeInt(numData);
UTF8.writeString(out, docStr == null ? "" : docStr);
out.writeLong(total);
}
public void readFields(DataInput in) throws IOException {
this.numData = in.readInt();
this.docStr = UTF8.readString(in);
this.total = in.readLong();
}
}
/*****************************************************
* Store statistical summary of observed Map field. Store # times seen and track data for each labeled key-pair.
****************************************************/
class MapSummaryNode extends SummaryNode {
Schema modelS;
HashMap<Utf8, SummaryNode> stats = new HashMap<Utf8, SummaryNode>();
public MapSummaryNode() {
}
public MapSummaryNode(Schema modelS, String docStr) {
super(docStr);
this.modelS = modelS;
}
public void addData(Map m) {
numData++;
Iterator it = m.keySet().iterator();
while (it.hasNext()) {
Utf8 key = (Utf8) it.next();
SummaryNode s = stats.get(key);
if (s == null) {
s = buildStructure(modelS, modelS.getDoc());
stats.put(key, s);
}
s.addData(m.get(key));
}
}
/////////////////////////////
// String representation
/////////////////////////////
public String dumpSummary(int prefix) {
StringBuffer buf = new StringBuffer();
buf.append(prefixString(prefix) + "+------------------------------------------+\n");
buf.append(prefixString(prefix) + "numData: " + numData + "\n");
for (Utf8 key: stats.keySet()) {
SummaryNode s = stats.get(key);
buf.append(prefixString(prefix) + key + " =>\n" + s.dumpSummary(prefix+2));
}
buf.append(prefixString(prefix) + "+------------------------------------------+\n");
return buf.toString();
}
public String getTypeDesc() {
return "MAP";
}
public String getDesc(boolean verbose) {
String desc = "MAP";
if (verbose) {
desc += "(numData: " + numData + ", numSymbols: " + stats.size() + ")";
}
return getLabel() + ": " + desc;
}
public String getLabel(String labelSoFar, SummaryNode src) {
for (Utf8 fname: stats.keySet()) {
SummaryNode candidate = stats.get(fname);
if (src == candidate) {
if (parent != null) {
labelSoFar = (labelSoFar.length() > 0) ? fname.toString() + "." + labelSoFar : fname.toString();
return parent.getLabel(labelSoFar, this);
}
}
}
return labelSoFar;
}
/////////////////////////////
// Serialize/deserialize
/////////////////////////////
public void write(DataOutput out) throws IOException {
out.writeShort(MAP_NODE);
out.writeInt(numData);
UTF8.writeString(out, docStr == null ? "" : docStr);
out.writeInt(stats.size());
for (Utf8 key: stats.keySet()) {
new Text(key.toString()).write(out);
stats.get(key).write(out);
}
}
public void readFields(DataInput in) throws IOException {
this.numData = in.readInt();
this.docStr = UTF8.readString(in);
int numElts = in.readInt();
for (int i = 0; i < numElts; i++) {
Text key = new Text();
key.readFields(in);
SummaryNode sn = readAndCreate(in);
stats.put(new Utf8(key.toString()), sn);
}
}
}
/*****************************************************
* Store statistical summary of observed Null field. Just store # times seen.
****************************************************/
class NullSummaryNode extends SummaryNode {
public NullSummaryNode() {
}
public NullSummaryNode(String docStr) {
super(docStr);
}
public void addData() {
numData++;
}
public String getDesc(boolean verbose) {
String desc = "NULL";
if (verbose) {
desc += "(numData: " + numData + ")";
}
return getLabel() + ": " + desc;
}
public String getTypeDesc() {
return "NULL";
}
/////////////////////////////
// Serialize/deserialize
/////////////////////////////
public void write(DataOutput out) throws IOException {
out.writeShort(NULL_NODE);
out.writeInt(numData);
UTF8.writeString(out, docStr == null ? "" : docStr);
}
public void readFields(DataInput in) throws IOException {
this.numData = in.readInt();
this.docStr = UTF8.readString(in);
}
}
/*****************************************************
* Store statistical summary of observed Record field. Store # times seen and then data about sub-elements.
****************************************************/
class RecordSummaryNode extends SummaryNode {
String name;
Map<String, SummaryNode> recordSummary = new HashMap<String, SummaryNode>();
public RecordSummaryNode() {
}
public RecordSummaryNode(String name, String docStr) {
super(docStr);
this.name = name;
}
public List<SummaryNode> children() {
List<SummaryNode> l = new ArrayList<SummaryNode>();
for (String key: recordSummary.keySet()) {
l.add(recordSummary.get(key));
}
return l;
}
public void addField(String fname, SummaryNode fn) {
recordSummary.put(fname, fn);
}
public void addData(GenericRecord data) {
numData++;
for (String fname: recordSummary.keySet()) {
recordSummary.get(fname).addData(data.get(fname));
}
}
/////////////////////////////
// String representation
/////////////////////////////
public String dumpSummary(int prefix) {
StringBuffer buf = new StringBuffer();
buf.append(prefixString(prefix) + "+------------------------------------------+\n");
buf.append(prefixString(prefix) + "numData: " + numData + "\n");
for (String fname: recordSummary.keySet()) {
buf.append(prefixString(prefix) + fname + " =>\n" + recordSummary.get(fname).dumpSummary(prefix+2));
}
buf.append(prefixString(prefix) + "+------------------------------------------+\n");
return buf.toString();
}
public String getTypeDesc() {
return "RECORD";
}
public String getDesc(boolean verbose) {
String desc = "RECORD";
if (verbose) {
desc += "(numData: " + numData + ", fields: " + recordSummary.size() + ")";
}
return getLabel() + ": " + desc;
}
public String getLabel(String labelSoFar, SummaryNode src) {
for (String fname: recordSummary.keySet()) {
SummaryNode candidate = recordSummary.get(fname);
if (src == candidate) {
labelSoFar = (labelSoFar.length() > 0) ? fname + "." + labelSoFar : fname;
if (parent != null) {
return parent.getLabel(labelSoFar, this);
} else {
return "<root>" + "." + labelSoFar;
}
}
}
return "<root>" + "." + labelSoFar;
}
/////////////////////////////
// Serialize/deserialize
/////////////////////////////
public void write(DataOutput out) throws IOException {
out.writeShort(RECORD_NODE);
out.writeInt(numData);
UTF8.writeString(out, docStr == null ? "" : docStr);
out.writeInt(recordSummary.size());
for (String fname: recordSummary.keySet()) {
new Text(fname).write(out);
recordSummary.get(fname).write(out);
}
}
public void readFields(DataInput in) throws IOException {
this.numData = in.readInt();
this.docStr = UTF8.readString(in);
int numRecs = in.readInt();
for (int i = 0; i < numRecs; i++) {
Text fname = new Text();
fname.readFields(in);
SummaryNode sn = readAndCreate(in);
recordSummary.put(fname.toString(), sn);
}
}
}
/*****************************************************
* Store statistical summary of observed String field. Store # times seen and total length of the strings (for now).
* Eventually, store info on the String content, too.
****************************************************/
class StringSummaryNode extends SummaryNode {
int totalLength;
Set<Utf8> observedStrings = new TreeSet<Utf8>();
public StringSummaryNode() {
}
public StringSummaryNode(String docStr) {
super(docStr);
}
public void addData(Utf8 s) {
numData++;
totalLength += s.getLength();
observedStrings.add(s);
}
///////////////////////////////////////////////
// Cost functions for schema matching
///////////////////////////////////////////////
public double transformCost(SummaryNode other) {
if (this.getClass() == other.getClass()) {
double schemaLabelDistance = computeSchemaLabelDistance(this.getLabel(), other.getLabel());
double jaccardSimilarity = computeJaccardSimilarity((StringSummaryNode) other);
double jaccardDistance = 1 - jaccardSimilarity;
return schemaLabelDistance + jaccardDistance;
} else {
return MATCHCOST_TYPE_CLASH;
}
}
/**
* This is a useful score for determining whether two sets of objects are similar
*/
public double computeJaccardSimilarity(StringSummaryNode other) {
Set<Utf8> larger = (this.numData >= other.numData ? this.observedStrings : other.observedStrings);
Set<Utf8> smaller = (this.numData < other.numData ? this.observedStrings : other.observedStrings);
int unionSize = larger.size();
if (larger.contains(new Utf8(""))) {
unionSize -= 1;
}
int intersectionSize = 0;
for (Utf8 smallElt: smaller) {
if (smallElt.length() == 0) {
continue;
}
if (larger.contains(smallElt)) {
intersectionSize++;
} else {
unionSize++;
}
}
if (unionSize == 0) {
return 0;
} else {
return intersectionSize / (1.0 * unionSize);
}
}
/////////////////////////////
// String representation
/////////////////////////////
public String dumpSummary(int prefix) {
return prefixString(prefix) + "numData: " + numData + ", avg-len: " + (totalLength / (1.0 * numData)) + "\n";
}
public String getTypeDesc() {
return "STRING";
}
public String getDesc(boolean verbose) {
String desc = "STRING";
if (verbose) {
desc += "(numData: " + numData + ", avglen: " + (totalLength / (1.0 * numData)) + ")";
}
return getLabel() + ": " + desc;
}
/////////////////////////////
// Serialize/deserialize
/////////////////////////////
public void write(DataOutput out) throws IOException {
out.writeShort(STRING_NODE);
out.writeInt(numData);
UTF8.writeString(out, docStr == null ? "" : docStr);
out.writeInt(totalLength);
out.writeInt(observedStrings.size());
for (Utf8 s: observedStrings) {
UTF8.writeString(out, s.toString());
}
}
public void readFields(DataInput in) throws IOException {
this.numData = in.readInt();
this.docStr = UTF8.readString(in);
this.totalLength = in.readInt();
observedStrings.clear();
int numInts = in.readInt();
for (int i = 0; i < numInts; i++) {
observedStrings.add(new Utf8(UTF8.readString(in)));
}
}
}
/*****************************************************
* Store statistical summary of observed Union field. Actually, a Union is not observed directly - we just know
* it's a union from the schema. Store # times seen, data on the particular type observed, and statistics on how
* often each subtype is seen.
****************************************************/
class UnionSummaryNode extends SummaryNode {
Map<Schema.Type, SummaryNode> unionTypes = new HashMap<Schema.Type, SummaryNode>();
Map<Schema.Type, Integer> unionTypeCounts = new HashMap<Schema.Type, Integer>();
public UnionSummaryNode() {
}
public UnionSummaryNode(String docStr) {
super(docStr);
}
public void addType(Schema.Type t, SummaryNode sn) {
if (unionTypes.get(t) == null) {
unionTypes.put(t, sn);
unionTypeCounts.put(t, 0);
}
}
/**
* We need to dispatch the object to the right element stored in 'unionTypes'
*/
public void addData(Object obj) {
Schema.Type t = Schema.Type.ARRAY;
if (obj instanceof GenericArray) {
t = Schema.Type.ARRAY;
} else if (obj instanceof Boolean) {
t = Schema.Type.BOOLEAN;
} else if (obj instanceof ByteBuffer) {
t = Schema.Type.BYTES;
} else if (obj instanceof Double) {
t = Schema.Type.DOUBLE;
} else if (obj instanceof String) {
t = Schema.Type.ENUM;
} else if (obj instanceof GenericFixed) {
t = Schema.Type.FIXED;
} else if (obj instanceof Float) {
t = Schema.Type.FLOAT;
} else if (obj instanceof Integer) {
t = Schema.Type.INT;
} else if (obj instanceof Long) {
t = Schema.Type.LONG;
} else if (obj instanceof Map) {
t = Schema.Type.MAP;
} else if (obj instanceof GenericRecord) {
t = Schema.Type.RECORD;
} else if (obj instanceof Utf8) {
t = Schema.Type.STRING;
}
unionTypes.get(t).addData(obj);
Integer c = unionTypeCounts.get(t);
if (c == null) {
unionTypeCounts.put(t, 1);
} else {
unionTypeCounts.put(t, c.intValue() + 1);
}
}
/////////////////////////////
// String representation
/////////////////////////////
public String dumpSummary(int prefix) {
StringBuffer buf = new StringBuffer();
for (Schema.Type t: unionTypes.keySet()) {
buf.append(prefixString(prefix) + "unionType: " + t + " =>\n");
buf.append(unionTypes.get(t).dumpSummary(prefix+2));
}
return buf.toString();
}
public String getTypeDesc() {
return "UNION";
}
public String getDesc(boolean verbose) {
String desc = "UNION";
if (verbose) {
desc += "(numData: " + numData + ", numtypes: " + unionTypes.size() + ")";
}
return getLabel() + ": " + desc;
}
/////////////////////////////
// Serialize/deserialize
/////////////////////////////
public void write(DataOutput out) throws IOException {
out.writeShort(UNION_NODE);
out.writeInt(numData);
UTF8.writeString(out, docStr == null ? "" : docStr);
out.writeInt(unionTypes.size());
for (Schema.Type t: unionTypes.keySet()) {
new Text(t.toString()).write(out);
out.writeInt(unionTypeCounts.get(t));
unionTypes.get(t).write(out);
}
}
public void readFields(DataInput in) throws IOException {
this.numData = in.readInt();
this.docStr = UTF8.readString(in);
int numTypes = in.readInt();
for (int i = 0; i < numTypes; i++) {
Text tLabel = new Text();
tLabel.readFields(in);
Schema.Type t = Schema.Type.valueOf(tLabel.toString());
int typeCount = in.readInt();
SummaryNode sn = readAndCreate(in);
unionTypes.put(t, sn);
unionTypeCounts.put(t, typeCount);
}
}
}
/***************************************
* Op is used to track mapping results
***************************************/
class PreviousChoice extends SchemaMappingOp {
Hashtable<String, List<SchemaMappingOp>> h;
String label;
public PreviousChoice(Hashtable<String, List<SchemaMappingOp>> h, int i, int j) {
this.h = h;
this.label = "" + i + "-" + j;
}
public PreviousChoice(Hashtable<String, List<SchemaMappingOp>> h, int p1, int p2, int p3, int p4, int p5, int p6) {
this.h = h;
this.label = "" + p1 + "-" + p2 + "-" + p3 + "-" + p4 + "-" + p5 + "-" + p6;
}
public List<SchemaMappingOp> getOps() {
List<SchemaMappingOp> ops = h.get(label);
if (ops == null) {
ops = new ArrayList<SchemaMappingOp>();
}
return ops;
}
public String toString() {
return "Previous! " + label;
}
}
/////////////////////////////////////////////////
// Members
/////////////////////////////////////////////////
SummaryNode root = null;
boolean useAttributeLabels = true;
String datasetLabel = "";
/////////////////////////////////////////////////
// Constructors, initializers
/////////////////////////////////////////////////
public SchemaStatisticalSummary() throws IOException {
}
public SchemaStatisticalSummary(String datasetLabel) throws IOException {
this.datasetLabel = datasetLabel;
}
public void setUseAttributeLabels(boolean useAttributeLabels) {
this.useAttributeLabels = useAttributeLabels;
}
/**
* Create the statistical summary object from data.
*/
public Schema createSummaryFromData(File f) throws IOException {
DataFileReader in = new DataFileReader(f, new GenericDatumReader());
try {
Schema s = in.getSchema();
//
// There has to be at least one data element for us to infer anything meaningful
//
Iterator it = in.iterator();
if (! it.hasNext()) {
throw new IOException("No contents");
}
//
// We can only infer schemas from top-level records, not Fixeds or Arrays.
//
Object firstRecord = it.next();
if (firstRecord instanceof GenericFixed ||
firstRecord instanceof GenericArray) {
throw new IOException("Not a top-level record");
}
// We assume the passed-in top-level Schema always represents a Record.
if (s.getType() != Schema.Type.RECORD) {
throw new IOException("Passed-in top-level Schema instance must be of type Schema.Type.RECORD");
}
this.root = buildStructure(s, "ROOT");
//
// Iterate through all records and collect statistics on each Schema field.
//
List<Schema.Field> fields = s.getFields();
GenericRecord cur = (GenericRecord) firstRecord;
int counter = 0;
do {
this.root.addData(cur);
counter++;
if (it.hasNext()) {
cur = (GenericRecord) it.next();
} else {
cur = null;
}
} while (cur != null);
this.root.computePreorder(-1);
return s;
} finally {
in.close();
}
}
/**
* This function reads in data and instantiates the SummaryNode hierarchy.
*/
public SummaryNode readAndCreate(DataInput in) throws IOException {
short nodeType = in.readShort();
SummaryNode sn = null;
switch (nodeType) {
case ARRAY_NODE: {
sn = new ArraySummaryNode();
break;
}
case BOOLEAN_NODE: {
sn = new BooleanSummaryNode();
break;
}
case BYTES_NODE: {
sn = new BytesSummaryNode();
break;
}
case DOUBLE_NODE: {
sn = new DoubleSummaryNode();
break;
}
case ENUM_NODE: {
sn = new EnumSummaryNode();
break;
}
case FIXED_NODE: {
sn = new FixedSummaryNode();
break;
}
case FLOAT_NODE: {
sn = new FloatSummaryNode();
break;
}
case INT_NODE: {
sn = new IntegerSummaryNode();
break;
}
case LONG_NODE: {
sn = new LongSummaryNode();
break;
}
case MAP_NODE: {
sn = new MapSummaryNode();
break;
}
case NULL_NODE: {
sn = new NullSummaryNode();
break;
}
case RECORD_NODE: {
sn = new RecordSummaryNode();
break;
}
case STRING_NODE: {
sn = new StringSummaryNode();
break;
}
case UNION_NODE: {
sn = new UnionSummaryNode();
break;
}
default:
throw new IOException("Unknown node type: " + nodeType);
}
sn.readFields(in);
return sn;
}
/**
* Build a Summary structure out of the given schema. Helper method.
*/
SummaryNode buildStructure(Schema s, String docStr) {
Schema.Type stype = s.getType();
if (stype == Schema.Type.ARRAY) {
return new ArraySummaryNode(buildStructure(s.getElementType(), s.getDoc()), docStr);
} else if (stype == Schema.Type.BOOLEAN) {
return new BooleanSummaryNode(docStr);
} else if (stype == Schema.Type.BYTES) {
return new BytesSummaryNode(docStr);
} else if (stype == Schema.Type.DOUBLE) {
return new DoubleSummaryNode(docStr);
} else if (stype == Schema.Type.ENUM) {
return new EnumSummaryNode(s.getFullName(), s.getEnumSymbols(), docStr);
} else if (stype == Schema.Type.FIXED) {
return new FixedSummaryNode(s.getFullName(), s.getFixedSize(), docStr);
} else if (stype == Schema.Type.FLOAT) {
return new FloatSummaryNode(docStr);
} else if (stype == Schema.Type.INT) {
return new IntegerSummaryNode(docStr);
} else if (stype == Schema.Type.LONG) {
return new LongSummaryNode(docStr);
} else if (stype == Schema.Type.MAP) {
return new MapSummaryNode(s.getValueType(), docStr);
} else if (stype == Schema.Type.NULL) {
return new NullSummaryNode(docStr);
} else if (stype == Schema.Type.RECORD) {
RecordSummaryNode rsn = new RecordSummaryNode(s.getFullName(), docStr);
for (Field f: s.getFields()) {
rsn.addField(f.name(), buildStructure(f.schema(), f.doc()));
}
return rsn;
} else if (stype == Schema.Type.STRING) {
return new StringSummaryNode(docStr);
} else if (stype == Schema.Type.UNION) {
UnionSummaryNode usn = new UnionSummaryNode(docStr);
for (Schema subschema: s.getTypes()) {
usn.addType(subschema.getType(), buildStructure(subschema, subschema.getDoc()));
}
}
return null;
}
/////////////////////////////////////////////////////////
// Schema distance computation
/////////////////////////////////////////////////////////
/**
* Get the minimum mapping cost from a schema of size k to one of size m.
* This helps us avoid mapping computations that couldn't possibly produce
* a low-distance mapping.
*/
public static double getMinimumMappingCost(int k, int m) {
return Math.abs(k - m) * Math.min(MATCHCOST_CREATE, MATCHCOST_DELETE);
}
/**
* Find the best mapping between the current schema summary and the one provided
* by the parameter.
*/
public SchemaMapping getBestMapping(SchemaStatisticalSummary other) {
SummaryNode t1 = root;
SummaryNode t2 = other.root;
TreeMap<Integer, SummaryNode> t1NonLeafs = new TreeMap<Integer, SummaryNode>();
TreeMap<Integer, SummaryNode> t1Leafs = new TreeMap<Integer, SummaryNode>();
TreeMap<Integer, SummaryNode> t2NonLeafs = new TreeMap<Integer, SummaryNode>();
TreeMap<Integer, SummaryNode> t2Leafs = new TreeMap<Integer, SummaryNode>();
//
// Find all the non-leaf nodes
//
for (SummaryNode iNode: t1.preorder()) {
if (iNode.children().size() > 0) {
t1NonLeafs.put(iNode.preorderCount(), iNode);
} else {
t1Leafs.put(iNode.preorderCount(), iNode);
}
}
for (SummaryNode jNode: t2.preorder()) {
if (jNode.children().size() > 0) {
t2NonLeafs.put(jNode.preorderCount(), jNode);
} else {
t2Leafs.put(jNode.preorderCount(), jNode);
}
}
//
// Start by computing all the potential 1:1 leaf-level match costs.
//
List<DistancePair[]> allCosts = new ArrayList<DistancePair[]>();
Set<DistancePair> allKnownCostPairs = new TreeSet<DistancePair>();
for (SummaryNode iNode: t1.preorder()) {
int iIdx = iNode.preorderCount();
DistancePair fromI[] = null;
if (t1NonLeafs.get(iIdx) == null) {
List<DistancePair> costs = new ArrayList<DistancePair>();
for (SummaryNode jNode: t2.preorder()) {
int jIdx = jNode.preorderCount();
if (t2NonLeafs.get(jIdx) == null) {
DistancePair dp = new DistancePair(iNode.transformCost(jNode), iNode, jNode);
costs.add(dp);
allKnownCostPairs.add(dp);
}
}
costs.add(new DistancePair(iNode.deleteCost(), iNode, null));
fromI = costs.toArray(new DistancePair[costs.size()]);
Arrays.sort(fromI);
}
allCosts.add(fromI);
}
//
// Now pass those costs to the mapping algorithm.
// Select which mapping algorithm we want to use. For now, it's 'greedy'.
//
return findGreedyMapping(other, t1, t2, t1Leafs, t2Leafs, t1NonLeafs, t2NonLeafs, allKnownCostPairs);
/**
boolean performTraditionalMapping = false;
if (performTraditionalMapping) {
return findTraditionalMapping(other, t1, t2, t1Leafs, t2Leafs, t1NonLeafs, t2NonLeafs, allCosts);
} else {
return findGreedyMapping(other, t1, t2, t1Leafs, t2Leafs, t1NonLeafs, t2NonLeafs, allKnownCostPairs);
}
**/
}
/**
* findTraditionalMapping() tries the best k permutations of matches and returns the best one.
* The number of permutations can grow rapidly as the sizes of the two schemas grow, so this method
* can be very time-consuming.
*/
/**
SchemaMapping findTraditionalMapping(SchemaStatisticalSummary other, SummaryNode t1, SummaryNode t2, Map<Integer, SummaryNode> t1Leafs, Map<Integer, SummaryNode> t2Leafs, Map<Integer, SummaryNode> t1NonLeafs, Map<Integer, SummaryNode> t2NonLeafs, List<DistancePair[]> allCosts) {
//
// Figure out how far down into each attr's match-list we can go while only evaluating the
// estimated top-k-scoring schema matches. (Estimated by combining independent 1:1 match scores;
// no enforcement of the pigeonhole constraint.)
//
int MAX_CANDIDATES = 100000;
int numToPeek[] = new int[allCosts.size()];
for (int i = 0; i < numToPeek.length; i++) {
if (allCosts.get(i) == null) {
numToPeek[i] = 0;
} else {
numToPeek[i] = 1;
}
}
int numCandidates = 1;
System.err.println("Num elts: " + numToPeek.length);
do {
int peekIndex = -1;
double cheapestPeek = Double.MAX_VALUE;
for (int i = 0; i < numToPeek.length; i++) {
if (allCosts.get(i) != null &&
numToPeek[i] < allCosts.get(i).length) {
double candidatePeekValue = allCosts.get(i)[numToPeek[i]].getCost();
if (candidatePeekValue < cheapestPeek) {
cheapestPeek = candidatePeekValue;
peekIndex = i;
}
}
}
if (peekIndex >= 0) {
numToPeek[peekIndex]++;
} else {
break;
}
numCandidates = 1;
for (int i = 0; i < numToPeek.length; i++) {
if (numToPeek[i] >= 1) {
numCandidates *= numToPeek[i];
}
}
} while (numCandidates < MAX_CANDIDATES);
System.err.println("All cost size: " + allCosts.size() + ", number of candidates examined: " + numCandidates);
System.err.println();
numCandidates = Math.max(MAX_CANDIDATES, numCandidates);
//
// Now the numToPeek vector tells us how many steps down to go in each attr's
// ranked list of preferred matches. The product of all of these determines the # of candidates.
//
int curPeek[] = new int[numToPeek.length];
for (int i = 0; i < curPeek.length; i++) {
if (numToPeek[i] == 0) {
curPeek[i] = 0;
} else {
curPeek[i] = 1;
}
}
//
// Now go through all the possible configurations of top-k mappings.
//
// We optimize for the common case in which we have two near-flat hierarchies
//
DistancePair bestMatchConfig[] = new DistancePair[curPeek.length];
DistancePair matchConfig[] = new DistancePair[curPeek.length];
double bestCost = Double.MAX_VALUE;
boolean peeksRemain = numCandidates > 0;
long startTime = System.currentTimeMillis();
int numIters = 0;
while (peeksRemain) {
numIters++;
////////////////////////////////////////
// Evaluate this configuration ("peek")
////////////////////////////////////////
//
// 1. Build a proper 'match configuration' out of the leaf-level 1:1 'curPeek'.
// That means we generate record-level correspondences when justified by full
// child-correspondences
//
for (SummaryNode iNode: t1.preorder()) {
int iNodeIdx = iNode.preorderCount();
matchConfig[iNodeIdx] = null;
DistancePair[] allINodeMatches = allCosts.get(iNodeIdx);
if (allINodeMatches != null) {
matchConfig[iNodeIdx] = allINodeMatches[curPeek[iNodeIdx]-1];
}
}
//
// 2. Modify the current matchConfig s.t. if ALL of a non-leaf's children match ALL of
// the children of a non-leaf, then the two non-leafs also match.
// Because of the potential record hierarchy, this procedure needs to be repeated until
// there is an iteration in which no new matches are found (or until the roots are matched).
//
for (Map.Entry<Integer, SummaryNode> elt: t1NonLeafs.entrySet()) {
SummaryNode iNode = elt.getValue();
if (matchConfig[iNode.preorderCount()] != null) {
continue;
}
// For each child of this t1 internal node, place the matching node's parent into a set
TreeMap<Integer, SummaryNode> observedMatchParents = new TreeMap<Integer, SummaryNode>();
for (SummaryNode iChild: iNode.children()) {
int iChildIdx = iChild.preorderCount();
DistancePair jMatch = matchConfig[iChildIdx];
if (jMatch != null) {
if (jMatch.getNode() == null) {
observedMatchParents.put(-1, iChild);
} else {
SummaryNode jMatchParent = jMatch.getNode().getParent();
observedMatchParents.put(jMatchParent.preorderCount(), jMatchParent);
}
}
}
// If the parent-set has just one element, then internal node iNode
// should be matched to the singleton elt in the parent-set.
if (observedMatchParents.size() == 1) {
int matchIdx = observedMatchParents.firstKey().intValue();
if (matchIdx >= 0) {
SummaryNode jMatchParent = observedMatchParents.get(matchIdx);
matchConfig[iNode.preorderCount()] = new DistancePair(0, iNode, jMatchParent);
}
}
}
//
// 3. Compute the total match costs.
// a. The first component is the TRANSFORM costs of the discovered 1:1 leaf matches.
// (Valid matches among non-leafs are free.)
//
double total = 0;
for (int iNodeIdx = 0; iNodeIdx < matchConfig.length; iNodeIdx++) {
if (matchConfig[iNodeIdx] != null) {
// Get the transform cost
total += matchConfig[iNodeIdx].getCost();
}
}
//
// 3b. Compute DELETE penalties. These are elts in t1 that are NOT MATCHED to anything
// in t2. Non-leaf nodes that are unmatched DO incur penalties.
//
// While we're there, compute the set of items in t2 that DO have a matched elt.
//
int numDuplicates = 0;
HashSet<Integer> observedT2Nodes = new HashSet<Integer>();
for (SummaryNode iNode: t1.preorder()) {
int iNodeIdx = iNode.preorderCount();
if (matchConfig[iNodeIdx] == null || matchConfig[iNodeIdx].getNode() == null) {
total += iNode.deleteCost();
} else {
int jIdx = matchConfig[iNodeIdx].getNode().preorderCount();
if (observedT2Nodes.contains(jIdx)) {
numDuplicates++;
} else {
observedT2Nodes.add(jIdx);
}
}
}
//
// 3c. Compute CREATE penalties. These count for any items in the target schema t2
// that have gone unmapped.
//
for (SummaryNode jNode: t2.preorder()) {
int jIdx = jNode.preorderCount();
if (! observedT2Nodes.contains(jIdx)) {
total += jNode.createCost();
}
}
//
// 4. Impose a penalty for duplicate mappings in t2.
//
//
// Is it the best mapping so far?
//
if (total < bestCost) {
bestCost = total;
System.arraycopy(matchConfig, 0, bestMatchConfig, 0, bestMatchConfig.length);
}
/////////////////////////////////////////////
// Find the next configuration to evaluate (leaf-level "peek").
// We try to do a 'breadth-first search' rather than go deep on
// a single peeklist. This makes it easier to find the best match sooner,
// and thus abort the process early.
/////////////////////////////////////////////
peeksRemain = false;
int minSeen = Integer.MAX_VALUE;
int minIndex = -1;
for (int i = 0; i < curPeek.length; i++) {
if (curPeek[i] == 0) {
continue;
} else {
if (curPeek[i] < numToPeek[i]) {
curPeek[i]++;
for (int j = i-1; j >= 0; j--) {
if (curPeek[j] > 0) {
curPeek[j] = 1;
}
}
peeksRemain = true;
break;
}
}
}
}
long endTime = System.currentTimeMillis();
System.err.println("Evaluting peeks: " + ((endTime - startTime) / 1000.0) + " over " + numIters + " iterations.");
//
// ALMOST DONE: We have the best match. Now we translate it into a series of SchemaMappingOps
//
List<SchemaMappingOp> bestOps = new ArrayList<SchemaMappingOp>();
HashSet<Integer> bestMapTargets = new HashSet<Integer>();
for (int i = 0; i < bestMatchConfig.length; i++) {
if (bestMatchConfig[i] != null && bestMatchConfig[i].getNode() != null) {
int dstIdx = bestMatchConfig[i].getNode().preorderCount();
bestOps.add(new SchemaMappingOp(SchemaMappingOp.TRANSFORM_OP, this, i, other, dstIdx));
bestMapTargets.add(dstIdx);
} else {
bestOps.add(new SchemaMappingOp(SchemaMappingOp.DELETE_OP, this, i));
}
}
for (SummaryNode jNode: t2.preorder()) {
int jIdx = jNode.preorderCount();
if (jNode.children().size() == 0 && ! bestMapTargets.contains(jIdx)) {
bestOps.add(new SchemaMappingOp(SchemaMappingOp.CREATE_OP, other, jIdx));
}
}
//
// All done!
//
return new SchemaMapping(this, other, bestCost, bestOps);
}
**/
/**
* Greedy Mapping is sloppy, but very fast. It repeatedly accepts the best-looking pairwise
* match, until there is nothing left to match. Seems to work well so far, but needs to be
* tested more.
*/
SchemaMapping findGreedyMapping(SchemaStatisticalSummary other, SummaryNode t1, SummaryNode t2, Map<Integer, SummaryNode> t1Leafs, Map<Integer, SummaryNode> t2Leafs, Map<Integer, SummaryNode> t1NonLeafs, Map<Integer, SummaryNode> t2NonLeafs, Set<DistancePair> allKnownCostPairs) {
int totalSrcs = t1Leafs.size();
int totalDsts = t2Leafs.size();
Set<Integer> observedSrcs = new TreeSet<Integer>();
Set<Integer> observedDsts = new TreeSet<Integer>();
List<DistancePair> matching = new ArrayList<DistancePair>();
List<SchemaMappingOp> outputOps = new ArrayList<SchemaMappingOp>();
double totalCost = 0;
//
// Find all the leaf-level matches
//
Map<Integer, SummaryNode> transformMap = new TreeMap<Integer, SummaryNode>();
for (DistancePair dp: allKnownCostPairs) {
int srcId = dp.getSrc().preorderCount();
int dstId = dp.getNode().preorderCount();
if ((! observedSrcs.contains(srcId)) && (! observedDsts.contains(dstId))) {
matching.add(dp);
observedSrcs.add(srcId);
observedDsts.add(dstId);
outputOps.add(new SchemaMappingOp(SchemaMappingOp.TRANSFORM_OP, this, srcId, other, dstId, dp.getCost()));
transformMap.put(srcId, dp.getNode());
totalCost += dp.getCost();
if (matching.size() >= Math.min(totalSrcs, totalDsts)) {
break;
}
}
}
//
// Look for internal nodes that should be matched. If ALL of an internal node's children
// match ALL of another internal node's children, then the two internal nodes also match.
//
for (Map.Entry<Integer, SummaryNode> elt: t1NonLeafs.entrySet()) {
SummaryNode iNode = elt.getValue();
SortedSet<Integer> knownDstParents = new TreeSet<Integer>();
for (SummaryNode iChild: iNode.children()) {
int iChildIdx = iChild.preorderCount();
SummaryNode dstNode = transformMap.get(iChildIdx);
if (dstNode != null) {
knownDstParents.add(dstNode.getParent().preorderCount());
}
}
// There's just one parent of the destination nodes, so we have found an internal node match.
if (knownDstParents.size() == 1) {
Integer dstIdx = knownDstParents.first();
SummaryNode dstNode = t2NonLeafs.get(dstIdx);
outputOps.add(new SchemaMappingOp(SchemaMappingOp.TRANSFORM_OP, this, iNode.preorderCount(), other, dstIdx, 0));
observedSrcs.add(iNode.preorderCount());
observedDsts.add(dstIdx);
}
}
//
// If a node is in the source, but not the dest, then we need to DELETE it.
// Compute the DELETE costs here.
//
for (SummaryNode iNode: t1.preorder()) {
int iNodeIdx = iNode.preorderCount();
if (! observedSrcs.contains(iNodeIdx)) {
totalCost += iNode.deleteCost();
outputOps.add(new SchemaMappingOp(SchemaMappingOp.DELETE_OP, this, iNodeIdx, iNode.deleteCost()));
}
}
//
// If a node is in the dest, but not the source, then we need to CREATE it.
// Compute the CREATE costs here.
//
for (SummaryNode jNode: t2.preorder()) {
int jNodeIdx = jNode.preorderCount();
if (! observedDsts.contains(jNodeIdx)) {
totalCost += jNode.createCost();
outputOps.add(new SchemaMappingOp(SchemaMappingOp.CREATE_OP, other, jNodeIdx, jNode.createCost()));
}
}
return new SchemaMapping(this, other, totalCost, outputOps);
}
class DistancePair implements Comparable {
double cost;
SummaryNode src;
SummaryNode target;
public DistancePair(double cost, SummaryNode src, SummaryNode target) {
this.cost = cost;
this.src = src;
this.target = target;
}
public int compareTo(Object o) {
DistancePair other = (DistancePair) o;
if (cost < other.cost) {
return -1;
} else if (cost > other.cost) {
return 1;
} else {
int cmp = src.preorderCount() - other.src.preorderCount();
if (cmp == 0) {
cmp = target.preorderCount() - other.target.preorderCount();
}
return cmp;
}
}
public double getCost() {
return cost;
}
public SummaryNode getSrc() {
return src;
}
public SummaryNode getNode() {
return target;
}
public int getIdx() {
return target.preorderCount();
}
public String toString() {
if (target != null) {
return "" + target.getDesc(false) + " cost=" + cost;
} else {
return " DELETE cost=" + cost;
}
}
}
////////////////////////////////////////////////
// String representation of the overall summary object
////////////////////////////////////////////////
public String getDatasetLabel() {
return datasetLabel;
}
public String dumpSummary() {
return this.root.dumpSummary(0);
}
public String getDesc(int nodeid) {
return root.getDesc(nodeid);
}
public String getLabel(int nodeid) {
return root.getLabel(nodeid);
}
public String getTypeDesc(int nodeid) {
return root.getTypeDesc(nodeid);
}
public String getDocStr(int nodeid) {
return root.getDocStr(nodeid);
}
////////////////////////////////////////////////
// Serialization/deserialization
////////////////////////////////////////////////
public void write(DataOutput out) throws IOException {
out.write(MAGIC);
out.write(VERSION);
root.write(out);
UTF8.writeString(out, datasetLabel);
}
public void readFields(DataInput in) throws IOException {
byte magic = in.readByte();
byte version = in.readByte();
this.root = readAndCreate(in);
this.root.computePreorder(-1);
this.datasetLabel = UTF8.readString(in);
}
}