SchemaStatisticalSummary.java example

Explorer
RecordBreaker-master
- src
/*
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.schemadict;

import java.io.File;
import java.io.IOException;
import java.io.DataInput;
import java.io.DataOutput;

import java.util.Iterator;
import java.util.HashMap;
import java.util.TreeMap;
import java.util.HashSet;
import java.util.TreeSet;
import java.util.Set;
import java.util.SortedSet;
import java.util.Hashtable;
import java.util.Map;
import java.lang.Math;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
import java.nio.ByteBuffer;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.reflect.ReflectDatumReader;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericFixed;
import org.apache.avro.generic.GenericRecord;

import org.apache.avro.util.Utf8;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.UTF8;
import org.apache.hadoop.io.Text;

/********************************************
 * The SchemaStatistical Summary object is designed to mirror the structure of an input Schema.
 * In addition to the name and type information associated with a Schema object, it keeps statistical data
 * about observed actual data values that correspond to each Schema element.  
 *
 * This class is intended to be used in the following way:
 * 1) Instantiate a SchemaStatisticalSummary object with a preexisting Schema.
 * 2) For each GenericData item that exhibits the Schema, call SchemaStatisticalSummary.addData(GenericData).  This is
 *    designed to be called multiple times.
 * 3) Once all the desired data has been added, call finalizeStatistics().
 * 4) The resulting finalized SchemaStatisticalSummary object can then be compared to other SchemaStatisticalSummary objects with the measureDistance() function.
 *
 ********************************************/
public class SchemaStatisticalSummary implements Writable {
  final static byte MAGIC = (byte) 0xa1;
  final static byte VERSION = (byte) 1;

  final static int MAX_SUMMARY_SAMPLES = 50;

  final static double MATCHCOST_TYPE_CLASH = 1 * 10 * 1000;
  final static double MATCHCOST_CREATE = 1 * 1000;
  final static double MATCHCOST_DELETE = 1 * 1000;

  final static short ARRAY_NODE = 1;
  final static short BOOLEAN_NODE = 2;
  final static short BYTES_NODE = 3;
  final static short DOUBLE_NODE = 4;
  final static short ENUM_NODE = 5;
  final static short FIXED_NODE = 6;
  final static short FLOAT_NODE = 7;
  final static short INT_NODE = 8;
  final static short LONG_NODE = 9;
  final static short MAP_NODE = 10;
  final static short NULL_NODE = 11;
  final static short RECORD_NODE = 12;
  final static short STRING_NODE = 13;
  final static short UNION_NODE = 14;

  /////////////////////////////////////////////////
  // Inner classes
  /////////////////////////////////////////////////
  /*****************************************************
   * SummaryNode is a generic statistical summary object for a given elt in the
   * hierarchy.  A single tuple in the source data may yield a number of nested
   * SummaryNodes, all rooted at a GenericRecord.
   *
   * The hierarchy is instantiated by examining the schema.  Each new data item
   * results in a call to SummaryNode.addData(), in which the data item is passed in.
   ******************************************************/
  abstract class SummaryNode implements Cloneable {
    SummaryNode parent = null;
    int preorderIdx;
    int numData;
    String docStr = "";

    public SummaryNode() {
    }
    public SummaryNode(String docStr) {
      this.docStr = docStr;
    }
    //////////////////////////////////////////
    // Methods for constructing the summary-node tree
    //////////////////////////////////////////
    public void addData(Object obj) {
      if (obj instanceof Boolean) {
        this.addData((Boolean) obj);
      } else if (obj instanceof GenericArray) {
        this.addData((GenericArray) obj);
      } else if (obj instanceof Double) {
        this.addData((Double) obj);
      } else if (obj instanceof Float) {
        this.addData((Float) obj);
      } else if (obj instanceof GenericFixed) {
        this.addData((GenericFixed) obj);
      } else if (obj instanceof Integer) {
        this.addData((Integer) obj);
      } else if (obj instanceof Long) {
        this.addData((Long) obj);
      } else if (obj instanceof Map) {
        this.addData((Map) obj);
      } else if (obj instanceof ByteBuffer) {
        this.addData((ByteBuffer) obj);
      } else if (obj instanceof GenericRecord) {
        this.addData((GenericRecord) obj);
      } else if (obj instanceof Utf8) {
        this.addData((Utf8) obj);
      } else if (obj instanceof String) {
        this.addData((String) obj);
      }
    }
    // Overridden on per-subclass basis.
    public void addData(Boolean b) {};
    public void addData(GenericArray g) {};
    public void addData(Double d) {};
    public void addData(Float f) {};
    public void addData(Integer i) {};
    public void addData(Long l) {};
    public void addData(Map m) {};
    public void addData(ByteBuffer bb) {};
    public void addData(GenericRecord g) {};
    public void addData(Utf8 u) {};
    public void addData(String s) {};

    ///////////////////////////////////////////////
    // Tree-manipulation and info methods
    ///////////////////////////////////////////////
    /**
     * How many nodes in this subtree?
     */
    public int size() {
      int total = 0;
      for (SummaryNode child: children()) {
        total += child.size();
      }
      return total + 1;
    }    

    /**
     * Setters/getters
     */
    SummaryNode getParent() {
      return parent;
    }
    void setParent(SummaryNode parent) {
      this.parent = parent;
    }
    public List<SummaryNode> children() {
      return new ArrayList<SummaryNode>();
    }
    public int preorderCount() {
      return preorderIdx;
    }
    public SummaryNode parent() {
      return parent;
    }

    /**
     * Dealing with paths and node orderings
     */
    public int computePreorder(int lastIdx) {
      lastIdx++;
      this.preorderIdx = lastIdx;
      for (SummaryNode child: children()) {
        lastIdx = child.computePreorder(lastIdx);
        child.setParent(this);
      }
      return lastIdx;
    }
    void preorder(List<SummaryNode> soFar) {
      soFar.add(this);
      for (SummaryNode child: children()) {
        child.preorder(soFar);
      }
    }
    public List<SummaryNode> preorder() {
      List<SummaryNode> l = new ArrayList<SummaryNode>();
      preorder(l);
      return l;
    }
    public List<SummaryNode> pathToRoot() {
      List<SummaryNode> path = new ArrayList<SummaryNode>();
      SummaryNode cur = this;
      while (cur != null) {
        path.add(cur);
        cur = cur.getParent();
      }
      return path;
    }
    public List<SummaryNode> getLastNodeOnPath() {
      List<SummaryNode> path = new ArrayList<SummaryNode>();
      SummaryNode cur = this;
      while (cur != null) {
        path.add(cur);
        cur = cur.getParent();
      }
      return path;
    }

    /**
     * Useful in testing whether two fields are referring to the same thing.
     * Levenshtein edit distance is great, but we would like a value that ranges 0..1.
     *
     * To compute this, note that the LD is at least abs(len(s1)-len(s2)).  It is also at
     * most max(len(s1), len(s2)).  So we normalize LD by that range.
     */
    double normalizedLevenshteinDistance(String s1, String s2) {
      int rawLD = computeLevenshteinDistance(s1, s2);
      int range = Math.abs(Math.max(s1.length(), s2.length()) - Math.abs(s1.length() - s2.length()));
      return (rawLD / (1.0 * range));
    }

    /**
     * The classic string edit distance algorithm rides again.
     */
    int computeLevenshteinDistance(String s1, String s2) {
      int s1Length = s1.length();
      int s2Length = s2.length();
      int s1pos;
      int s2pos;

      if (s1Length == 0) {
        return s2Length;
      }
      if (s2Length == 0) {
        return s1Length;
      }

      int d[][] = new int[s1Length + 1][];
      for (int i = 0; i <= s1Length; i++) {
        d[i] = new int[s2Length + 1];
      }
      for (int i = 0; i <= s1Length; i++) {
        d[i][0] = i;
      }
      for (int j = 0; j <= s2Length; j++) {
        d[0][j] = j;
      }

      for (int i = 1; i <= s1Length; i++) {
        char s1Char = s1.charAt(i-1);
        for (int j = 1; j <= s2Length; j++) {
          char s2Char = s2.charAt(j-1);

          int cost = 0;
          if (s1Char != s2Char) {
            cost = 1;
          }
          d[i][j] = Math.min(d[i-1][j]+1,
                             Math.min(d[i][j-1]+1, d[i-1][j-1] + cost));
        }
      }
      return d[s1Length][s2Length];
    }
    
    ///////////////////////////////////////////////
    // Methods for string representation
    ///////////////////////////////////////////////
    /**
     * Helper method for rendering a string version of the data
     */
    String prefixString(int prefix) {
      StringBuffer buf = new StringBuffer();
      for (int i = 0; i < prefix; i++) {
        buf.append(" ");
      }
      return buf.toString();
    }
    /**
     * Render a string version of the data
     */
    public String dumpSummary(int prefix) {
      return prefixString(prefix) + "numData: " + numData + "\n";
    }
    public abstract String getTypeDesc();
    /**
     * Find the right node and obtain a description of it.
     */
    public abstract String getDesc(boolean verbose);
    public String getDesc(int nodeid) {
      if (nodeid == preorderIdx) {
        return getDesc(false);
      } else {
        for (SummaryNode child: children()) {
          String desc = child.getDesc(nodeid);
          if (desc != null) {
            return desc;
          }
        }
      }
      return null;
    }
    public String getLabel(int nodeid) {
      if (nodeid == preorderIdx) {
        return getLabel();
      } else {
        for (SummaryNode child: children()) {
          String label = child.getLabel(nodeid);
          if (label != null) {
            return label;
          }
        }
      }
      return null;
    }
    public String getTypeDesc(int nodeid) {
      if (nodeid == preorderIdx) {
        return getTypeDesc();
      } else {
        for (SummaryNode child: children()) {
          String typedesc = child.getTypeDesc(nodeid);
          if (typedesc != null) {
            return typedesc;
          }
        }
      }
      return null;
    }
    public String getDocStr(int nodeid) {
      if (nodeid == preorderIdx) {
        return docStr;
      } else {
        for (SummaryNode child: children()) {
          String docstr = child.getDocStr(nodeid);
          if (docstr != null) {
            return docstr;
          }
        }
      }
      return null;
    }
    /**
     * Find the "label" for the current node.  Since the top-level element in the
     * NodeSummary hierarchy is a record, we know that every element has a label.
     * The getLabel() function goes up the tree to the root, constructing the 
     * dotted label sequence all the way.
     */
    public String getLabel() {
      if (parent != null) {
        return parent.getLabel("", this);
      } else {
        return "<root>";
      }
    }

    public String getLabel(String labelSoFar, SummaryNode src) {
      if (parent != null) {
        return parent.getLabel(labelSoFar, this);
      } else {
        return labelSoFar;
      }
    }

    ///////////////////////////////////////////////
    // Cost functions for schema matching
    ///////////////////////////////////////////////
    /**
     * Figure out basic normalized string edit distance to
     * see if the schema labels match.  If 'useAttributeLabels'
     * is set to false, then this distance is always zero.
     */
    double computeSchemaLabelDistance(String l1, String l2) {
      if (! useAttributeLabels) {
        return 0;
      } else {
        if (l1.indexOf(".") >= 0) {
          l1 = l1.substring(l1.lastIndexOf(".")+1);
        }
        if (l2.indexOf(".") >= 0) {
          l2 = l2.substring(l2.lastIndexOf(".")+1);
        }
        return normalizedLevenshteinDistance(l1, l2);
      }
    }
    /**
     * The default non-type-specific way of performing schema matching is to
     * just compare the attribute labels.  We can also examine data distributions,
     * but this is only possible in the subclasses' overriding transformCost() methods.
     */
    public double transformCost(SummaryNode other) {
      if (this.getClass() == other.getClass()) {
        // Examine the field name for a schema-label distance
        return computeSchemaLabelDistance(this.getLabel(), other.getLabel());
      } else {
        return MATCHCOST_TYPE_CLASH;
      }
    }
    public double deleteCost() {
      return MATCHCOST_DELETE;
    }
    public double createCost() {
      return MATCHCOST_CREATE;
    }

    ///////////////////////////////////////////////
    // Serialization/deserialization
    ///////////////////////////////////////////////
    public abstract void write(DataOutput out) throws IOException;
    public abstract void readFields(DataInput in) throws IOException;
  }

  /*****************************************************
   * Store statistical summary of observed arrays.  Basically, store length information and # times seen.
   ****************************************************/
  class ArraySummaryNode extends SummaryNode {
    int totalSize;
    SummaryNode eltSummary;
    public ArraySummaryNode() {
    }
    public ArraySummaryNode(SummaryNode eltSummary, String docStr) {
      super(docStr);
      this.eltSummary = eltSummary;
    }

    /**
     */
    public void addData(GenericArray data) {
      numData++;
      totalSize += data.size();
      for (Iterator it = data.iterator(); it.hasNext(); ) {
        eltSummary.addData(it.next());
      }
    }

    /////////////////////////////
    // String representation
    /////////////////////////////
    public String dumpSummary(int prefix) {
      return prefixString(prefix) + "numData: " + numData + ", avgSize: " + (totalSize / (1.0 * numData)) + "\n" + eltSummary.dumpSummary(prefix+2);
    }
    public String getTypeDesc() {
      return "ARRAY";
    }
    public String getDesc(boolean verbose) {
      String desc = "ARRAY";
      if (verbose) {
        desc += "(numData: " + numData + ", avgSize: " + (totalSize / (1.0 * numData)) + ")";
      }
      return getLabel()  + ": " + desc;
    }

    /////////////////////////////
    // Serialize/deserialize
    /////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeShort(ARRAY_NODE);
      out.writeInt(numData);
      UTF8.writeString(out, docStr == null ? "" : docStr);
      out.writeInt(totalSize);
      eltSummary.write(out);
    }
    public void readFields(DataInput in) throws IOException {
      this.numData = in.readInt();
      this.docStr = UTF8.readString(in);
      this.totalSize = in.readInt();
      this.eltSummary = readAndCreate(in);
    }
  }

  /*****************************************************
   * Store statistical summary of observed boolean field.  Store # times seen and distribution true vs false
   ****************************************************/
  class BooleanSummaryNode extends SummaryNode {
    int numTrue;
    int numFalse;
    public BooleanSummaryNode() {
    }
    public BooleanSummaryNode(String docStr) {
      super(docStr);
    }
    public void addData(Boolean b) {
      numData++;
      if (b.booleanValue()) {
        numTrue++;
      } else {
        numFalse++;
      }
    }

    /////////////////////////////
    // String representation
    /////////////////////////////
    public String dumpSummary(int prefix) {
      return prefixString(prefix) + "numData: " + numData + ", numTrue: " + numTrue + ", numFalse: " + numFalse + "\n";
    }
    public String getTypeDesc() {
      return "BOOLEAN";
    }
    public String getDesc(boolean verbose) {
      String desc = "BOOLEAN";
      if (verbose) {
        desc += "(numData: " + numData + ", numTrue: " + numTrue + ", numFalse: " + numFalse + ")";
      }
      return getLabel() + ": " + desc;
    }

    /////////////////////////////
    // Serialize/deserialize
    /////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeShort(BOOLEAN_NODE);
      out.writeInt(numData);
      UTF8.writeString(out, docStr == null ? "" : docStr);
      out.writeInt(numTrue);
      out.writeInt(numFalse);
    }
    public void readFields(DataInput in) throws IOException {
      this.numData = in.readInt();
      this.docStr = UTF8.readString(in);
      this.numTrue = in.readInt();
      this.numFalse = in.readInt();
    }    
  }

  /*****************************************************
   * Store statistical summary of observed Bytes field.  Store # times seen and # bytes seen.
   ****************************************************/
  class BytesSummaryNode extends SummaryNode {
    int totalSize = 0;
    public BytesSummaryNode() {
    }
    public BytesSummaryNode(String docStr) {
      super(docStr);
    }
    public void addData(ByteBuffer bb) {
      numData++;
      totalSize += bb.remaining();
    }

    /////////////////////////////
    // String representation
    /////////////////////////////
    public String dumpSummary(int prefix) {
      return prefixString(prefix) + "numData: " + numData + ", totalSize: " + totalSize + "\n";
    }
    public String getTypeDesc() {
      return "BYTES";
    }
    public String getDesc(boolean verbose) {
      String desc = "BYTES";
      if (verbose) {
        desc += "(numData: " + numData + ", totalSize: " + totalSize + ")";
      }
      return getLabel() + ": " + desc;
    }

    /////////////////////////////
    // Serialize/deserialize
    /////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeShort(BYTES_NODE);
      out.writeInt(numData);
      UTF8.writeString(out, docStr == null ? "" : docStr);
      out.writeInt(totalSize);
    }
    public void readFields(DataInput in) throws IOException {
      this.numData = in.readInt();
      this.docStr = UTF8.readString(in);
      this.totalSize = in.readInt();
    }    
  }

  /*****************************************************
   * Store statistical summary of observed Double field.  Store # times seen and total value
   ****************************************************/
  class DoubleSummaryNode extends SummaryNode {
    double total;
    public DoubleSummaryNode() {
    }
    public DoubleSummaryNode(String docStr) {
      super(docStr);
    }
    public void addData(Double d) {
      numData++;
      total += d.doubleValue();
    }

    /////////////////////////////
    // String representation
    /////////////////////////////
    public String dumpSummary(int prefix) {
      return prefixString(prefix) + "numData: " + numData + ", avg: " + (total / (1.0 * numData)) + "\n";
    }
    public String getTypeDesc() {
      return "DOUBLE";
    }
    public String getDesc(boolean verbose) {
      String desc = "DOUBLE";
      if (verbose) {
        desc += "(numData: " + numData + ", avg: " + (total / (1.0 * numData)) + ")";
      }
      return getLabel() + ": " + desc;
    }

    /////////////////////////////
    // Serialize/deserialize
    /////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeShort(DOUBLE_NODE);
      out.writeInt(numData);
      UTF8.writeString(out, docStr == null ? "" : docStr);
      out.writeDouble(total);
    }
    public void readFields(DataInput in) throws IOException {
      this.numData = in.readInt();
      this.docStr = UTF8.readString(in);
      this.total = in.readDouble();
    }    
  }

  /*****************************************************
   * Store statistical summary of observed Enumerated Type field.  Store # times seen and statistics on how often 
   * each enum-value is seen.
   ****************************************************/
  class EnumSummaryNode extends SummaryNode {
    String name;
    Map<String, Integer> symbolCounts = new HashMap<String, Integer>();
    public EnumSummaryNode() {
    }
    public EnumSummaryNode(String name, List<String> symbols, String docStr) {
      super(docStr);
      this.name = name;
      for (String symbol: symbols) {
        this.symbolCounts.put(symbol, 1);
      }
    }
    public void addData(String s) {
      this.symbolCounts.put(s, symbolCounts.get(s) + 1);
    }

    /////////////////////////////
    // String representation
    /////////////////////////////
    public String dumpSummary(int prefix) {
      StringBuffer buf = new StringBuffer();
      buf.append(prefixString(prefix) + "numData: " + numData + " =>\n");
      for (String symbol: symbolCounts.keySet()) {
        buf.append(prefixString(prefix+2) + symbol + ": " + symbolCounts.get(symbol) + "\n");
      }
      return buf.toString();
    }
    public String getTypeDesc() {
      return "ENUM";
    }
    public String getDesc(boolean verbose) {
      String desc = "ENUM";
      if (verbose) {
        desc += "(numData: " + numData + ", numSymbols: " + symbolCounts.size() + ")";
      }
      return getLabel() + ": " + desc;
    }

    /////////////////////////////
    // Serialize/deserialize
    /////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeShort(ENUM_NODE);
      out.writeInt(numData);
      UTF8.writeString(out, docStr == null ? "" : docStr);
      out.writeInt(symbolCounts.size());
      for (String symbol: symbolCounts.keySet()) {
        new Text(symbol).write(out);
        out.writeInt(symbolCounts.get(symbol));
      }
    }
    public void readFields(DataInput in) throws IOException {
      this.numData = in.readInt();
      this.docStr = UTF8.readString(in);
      symbolCounts = new HashMap<String, Integer>();
      int numElts = in.readInt();
      for (int i = 0; i < numElts; i++) {
        Text symbol = new Text();
        symbol.readFields(in);
        Integer count = in.readInt();
        symbolCounts.put(symbol.toString(), count);
      }
    }    
  }

  /*****************************************************
   * Store statistical summary of observed GenericFixed field.  Store # times seen and byte length information.  Eventually,
   * store info on the byte content, too.
   ****************************************************/
  class FixedSummaryNode extends SummaryNode {
    String name;
    int size;
    int total;
    public FixedSummaryNode() {
    }
    public FixedSummaryNode(String name, int size, String docStr) {
      super(docStr);
      this.name = name;
      this.size = size;
      this.total = 0;
    }
    public void addData(GenericFixed data) {
      byte d[] = data.bytes();
      total += d.length;
      numData++;
    }

    /////////////////////////////
    // String representation
    /////////////////////////////
    public String dumpSummary(int prefix) {
      return prefixString(prefix) + "size: " + size + ", total: " + total + ", numData: " + numData;
    }
    public String getTypeDesc() {
      return "FIXED";
    }
    public String getDesc(boolean verbose) {
      String desc = "FIXED";
      if (verbose) {
        desc += "(numData: " + numData + ", size: " + size + ", total: " + total + ")";
      }
      return getLabel() + ": " + desc;
    }

    /////////////////////////////
    // Serialize/deserialize
    /////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeShort(FIXED_NODE);
      new Text(name).write(out);
      UTF8.writeString(out, docStr == null ? "" : docStr);
      out.writeInt(size);
      out.writeInt(total);
    }
    public void readFields(DataInput in) throws IOException {
      this.name = Text.readString(in);
      this.docStr = UTF8.readString(in);
      this.size = in.readInt();
      this.total = in.readInt();
    }    
  }

  /*****************************************************
   * Store statistical summary of observed Float field.  Store # times seen and total value
   ****************************************************/
  class FloatSummaryNode extends SummaryNode {
    float total;
    public FloatSummaryNode() {
    }
    public FloatSummaryNode(String docStr) {
      super(docStr);
    }
    public void addData(Float f) {
      numData++;
      total += f.floatValue();
    }

    /////////////////////////////
    // String representation
    /////////////////////////////
    public String dumpSummary(int prefix) {
      return prefixString(prefix) + "numData: " + numData + ", avg: " + (total / (1.0 * numData)) + "\n";
    }
    public String getTypeDesc() {
      return "FLOAT";
    }
    public String getDesc(boolean verbose) {
      String desc = "FLOAT";
      if (verbose) {
        desc += "(numData: " + numData + ", avg: " + (total / (1.0 * numData)) + ")";
      }
      return getLabel() + ": " + desc;
    }

    /////////////////////////////
    // Serialize/deserialize
    /////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeShort(FLOAT_NODE);
      out.writeInt(numData);
      UTF8.writeString(out, docStr == null ? "" : docStr);
      out.writeFloat(total);
    }
    public void readFields(DataInput in) throws IOException {
      this.numData = in.readInt();
      this.docStr = UTF8.readString(in);
      this.total = in.readFloat();
    }    
  }

  /*****************************************************
   * Store statistical summary of observed Integer field.
   * Store total value, num data elements, and a sample of actual data elts
   ****************************************************/
  class IntegerSummaryNode extends SummaryNode {
    int total;
    List<Integer> samples = new ArrayList<Integer>();
    public IntegerSummaryNode() {
    }
    public IntegerSummaryNode(String docStr) {
      super(docStr);
    }
    public void addData(Integer i) {
      numData++;
      total += i.intValue();
      if (samples.size() < MAX_SUMMARY_SAMPLES) {
        samples.add(i);
      }
    }

    ///////////////////////////////////////////////
    // Cost functions for schema matching
    ///////////////////////////////////////////////
    public double transformCost(SummaryNode other) {
      if (this.getClass() == other.getClass()) {
        double schemaLabelDistance = computeSchemaLabelDistance(this.getLabel(), other.getLabel());
        double klDivergence = computeSampleKLDivergence((IntegerSummaryNode) other);

        return schemaLabelDistance + klDivergence;
      } else {
        return MATCHCOST_TYPE_CLASH;
      }
    }

    /**
     * This computes the Kullback-Leibler divergence between two int distributions.  It
     * measures how much the two integer distributions differ.  Useful for testing whether
     * they should be matched.
     * 
     * Assumes the two distributions are gaussians.
     */
    public double computeSampleKLDivergence(IntegerSummaryNode other) {
      double mean1 = total / (1.0 * numData);
      double mean2 = other.total / (1.0 * other.numData);
      double stddev1 = computeStddev();
      double stddev2 = other.computeStddev();
      double variance1 = Math.pow(stddev1, 2);
      double variance2 = Math.pow(stddev2, 2);
      return Math.log(stddev2 / stddev1) + ((variance1 + Math.pow(mean1 - mean2, 2)) / (2 * Math.pow(variance2, 2))) - 0.5;
    }

    /**
     * Compute the standard deviation of the distribution of integers in this summary node.
     * Note that if the sample is smaller than the genuine data, we take the
     * "sample standard deviation", not the true stddev.
     */
    public double computeStddev() {
      double mean = total / (1.0 * numData);
      double total = 0;
      for (Integer sample: samples) {
        total += Math.pow(sample.intValue() - mean, 2);
      }
      double normalizer = 1 / (1.0 * numData);
      if (samples.size() < numData) {
        // This here's what makes the "sample std deviation" in case we're not
        // looking at the full dataset.
        normalizer = 1 / (1.0 * (numData-1));
      }
      double variance = normalizer * total;
      return Math.sqrt(variance);
    }

    /////////////////////////////
    // String representation
    /////////////////////////////
    public String dumpSummary(int prefix) {
      return prefixString(prefix) + "numData: " + numData + ", avg: " + (total / (1.0 * numData)) + "\n";
    }
    public String getTypeDesc() {
      return "INT";
    }
    public String getDesc(boolean verbose) {
      String desc = "INT";
      if (verbose) {
        desc += "(numData: " + numData + ", avg: " + (total / (1.0 * numData)) + ")";
      }
      return getLabel() + ": " + desc;
    }

    /////////////////////////////
    // Serialize/deserialize
    /////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeShort(INT_NODE);
      out.writeInt(numData);
      UTF8.writeString(out, (docStr == null) ? "" : docStr);
      out.writeInt(total);
      out.writeInt(samples.size());
      for (Integer sample: samples) {
        out.writeInt(sample.intValue());
      }
    }
    public void readFields(DataInput in) throws IOException {
      this.numData = in.readInt();
      this.docStr = UTF8.readString(in);
      this.total = in.readInt();
      this.samples.clear();
      int numSamples = in.readInt();
      for (int i = 0; i < numSamples; i++) {
        this.samples.add(in.readInt());
      }
    }    
  }

  /*****************************************************
   * Store statistical summary of observed Long field.  Store # times seen and total value
   ****************************************************/
  class LongSummaryNode extends SummaryNode {
    long total;
    public LongSummaryNode() {
    }
    public LongSummaryNode(String docStr) {
      super(docStr);
    }
    public void addData(Long l) {
      numData++;
      total += l.longValue();
    }

    /////////////////////////////
    // String representation
    /////////////////////////////
    public String dumpSummary(int prefix) {
      return prefixString(prefix) + "numData: " + numData + ", avg: " + (total / (1.0 * numData)) + "\n";
    }
    public String getTypeDesc() {
      return "LONG";
    }
    public String getDesc(boolean verbose) {
      String desc = "LONG";
      if (verbose) {
        desc += "(numData: " + numData + ", avg: " + (total / (1.0 * numData)) + ")";
      }
      return getLabel() + ": " + desc;
    }

    /////////////////////////////
    // Serialize/deserialize
    /////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeShort(LONG_NODE);
      out.writeInt(numData);
      UTF8.writeString(out, docStr == null ? "" : docStr);
      out.writeLong(total);
    }
    public void readFields(DataInput in) throws IOException {
      this.numData = in.readInt();
      this.docStr = UTF8.readString(in);
      this.total = in.readLong();
    }    
  }

  /*****************************************************
   * Store statistical summary of observed Map field.  Store # times seen and track data for each labeled key-pair.
   ****************************************************/
  class MapSummaryNode extends SummaryNode {
    Schema modelS;
    HashMap<Utf8, SummaryNode> stats = new HashMap<Utf8, SummaryNode>();

    public MapSummaryNode() {
    }
    public MapSummaryNode(Schema modelS, String docStr) {
      super(docStr);
      this.modelS = modelS;
    }
    public void addData(Map m) {
      numData++;
      Iterator it = m.keySet().iterator();
      while (it.hasNext()) {
        Utf8 key = (Utf8) it.next();
        SummaryNode s = stats.get(key);
        if (s == null) {
          s = buildStructure(modelS, modelS.getDoc());
          stats.put(key, s);
        }
        s.addData(m.get(key));
      }
    }

    /////////////////////////////
    // String representation
    /////////////////////////////
    public String dumpSummary(int prefix) {
      StringBuffer buf = new StringBuffer();
      buf.append(prefixString(prefix) + "+------------------------------------------+\n");
      buf.append(prefixString(prefix) + "numData: " + numData + "\n");
      for (Utf8 key: stats.keySet()) {
        SummaryNode s = stats.get(key);
        buf.append(prefixString(prefix) + key + " =>\n" + s.dumpSummary(prefix+2));
      }
      buf.append(prefixString(prefix) + "+------------------------------------------+\n");
      return buf.toString();
    }
    public String getTypeDesc() {
      return "MAP";
    }
    public String getDesc(boolean verbose) {
      String desc = "MAP";
      if (verbose) {
        desc += "(numData: " + numData + ", numSymbols: " + stats.size() + ")";
      }
      return getLabel() + ": " + desc;
    }
    public String getLabel(String labelSoFar, SummaryNode src) {
      for (Utf8 fname: stats.keySet()) {
        SummaryNode candidate = stats.get(fname);
        if (src == candidate) {
          if (parent != null) {
            labelSoFar = (labelSoFar.length() > 0) ? fname.toString() + "." + labelSoFar : fname.toString();
            return parent.getLabel(labelSoFar, this);
          }
        }
      }
      return labelSoFar;
    }

    /////////////////////////////
    // Serialize/deserialize
    /////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeShort(MAP_NODE);
      out.writeInt(numData);
      UTF8.writeString(out, docStr == null ? "" : docStr);
      out.writeInt(stats.size());
      for (Utf8 key: stats.keySet()) {
        new Text(key.toString()).write(out);
        stats.get(key).write(out);
      }
    }
    public void readFields(DataInput in) throws IOException {
      this.numData = in.readInt();
      this.docStr = UTF8.readString(in);
      int numElts = in.readInt();
      for (int i = 0; i < numElts; i++) {
        Text key = new Text();
        key.readFields(in);
        SummaryNode sn = readAndCreate(in);
        stats.put(new Utf8(key.toString()), sn);
      }
    }    
  }

  /*****************************************************
   * Store statistical summary of observed Null field.  Just store # times seen.
   ****************************************************/
  class NullSummaryNode extends SummaryNode {
    public NullSummaryNode() {
    }
    public NullSummaryNode(String docStr) {
      super(docStr);
    }
    public void addData() {
      numData++;
    }

    public String getDesc(boolean verbose) {
      String desc = "NULL";
      if (verbose) {
        desc += "(numData: " + numData + ")";
      }
      return getLabel() + ": " + desc;
    }
    public String getTypeDesc() {
      return "NULL";
    }

    /////////////////////////////
    // Serialize/deserialize
    /////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeShort(NULL_NODE);
      out.writeInt(numData);
      UTF8.writeString(out, docStr == null ? "" : docStr);
    }
    public void readFields(DataInput in) throws IOException {
      this.numData = in.readInt();
      this.docStr = UTF8.readString(in);
    }    
  }

  /*****************************************************
   * Store statistical summary of observed Record field.  Store # times seen and then data about sub-elements.
   ****************************************************/
  class RecordSummaryNode extends SummaryNode {
    String name;
    Map<String, SummaryNode> recordSummary = new HashMap<String, SummaryNode>();
    public RecordSummaryNode() {
    }
    public RecordSummaryNode(String name, String docStr) {
      super(docStr);
      this.name = name;
    }
    public List<SummaryNode> children() {
      List<SummaryNode> l = new ArrayList<SummaryNode>();
      for (String key: recordSummary.keySet()) {
        l.add(recordSummary.get(key));
      }
      return l;
    }
    public void addField(String fname, SummaryNode fn) {
      recordSummary.put(fname, fn);
    }
    public void addData(GenericRecord data) {
      numData++;
      for (String fname: recordSummary.keySet()) {
        recordSummary.get(fname).addData(data.get(fname));
      }
    }

    /////////////////////////////
    // String representation
    /////////////////////////////
    public String dumpSummary(int prefix) {
      StringBuffer buf = new StringBuffer();
      buf.append(prefixString(prefix) + "+------------------------------------------+\n");
      buf.append(prefixString(prefix) + "numData: " + numData + "\n");
      for (String fname: recordSummary.keySet()) {
        buf.append(prefixString(prefix) + fname + " =>\n" + recordSummary.get(fname).dumpSummary(prefix+2));
      }
      buf.append(prefixString(prefix) + "+------------------------------------------+\n");
      return buf.toString();
    }
    public String getTypeDesc() {
      return "RECORD";
    }
    public String getDesc(boolean verbose) {
      String desc = "RECORD";
      if (verbose) {
        desc += "(numData: " + numData + ", fields: " + recordSummary.size() + ")";
      }
      return getLabel() + ": " + desc;
    }
    public String getLabel(String labelSoFar, SummaryNode src) {
      for (String fname: recordSummary.keySet()) {
        SummaryNode candidate = recordSummary.get(fname);
        if (src == candidate) {
          labelSoFar = (labelSoFar.length() > 0) ? fname + "." + labelSoFar : fname;
          if (parent != null) {
            return parent.getLabel(labelSoFar, this);
          } else {
            return "<root>" + "." + labelSoFar;
          }
        }
      }
      return "<root>" + "." + labelSoFar;
    }

    /////////////////////////////
    // Serialize/deserialize
    /////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeShort(RECORD_NODE);
      out.writeInt(numData);
      UTF8.writeString(out, docStr == null ? "" : docStr);
      out.writeInt(recordSummary.size());
      for (String fname: recordSummary.keySet()) {
        new Text(fname).write(out);
        recordSummary.get(fname).write(out);
      }
    }
    public void readFields(DataInput in) throws IOException {
      this.numData = in.readInt();
      this.docStr = UTF8.readString(in);
      int numRecs = in.readInt();
      for (int i = 0; i < numRecs; i++) {
        Text fname = new Text();
        fname.readFields(in);
        SummaryNode sn = readAndCreate(in);
        recordSummary.put(fname.toString(), sn);
      }
    }    
  }

  /*****************************************************
   * Store statistical summary of observed String field.  Store # times seen and total length of the strings (for now).
   * Eventually, store info on the String content, too.
   ****************************************************/
  class StringSummaryNode extends SummaryNode {
    int totalLength;
    Set<Utf8> observedStrings = new TreeSet<Utf8>();
    public StringSummaryNode() {
    }
    public StringSummaryNode(String docStr) {
      super(docStr);
    }
    public void addData(Utf8 s) {
      numData++;
      totalLength += s.getLength();
      observedStrings.add(s);
    }

    ///////////////////////////////////////////////
    // Cost functions for schema matching
    ///////////////////////////////////////////////
    public double transformCost(SummaryNode other) {
      if (this.getClass() == other.getClass()) {
        double schemaLabelDistance = computeSchemaLabelDistance(this.getLabel(), other.getLabel());
        double jaccardSimilarity = computeJaccardSimilarity((StringSummaryNode) other);
        double jaccardDistance = 1 - jaccardSimilarity;

        return schemaLabelDistance + jaccardDistance;
      } else {
        return MATCHCOST_TYPE_CLASH;
      }
    }

    /**
     * This is a useful score for determining whether two sets of objects are similar
     */
    public double computeJaccardSimilarity(StringSummaryNode other) {
      Set<Utf8> larger = (this.numData >= other.numData ? this.observedStrings : other.observedStrings);
      Set<Utf8> smaller = (this.numData < other.numData ? this.observedStrings : other.observedStrings);

      int unionSize = larger.size();
      if (larger.contains(new Utf8(""))) {
        unionSize -= 1;
      }
      int intersectionSize = 0;
      for (Utf8 smallElt: smaller) {
        if (smallElt.length() == 0) {
          continue;
        }
        if (larger.contains(smallElt)) {
          intersectionSize++;
        } else {
          unionSize++;
        }
      }
      if (unionSize == 0) {
        return 0;
      } else {
        return intersectionSize / (1.0 * unionSize);
      }
    }

    /////////////////////////////
    // String representation
    /////////////////////////////
    public String dumpSummary(int prefix) {
      return prefixString(prefix) + "numData: " + numData + ", avg-len: " + (totalLength / (1.0 * numData)) + "\n";
    }
    public String getTypeDesc() {
      return "STRING";
    }
    public String getDesc(boolean verbose) {
      String desc = "STRING";
      if (verbose) {
        desc += "(numData: " + numData + ", avglen: " + (totalLength / (1.0 * numData)) + ")";
      }
      return getLabel()  + ": " + desc;
    }

    /////////////////////////////
    // Serialize/deserialize
    /////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeShort(STRING_NODE);
      out.writeInt(numData);
      UTF8.writeString(out, docStr == null ? "" : docStr);
      out.writeInt(totalLength);

      out.writeInt(observedStrings.size());
      for (Utf8 s: observedStrings) {
        UTF8.writeString(out, s.toString());
      }
    }
    public void readFields(DataInput in) throws IOException {
      this.numData = in.readInt();
      this.docStr = UTF8.readString(in);
      this.totalLength = in.readInt();

      observedStrings.clear();
      int numInts = in.readInt();
      for (int i = 0; i < numInts; i++) {
        observedStrings.add(new Utf8(UTF8.readString(in)));
      }
    }    
  }

  /*****************************************************
   * Store statistical summary of observed Union field.  Actually, a Union is not observed directly - we just know
   * it's a union from the schema.  Store # times seen, data on the particular type observed, and statistics on how 
   * often each subtype is seen.
   ****************************************************/
  class UnionSummaryNode extends SummaryNode {
    Map<Schema.Type, SummaryNode> unionTypes = new HashMap<Schema.Type, SummaryNode>();
    Map<Schema.Type, Integer> unionTypeCounts = new HashMap<Schema.Type, Integer>();
    public UnionSummaryNode() {
    }
    public UnionSummaryNode(String docStr) {
      super(docStr);
    }
    public void addType(Schema.Type t, SummaryNode sn) {
      if (unionTypes.get(t) == null) {
        unionTypes.put(t, sn);
        unionTypeCounts.put(t, 0);
      }
    }

    /**
     * We need to dispatch the object to the right element stored in 'unionTypes'
     */
    public void addData(Object obj) {
      Schema.Type t = Schema.Type.ARRAY;
      if (obj instanceof GenericArray) {
        t = Schema.Type.ARRAY;
      } else if (obj instanceof Boolean) {
        t = Schema.Type.BOOLEAN;
      } else if (obj instanceof ByteBuffer) {
        t = Schema.Type.BYTES;
      } else if (obj instanceof Double) {
        t = Schema.Type.DOUBLE;
      } else if (obj instanceof String) {
        t = Schema.Type.ENUM;
      } else if (obj instanceof GenericFixed) {
        t = Schema.Type.FIXED;
      } else if (obj instanceof Float) {
        t = Schema.Type.FLOAT;
      } else if (obj instanceof Integer) {
        t = Schema.Type.INT;
      } else if (obj instanceof Long) {
        t = Schema.Type.LONG;
      } else if (obj instanceof Map) {
        t = Schema.Type.MAP;
      } else if (obj instanceof GenericRecord) {
        t = Schema.Type.RECORD;
      } else if (obj instanceof Utf8) {
        t = Schema.Type.STRING;
      }
      unionTypes.get(t).addData(obj);
      Integer c = unionTypeCounts.get(t);
      if (c == null) {
        unionTypeCounts.put(t, 1);
      } else {
        unionTypeCounts.put(t, c.intValue() + 1);
      }
    }

    /////////////////////////////
    // String representation
    /////////////////////////////
    public String dumpSummary(int prefix) {
      StringBuffer buf = new StringBuffer();
      for (Schema.Type t: unionTypes.keySet()) {
        buf.append(prefixString(prefix) + "unionType: " + t + " =>\n");
        buf.append(unionTypes.get(t).dumpSummary(prefix+2));
      }
      return buf.toString();
    }
    public String getTypeDesc() {
      return "UNION";
    }
    public String getDesc(boolean verbose) {
      String desc = "UNION";
      if (verbose) {
        desc += "(numData: " + numData + ", numtypes: " + unionTypes.size() + ")";
      }
      return getLabel() + ": " + desc;
    }

    /////////////////////////////
    // Serialize/deserialize
    /////////////////////////////
    public void write(DataOutput out) throws IOException {
      out.writeShort(UNION_NODE);
      out.writeInt(numData);
      UTF8.writeString(out, docStr == null ? "" : docStr);
      out.writeInt(unionTypes.size());
      for (Schema.Type t: unionTypes.keySet()) {
        new Text(t.toString()).write(out);
        out.writeInt(unionTypeCounts.get(t));
        unionTypes.get(t).write(out);
      }
    }
    public void readFields(DataInput in) throws IOException {
      this.numData = in.readInt();
      this.docStr = UTF8.readString(in);
      int numTypes = in.readInt();
      for (int i = 0; i < numTypes; i++) {
        Text tLabel = new Text();
        tLabel.readFields(in);
        Schema.Type t = Schema.Type.valueOf(tLabel.toString());
        int typeCount = in.readInt();
        SummaryNode sn = readAndCreate(in);
        unionTypes.put(t, sn);
        unionTypeCounts.put(t, typeCount);
      }
    }    
  }
  /***************************************
   * Op is used to track mapping results
   ***************************************/
  class PreviousChoice extends SchemaMappingOp {
    Hashtable<String, List<SchemaMappingOp>> h;
    String label;
    public PreviousChoice(Hashtable<String, List<SchemaMappingOp>> h, int i, int j) {
      this.h = h;
      this.label = "" + i + "-" + j;
    }
    public PreviousChoice(Hashtable<String, List<SchemaMappingOp>> h, int p1, int p2, int p3, int p4, int p5, int p6) {
      this.h = h;
      this.label = "" + p1 + "-" + p2 + "-" + p3 + "-" + p4 + "-" + p5 + "-" + p6;
    }
    public List<SchemaMappingOp> getOps() {
      List<SchemaMappingOp> ops = h.get(label);
      if (ops == null) {
        ops = new ArrayList<SchemaMappingOp>();
      }
      return ops;
    }
    public String toString() {
      return "Previous! " + label;
    }
  }


  /////////////////////////////////////////////////
  // Members
  /////////////////////////////////////////////////
  SummaryNode root = null;
  boolean useAttributeLabels = true;
  String datasetLabel = "";

  /////////////////////////////////////////////////
  // Constructors, initializers
  /////////////////////////////////////////////////
  public SchemaStatisticalSummary() throws IOException {
  }
  public SchemaStatisticalSummary(String datasetLabel) throws IOException {
    this.datasetLabel = datasetLabel;
  }
  public void setUseAttributeLabels(boolean useAttributeLabels) {
    this.useAttributeLabels = useAttributeLabels;
  }
  /**
   * Create the statistical summary object from data.
   */
  public Schema createSummaryFromData(File f) throws IOException {
    DataFileReader in = new DataFileReader(f, new GenericDatumReader());
    try {
      Schema s = in.getSchema();

      //
      // There has to be at least one data element for us to infer anything meaningful
      //
      Iterator it = in.iterator();
      if (! it.hasNext()) {
        throw new IOException("No contents");
      }

      //
      // We can only infer schemas from top-level records, not Fixeds or Arrays.
      //
      Object firstRecord = it.next();
      if (firstRecord instanceof GenericFixed ||
          firstRecord instanceof GenericArray) {
        throw new IOException("Not a top-level record");
      }

      // We assume the passed-in top-level Schema always represents a Record.
      if (s.getType() != Schema.Type.RECORD) {
        throw new IOException("Passed-in top-level Schema instance must be of type Schema.Type.RECORD");
      }
      this.root = buildStructure(s, "ROOT");

      //
      // Iterate through all records and collect statistics on each Schema field.
      //
      List<Schema.Field> fields = s.getFields();
      GenericRecord cur = (GenericRecord) firstRecord;
      int counter = 0;
      do {
        this.root.addData(cur);
        counter++;
        if (it.hasNext()) {
          cur = (GenericRecord) it.next();
        } else {
          cur = null;
        }
      } while (cur != null);

      this.root.computePreorder(-1);
      return s;
    } finally {
      in.close();
    }
  }

  /**
   * This function reads in data and instantiates the SummaryNode hierarchy.
   */
  public SummaryNode readAndCreate(DataInput in) throws IOException {
    short nodeType = in.readShort();
    SummaryNode sn = null;

    switch (nodeType) {
    case ARRAY_NODE: {
      sn = new ArraySummaryNode();
      break;
    }
    case BOOLEAN_NODE: {
      sn = new BooleanSummaryNode();
      break;
    }
    case BYTES_NODE: {
      sn = new BytesSummaryNode();
      break;
    }
    case DOUBLE_NODE: {
      sn = new DoubleSummaryNode();
      break;
    }
    case ENUM_NODE: {
      sn = new EnumSummaryNode();
      break;
    }
    case FIXED_NODE: {
      sn = new FixedSummaryNode();
      break;
    }
    case FLOAT_NODE: {
      sn = new FloatSummaryNode();
      break;
    }
    case INT_NODE: {
      sn = new IntegerSummaryNode();
      break;
    }
    case LONG_NODE: {
      sn = new LongSummaryNode();
      break;
    }
    case MAP_NODE: {
      sn = new MapSummaryNode();
      break;
    }
    case NULL_NODE: {
      sn = new NullSummaryNode();
      break;
    }
    case RECORD_NODE: {
      sn = new RecordSummaryNode();
      break;
    }
    case STRING_NODE: {
      sn = new StringSummaryNode();
      break;
    }
    case UNION_NODE: {
      sn = new UnionSummaryNode();
      break;
    }
    default:
      throw new IOException("Unknown node type: " + nodeType);
    }

    sn.readFields(in);
    return sn;
  }

  /**
   * Build a Summary structure out of the given schema.  Helper method.
   */ 
  SummaryNode buildStructure(Schema s, String docStr) {
    Schema.Type stype = s.getType();
    if (stype == Schema.Type.ARRAY) {
      return new ArraySummaryNode(buildStructure(s.getElementType(), s.getDoc()), docStr);
    } else if (stype == Schema.Type.BOOLEAN) {
      return new BooleanSummaryNode(docStr);
    } else if (stype == Schema.Type.BYTES) {
      return new BytesSummaryNode(docStr);
    } else if (stype == Schema.Type.DOUBLE) {
      return new DoubleSummaryNode(docStr);
    } else if (stype == Schema.Type.ENUM) {
      return new EnumSummaryNode(s.getFullName(), s.getEnumSymbols(), docStr);
    } else if (stype == Schema.Type.FIXED) {
      return new FixedSummaryNode(s.getFullName(), s.getFixedSize(), docStr);
    } else if (stype == Schema.Type.FLOAT) {
      return new FloatSummaryNode(docStr);
    } else if (stype == Schema.Type.INT) {
      return new IntegerSummaryNode(docStr);
    } else if (stype == Schema.Type.LONG) {
      return new LongSummaryNode(docStr);
    } else if (stype == Schema.Type.MAP) {
      return new MapSummaryNode(s.getValueType(), docStr);
    } else if (stype == Schema.Type.NULL) {
      return new NullSummaryNode(docStr);
    } else if (stype == Schema.Type.RECORD) {
      RecordSummaryNode rsn = new RecordSummaryNode(s.getFullName(), docStr);
      for (Field f: s.getFields()) {
        rsn.addField(f.name(), buildStructure(f.schema(), f.doc()));
      }
      return rsn;
    } else if (stype == Schema.Type.STRING) {
      return new StringSummaryNode(docStr);
    } else if (stype == Schema.Type.UNION) {
      UnionSummaryNode usn = new UnionSummaryNode(docStr);
      for (Schema subschema: s.getTypes()) {
        usn.addType(subschema.getType(), buildStructure(subschema, subschema.getDoc()));
      }
    }
    return null;
  }

  /////////////////////////////////////////////////////////
  // Schema distance computation
  /////////////////////////////////////////////////////////
  /**
   * Get the minimum mapping cost from a schema of size k to one of size m.
   * This helps us avoid mapping computations that couldn't possibly produce
   * a low-distance mapping.
   */
  public static double getMinimumMappingCost(int k, int m) {
    return Math.abs(k - m) * Math.min(MATCHCOST_CREATE, MATCHCOST_DELETE);
  }

  /**
   * Find the best mapping between the current schema summary and the one provided
   * by the parameter.
   */
  public SchemaMapping getBestMapping(SchemaStatisticalSummary other) {
    SummaryNode t1 = root;
    SummaryNode t2 = other.root;
    TreeMap<Integer, SummaryNode> t1NonLeafs = new TreeMap<Integer, SummaryNode>();
    TreeMap<Integer, SummaryNode> t1Leafs = new TreeMap<Integer, SummaryNode>();
    TreeMap<Integer, SummaryNode> t2NonLeafs = new TreeMap<Integer, SummaryNode>();
    TreeMap<Integer, SummaryNode> t2Leafs = new TreeMap<Integer, SummaryNode>();

    //
    // Find all the non-leaf nodes
    //
    for (SummaryNode iNode: t1.preorder()) {
      if (iNode.children().size() > 0) {
        t1NonLeafs.put(iNode.preorderCount(), iNode);
      } else {
        t1Leafs.put(iNode.preorderCount(), iNode);
      }
    }
    for (SummaryNode jNode: t2.preorder()) {
      if (jNode.children().size() > 0) {
        t2NonLeafs.put(jNode.preorderCount(), jNode);
      } else {
        t2Leafs.put(jNode.preorderCount(), jNode);
      }
    }

    //
    // Start by computing all the potential 1:1 leaf-level match costs.
    //
    List<DistancePair[]> allCosts = new ArrayList<DistancePair[]>();
    Set<DistancePair> allKnownCostPairs = new TreeSet<DistancePair>();

    for (SummaryNode iNode: t1.preorder()) {
      int iIdx = iNode.preorderCount();
      DistancePair fromI[] = null;
      if (t1NonLeafs.get(iIdx) == null) {
        List<DistancePair> costs = new ArrayList<DistancePair>();
        for (SummaryNode jNode: t2.preorder()) {
          int jIdx = jNode.preorderCount();
          if (t2NonLeafs.get(jIdx) == null) {
            DistancePair dp = new DistancePair(iNode.transformCost(jNode), iNode, jNode);
            costs.add(dp);
            allKnownCostPairs.add(dp);
          }
        }
        costs.add(new DistancePair(iNode.deleteCost(), iNode, null));
        fromI = costs.toArray(new DistancePair[costs.size()]);
        Arrays.sort(fromI);
      }
      allCosts.add(fromI);
    }


    //
    // Now pass those costs to the mapping algorithm.
    // Select which mapping algorithm we want to use.  For now, it's 'greedy'.
    //
    return findGreedyMapping(other, t1, t2, t1Leafs, t2Leafs, t1NonLeafs, t2NonLeafs, allKnownCostPairs);
    /**
    boolean performTraditionalMapping = false;
    if (performTraditionalMapping) {
      return findTraditionalMapping(other, t1, t2, t1Leafs, t2Leafs, t1NonLeafs, t2NonLeafs, allCosts);
    } else {
      return findGreedyMapping(other, t1, t2, t1Leafs, t2Leafs, t1NonLeafs, t2NonLeafs, allKnownCostPairs);      
    }
    **/
  }

  /**
   * findTraditionalMapping() tries the best k permutations of matches and returns the best one.
   * The number of permutations can grow rapidly as the sizes of the two schemas grow, so this method
   * can be very time-consuming.
   */
  /**
  SchemaMapping findTraditionalMapping(SchemaStatisticalSummary other, SummaryNode t1, SummaryNode t2, Map<Integer, SummaryNode> t1Leafs, Map<Integer, SummaryNode> t2Leafs, Map<Integer, SummaryNode> t1NonLeafs, Map<Integer, SummaryNode> t2NonLeafs, List<DistancePair[]> allCosts) {
    //
    // Figure out how far down into each attr's match-list we can go while only evaluating the
    // estimated top-k-scoring schema matches.  (Estimated by combining independent 1:1 match scores;
    // no enforcement of the pigeonhole constraint.)
    //
    int MAX_CANDIDATES = 100000;
    int numToPeek[] = new int[allCosts.size()];
    for (int i = 0; i < numToPeek.length; i++) {
      if (allCosts.get(i) == null) {
        numToPeek[i] = 0;
      } else {
        numToPeek[i] = 1;
      }
    }
    int numCandidates = 1;
    System.err.println("Num elts: " + numToPeek.length);
    do {
      int peekIndex = -1;
      double cheapestPeek = Double.MAX_VALUE;
      for (int i = 0; i < numToPeek.length; i++) {      
        if (allCosts.get(i) != null &&
            numToPeek[i] < allCosts.get(i).length) {
          double candidatePeekValue = allCosts.get(i)[numToPeek[i]].getCost();
          if (candidatePeekValue < cheapestPeek) {
            cheapestPeek = candidatePeekValue;
            peekIndex = i;
          }
        }
      }
      if (peekIndex >= 0) {
        numToPeek[peekIndex]++;
      } else {
        break;
      }
      numCandidates = 1;
      for (int i = 0; i < numToPeek.length; i++) {
        if (numToPeek[i] >= 1) {
          numCandidates *= numToPeek[i];
        }
      }
    } while (numCandidates < MAX_CANDIDATES);

    System.err.println("All cost size: " + allCosts.size() + ", number of candidates examined: " + numCandidates);
    System.err.println();
    numCandidates = Math.max(MAX_CANDIDATES, numCandidates);

    //
    // Now the numToPeek vector tells us how many steps down to go in each attr's
    // ranked list of preferred matches.  The product of all of these determines the # of candidates.
    //
    int curPeek[] = new int[numToPeek.length];
    for (int i = 0; i < curPeek.length; i++) {
      if (numToPeek[i] == 0) {
        curPeek[i] = 0;
      } else {
        curPeek[i] = 1;
      }
    }

    //
    // Now go through all the possible configurations of top-k mappings.
    // 
    // We optimize for the common case in which we have two near-flat hierarchies
    //
    DistancePair bestMatchConfig[] = new DistancePair[curPeek.length];
    DistancePair matchConfig[] = new DistancePair[curPeek.length];
    double bestCost = Double.MAX_VALUE;
    boolean peeksRemain = numCandidates > 0;
    long startTime = System.currentTimeMillis();
    int numIters = 0;
    while (peeksRemain) {
      numIters++;

      ////////////////////////////////////////
      // Evaluate this configuration ("peek")
      ////////////////////////////////////////
      //
      // 1. Build a proper 'match configuration' out of the leaf-level 1:1 'curPeek'.
      //    That means we generate record-level correspondences when justified by full 
      //    child-correspondences
      //
      for (SummaryNode iNode: t1.preorder()) {
        int iNodeIdx = iNode.preorderCount();
        matchConfig[iNodeIdx] = null;
        DistancePair[] allINodeMatches = allCosts.get(iNodeIdx);
        if (allINodeMatches != null) {
          matchConfig[iNodeIdx] = allINodeMatches[curPeek[iNodeIdx]-1];
        }
      }

      //
      // 2. Modify the current matchConfig s.t. if ALL of a non-leaf's children match ALL of
      //    the children of a non-leaf, then the two non-leafs also match.
      //    Because of the potential record hierarchy, this procedure needs to be repeated until 
      //    there is an iteration in which no new matches are found (or until the roots are matched).
      //
      for (Map.Entry<Integer, SummaryNode> elt: t1NonLeafs.entrySet()) {
        SummaryNode iNode = elt.getValue();
        if (matchConfig[iNode.preorderCount()] != null) {
          continue;
        }

        // For each child of this t1 internal node, place the matching node's parent into a set
        TreeMap<Integer, SummaryNode> observedMatchParents = new TreeMap<Integer, SummaryNode>();
        for (SummaryNode iChild: iNode.children()) {
          int iChildIdx = iChild.preorderCount();
          DistancePair jMatch = matchConfig[iChildIdx];
          if (jMatch != null) {
            if (jMatch.getNode() == null) {
              observedMatchParents.put(-1, iChild);
            } else {
              SummaryNode jMatchParent = jMatch.getNode().getParent();
              observedMatchParents.put(jMatchParent.preorderCount(), jMatchParent);
            }
          }
        }

        // If the parent-set has just one element, then internal node iNode 
        // should be matched to the singleton elt in the parent-set.
        if (observedMatchParents.size() == 1) {
          int matchIdx = observedMatchParents.firstKey().intValue();
          if (matchIdx >= 0) {
            SummaryNode jMatchParent = observedMatchParents.get(matchIdx);
            matchConfig[iNode.preorderCount()] = new DistancePair(0, iNode, jMatchParent);
          }
        }
      }

      //
      // 3. Compute the total match costs.  
      // a. The first component is the TRANSFORM costs of the discovered 1:1 leaf matches.
      //    (Valid matches among non-leafs are free.)
      //
      double total = 0;
      for (int iNodeIdx = 0; iNodeIdx < matchConfig.length; iNodeIdx++) {
        if (matchConfig[iNodeIdx] != null) {
          // Get the transform cost
          total += matchConfig[iNodeIdx].getCost();
        }
      }

      //
      // 3b. Compute DELETE penalties.  These are elts in t1 that are NOT MATCHED to anything
      //     in t2.  Non-leaf nodes that are unmatched DO incur penalties.
      //
      //     While we're there, compute the set of items in t2 that DO have a matched elt.
      //
      int numDuplicates = 0;
      HashSet<Integer> observedT2Nodes = new HashSet<Integer>();
      for (SummaryNode iNode: t1.preorder()) {
        int iNodeIdx = iNode.preorderCount();
        if (matchConfig[iNodeIdx] == null || matchConfig[iNodeIdx].getNode() == null) {
          total += iNode.deleteCost();
        } else {
          int jIdx = matchConfig[iNodeIdx].getNode().preorderCount();
          if (observedT2Nodes.contains(jIdx)) {
            numDuplicates++;
          } else {
            observedT2Nodes.add(jIdx);
          }
        }
      }

      //
      // 3c. Compute CREATE penalties.  These count for any items in the target schema t2
      //     that have gone unmapped.  
      //
      for (SummaryNode jNode: t2.preorder()) {
        int jIdx = jNode.preorderCount();
        if (! observedT2Nodes.contains(jIdx)) {
          total += jNode.createCost();
        }
      }

      //
      // 4.  Impose a penalty for duplicate mappings in t2.
      // 
      
      //
      // Is it the best mapping so far?
      //
      if (total < bestCost) {
        bestCost = total;
        System.arraycopy(matchConfig, 0, bestMatchConfig, 0, bestMatchConfig.length);
      }

      /////////////////////////////////////////////
      // Find the next configuration to evaluate (leaf-level "peek").
      // We try to do a 'breadth-first search' rather than go deep on
      // a single peeklist.  This makes it easier to find the best match sooner,
      // and thus abort the process early.
      /////////////////////////////////////////////
      peeksRemain = false;
      int minSeen = Integer.MAX_VALUE;
      int minIndex = -1;
      for (int i = 0; i < curPeek.length; i++) {
        if (curPeek[i] == 0) {
          continue;
        } else {
          if (curPeek[i] < numToPeek[i]) {
            curPeek[i]++;
            for (int j = i-1; j >= 0; j--) {
              if (curPeek[j] > 0) {
                curPeek[j] = 1;
              }
            }
            peeksRemain = true;
            break;
          }
        }
      }
    }
    long endTime = System.currentTimeMillis();
    System.err.println("Evaluting peeks: " + ((endTime - startTime) / 1000.0) + " over " + numIters + " iterations.");
    
    //
    // ALMOST DONE: We have the best match.  Now we translate it into a series of SchemaMappingOps 
    //
    List<SchemaMappingOp> bestOps = new ArrayList<SchemaMappingOp>();
    HashSet<Integer> bestMapTargets = new HashSet<Integer>();
    for (int i = 0; i < bestMatchConfig.length; i++) {
      if (bestMatchConfig[i] != null && bestMatchConfig[i].getNode() != null) {
        int dstIdx = bestMatchConfig[i].getNode().preorderCount();
        bestOps.add(new SchemaMappingOp(SchemaMappingOp.TRANSFORM_OP, this, i, other, dstIdx));
        bestMapTargets.add(dstIdx);
      } else {
        bestOps.add(new SchemaMappingOp(SchemaMappingOp.DELETE_OP, this, i));
      }
    }
    for (SummaryNode jNode: t2.preorder()) {
      int jIdx = jNode.preorderCount();
      if (jNode.children().size() == 0 && ! bestMapTargets.contains(jIdx)) {
        bestOps.add(new SchemaMappingOp(SchemaMappingOp.CREATE_OP, other, jIdx));
      }
    }
    //
    // All done!
    //
    return new SchemaMapping(this, other, bestCost, bestOps);
  }
  **/

  /**
   * Greedy Mapping is sloppy, but very fast.  It repeatedly accepts the best-looking pairwise
   * match, until there is nothing left to match.  Seems to work well so far, but needs to be
   * tested more.
   */
  SchemaMapping findGreedyMapping(SchemaStatisticalSummary other, SummaryNode t1, SummaryNode t2, Map<Integer, SummaryNode> t1Leafs, Map<Integer, SummaryNode> t2Leafs, Map<Integer, SummaryNode> t1NonLeafs, Map<Integer, SummaryNode> t2NonLeafs, Set<DistancePair> allKnownCostPairs) {
    int totalSrcs = t1Leafs.size();
    int totalDsts = t2Leafs.size();
    Set<Integer> observedSrcs = new TreeSet<Integer>();
    Set<Integer> observedDsts = new TreeSet<Integer>();    
    List<DistancePair> matching = new ArrayList<DistancePair>();
    List<SchemaMappingOp> outputOps = new ArrayList<SchemaMappingOp>();
    double totalCost = 0;

    //
    // Find all the leaf-level matches
    //
    Map<Integer, SummaryNode> transformMap = new TreeMap<Integer, SummaryNode>();
    for (DistancePair dp: allKnownCostPairs) {
      int srcId = dp.getSrc().preorderCount();
      int dstId = dp.getNode().preorderCount();

      if ((! observedSrcs.contains(srcId)) && (! observedDsts.contains(dstId))) {
        matching.add(dp);
        observedSrcs.add(srcId);
        observedDsts.add(dstId);
        outputOps.add(new SchemaMappingOp(SchemaMappingOp.TRANSFORM_OP, this, srcId, other, dstId, dp.getCost()));
        transformMap.put(srcId, dp.getNode());
        totalCost += dp.getCost();
        if (matching.size() >= Math.min(totalSrcs, totalDsts)) {
          break;
        }
      }
    }

    //
    // Look for internal nodes that should be matched.  If ALL of an internal node's children
    // match ALL of another internal node's children, then the two internal nodes also match.
    //
    for (Map.Entry<Integer, SummaryNode> elt: t1NonLeafs.entrySet()) {
      SummaryNode iNode = elt.getValue();
      SortedSet<Integer> knownDstParents = new TreeSet<Integer>();
      for (SummaryNode iChild: iNode.children()) {
        int iChildIdx = iChild.preorderCount();
        SummaryNode dstNode = transformMap.get(iChildIdx);
        if (dstNode != null) {
          knownDstParents.add(dstNode.getParent().preorderCount());
        }        
      }

      // There's just one parent of the destination nodes, so we have found an internal node match.
      if (knownDstParents.size() == 1) {
        Integer dstIdx = knownDstParents.first();
        SummaryNode dstNode = t2NonLeafs.get(dstIdx);
        outputOps.add(new SchemaMappingOp(SchemaMappingOp.TRANSFORM_OP, this, iNode.preorderCount(), other, dstIdx, 0));
        observedSrcs.add(iNode.preorderCount());
        observedDsts.add(dstIdx);
      }
    }

    //
    // If a node is in the source, but not the dest, then we need to DELETE it.
    // Compute the DELETE costs here.
    //
    for (SummaryNode iNode: t1.preorder()) {
      int iNodeIdx = iNode.preorderCount();
      if (! observedSrcs.contains(iNodeIdx)) {
        totalCost += iNode.deleteCost();
        outputOps.add(new SchemaMappingOp(SchemaMappingOp.DELETE_OP, this, iNodeIdx, iNode.deleteCost()));
      }
    }

    //
    // If a node is in the dest, but not the source, then we need to CREATE it.
    // Compute the CREATE costs here.
    // 
    for (SummaryNode jNode: t2.preorder()) {
      int jNodeIdx = jNode.preorderCount();
      if (! observedDsts.contains(jNodeIdx)) {
        totalCost += jNode.createCost();
        outputOps.add(new SchemaMappingOp(SchemaMappingOp.CREATE_OP, other, jNodeIdx, jNode.createCost()));
      }
    }
    return new SchemaMapping(this, other, totalCost, outputOps);
  }

  class DistancePair implements Comparable {
    double cost;
    SummaryNode src;
    SummaryNode target;
    public DistancePair(double cost, SummaryNode src, SummaryNode target) {
      this.cost = cost;
      this.src = src;
      this.target = target;
    }
    public int compareTo(Object o) {
      DistancePair other = (DistancePair) o;
      if (cost < other.cost) {
        return -1;
      } else if (cost > other.cost) {
        return 1;
      } else {
        int cmp = src.preorderCount() - other.src.preorderCount();
        if (cmp == 0) {
          cmp = target.preorderCount() - other.target.preorderCount();
        }
        return cmp;
      }
    }
    public double getCost() {
      return cost;
    }
    public SummaryNode getSrc() {
      return src;
    }
    public SummaryNode getNode() {
      return target;
    }
    public int getIdx() {
      return target.preorderCount();
    }      
    public String toString() {
      if (target != null) {
        return "" + target.getDesc(false) + " cost=" + cost;
      } else {
        return " DELETE cost=" + cost;
      }
    }
  }

  ////////////////////////////////////////////////
  // String representation of the overall summary object
  ////////////////////////////////////////////////
  public String getDatasetLabel() {
    return datasetLabel;
  }
  public String dumpSummary() {
    return this.root.dumpSummary(0);
  }
  public String getDesc(int nodeid) {
    return root.getDesc(nodeid);
  }
  public String getLabel(int nodeid) {
    return root.getLabel(nodeid);
  }
  public String getTypeDesc(int nodeid) {
    return root.getTypeDesc(nodeid);
  }
  public String getDocStr(int nodeid) {
    return root.getDocStr(nodeid);
  }

  ////////////////////////////////////////////////
  // Serialization/deserialization
  ////////////////////////////////////////////////
  public void write(DataOutput out) throws IOException {
    out.write(MAGIC);
    out.write(VERSION);
    root.write(out);
    UTF8.writeString(out, datasetLabel);
  }

  public void readFields(DataInput in) throws IOException {
    byte magic = in.readByte();
    byte version = in.readByte();
    this.root = readAndCreate(in);
    this.root.computePreorder(-1);
    this.datasetLabel = UTF8.readString(in);
  }
}