/* * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved. * * Cloudera, Inc. licenses this file to you under the Apache License, * Version 2.0 (the "License"). You may not use this file except in * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for * the specific language governing permissions and limitations under the * License. */ package com.cloudera.recordbreaker.learnstructure; import java.util.*; /********************************************************* * Histogram statistically summarizes the contents of a set of chunks. * It's used to find similar, but not identical, chunks in the input data. *********************************************************/ public class Histogram { /////////////////////////////////////////////////// // static members /////////////////////////////////////////////////// static double CLUSTER_TOLERANCE = 0.01; static double MAX_RESIDUAL_MASS = 0.1; static double MIN_COVERAGE_FACTOR = 0.2; /////////////////////////////////////////////////// // static classes /////////////////////////////////////////////////// static class FrequencyPair implements Comparable { int perChunkFrequency; int overallChunkCount; public FrequencyPair(int perChunkFrequency, int overallChunkCount) { this.perChunkFrequency = perChunkFrequency; this.overallChunkCount = overallChunkCount; } public int compareTo(Object o) { FrequencyPair other = (FrequencyPair) o; int cmp = overallChunkCount - other.overallChunkCount; if (cmp != 0) { return -1 * cmp; } else { cmp = perChunkFrequency - other.perChunkFrequency; return cmp; } } public int getPerChunkFrequency() { return perChunkFrequency; } public int getCount() { return overallChunkCount; } } ////////////////////////////////////////////////////////////// // Static methods: for computing histograms from data, // and for clustering the resulting histograms. ////////////////////////////////////////////////////////////// /** * Compute a statistical summary of the data. This will return a histogram * for each token type, indicating the relative proportion and distribution of * the token type in the chunkset. * * Thus, the size of the output of this function is relatively small: no larger * than the number of potential token types. However, it can take awhile to compute * if the input data size is huge. * * REMIND mjc - This fn may be a good candidate for runtime-perf optimization */ public static List<Histogram> computeNormalizedHistograms(List<List<Token.AbstractToken>> chunks) { Map<String, Map<Integer, Integer>> allHistograms = new TreeMap<String, Map<Integer, Integer>>(); List<Histogram> normalizedHistograms = new ArrayList<Histogram>(); // // 1. Compute some per-chunk statistics // for (List<Token.AbstractToken> chunk: chunks) { // Compute frequencies of token-type within this chunk HashMap<String, Integer> localFreq = new HashMap<String, Integer>(); for (Token.AbstractToken tok: chunk) { Integer count = (Integer) localFreq.get(tok.getId()); if (count == null) { localFreq.put(tok.getId(), 1); } else { localFreq.put(tok.getId(), count.intValue() + 1); } } // Now adjust the "histogram of frequencies" associated with each token type for (String tokenId: localFreq.keySet()) { Map<Integer, Integer> perTokenTypeHistogram = allHistograms.get(tokenId); if (perTokenTypeHistogram == null) { perTokenTypeHistogram = new HashMap<Integer, Integer>(); allHistograms.put(tokenId, perTokenTypeHistogram); } Integer currentTokenTypeCount = localFreq.get(tokenId); Integer countSoFar = perTokenTypeHistogram.get(currentTokenTypeCount); if (countSoFar == null) { perTokenTypeHistogram.put(currentTokenTypeCount, 1); } else { perTokenTypeHistogram.put(currentTokenTypeCount, countSoFar.intValue() + 1); } } } // // 2. Now for each per-token-type histogram, compute how many times the // token was observed in *no chunk at all*. // for (String tokenId: allHistograms.keySet()) { Map<Integer, Integer> perTokenTypeHistogram = allHistograms.get(tokenId); int numberOfChunksForObservedTokenTypeCount = 0; for (Integer currentTokenTypeCount: perTokenTypeHistogram.keySet()) { numberOfChunksForObservedTokenTypeCount += perTokenTypeHistogram.get(currentTokenTypeCount); } perTokenTypeHistogram.put(0, chunks.size() - numberOfChunksForObservedTokenTypeCount); } // // 3. Normalize the per-token-type histograms // for (Map.Entry<String, Map<Integer, Integer>> e1: allHistograms.entrySet()) { String tokenId = e1.getKey(); Map<Integer, Integer> perTokenTypeHistogram = e1.getValue(); double coverage = 0; double totalMass = 0; // 3.1. Compute the histogram's normal form: all the counts in descending order of prevalence in the chunk set. // Also, compute some metainfo stats along the way SortedSet<FrequencyPair> sorter = new TreeSet<FrequencyPair>(); for (Map.Entry<Integer, Integer> e2: perTokenTypeHistogram.entrySet()) { Integer perChunkFrequency = e2.getKey(); Integer overallChunkCount = e2.getValue(); if (perChunkFrequency.intValue() != 0) { coverage += overallChunkCount.intValue(); sorter.add(new FrequencyPair(perChunkFrequency, overallChunkCount)); } totalMass += overallChunkCount.intValue(); } List<FrequencyPair> normalForm = new ArrayList<FrequencyPair>(); for (FrequencyPair p: sorter) { normalForm.add(p); } normalForm.add(0, new FrequencyPair(0, perTokenTypeHistogram.get(0))); // 3.2. Compute metainfo double width = perTokenTypeHistogram.size()-1; double residualMass = (totalMass - normalForm.get(1).getCount()) / totalMass; // 3.3 Done with the histogram! normalizedHistograms.add(new Histogram(tokenId, normalForm, width, residualMass, coverage)); } return normalizedHistograms; } /** * Cluster together histograms that appear to be related. * * We currently employ agglomerative single-link clustering. That means: * a) We can imagine that each data elt starts as its own cluster * b) We merge clusters whenever the distance between clusters is less than CLUSTER_TOLERANCE * c) The distance between two clusters is determined by the *minimum distance between any two members of the cluster*. * This is sometimes called "single link" clustering. The resulting cluster quality is not as good as computing * distance based on the average of the members of a cluster, but it is more efficient. */ public static List<List<Histogram>> clusterHistograms(List<Histogram> inputHistograms) { // 1. Handle degenerate case of size(input) == 1 if (inputHistograms.size() == 1) { List<List<Histogram>> clusters = new ArrayList<List<Histogram>>(); clusters.add(inputHistograms); return clusters; } // 2. Otherwise, compute pairwise symmetric relative entropy among histograms class Score implements Comparable { double s; int i; int j; public Score(double s, int i, int j) { this.s = s; this.i = i; this.j = j; } public int getIndex1() { return i; } public int getIndex2() { return j; } public int compareTo(Object o) { Score other = (Score) o; if (this.s < other.s) { return -1; } else if (this.s > other.s) { return 1; } else { int cmp = this.i - other.i; if (cmp == 0) { cmp = this.j - other.j; } return cmp; } } } SortedSet<Score> scores = new TreeSet<Score>(); for (int i = 0; i < inputHistograms.size(); i++) { for (int j = i+1; j < inputHistograms.size(); j++) { Histogram h1 = inputHistograms.get(i); Histogram h2 = inputHistograms.get(j); double sre = h1.computeSymmetricRelativeEntropy(h2); if (sre < CLUSTER_TOLERANCE) { scores.add(new Score(sre, i, j)); } } } // Initialize clusters Map<Integer, Integer> histogramToCluster = new TreeMap<Integer, Integer>(); Map<Integer, Set<Integer>> clusterToHistograms = new TreeMap<Integer, Set<Integer>>(); for (int i = 0; i < inputHistograms.size(); i++) { histogramToCluster.put(i, i); Set<Integer> containedHistograms = new HashSet<Integer>(); containedHistograms.add(i); clusterToHistograms.put(i, containedHistograms); } // Start merging clusters for (Score s: scores) { int idx1 = s.getIndex1(); int idx2 = s.getIndex2(); int cluster1 = histogramToCluster.get(idx1); int cluster2 = histogramToCluster.get(idx2); if (cluster1 == cluster2) { continue; } for (Integer histogramId: clusterToHistograms.get(cluster2)) { histogramToCluster.put(histogramId, cluster1); } clusterToHistograms.get(cluster1).addAll(clusterToHistograms.get(cluster2)); clusterToHistograms.remove(cluster2); } // Build the clustered histogram list. List<List<Histogram>> clusters = new ArrayList<List<Histogram>>(); for (Map.Entry<Integer, Set<Integer>> entry: clusterToHistograms.entrySet()) { Integer clusterId = entry.getKey(); Set<Integer> histograms = entry.getValue(); List<Histogram> curCluster = new ArrayList<Histogram>(); for (Integer histogramIndex: histograms) { curCluster.add(inputHistograms.get(histogramIndex)); } clusters.add(curCluster); } return clusters; } ////////////////////////////////////////////////////////////// // Members ////////////////////////////////////////////////////////////// String histogramType; List<FrequencyPair> normalForm; double width; double residualMass; double coverage; ////////////////////////////////////////////////////////////// // Methods ////////////////////////////////////////////////////////////// public Histogram(String histogramType, List<FrequencyPair> normalForm, double width, double residualMass, double coverage) { this.histogramType = histogramType; this.normalForm = normalForm; this.width = width; this.residualMass = residualMass; this.coverage = coverage; } public boolean passStructStatisticalTest(int numChunks) { return residualMass < MAX_RESIDUAL_MASS && coverage > MIN_COVERAGE_FACTOR * numChunks; } public boolean passArrayStatisticalTest(int numChunks) { return width > 3 && coverage > MIN_COVERAGE_FACTOR * numChunks; } public String getHistogramType() { return histogramType; } public double getWidth() { return width; } public double getResidualMass() { return residualMass; } public double getCoverage() { return coverage; } public String toString() { StringBuffer buf = new StringBuffer(); buf.append("Histogram: type=" + histogramType + ", width=" + width + ", residualMass=" + residualMass + ", coverage=" + coverage + ", normalForm=["); for (FrequencyPair fp: normalForm) { buf.append("(" + fp.getPerChunkFrequency() + ", " + fp.getCount() + ") "); } buf.append("]"); return buf.toString(); } /** * The relative entropy score is used for clustering. However, we can't compute * it directly, as histograms do not always contain the same components. Instead, * we preprocess the data with computeSymmetricRelativeEntropy(), then pass the resulting * averaged values into this pretty generic method. */ double computeRelativeEntropy(List<Double> avgCounts) { double total = 0; for (int i = 1; i < normalForm.size(); i++) { double selfOverallChunkCount = normalForm.get(i).getCount(); double otherOverallChunkCount = avgCounts.get(i); total += selfOverallChunkCount * Math.log(selfOverallChunkCount / otherOverallChunkCount); } return total; } /** * The point of this method is to preprocess the data from two input Histograms, * getting it ready for the relative entropy computation. Without this preprocessing, * the rel-entropy computation would be sensitive to varying numbers of components in each histogram. */ double computeSymmetricRelativeEntropy(Histogram other) { List<Double> avgCounts = new ArrayList<Double>(); for (int i = 0; i < Math.max(normalForm.size(), other.normalForm.size()); i++) { if ((i < normalForm.size()) && (i < other.normalForm.size())) { avgCounts.add((normalForm.get(i).getCount() + other.normalForm.get(i).getCount()) / 2.0); } else if (i < normalForm.size()) { avgCounts.add(normalForm.get(i).getCount() * 0.5); } else { avgCounts.add(other.normalForm.get(i).getCount() * 0.5); } } return 0.5 * this.computeRelativeEntropy(avgCounts) + 0.5 * other.computeRelativeEntropy(avgCounts); } }