/*
* Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
*
* Cloudera, Inc. licenses this file to you under the Apache License,
* Version 2.0 (the "License"). You may not use this file except in
* compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for
* the specific language governing permissions and limitations under the
* License.
*/
package com.cloudera.recordbreaker.learnstructure;
import java.util.*;
/*********************************************************
* Histogram statistically summarizes the contents of a set of chunks.
* It's used to find similar, but not identical, chunks in the input data.
*********************************************************/
public class Histogram {
///////////////////////////////////////////////////
// static members
///////////////////////////////////////////////////
static double CLUSTER_TOLERANCE = 0.01;
static double MAX_RESIDUAL_MASS = 0.1;
static double MIN_COVERAGE_FACTOR = 0.2;
///////////////////////////////////////////////////
// static classes
///////////////////////////////////////////////////
static class FrequencyPair implements Comparable {
int perChunkFrequency;
int overallChunkCount;
public FrequencyPair(int perChunkFrequency, int overallChunkCount) {
this.perChunkFrequency = perChunkFrequency;
this.overallChunkCount = overallChunkCount;
}
public int compareTo(Object o) {
FrequencyPair other = (FrequencyPair) o;
int cmp = overallChunkCount - other.overallChunkCount;
if (cmp != 0) {
return -1 * cmp;
} else {
cmp = perChunkFrequency - other.perChunkFrequency;
return cmp;
}
}
public int getPerChunkFrequency() {
return perChunkFrequency;
}
public int getCount() {
return overallChunkCount;
}
}
//////////////////////////////////////////////////////////////
// Static methods: for computing histograms from data,
// and for clustering the resulting histograms.
//////////////////////////////////////////////////////////////
/**
* Compute a statistical summary of the data. This will return a histogram
* for each token type, indicating the relative proportion and distribution of
* the token type in the chunkset.
*
* Thus, the size of the output of this function is relatively small: no larger
* than the number of potential token types. However, it can take awhile to compute
* if the input data size is huge.
*
* REMIND mjc - This fn may be a good candidate for runtime-perf optimization
*/
public static List<Histogram> computeNormalizedHistograms(List<List<Token.AbstractToken>> chunks) {
Map<String, Map<Integer, Integer>> allHistograms = new TreeMap<String, Map<Integer, Integer>>();
List<Histogram> normalizedHistograms = new ArrayList<Histogram>();
//
// 1. Compute some per-chunk statistics
//
for (List<Token.AbstractToken> chunk: chunks) {
// Compute frequencies of token-type within this chunk
HashMap<String, Integer> localFreq = new HashMap<String, Integer>();
for (Token.AbstractToken tok: chunk) {
Integer count = (Integer) localFreq.get(tok.getId());
if (count == null) {
localFreq.put(tok.getId(), 1);
} else {
localFreq.put(tok.getId(), count.intValue() + 1);
}
}
// Now adjust the "histogram of frequencies" associated with each token type
for (String tokenId: localFreq.keySet()) {
Map<Integer, Integer> perTokenTypeHistogram = allHistograms.get(tokenId);
if (perTokenTypeHistogram == null) {
perTokenTypeHistogram = new HashMap<Integer, Integer>();
allHistograms.put(tokenId, perTokenTypeHistogram);
}
Integer currentTokenTypeCount = localFreq.get(tokenId);
Integer countSoFar = perTokenTypeHistogram.get(currentTokenTypeCount);
if (countSoFar == null) {
perTokenTypeHistogram.put(currentTokenTypeCount, 1);
} else {
perTokenTypeHistogram.put(currentTokenTypeCount, countSoFar.intValue() + 1);
}
}
}
//
// 2. Now for each per-token-type histogram, compute how many times the
// token was observed in *no chunk at all*.
//
for (String tokenId: allHistograms.keySet()) {
Map<Integer, Integer> perTokenTypeHistogram = allHistograms.get(tokenId);
int numberOfChunksForObservedTokenTypeCount = 0;
for (Integer currentTokenTypeCount: perTokenTypeHistogram.keySet()) {
numberOfChunksForObservedTokenTypeCount += perTokenTypeHistogram.get(currentTokenTypeCount);
}
perTokenTypeHistogram.put(0, chunks.size() - numberOfChunksForObservedTokenTypeCount);
}
//
// 3. Normalize the per-token-type histograms
//
for (Map.Entry<String, Map<Integer, Integer>> e1: allHistograms.entrySet()) {
String tokenId = e1.getKey();
Map<Integer, Integer> perTokenTypeHistogram = e1.getValue();
double coverage = 0;
double totalMass = 0;
// 3.1. Compute the histogram's normal form: all the counts in descending order of prevalence in the chunk set.
// Also, compute some metainfo stats along the way
SortedSet<FrequencyPair> sorter = new TreeSet<FrequencyPair>();
for (Map.Entry<Integer, Integer> e2: perTokenTypeHistogram.entrySet()) {
Integer perChunkFrequency = e2.getKey();
Integer overallChunkCount = e2.getValue();
if (perChunkFrequency.intValue() != 0) {
coverage += overallChunkCount.intValue();
sorter.add(new FrequencyPair(perChunkFrequency, overallChunkCount));
}
totalMass += overallChunkCount.intValue();
}
List<FrequencyPair> normalForm = new ArrayList<FrequencyPair>();
for (FrequencyPair p: sorter) {
normalForm.add(p);
}
normalForm.add(0, new FrequencyPair(0, perTokenTypeHistogram.get(0)));
// 3.2. Compute metainfo
double width = perTokenTypeHistogram.size()-1;
double residualMass = (totalMass - normalForm.get(1).getCount()) / totalMass;
// 3.3 Done with the histogram!
normalizedHistograms.add(new Histogram(tokenId, normalForm, width, residualMass, coverage));
}
return normalizedHistograms;
}
/**
* Cluster together histograms that appear to be related.
*
* We currently employ agglomerative single-link clustering. That means:
* a) We can imagine that each data elt starts as its own cluster
* b) We merge clusters whenever the distance between clusters is less than CLUSTER_TOLERANCE
* c) The distance between two clusters is determined by the *minimum distance between any two members of the cluster*.
* This is sometimes called "single link" clustering. The resulting cluster quality is not as good as computing
* distance based on the average of the members of a cluster, but it is more efficient.
*/
public static List<List<Histogram>> clusterHistograms(List<Histogram> inputHistograms) {
// 1. Handle degenerate case of size(input) == 1
if (inputHistograms.size() == 1) {
List<List<Histogram>> clusters = new ArrayList<List<Histogram>>();
clusters.add(inputHistograms);
return clusters;
}
// 2. Otherwise, compute pairwise symmetric relative entropy among histograms
class Score implements Comparable {
double s;
int i;
int j;
public Score(double s, int i, int j) {
this.s = s;
this.i = i;
this.j = j;
}
public int getIndex1() {
return i;
}
public int getIndex2() {
return j;
}
public int compareTo(Object o) {
Score other = (Score) o;
if (this.s < other.s) {
return -1;
} else if (this.s > other.s) {
return 1;
} else {
int cmp = this.i - other.i;
if (cmp == 0) {
cmp = this.j - other.j;
}
return cmp;
}
}
}
SortedSet<Score> scores = new TreeSet<Score>();
for (int i = 0; i < inputHistograms.size(); i++) {
for (int j = i+1; j < inputHistograms.size(); j++) {
Histogram h1 = inputHistograms.get(i);
Histogram h2 = inputHistograms.get(j);
double sre = h1.computeSymmetricRelativeEntropy(h2);
if (sre < CLUSTER_TOLERANCE) {
scores.add(new Score(sre, i, j));
}
}
}
// Initialize clusters
Map<Integer, Integer> histogramToCluster = new TreeMap<Integer, Integer>();
Map<Integer, Set<Integer>> clusterToHistograms = new TreeMap<Integer, Set<Integer>>();
for (int i = 0; i < inputHistograms.size(); i++) {
histogramToCluster.put(i, i);
Set<Integer> containedHistograms = new HashSet<Integer>();
containedHistograms.add(i);
clusterToHistograms.put(i, containedHistograms);
}
// Start merging clusters
for (Score s: scores) {
int idx1 = s.getIndex1();
int idx2 = s.getIndex2();
int cluster1 = histogramToCluster.get(idx1);
int cluster2 = histogramToCluster.get(idx2);
if (cluster1 == cluster2) {
continue;
}
for (Integer histogramId: clusterToHistograms.get(cluster2)) {
histogramToCluster.put(histogramId, cluster1);
}
clusterToHistograms.get(cluster1).addAll(clusterToHistograms.get(cluster2));
clusterToHistograms.remove(cluster2);
}
// Build the clustered histogram list.
List<List<Histogram>> clusters = new ArrayList<List<Histogram>>();
for (Map.Entry<Integer, Set<Integer>> entry: clusterToHistograms.entrySet()) {
Integer clusterId = entry.getKey();
Set<Integer> histograms = entry.getValue();
List<Histogram> curCluster = new ArrayList<Histogram>();
for (Integer histogramIndex: histograms) {
curCluster.add(inputHistograms.get(histogramIndex));
}
clusters.add(curCluster);
}
return clusters;
}
//////////////////////////////////////////////////////////////
// Members
//////////////////////////////////////////////////////////////
String histogramType;
List<FrequencyPair> normalForm;
double width;
double residualMass;
double coverage;
//////////////////////////////////////////////////////////////
// Methods
//////////////////////////////////////////////////////////////
public Histogram(String histogramType, List<FrequencyPair> normalForm, double width, double residualMass, double coverage) {
this.histogramType = histogramType;
this.normalForm = normalForm;
this.width = width;
this.residualMass = residualMass;
this.coverage = coverage;
}
public boolean passStructStatisticalTest(int numChunks) {
return residualMass < MAX_RESIDUAL_MASS && coverage > MIN_COVERAGE_FACTOR * numChunks;
}
public boolean passArrayStatisticalTest(int numChunks) {
return width > 3 && coverage > MIN_COVERAGE_FACTOR * numChunks;
}
public String getHistogramType() {
return histogramType;
}
public double getWidth() {
return width;
}
public double getResidualMass() {
return residualMass;
}
public double getCoverage() {
return coverage;
}
public String toString() {
StringBuffer buf = new StringBuffer();
buf.append("Histogram: type=" + histogramType + ", width=" + width + ", residualMass=" + residualMass + ", coverage=" + coverage + ", normalForm=[");
for (FrequencyPair fp: normalForm) {
buf.append("(" + fp.getPerChunkFrequency() + ", " + fp.getCount() + ") ");
}
buf.append("]");
return buf.toString();
}
/**
* The relative entropy score is used for clustering. However, we can't compute
* it directly, as histograms do not always contain the same components. Instead,
* we preprocess the data with computeSymmetricRelativeEntropy(), then pass the resulting
* averaged values into this pretty generic method.
*/
double computeRelativeEntropy(List<Double> avgCounts) {
double total = 0;
for (int i = 1; i < normalForm.size(); i++) {
double selfOverallChunkCount = normalForm.get(i).getCount();
double otherOverallChunkCount = avgCounts.get(i);
total += selfOverallChunkCount * Math.log(selfOverallChunkCount / otherOverallChunkCount);
}
return total;
}
/**
* The point of this method is to preprocess the data from two input Histograms,
* getting it ready for the relative entropy computation. Without this preprocessing,
* the rel-entropy computation would be sensitive to varying numbers of components in each histogram.
*/
double computeSymmetricRelativeEntropy(Histogram other) {
List<Double> avgCounts = new ArrayList<Double>();
for (int i = 0; i < Math.max(normalForm.size(), other.normalForm.size()); i++) {
if ((i < normalForm.size()) && (i < other.normalForm.size())) {
avgCounts.add((normalForm.get(i).getCount() + other.normalForm.get(i).getCount()) / 2.0);
} else if (i < normalForm.size()) {
avgCounts.add(normalForm.get(i).getCount() * 0.5);
} else {
avgCounts.add(other.normalForm.get(i).getCount() * 0.5);
}
}
return 0.5 * this.computeRelativeEntropy(avgCounts) + 0.5 * other.computeRelativeEntropy(avgCounts);
}
}