/*
* Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fhcrc.cpl.toolbox.proteomics;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.ApplicationContext;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithRPerspectivePlot;
import org.fhcrc.cpl.toolbox.datastructure.Pair;
import org.apache.log4j.Logger;
import java.util.*;
/**
* _Highly_ generic 2-dimensional clusterer. Adapted from FeatureGrouper, which was specific
* to clustering Features by mass and scan.
*
* Arrays of Clusterables are added to a tree structure, which divides them up first by
* dimension1 and then by dimension2. Each Clusterable is aware of its value in both dimensions
* and (as far as Clusterer2D is concerned) nothing else.
*
* Any calling class will have to implement its own Clusterer class in order for clustering to have
* any meaning.
*
* So far so good. Here's the weird bit: we need to be able to cluster Features by mass, and
* the max bucket size may be in terms of Daltons or in terms of PPM. If it's in terms of Daltons,
* great. If ppm, then the canSplit() method has some very funky behavior. By default it behaves as
* you might expect, though.
*/
public class Clusterer2D
{
private static Logger _log = Logger.getLogger(Clusterer2D.class);
private List<Clusterable[]> _clusterableArrays = new ArrayList<Clusterable[]>();
public Node _root;
BucketSummary[] summaries;
//determines how to determine whether to split buckets in each dimension
protected ClusterDimensionSplitCalculator _dimensionSplitCalculator =
new DefaultClusterDimensionSplitCalculator();
//keeps track of whether each dimension should really be treated as int values (not doubles).
// For formatting of output only!
protected boolean _dimension1IsInt = false;
protected boolean _dimension2IsInt = false;
public Clusterer2D()
{
}
public Clusterer2D(Clusterable[] clusterableArray)
{
this();
this.addSet(clusterableArray);
}
public Pair<Double, Double> calculateBestBuckets(double[] dimension1Buckets,
double[] dimension2Buckets, boolean showCharts)
{
double bestDimension1Bucket = 0;
double bestDimension2Bucket = 0;
//maximum number of perfect matches we could possibly have. Used in order to stop searching
//if we find that many. Not likely to encounter this in real scenarios, maybe should remove?
int maxPossiblePerfectMatches = Integer.MAX_VALUE;
for (Clusterable[] clusterableArray : _clusterableArrays)
if (clusterableArray.length < maxPossiblePerfectMatches)
maxPossiblePerfectMatches = clusterableArray.length;
double[][] perfectMatches = new double[dimension1Buckets.length][dimension2Buckets.length];
int bestNumPerfectMatches = -1;
//evaluate all combinations of mass and hydrophobicity buckets
for (int iMass = 0; iMass < dimension1Buckets.length; iMass++)
{
for (int iElution = 0; iElution < dimension2Buckets.length;
iElution++)
{
double dimension2Bucketsize = dimension2Buckets[iElution];
double dimension1Bucketsize = dimension1Buckets[iMass];
if (dimension2Bucketsize <= 0 || dimension1Bucketsize <= 0)
continue;
split2D(dimension1Bucketsize, dimension2Bucketsize);
perfectMatches[iMass][iElution] = rowsWithOneFromEach();
_log.debug("Evaluating bucket size: mass " + dimension1Bucketsize + ", elution " + dimension2Bucketsize);
_log.debug(" Bucket score: " + perfectMatches[iMass][iElution]);
if (perfectMatches[iMass][iElution] >
bestNumPerfectMatches)
{
bestDimension1Bucket = dimension1Bucketsize;
bestDimension2Bucket = dimension2Bucketsize;
bestNumPerfectMatches = (int) perfectMatches[iMass][iElution];
//if we've got as many perfect matches as we can get, stop evaluating buckets
if (bestNumPerfectMatches == maxPossiblePerfectMatches)
{
_log.debug("Found best matches early, saved " + (dimension1Buckets.length-iMass-1) + " mass evals");
break;
}
}
}
}
if (showCharts)
{
//don't want to require R, so catching exceptions
try
{
PanelWithRPerspectivePlot perspPlot = new PanelWithRPerspectivePlot();
perspPlot.setName("Perfect Buckets");
perspPlot.setAxisRVariableNames("dim1","dim2","perfect_buckets");
perspPlot.plot(dimension1Buckets, dimension2Buckets, perfectMatches);
perspPlot.displayInTab();
}
catch (Exception e)
{
_log.debug("Error showing plot");
}
}
_log.debug("Best Mass/Elution buckets: " + bestDimension1Bucket + ", " + bestDimension2Bucket);
ApplicationContext.setMessage("Num perfect buckets: " + bestNumPerfectMatches);
return new Pair<Double, Double>(bestDimension1Bucket, bestDimension2Bucket);
}
/**
* A single entry in the clustering tree, aware of its value in only one dimension
*/
public static class TreeEntry implements Comparable
{
public Clusterable parent;
public int iSet;
public double value;
public int compareTo(Object o)
{
TreeEntry treeEntry = (TreeEntry) o;
if (value < treeEntry.value)
return 0;
else if (value > treeEntry.value)
return 1;
return 0;
}
}
public void setDimensionSplitCalculator(ClusterDimensionSplitCalculator dimensionSplitCalculator)
{
this._dimensionSplitCalculator = dimensionSplitCalculator;
}
/**
* abstract class for determining whether to split in each dimension. See usage in canSplit()
*/
public abstract static class ClusterDimensionSplitCalculator
{
public abstract double calculateDimension1ForSplit(double referenceValue, double dimensionValue);
public abstract double calculateDimension2ForSplit(double referenceValue, double dimensionValue);
}
/**
* Default split calculator is very simple!
*/
public static class DefaultClusterDimensionSplitCalculator
extends ClusterDimensionSplitCalculator
{
public double calculateDimension1ForSplit(double referenceValue, double dimensionValue)
{
return dimensionValue;
}
public double calculateDimension2ForSplit(double referenceValue, double dimensionValue)
{
return dimensionValue;
}
}
/**
* defines values in two dimensions. Individual implementations will likely have some other
* object that they're aware of that they grab these values from
*/
public interface Clusterable
{
public abstract double getDimension1Value();
public abstract double getDimension2Value();
}
/**
* A node in the clustering tree, aware of its children
*/
private class Node
{
TreeEntry[] entries;
int start;
int length;
Node left, right;
Node dimension2; //used to divide up in a different dimension
Node(TreeEntry[] entries, int start, int length)
{
assert length > 0;
this.entries = entries;
this.start = start;
this.length = length;
}
double getMin()
{
return entries[start].value;
}
double getMax()
{
return entries[start + length - 1].value;
}
TreeEntry[] getEntries()
{
TreeEntry[] newArray = new TreeEntry[length];
System.arraycopy(entries, start, newArray, 0, length);
return newArray;
}
void addEntries(TreeEntry[] addEntries)
{
TreeEntry[] newEntries = new TreeEntry[length + addEntries.length];
if (null != this.entries)
System.arraycopy(this.entries, start, newEntries, 0, length);
System.arraycopy(addEntries, 0, newEntries, length, addEntries.length);
this.entries = newEntries;
start = 0;
length = this.entries.length;
Arrays.sort(this.entries);
}
/**
* In the case of splitting Features by mass, we might have to convert
* from PPM to Da before figuring out whether we can split. This means we need to
* take into account getMin() as well as maxBucket. So I've abstracted the calculation of
* the allowed bucket width.
* @param maxBucket
* @param dimension the dimension we're considering -- can have different behavior for both
* @return
*/
boolean canSplit(double maxBucket, int dimension)
{
return (length > 1) &&
(getMax() - getMin() >
(dimension == 2 ? _dimensionSplitCalculator.calculateDimension2ForSplit(getMin(), maxBucket)
: _dimensionSplitCalculator.calculateDimension1ForSplit(getMin(), maxBucket)));
}
void reSplit(double maxBucket, int dimension)
{
if (canSplit(maxBucket, dimension))
{
double maxDistance = 0;
int bestIndex = -1;
//TODO: Can pre-compute all of this...
for (int i = start; i < start + length - 1; i++)
{
double distance = entries[i + 1].value - entries[i].value;
if (distance > maxDistance)
{
maxDistance = distance;
bestIndex = i;
}
}
assert bestIndex >= start;
left = new Node(entries, start, bestIndex + 1 - start);
assert start + length - bestIndex - 1 > 0;
right = new Node(entries, bestIndex + 1, start + length - bestIndex - 1);
left.reSplit(maxBucket, dimension);
right.reSplit(maxBucket, dimension);
}
else
left = right = null;
}
List<Node> appendLeafNodes(List<Node> nodes)
{
if (nodes == null)
nodes = new ArrayList<Node>();
if (null == left)
nodes.add(this);
else
{
left.appendLeafNodes(nodes);
right.appendLeafNodes(nodes);
}
return nodes;
}
List<Node> getLeafNodes()
{
return appendLeafNodes(null);
}
}
public void addSet(Clusterable[] clusterableArray)
{
summaries = null;
int index = _clusterableArrays.size();
_clusterableArrays.add(clusterableArray);
TreeEntry[] entries = new TreeEntry[clusterableArray.length];
for (int i = 0; i < entries.length; i++)
{
Clusterable parent = clusterableArray[i];
TreeEntry treeEntry = new TreeEntry();
treeEntry.value = parent.getDimension1Value();
treeEntry.parent = parent;
treeEntry.iSet = index;
entries[i] = treeEntry;
}
Arrays.sort(entries);
if (null == _root)
_root = new Node(entries, 0, entries.length);
else
_root.addEntries(entries);
}
public void split2D(double maxDimension1Bucket, double maxDimension2Bucket)
{
summaries = null;
_root.reSplit(maxDimension1Bucket, 1);
List<Node> leaves = _root.getLeafNodes();
for (Node node : leaves)
{
TreeEntry[] massEntries = node.getEntries();
TreeEntry[] hydrophobicityEntries = new TreeEntry[massEntries.length];
for (int j = 0; j < massEntries.length; j++)
{
TreeEntry dimension1TreeEntry = massEntries[j];
TreeEntry hydrophobicityTreeEntry = new TreeEntry();
hydrophobicityTreeEntry.parent = dimension1TreeEntry.parent;
hydrophobicityTreeEntry.iSet = dimension1TreeEntry.iSet;
hydrophobicityTreeEntry.value = dimension1TreeEntry.parent.getDimension2Value();
hydrophobicityEntries[j] = hydrophobicityTreeEntry;
}
Arrays.sort(hydrophobicityEntries);
node.dimension2 = new Node(hydrophobicityEntries, 0, hydrophobicityEntries.length);
node.dimension2.reSplit(maxDimension2Bucket, 2);
}
}
public BucketSummary[] summarize()
{
if (summaries != null)
return summaries;
List<BucketSummary> summaryList = new ArrayList<BucketSummary>();
List<Node> dimension1Leaves = _root.appendLeafNodes(null);
for (Node dimension1Leaf : dimension1Leaves)
{
List<Node> dimension2Leaves = dimension1Leaf.dimension2.appendLeafNodes(null);
for (Node dimension2Leaf : dimension2Leaves)
{
BucketSummary summary = new BucketSummary(dimension1Leaf, dimension2Leaf);
summaryList.add(summary);
}
}
summaries = summaryList.toArray(new BucketSummary[summaryList.size()]);
return summaries;
}
public int[] histogramBucketCounts()
{
BucketSummary[] bucketSummaries = summarize();
int maxCount = 0;
for (int i = 0; i < bucketSummaries.length; i++)
if (bucketSummaries[i].featureCount > maxCount)
maxCount = bucketSummaries[i].featureCount;
int[] histogram = new int[maxCount + 1];
for (int i = 0; i < bucketSummaries.length; i++)
histogram[bucketSummaries[i].featureCount]++;
return histogram;
}
public int[] histogramSetCounts()
{
BucketSummary[] bucketSummaries = summarize();
int maxCount = 0;
for (int i = 0; i < bucketSummaries.length; i++)
if (bucketSummaries[i].setCount > maxCount)
maxCount = bucketSummaries[i].setCount;
int[] histogram = new int[maxCount + 1];
for (int i = 0; i < bucketSummaries.length; i++)
histogram[bucketSummaries[i].setCount]++;
return histogram;
}
public int numBuckets()
{
BucketSummary[] bucketSummaries = summarize();
return bucketSummaries.length;
}
public int rowsWithOneFromEach()
{
BucketSummary[] bucketSummaries = summarize();
return rowsWithOneFromEach(bucketSummaries);
}
/**
* Calculates how many buckets have exactly 1 feature from each feature set.
* Useful for testing align
*
* @return
*/
public int rowsWithOneFromEach(BucketSummary[] bucketSummaries)
{
int count = 0;
int numSets = _clusterableArrays.size();
for (int i = 0; i < bucketSummaries.length; i++)
if (bucketSummaries[i].featureCount == numSets && bucketSummaries[i].setCount == numSets)
count++;
//TODO: Figure out what to do about this. It would be nice to have the minimum number of
//TODO: featuresets with one from each configurable.
// for (int i = 0; i < bucketSummaries.length; i++)
// if (bucketSummaries[i].featureCount > 50 &&
// bucketSummaries[i].featureCount == bucketSummaries[i].setCount)
// count++;
return count;
}
public class BucketSummary
{
private Node _dimension2Leaf; //Innermost node
public double minDimension1;
public double maxDimension1;
public double minDimension2;
public double maxDimension2;
public int featureCount = 0;
public int setCount = 0;
public Map<Integer, List<TreeEntry>> setIndexTreeEntryListMap =
new HashMap<Integer, List<TreeEntry>>();
public BucketSummary(Node dimension1Leaf, Node dimension2Leaf)
{
this._dimension2Leaf = dimension2Leaf;
this.minDimension1 = dimension1Leaf.getMin();
this.maxDimension1 = dimension1Leaf.getMax();
this.minDimension2 = dimension2Leaf.getMin();
this.maxDimension2 = dimension2Leaf.getMax();
this.featureCount = dimension2Leaf.length;
int[] setCounts = new int[_clusterableArrays.size()];
for (int i = dimension2Leaf.start; i < dimension2Leaf.start + dimension2Leaf.length; i++)
{
TreeEntry treeEntry = dimension2Leaf.entries[i];
setCounts[treeEntry.iSet]++;
getTreeEntryListForSetIndex(treeEntry.iSet).add(treeEntry);
}
for (int i = 0; i < _clusterableArrays.size(); i++)
if (setCounts[i] > 0)
{
setCount++;
}
}
/**
* No time is wasted here: If there's only one entry, return it
* @param valueToMatch
* @param dimension
* @param treeEntryList
* @return
*/
protected TreeEntry pickClosestTreeEntry(double valueToMatch, int dimension,
List<TreeEntry> treeEntryList)
{
TreeEntry bestTreeEntry = treeEntryList.get(0);
if (treeEntryList.size() > 1)
{
double bestTreeEntryDifference =
Math.abs((dimension == 1 ? bestTreeEntry.parent.getDimension1Value() :
bestTreeEntry.parent.getDimension2Value()) -
valueToMatch);
for (int i = 1; i < treeEntryList.size(); i++)
{
TreeEntry treeEntry = treeEntryList.get(i);
//System.err.println("Possibility: " + (dimension == 1 ? treeEntry.parent.getDimension1Value() : treeEntry.parent.getDimension2Value()));
double treeEntryDifference =
Math.abs((dimension == 1 ? bestTreeEntry.parent.getDimension1Value() :
bestTreeEntry.parent.getDimension2Value()) -
valueToMatch);
if (treeEntryDifference < bestTreeEntryDifference)
{
bestTreeEntry = treeEntry;
bestTreeEntryDifference = treeEntryDifference;
}
}
}
return bestTreeEntry;
}
/**
* Resolve pairing conflicts for features in the same bucket. If there's more than one
* feature from each set in a bucket, it gets hairy.
* Time is wasted if there's already just one for each set, because I don't want to
* waste time checking for that here. If that's the case, don't call this method.
*
* The way one feature from each set is picked is as follows:
* -Along the interestingDimension, find the average value in the bucket
* -For each set, find each feature's deviance from that value and pick the feature with the smallest
*
* TODO: The picking in this method represents the most questionable choices in all the clustering code.
* We need to revisit:
* -is it reasonable to do this picking based on just one dimension?
* -is it reasonable to use the center of the bucket in order to pick the "closest" feature?
* -if so, when calculating the center of the bucket, should we exclude features from the current
* set (this would take more time)
*
* @return A list of Clusterables, one representing each set, in order of increasing
* set index
*/
public Clusterable[] pickOneFromEachSet(int interestingDimension)
{
double interestingDimensionSum = 0;
for (TreeEntry treeEntry : _dimension2Leaf.entries)
{
interestingDimensionSum +=
(interestingDimension == 1? treeEntry.parent.getDimension1Value() :
treeEntry.parent.getDimension2Value());
}
double interestingDimensionMean =
interestingDimensionSum / (double) _dimension2Leaf.entries.length;
//System.err.println("mean: " + interestingDimensionMean);
Map<Integer, List<TreeEntry>> interestingDimensionSetIndexTreeEntryListMap =
(interestingDimension == 2 ? setIndexTreeEntryListMap :
setIndexTreeEntryListMap);
Clusterable[] result = new Clusterable[_clusterableArrays.size()];
for (int i=0; i<_clusterableArrays.size(); i++)
{
List<TreeEntry> setTreeEntryList =
interestingDimensionSetIndexTreeEntryListMap.get(i);
if (setTreeEntryList == null || setTreeEntryList.size() == 0)
result[i] = null;
else
{
result[i] = pickClosestTreeEntry(interestingDimensionMean,
interestingDimension, setTreeEntryList).parent;
}
}
return result;
}
public List<Clusterable> getParentList()
{
List<Clusterable> result = new ArrayList<Clusterable>();
//dhmay fixing 20100512. This used to iterate through _dimension2Leaf.entries,
//which is <sigh> completely different
for (TreeEntry treeEntry : _dimension2Leaf.getEntries())
{
result.add(treeEntry.parent);
}
return result;
}
public List<Clusterable> getParentListForSetIndex(int setIndex)
{
//Dimension doesn't matter, because both tree lists will have the same parents
List<TreeEntry> treeEntryList = getTreeEntryListForSetIndex(setIndex);
List<Clusterable> result = new ArrayList<Clusterable>(treeEntryList.size());
for (TreeEntry treeEntry : treeEntryList)
{
result.add(treeEntry.parent);
}
return result;
}
public List<TreeEntry> getTreeEntryListForSetIndex(int setIndex)
{
List<TreeEntry> result = setIndexTreeEntryListMap.get(setIndex);
if (result == null)
{
result = new ArrayList<TreeEntry>();
setIndexTreeEntryListMap.put(setIndex,result);
}
return result;
}
public Object[] objects()
{
Object[] objects = new Feature[_dimension2Leaf.length];
for (int i = 0; i < _dimension2Leaf.length; i++)
{
TreeEntry treeEntry = _dimension2Leaf.entries[i + _dimension2Leaf.start];
objects[i] = treeEntry.parent;
}
return objects;
}
public TreeEntry[] entries()
{
TreeEntry[] entries = new TreeEntry[_dimension2Leaf.length];
for (int i = 0; i < _dimension2Leaf.length; i++)
{
entries[i] = _dimension2Leaf.entries[i + _dimension2Leaf.start];
}
return entries;
}
public String arrayRowDetail()
{
StringBuffer sb = new StringBuffer(arrayRow());
for (int i = _dimension2Leaf.start; i < _dimension2Leaf.start + _dimension2Leaf.length; i++)
{
TreeEntry treeEntry = _dimension2Leaf.entries[i];
sb.append("\t");
sb.append("" + i);
sb.append("\t");
sb.append(treeEntry.parent.toString());
}
return sb.toString();
}
public String arrayRow()
{
StringBuffer sb = new StringBuffer(toString());
return sb.toString();
}
public String toString()
{
StringBuffer dimension1String = new StringBuffer(minDimension1 + "\t" + maxDimension1);
StringBuffer dimension2String = new StringBuffer(minDimension2 + "\t" + maxDimension2);
if (dimension1IsInt())
dimension1String = new StringBuffer((int) minDimension1 + "\t" + (int) maxDimension1);
if (dimension2IsInt())
dimension2String = new StringBuffer((int) minDimension2 + "\t" + (int) maxDimension2);
return dimension1String + "\t" + dimension2String + "\t" + featureCount + "\t" + setCount;
}
}
//getters and setters
public boolean dimension1IsInt()
{
return _dimension1IsInt;
}
public void setDimension1IsInt(boolean _dimension1IsInt)
{
this._dimension1IsInt = _dimension1IsInt;
}
public boolean dimension2IsInt()
{
return _dimension2IsInt;
}
public void setDimension2IsInt(boolean _dimension2IsInt)
{
this._dimension2IsInt = _dimension2IsInt;
}
public List<Clusterable[]> getClusterableArrays()
{
return _clusterableArrays;
}
public void setClusterableArrays(List<Clusterable[]> clusterableArrays)
{
_clusterableArrays = clusterableArrays;
}
public int countAllEntries()
{
if (_root == null)
return 0;
return _root.entries.length;
}
}