/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.toolbox.proteomics.feature.matching; import org.fhcrc.cpl.toolbox.proteomics.feature.Feature; import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet; import org.fhcrc.cpl.toolbox.proteomics.Clusterer2D; import org.fhcrc.cpl.toolbox.proteomics.MassUtilities; import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureClusterer; import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureGrouper; import org.fhcrc.cpl.toolbox.proteomics.feature.matching.FeatureSetMatcher; import org.fhcrc.cpl.toolbox.proteomics.feature.matching.BaseFeatureSetMatcherImpl; import org.apache.log4j.Logger; import java.util.*; /** * Attempts to match one feature set against another. * This version is the closest to what CS types would call hierarchical clustering: * we don't attempt to guess the global "best" values for every bucket. Instead, we * start loose, make what matches we can, and drill down into the smaller buckets, breaking * them up until either everything's cool or we hit a defined minimum in both dimensions. * * Strictly speaking, though, this isn't exactly hierarchical clustering, either. Instead of * breaking clusters in half each time, I'm decreasing the cluster size slightly and re-clustering. * * In order to do true hierarchical clustering appropriately, it would be necessary to rewrite * Clusterer2D. I think this is a close enough approximation that that's not a high priority. * Since it would affect array creation, too, I'm reluctant to do it. */ public class RecursiveFeatureSetMatcher extends BaseFeatureSetMatcherImpl implements FeatureSetMatcher { private static Logger _log = Logger.getLogger(RecursiveFeatureSetMatcher.class); public static final double DEFAULT_HYDRO_ELUTION_BUCKET_INCREMENT = 0.01; public static final double DEFAULT_SCAN_ELUTION_BUCKET_INCREMENT = 5; public static final double DEFAULT_TIME_ELUTION_BUCKET_INCREMENT = 10; protected static final double ABSOLUTE_MASS_BUCKET_INCREMENT = 0.05; protected static final double PPM_MASS_BUCKET_INCREMENT = 2; protected double massBucketIncrement = ABSOLUTE_MASS_BUCKET_INCREMENT; protected double elutionBucketIncrement = DEFAULT_HYDRO_ELUTION_BUCKET_INCREMENT; public static final double DEFAULT_MIN_DELTA_MASS_PPM = 1; public static final double DEFAULT_MIN_DELTA_MASS_DA = .05; public static final double DEFAULT_MIN_DELTA_SCAN = 5; public static final double DEFAULT_MIN_DELTA_TIME = 10; public static final double DEFAULT_MIN_DELTA_H = .005; protected double minDeltaMass = DEFAULT_MIN_DELTA_MASS_PPM; protected double minDeltaElution = DEFAULT_MIN_DELTA_SCAN; protected int[] matchedAtDepth; protected boolean useMassInsteadOfMz = true; public RecursiveFeatureSetMatcher() { } public RecursiveFeatureSetMatcher(float deltaMass, int deltaMassType, float deltaElution) { super(); init(deltaMass, deltaMassType, deltaElution); } public void init(float deltaMass, int deltaMassType, float deltaElution) { super.init(deltaMass, deltaMassType, deltaElution); if (deltaMassType == FeatureSetMatcher.DELTA_MASS_TYPE_PPM) massBucketIncrement = PPM_MASS_BUCKET_INCREMENT; } public String toString() { return "RecursiveFeatureSetMatcher: deltaMass=" + deltaMass + ", deltaElution=" + deltaElution; } /** * * // In the PPM case, we have to convert from PPM to Da before figuring out whether we can split. // This is done by converting maxBucket based on this bucket's getMin() value. This will // ensure that a bucket will be split if any subregion of the bucket would be split for the // given PPM value. // This should be reasonable behavior, perhaps splitting slightly more than necessary. * @param clusterer */ protected void setDimensionSplitCalculator(FeatureClusterer clusterer) { if (deltaMassType == FeatureSetMatcher.DELTA_MASS_TYPE_PPM) { clusterer.setDimensionSplitCalculator(new Clusterer2D.ClusterDimensionSplitCalculator() { public double calculateDimension1ForSplit(double referenceValue, double dimensionValue) { return MassUtilities.calculateAbsoluteDeltaMass((float) referenceValue, (float) dimensionValue, FeatureSetMatcher.DELTA_MASS_TYPE_PPM); } public double calculateDimension2ForSplit(double referenceValue, double dimensionValue) { return dimensionValue; } }); } } /** * Match two feature sets using clustering * @param featureSet1 * @param featureSet2 * @return */ public FeatureMatchingResult matchFeatures(FeatureSet featureSet1, FeatureSet featureSet2) { _log.debug("matchFeatures 1"); List<Feature> set1Features = new ArrayList<Feature>(featureSet1.getFeatures().length); List<Feature> set2Features = new ArrayList<Feature>(featureSet2.getFeatures().length); for (Feature feature : featureSet1.getFeatures()) set1Features.add(feature); for (Feature feature : featureSet2.getFeatures()) set2Features.add(feature); matchedAtDepth = new int[100]; FeatureMatchingResult result = recursivelyMatch(set1Features,set2Features, deltaMass, deltaElution, 1); _log.debug("Done. Features matched at depth..."); for (int i=1; i<matchedAtDepth.length; i++) { if (matchedAtDepth[i] == 0) break; _log.debug("\t" + i + " : " + matchedAtDepth[i]); } return result; } protected FeatureMatchingResult recursivelyMatch(List<Feature> set1Features, List<Feature> set2Features, double currentDeltaMass, double currentDeltaElution, int depth) { FeatureMatchingResult result = new FeatureMatchingResult(); boolean adjustedMass=false; boolean adjustedElution=false; if (currentDeltaMass < minDeltaMass) { currentDeltaMass = minDeltaMass; adjustedMass=true; } if (currentDeltaElution < minDeltaElution) { currentDeltaElution = minDeltaElution; adjustedElution=true; } if (adjustedMass && adjustedElution) return result; FeatureSet currentFeatureSet1 = new FeatureSet(set1Features.toArray( new Feature[set1Features.size()])); FeatureSet currentFeatureSet2 = new FeatureSet(set2Features.toArray( new Feature[set2Features.size()])); FeatureClusterer clusterer = new FeatureClusterer(FeatureClusterer.MASS_MZ_MODE_MASS, elutionMode, currentFeatureSet1); clusterer.addSet(currentFeatureSet2); setDimensionSplitCalculator(clusterer); clusterer.split2D(currentDeltaMass, currentDeltaElution); Clusterer2D.BucketSummary[] bucketSummaries = clusterer.summarize(); int numConflictBuckets = 0; int matchedThisIteration = 0; for (Clusterer2D.BucketSummary bucketSummary : bucketSummaries) { //for each bucket, if it has at least one pair, do something with it if (bucketSummary.setCount == 2) { if (bucketSummary.entries().length == 2) { //exactly one from each, no-brainer Feature masterSetFeature = ((FeatureClusterer.FeatureClusterable) bucketSummary.getParentListForSetIndex(0).get(0)).getParentFeature(); Feature slaveSetFeature = ((FeatureClusterer.FeatureClusterable) bucketSummary.getParentListForSetIndex(1).get(0)).getParentFeature(); result.add(masterSetFeature, slaveSetFeature); matchedAtDepth[depth]++; } else { //Conflict bucket. Try to resolve recursively. If failure, add as is List<Feature> set2FeaturesThisBucket = FeatureGrouper.getFeaturesFromSet(1, bucketSummary); List<Feature> set1FeaturesThisBucket = FeatureGrouper.getFeaturesFromSet(0, bucketSummary); double nextDeltaMass = currentDeltaMass; double nextDeltaElution = currentDeltaElution; nextDeltaMass -= massBucketIncrement; nextDeltaElution -= elutionBucketIncrement; FeatureMatchingResult subResult = recursivelyMatch(set1FeaturesThisBucket, set2FeaturesThisBucket, nextDeltaMass, nextDeltaElution, depth+1); if (subResult.size() == 0) { //if we got nothing from the next level, make the matches at this level for (Feature masterSetFeature : set1FeaturesThisBucket) { //order slave set features in the order determined by //variable settings at the superclass level List<Feature> orderedSlaveSetFeatures = orderSlaveSetFeatures(set2FeaturesThisBucket, masterSetFeature); result.put(masterSetFeature, orderedSlaveSetFeatures); matchedAtDepth[depth]++; } } else { //if we got some stuff at the next level, add that stuff, then //go back and pick up the spares at this level Set<Feature> subResultMasterSetFeatures = subResult.getMasterSetFeatures(); Set<Feature> subResultSlaveSetFeatures = subResult.getSlaveSetFeatures(); for (Feature feature : subResultMasterSetFeatures) { result.put(feature, subResult.get(feature)); } Set<Feature> remainingMasterSetFeatures = new HashSet<Feature>(); for (Feature feature : set1FeaturesThisBucket) { if (!subResultMasterSetFeatures.contains(feature)) remainingMasterSetFeatures.add(feature); } if (!remainingMasterSetFeatures.isEmpty()) { List<Feature> remainingSlaveSetFeatures = new ArrayList<Feature>(); for (Feature feature : set2FeaturesThisBucket) if (!subResultSlaveSetFeatures.contains(feature)) remainingSlaveSetFeatures.add(feature); if (!remainingSlaveSetFeatures.isEmpty()) { for (Feature masterSetFeature : remainingMasterSetFeatures) { List<Feature> orderedSlaveSetFeatures = orderSlaveSetFeatures(set2FeaturesThisBucket, masterSetFeature); result.put(masterSetFeature, orderedSlaveSetFeatures); matchedAtDepth[depth]++; } } } } numConflictBuckets++; } } else { //if bucket contains only features from one set, those features are //unmatchable. } } // _log.debug("\t\tDone. matched: " + matchedThisIteration + // ", Conflict buckets: " + numConflictBuckets); return result; } /** * Convert a list of Clusterables, assumed to be FeatureMassHydroClusterables, to a list of * Features * @param clusterableList * @return */ protected List<Feature> getFeatureListForClusterableList(List<Clusterer2D.Clusterable> clusterableList) { List<Feature> result = new ArrayList<Feature>(clusterableList.size()); for (Clusterer2D.Clusterable clusterable : clusterableList) { result.add(((FeatureClusterer.FeatureClusterable) clusterable).getParentFeature()); } return result; } public double getMassBucketIncrement() { return massBucketIncrement; } public void setMassBucketIncrement(double massBucketIncrement) { this.massBucketIncrement = massBucketIncrement; } public double getElutionBucketIncrement() { return elutionBucketIncrement; } public void setElutionBucketIncrement(double elutionBucketIncrement) { this.elutionBucketIncrement = elutionBucketIncrement; } public void setUseMassInsteadOfMz(boolean value) { useMassInsteadOfMz = value; } public double getMinDeltaMass() { return minDeltaMass; } public void setMinDeltaMass(double minDeltaMass) { this.minDeltaMass = minDeltaMass; } public double getMinDeltaElution() { return minDeltaElution; } public void setMinDeltaElution(double minDeltaElution) { this.minDeltaElution = minDeltaElution; } public void setElutionMode(int newElutionMode) { super.setElutionMode(newElutionMode); switch (newElutionMode) { case FeatureClusterer.ELUTION_MODE_SCAN: elutionBucketIncrement = (float) DEFAULT_SCAN_ELUTION_BUCKET_INCREMENT; minDeltaElution = DEFAULT_MIN_DELTA_SCAN; break; case FeatureClusterer.ELUTION_MODE_TIME: elutionBucketIncrement = (float) DEFAULT_TIME_ELUTION_BUCKET_INCREMENT; minDeltaElution = DEFAULT_MIN_DELTA_TIME; break; } } }