ClusteringFeatureSetMatcher.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.fhcrc.cpl.toolbox.proteomics.feature.matching;

import org.fhcrc.cpl.toolbox.datastructure.Pair;
import org.fhcrc.cpl.toolbox.proteomics.Clusterer2D;
import org.fhcrc.cpl.toolbox.proteomics.MassUtilities;

import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureClusterer;
import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureGrouper;
import org.fhcrc.cpl.toolbox.proteomics.feature.*;
import org.apache.log4j.Logger;

import java.util.*;

/**
 * Attempts to match one feature set against another using clustering
 */
public class ClusteringFeatureSetMatcher extends BaseFeatureSetMatcherImpl
        implements FeatureSetMatcher
{
    private static Logger _log = Logger.getLogger(ClusteringFeatureSetMatcher.class);

    public static final double DEFAULT_HYDRO_ELUTION_BUCKET_INCREMENT = 0.05;

    protected static final double ABSOLUTE_MASS_BUCKET_INCREMENT = 0.05;
    protected static final double PPM_MASS_BUCKET_INCREMENT = 1;
    protected double massBucketIncrement = ABSOLUTE_MASS_BUCKET_INCREMENT;
    protected double elutionBucketIncrement = DEFAULT_HYDRO_ELUTION_BUCKET_INCREMENT;

    //Up these values only with caution!  The complexity of this feature-matching grows with
    //numMassBuckets * numElutionBuckets
    protected int numMassBuckets = 4;
    protected int numElutionBuckets = 4;

    protected double bestMassBucketSize = 0;
    protected double bestElutionBucketSize = 0;

   protected boolean useMassInsteadOfMz = true;



    public ClusteringFeatureSetMatcher()
    {
    }

    public ClusteringFeatureSetMatcher(float deltaMass, int deltaMassType,
                                       float deltaElution)
    {
        super();
        init(deltaMass, deltaMassType, deltaElution);
    }

    public void init(float deltaMass, int deltaMassType, float deltaElution)
    {
        super.init(deltaMass, deltaMassType, deltaElution);
        if (deltaMassType == FeatureSetMatcher.DELTA_MASS_TYPE_PPM)
            massBucketIncrement = PPM_MASS_BUCKET_INCREMENT;
    }

    public String toString()
    {
        return "ClusteringFeatureSetMatcher: deltaMass=" + deltaMass +
                ", deltaElution=" + deltaElution;
    }

    /**
     * Calculate the buckets to use for either mass or hydro clustering
     * @param numBuckets
     * @param bucketIncrement
     * @param maxBucketSize
     * @return
     */
    protected double[] calculateBuckets(int numBuckets, double bucketIncrement,
                                        double maxBucketSize)
    {
        _log.debug("Buckets:");
        ArrayList<Double> resultList = new ArrayList<Double>();
        double lowBucket = maxBucketSize - (bucketIncrement * (numBuckets - 1));

        for (int i=0; i<numBuckets; i++)
        {
            double bucketSize = lowBucket + (i * bucketIncrement);
            if (bucketSize > 0)
            {
                resultList.add(bucketSize);
                _log.debug("  " + resultList.get(resultList.size()-1));
            }
        }
        double[] result = new double[resultList.size()];
        for (int i=0; i<resultList.size(); i++)
            result[i] = resultList.get(i);
        return result;
    }


    /**
     * Match two feature sets using clustering
     * @param featureSet1
     * @param featureSet2
     * @return
     */
    public FeatureMatchingResult
            matchFeatures(FeatureSet featureSet1, FeatureSet featureSet2)
    {
//for (Feature feature : featureSet1.getFeatures())
//{
//    System.err.println(AmtExtraInfoDef.getObservedHydrophobicity(feature));
//}
        FeatureClusterer clusterer =
                new FeatureClusterer(useMassInsteadOfMz ? FeatureClusterer.MASS_MZ_MODE_MASS :
                                                          FeatureClusterer.MASS_MZ_MODE_MZ,
                                     elutionMode, featureSet1);
        clusterer.addSet(featureSet2);
//System.err.println("matcF 1, ms1Features: " + featureSet1.getFeatures().length + ", ms2Features: " + featureSet2.getFeatures().length);
//System.err.println("mass? " + useMassInsteadOfMz + ", elut: " + elutionMode);
//System.err.println("deltaMass: " + deltaMass + ", deltaElut: " + deltaElution);

//for (Feature feature : featureSet1.getFeatures()) System.err.println(" H: " + AmtExtraInfoDef.getObservedHydrophobicity(feature));
//System.err.println("set 1 features: " + featureSet1.getFeatures().length);

//      In the PPM case, we have to convert from PPM to Da before figuring out whether we can split.
//      This is done by converting maxBucket based on this bucket's getMin() value.  This will
//      ensure that a bucket will be split if any subregion of the bucket would be split for the
//      given PPM value.
//      This should be reasonable behavior, perhaps splitting slightly more than necessary.
        if (deltaMassType == FeatureSetMatcher.DELTA_MASS_TYPE_PPM)
        {
            clusterer.setDimensionSplitCalculator(new Clusterer2D.ClusterDimensionSplitCalculator()
            {
                public double calculateDimension1ForSplit(double referenceValue, double dimensionValue)
                {
                    return MassUtilities.calculateAbsoluteDeltaMass((float) referenceValue,
                            (float) dimensionValue,
                            FeatureSetMatcher.DELTA_MASS_TYPE_PPM);
                }

                public double calculateDimension2ForSplit(double referenceValue, double dimensionValue)
                {
                    return dimensionValue;
                }
            });
        }

        FeatureMatchingResult result = new FeatureMatchingResult();

        //maximum number of perfect matches we could possibly have
//        int maxPossiblePerfectMatches = Math.min(featureSet1.getFeatures().length,
//                                                 featureSet2.getFeatures().length);



        double[] massBuckets = calculateBuckets(numMassBuckets, massBucketIncrement, getDeltaMass());
        double[] elutionBuckets = calculateBuckets(numElutionBuckets,
                                                          elutionBucketIncrement,
                                                          getDeltaElution());

        setBestElutionBucketSize(0);
        setBestMassBucketSize(0);

        Pair<Double,Double> bestBucketSizes = clusterer.calculateBestBuckets(massBuckets, elutionBuckets, false);
        clusterer.split2D(bestBucketSizes.first, bestBucketSizes.second);
        _log.debug("Splitting on buckets " + bestBucketSizes.first + " and " +
                   bestBucketSizes.second);
        Clusterer2D.BucketSummary[] bucketSummaries = clusterer.summarize();

int numConflictBuckets = 0;
        for (Clusterer2D.BucketSummary bucketSummary : bucketSummaries)
        {
            //for each bucket, if it has at least one pair, do something with it
            if (bucketSummary.setCount == 2)
            {
                if (bucketSummary.entries().length > 2)
                {
                    List<Feature> slaveSetFeatures =
                            FeatureGrouper.getFeaturesFromSet(1, bucketSummary);
                    for (Feature masterSetFeature :
                            FeatureGrouper.getFeaturesFromSet(0, bucketSummary))
                    {
                        //order slave set features in the order determined by
                        //variable settings at the superclass level
                        List<Feature> orderedSlaveSetFeatures =
                                orderSlaveSetFeatures(slaveSetFeatures, masterSetFeature);
                        result.put(masterSetFeature, orderedSlaveSetFeatures);
                    }
                    numConflictBuckets++;
//                    Feature[] oneFromEachSet =
//                            FeatureGrouper.resolveClusterConflicts(bucketSummary,
//                                    new FeatureSet[] {featureSet1, featureSet2},
//                                    FeatureGrouper.CONFLICT_RESOLVER_BEST);
                }
                else
                {
                    //exactly one from each, no-brainer
                    Feature masterSetFeature = ((FeatureClusterer.FeatureClusterable)
                            bucketSummary.getParentListForSetIndex(0).get(0)).getParentFeature();
                    Feature slaveSetFeature = ((FeatureClusterer.FeatureClusterable)
                            bucketSummary.getParentListForSetIndex(1).get(0)).getParentFeature();
                    result.add(masterSetFeature, slaveSetFeature);
//System.err.println("first scan: " + firstOne.getParentFeature().scan + ", second scan: " + secondOne.getParentFeature().scan);                    
                }
/*
                List<Clusterer2D.Clusterable> set2Clusterables = bucketSummary.getParentListForSetIndex(1);
                List<Clusterer2D.Clusterable> set1Clusterables = bucketSummary.getParentListForSetIndex(0);
                List<Feature> set2Features = getFeatureListForClusterableList(set2Clusterables);
                List<Feature> set1Features = getFeatureListForClusterableList(set1Clusterables);
                //if there are only two features in the bucket, then obviously we've got one from each
                if (bucketSummary.featureCount == 2)
                {
                    result.add(new Pair<Feature, Feature>(set2Features.get(0),
                                                          set1Features.get(0)));
                }
                else if (bucketSummary.featureCount > 2)
                {
                    numConflictBuckets++;
                    result.add(resolvePairingConflicts(set2Features, set1Features));
                }
*/
            }
        }
//System.err.println("Perfect buckets: " + clusterer.rowsWithOneFromEach(bucketSummaries) + ", conflicts: " + numConflictBuckets);

        return result;
    }




    /**
     * Convert a list of Clusterables, assumed to be FeatureMassHydroClusterables, to a list of
     * Features
     * @param clusterableList
     * @return
     */
    protected List<Feature>
            getFeatureListForClusterableList(List<Clusterer2D.Clusterable> clusterableList)
    {
        List<Feature> result = new ArrayList<Feature>(clusterableList.size());
        for (Clusterer2D.Clusterable clusterable : clusterableList)
        {
            result.add(((FeatureClusterer.FeatureClusterable) clusterable).getParentFeature());
        }
        return result;
    }



    /**
     * Pick the closest feature to a given feature by mass, from a list.
     * @param baseFeature
     * @param featuresToMatch
     * @return
     */
    protected Feature pickClosestByMass(Feature baseFeature, List<Feature> featuresToMatch)
    {
        Feature result = null;
        //worst value possible
        double bestMassDistance = Double.MAX_VALUE;
        for (Feature featureToMatch : featuresToMatch)
        {
            double currentMassDistance = Math.abs(baseFeature.getMass() - featureToMatch.getMass());
            if (currentMassDistance < bestMassDistance)
            {
                bestMassDistance = currentMassDistance;
                result = featureToMatch;
            }
        }
        return result;
    }


    public double getBestMassBucketSize()
    {
        return bestMassBucketSize;
    }

    public void setBestMassBucketSize(double bestMassBucketSize)
    {
        this.bestMassBucketSize = bestMassBucketSize;
    }

    public double getBestElutionBucketSize()
    {
        return bestElutionBucketSize;
    }

    public void setBestElutionBucketSize(double bestElutionBucketSize)
    {
        this.bestElutionBucketSize = bestElutionBucketSize;
    }


    public int getNumMassBuckets()
    {
        return numMassBuckets;
    }

    public void setNumMassBuckets(int numMassBuckets)
    {
        this.numMassBuckets = numMassBuckets;
    }

    public int getNumElutionBuckets()
    {
        return numElutionBuckets;
    }

    public void setNumElutionBuckets(int numElutionBuckets)
    {
        this.numElutionBuckets = numElutionBuckets;
    }

    public double getMassBucketIncrement()
    {
        return massBucketIncrement;
    }

    public void setMassBucketIncrement(double massBucketIncrement)
    {
        this.massBucketIncrement = massBucketIncrement;
    }

    public double getElutionBucketIncrement()
    {
        return elutionBucketIncrement;
    }

    public void setElutionBucketIncrement(double elutionBucketIncrement)
    {
        this.elutionBucketIncrement = elutionBucketIncrement;
    }

    public void setUseMassInsteadOfMz(boolean value)
    {
        useMassInsteadOfMz = value;
    }
}