/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.viewer.align; import org.fhcrc.cpl.toolbox.ApplicationContext; import org.fhcrc.cpl.toolbox.gui.chart.PanelWithScatterPlot; import org.fhcrc.cpl.toolbox.statistics.BasicStatistics; import org.fhcrc.cpl.toolbox.datastructure.Pair; import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet; import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureGrouper; import org.fhcrc.cpl.toolbox.proteomics.feature.Feature; import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureClusterer; import org.fhcrc.cpl.toolbox.proteomics.feature.matching.FeatureSetMatcher; import org.fhcrc.cpl.toolbox.proteomics.Clusterer2D; import org.fhcrc.cpl.toolbox.proteomics.MassUtilities; import org.fhcrc.cpl.viewer.amt.AmtMatchProbabilityAssigner; import java.awt.*; import java.io.File; import java.io.FileOutputStream; import java.io.PrintWriter; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.log4j.Logger; /** * User: migra * Date: Mar 9, 2005 * Time: 11:06:52 AM * * dhmay changing 20100104. Big changes: separating features by charge before clustering. Alignment is still done based * on a deconvoluted featureset, and normalization is done on a deconvoluted featureset in which all feature charges * are assigned 1. But clustering is done separately for each charge state. That way, feature intensity comparison * actually means something. * * dhmay changing 20100311. More big changes. Two kinds of optimization now: * 1. "perfectbuckets". This is the old-school optimization in which we try a bunch of combinations of parameters * and pick the one that gives us the most 'perfect buckets', i.e., rows with one feature from each run. This * generally works pretty well, but the downside is that it can only evaluate the bucket sizes you give it. * 2. "em". This is a mixed-model approach, pretty much identical to what we do in calculating AMT match probability. * We perform a loose clustering and then calculate the distances, in both dimensions, between all pairs of features * that are clustered together (up to some maximum). Then we assume that this 2D error distribution is a mixed * distribution of true matches (bivariate normal) and false (uniform), and calculate the parameters of the distributions * accordingly. Cluster dimensions are then figured out based on the highest possible error for a match with probability * greater than some threashold of being in the true distribution. The bounding box set by these values (note: not * equivalent to the area actually containing matches with those probabilities, which is not rectangular) is used * for clustering. This isn't as good at maximizing "perfect buckets" as the "perfectbuckets" strategy is if you give * "perfectbuckets" a good set of tolerances, but it can be better if you're not sure about your mass accuracy / RT * reproducibility. */ public class BucketedPeptideArray implements Runnable { private static Logger _log = Logger.getLogger(BucketedPeptideArray.class); private java.util.List _sets; private FeatureSet.FeatureSelector _sel; boolean _align = true; boolean _normalize = false; double _elutionBucket = 50; double _massBucket = .2; protected Aligner _aligner = null; //different bucket sizes used for optimization protected double[] _elutionBuckets = {20, 30, 50, 75, 100, 150, 200, 300, 400}; public static final double[] DEFAULT_MASS_BUCKETS_DA = {.025, .05, .1, .15, .2}; public static final double[] DEFAULT_MASS_BUCKETS_PPM = {3, 5, 7, 10, 20}; protected double[] _massBuckets = DEFAULT_MASS_BUCKETS_DA; protected FeatureGrouper _featureGrouper; protected Aligner.FeaturePairSelector _featurePairSelector = Aligner.DEFAULT_FEATURE_PAIR_SELECTOR; private String _outFileName; protected int _conflictResolver = FeatureGrouper.DEFAULT_CONFLICT_RESOLVER; //parameters related to (optional) deconvolution public static final int DEFAULT_DECONVOLUTE_SCAN_WINDOW = 6; public static final double DEFAULT_DECONVOLUTE_MASS_WINDOW = 0.2; protected int _deconvoluteScanWindow = DEFAULT_DECONVOLUTE_SCAN_WINDOW; protected double _deconvoluteMassWindow = DEFAULT_DECONVOLUTE_MASS_WINDOW; protected boolean _shouldDeconvolute = false; //should we optimize based on the distribution of mass and rt error? public static final int OPTIMIZE_MODE_ERRORDIST = 0; public static final int OPTIMIZE_MODE_PERFECTBUCKETS = 1; public static final int DEFAULT_OPTIMIZATION_MODE = OPTIMIZE_MODE_PERFECTBUCKETS; protected int optimizationMode = DEFAULT_OPTIMIZATION_MODE; //For calculating the initial, wide tolerances for generating the dataset that's fed to the EM algorithm //during distribution optimization public static final float DEFAULT_MASS_WIDE_TOLERANCE_PPM = 40; public static final float DEFAULT_MASS_WIDE_TOLERANCE_DA = 0.2f; public static final float DEFAULT_RT_WIDE_TOLERANCE = 1000; protected float sdMultipleForWideToleranceCalc = 4f; protected float optimizationRTWideTolerance = DEFAULT_RT_WIDE_TOLERANCE; protected float optimizationMassWideTolerance = DEFAULT_MASS_WIDE_TOLERANCE_PPM; //The EM algorithm performance bogs down hugely if you give it too many datapoints. Max datapoints to use. protected int maxDatapointsForEMAlgorithm = 10000; //In calculating a bounding box for optimized tolerances, minimum match probability for locating features //to be included in the box public static final float DEFAULT_MAX_MATCH_FDR_EM_OPT = 0.05f; protected float maxMatchFDRForToleranceBoxCalc = DEFAULT_MAX_MATCH_FDR_EM_OPT; //Initial assumption about the proportion of matches (with wide tolerances) that are true matches. This is //likely to be an extremely high proportion if there's any real signal here. If not, the EM algorithm will //correct this protected float initialDistAssumptionTrue = 0.95f; public BucketedPeptideArray(List<?> sets) { _sets = sets; _featureGrouper = new FeatureGrouper(); _aligner = new SplineAligner(); } public BucketedPeptideArray(List<?> sets, FeatureSet.FeatureSelector sel) { this(sets); _sel = sel; } public BucketedPeptideArray(List<?> sets, FeatureSet.FeatureSelector sel, double elutionBucket, double massBucket, String outFileName, boolean align) { this(sets,sel); _elutionBucket = elutionBucket; _massBucket = massBucket; _outFileName = outFileName; _align = align; } public void run() { run(false); } public void run(boolean optimize) { run(optimize, FeatureGrouper.BUCKET_EVALUATION_MODE_ONE_FROM_EACH); } public void run(boolean optimize, int optimizationMode) { run(optimize, optimizationMode, false); } public void run(boolean optimize, int optimizationPerfectBucketMode, boolean showCharts) { String detailsFileName = null; //System.err.println(_massBucket); System.err.println(_featureGrouper.getMassType()); PrintWriter out = null; if (_sel == null) { _log.debug("*************CREATING DUMMY SELECTOR"); _sel = new FeatureSet.FeatureSelector(); } try { ApplicationContext.setMessage("Loading"); java.util.List<FeatureSet> sourceFeatureSets = new ArrayList<FeatureSet>(); for (int i = 0; i < _sets.size(); i++) { FeatureSet fs; if (_sets.get(i) instanceof File) fs = new FeatureSet((File) _sets.get(i), Color.RED); else if (_sets.get(i) instanceof FeatureSet) fs = (FeatureSet) _sets.get(i); else if (_sets.get(i) instanceof String) fs = new FeatureSet(new File((String) _sets.get(i)), Color.RED); else { ApplicationContext.errorMessage("Couldn't load feature set due to bad object: " + _sets.get(i).toString(), null); return; } sourceFeatureSets.add(fs); } //Filter ApplicationContext.setMessage("Filtering"); _log.debug("sel: " + _sel); List<FeatureSet> featureSets = new ArrayList<FeatureSet>(); for (int i = 0; i < sourceFeatureSets.size(); i++) { FeatureSet fs = (sourceFeatureSets.get(i)).filter(_sel); _log.debug("\tbefore: " + sourceFeatureSets.get(i).getFeatures().length + ", after: " + fs.getFeatures().length); featureSets.add(fs); } //Align if (_align) { ApplicationContext.setMessage("Aligning"); _aligner.setFeaturePairSelector(_featurePairSelector); featureSets = _aligner.alignFeatureSets(featureSets, showCharts); } //Deconvolute if (_shouldDeconvolute) { ApplicationContext.setMessage("Deconvoluting"); for (int i=0; i<featureSets.size(); i++) { FeatureSet origSet = featureSets.get(i); FeatureSet deconvolutedSet = origSet.deconvolute(_deconvoluteScanWindow, _deconvoluteMassWindow, true); ApplicationContext.setMessage("\tCollapsed " + origSet.getFeatures().length + " features into " + deconvolutedSet.getFeatures().length); featureSets.set(i, deconvolutedSet); } } _featureGrouper.setShowCharts(showCharts); _featureGrouper.setGroupByMass(true); _featureGrouper.setGroupByCharge(true); _featureGrouper.setConflictResolver(_conflictResolver); if (optimize) { switch (optimizationMode) { case OPTIMIZE_MODE_PERFECTBUCKETS: optimizePerfectBuckets(featureSets, optimizationPerfectBucketMode, showCharts); break; case OPTIMIZE_MODE_ERRORDIST: optimizeDist(featureSets, showCharts); break; } } for (FeatureSet fs : featureSets) { _featureGrouper.addSet(fs); } _featureGrouper.split2D(_massBucket, _elutionBucket); ApplicationContext.setMessage("Perfect buckets: " + _featureGrouper.rowsWithOneFromEach()); if (_outFileName != null) out = new PrintWriter(new FileOutputStream(_outFileName)); else out = new PrintWriter(System.out); writeHeader(out); float[][] intensitiesAfterProcessing = _featureGrouper.writePeptideArray(out, _normalize); // ???? check for successful write? out.flush(); //Write details file if outfilename != null if (_outFileName != null) { out.close(); detailsFileName = calcDetailsFilepath(_outFileName); out = new PrintWriter(new FileOutputStream(detailsFileName)); writeHeader(out); //todo: figure out dynamically if we need to write out MS2 extrainfo ApplicationContext.setMessage("Writing details array"); _featureGrouper.writeArrayDetails(out, true); out.flush(); } if (showCharts && featureSets.size() == 2) { List<Float> logInt1 = new ArrayList<Float>(); List<Float> logInt2 = new ArrayList<Float>(); for (Clusterer2D.BucketSummary summary :_featureGrouper.summarize()) { if ((summary.featureCount == summary.setCount) && (summary.setCount == 2)) { logInt1.add((float)Math.log( ((FeatureClusterer.FeatureClusterable) summary.getParentListForSetIndex(0).get(0)).getParentFeature().getTotalIntensity())); logInt2.add((float)Math.log( ((FeatureClusterer.FeatureClusterable) summary.getParentListForSetIndex(1).get(0)).getParentFeature().getTotalIntensity())); } } if (logInt1.size() > 1) { PanelWithScatterPlot pwsp = new PanelWithScatterPlot(logInt1, logInt2, "PerfectBucket_logint_nonorm"); pwsp.setPointSize(2); pwsp.displayInTab(); ApplicationContext.setMessage("Perfect Bucket log-intensity correlation (without normalization): " + BasicStatistics.correlationCoefficient(logInt1, logInt2)); } if (_normalize) { logInt1 = new ArrayList<Float>(); logInt2 = new ArrayList<Float>(); for (int i=0; i<intensitiesAfterProcessing.length; i++) { if (intensitiesAfterProcessing[i][0] > 0 && intensitiesAfterProcessing[i][1] > 0) { logInt1.add((float)Math.log(intensitiesAfterProcessing[i][0])); logInt2.add((float)Math.log(intensitiesAfterProcessing[i][1])); } } if (logInt1.size() > 1) { PanelWithScatterPlot pwsp = new PanelWithScatterPlot(logInt1, logInt2, "AllBucket_logint_norm"); pwsp.setPointSize(2); pwsp.displayInTab(); ApplicationContext.setMessage("All Bucket log-intensity correlation (with normalization): " + BasicStatistics.correlationCoefficient(logInt1, logInt2)); } } } } catch (Exception x) { ApplicationContext.errorMessage("Error building peptide array", x); } finally { ApplicationContext.setMessage("Peptide array complete." + (_outFileName == null ? "" : " See " + _outFileName + " and " + detailsFileName)); if (out != null && _outFileName == null) { out.close(); //System.err.println("Closing out"); } } } protected void optimizePerfectBuckets(List<FeatureSet> featureSets, int optimizationPerfectBucketMode, boolean showCharts) { ApplicationContext.setMessage("Optimizing"); FeatureGrouper optimizeFeatureGrouper = new FeatureGrouper(); optimizeFeatureGrouper.setGroupByCharge(false); optimizeFeatureGrouper.setGroupByMass(true); optimizeFeatureGrouper.setMassType(_featureGrouper.getMassType()); optimizeFeatureGrouper.setConflictResolver(_conflictResolver); for (FeatureSet featureSet : featureSets) { optimizeFeatureGrouper.addSet(featureSet); } _log.debug("optimizing... added all sets"); StringBuffer massBucketsToPrint = new StringBuffer(); for (double massBucket : _massBuckets) massBucketsToPrint.append(massBucket + ", "); StringBuffer scanBucketsToPrint = new StringBuffer(); for (double elutionBucket : _elutionBuckets) scanBucketsToPrint.append(elutionBucket + ", "); ApplicationContext.setMessage("Optimizing: mass buckets " + massBucketsToPrint + " elution buckets " + scanBucketsToPrint); Pair<Double, Double> bestBuckets = optimizeFeatureGrouper.calculateBestBuckets(_massBuckets, _elutionBuckets, optimizationPerfectBucketMode, showCharts); _massBucket = bestBuckets.first; _elutionBucket = bestBuckets.second; ApplicationContext.setMessage("Using mass and elution buckets " + _massBucket + " and " + _elutionBucket); } /** * * @param featureSets * @param showCharts * @throws IOException */ protected void optimizeDist(List<FeatureSet> featureSets, boolean showCharts) throws IOException { ApplicationContext.setMessage("Beginning distribution optimization...."); FeatureGrouper optimizeFeatureGrouper = new FeatureGrouper(); optimizeFeatureGrouper.setGroupByCharge(true); FeatureSet.FeatureSelector accMassSel = new FeatureSet.FeatureSelector(); accMassSel.setAccurateMzOnly(true); for (FeatureSet featureSet : featureSets) optimizeFeatureGrouper.addSet(featureSet); optimizeFeatureGrouper.setGroupByMass(true); optimizeFeatureGrouper.setMassType(_featureGrouper.getMassType()); optimizeFeatureGrouper.setConflictResolver(_conflictResolver); optimizeFeatureGrouper.split2D(optimizationMassWideTolerance, optimizationRTWideTolerance); Clusterer2D.BucketSummary[] summaries = optimizeFeatureGrouper.summarize(); float[] clusterRTDiameters = new float[summaries.length]; float[] clusterMassDiameters = new float[summaries.length]; for (int i=0; i<summaries.length; i++) { Clusterer2D.BucketSummary summary = summaries[i]; if ((summary.featureCount < 2) || (summary.setCount != summary.featureCount)) continue; clusterMassDiameters[i] = (float) (summary.maxDimension1 - summary.minDimension1); if (_featureGrouper.getMassType() == FeatureClusterer.DELTA_MASS_TYPE_PPM) clusterMassDiameters[i] = MassUtilities.calculatePPMDeltaMass( (float) ((summary.maxDimension1 + summary.minDimension1) / 2), clusterMassDiameters[i], FeatureSetMatcher.DELTA_MASS_TYPE_ABSOLUTE); clusterRTDiameters[i] = (float) (summary.maxDimension2 - summary.minDimension2); } float massLimit = sdMultipleForWideToleranceCalc * BasicStatistics.standardDeviation(clusterMassDiameters); float rtLimit = sdMultipleForWideToleranceCalc * BasicStatistics.standardDeviation(clusterRTDiameters); ApplicationContext.setMessage(sdMultipleForWideToleranceCalc + "*SD mass diameter: " + massLimit + ", RT diameter = " + rtLimit); optimizeFeatureGrouper.split2D(massLimit, rtLimit); summaries = optimizeFeatureGrouper.summarize(); int numPotentialDatapoints = 0; for (Clusterer2D.BucketSummary summary : summaries) { int numBucketPoints = summary.entries().length; if (summary.setCount >1 && (summary.featureCount == summary.setCount)) { numPotentialDatapoints += numBucketPoints * (numBucketPoints-1); } } float fractionDatapointsToKeep = 1f; if (numPotentialDatapoints > maxDatapointsForEMAlgorithm) fractionDatapointsToKeep = (float) maxDatapointsForEMAlgorithm / (float) numPotentialDatapoints; ApplicationContext.setMessage("Potential Datapoints: " + numPotentialDatapoints + ". Keeping " + (fractionDatapointsToKeep * 100f) + "%"); List<Float> massDistances = new ArrayList<Float>(); List<Float> massesOfBuckets = new ArrayList<Float>(); List<Float> rtDistances = new ArrayList<Float>(); ApplicationContext.infoMessage("Calculating distances for " + summaries.length + " buckets"); for (Clusterer2D.BucketSummary summary : summaries) { Feature[] bucketFeatures = optimizeFeatureGrouper.getFeatures(summary); if ((summary.setCount > 1) && (summary.featureCount == summary.setCount)) { double[] masses = new double[bucketFeatures.length]; double[] mzs = new double[bucketFeatures.length]; double[] rts = new double[bucketFeatures.length]; for (int i=0; i<bucketFeatures.length; i++) { masses[i] = bucketFeatures[i].mass; mzs[i] = bucketFeatures[i].mz; rts[i] = bucketFeatures[i].time; } double medianMass = BasicStatistics.median(masses); List<Float> massDistancesThisBucket = new ArrayList<Float>(); List<Float> rtDistancesThisBucket = new ArrayList<Float>(); for (int i=0; i<bucketFeatures.length-1; i++) { for (int j=i+1; j<bucketFeatures.length; j++) { if (fractionDatapointsToKeep == 1 || Math.random() < fractionDatapointsToKeep) { massesOfBuckets.add((float) medianMass); float deltaMass = bucketFeatures[i].getMass() - bucketFeatures[j].getMass(); if (_featureGrouper.getMassType() == FeatureClusterer.DELTA_MASS_TYPE_PPM) deltaMass = MassUtilities.calculatePPMDeltaMass((float)medianMass, deltaMass, FeatureSetMatcher.DELTA_MASS_TYPE_ABSOLUTE); float deltaTime = _featureGrouper.getElution(bucketFeatures[i]) - _featureGrouper.getElution(bucketFeatures[j]); //these features are ordered by time, secondarily by mass. Need to randomize //the ordering so that we don't end up with all negative or all pos values boolean flipMassAndRT = Math.random() < 0.5f; if (flipMassAndRT) { deltaMass = -deltaMass; deltaTime = -deltaTime; } massDistancesThisBucket.add(deltaMass); rtDistancesThisBucket.add(deltaTime); } } } massDistances.addAll(massDistancesThisBucket); rtDistances.addAll(rtDistancesThisBucket); //if (rtDistancesThisBucket.get(0) < 10) // System.err.println(massDistancesThisBucket.get(0) + "\t" + masses[0] + "\t" + masses[1] + "\t" + mzs[0] + "\t" + mzs[1] + "\t" + rts[0] + "\t" + rts[1]); } } ApplicationContext.infoMessage("Calculating mixed model distribution for probability inference..."); float minMassD = (float) BasicStatistics.min(massDistances); float maxMassD = (float) BasicStatistics.max(massDistances); float minRTD = (float) BasicStatistics.min(rtDistances); float maxRTD = (float) BasicStatistics.max(rtDistances); //System.err.println("****" + minMassD + ", " + maxMassD + ", " + minRTD + ", " + maxRTD); AmtMatchProbabilityAssigner probabilityAssigner = new AmtMatchProbabilityAssigner( minMassD, maxMassD, minRTD, maxRTD, 0.001f, maxMatchFDRForToleranceBoxCalc); float[] probabilities = probabilityAssigner.calculateProbabilitiesEM(massDistances, rtDistances, initialDistAssumptionTrue, showCharts); float highestMassDiffWithHighProb = Float.MIN_VALUE; float highestRTDiffWithHighProb = Float.MIN_VALUE; for (int i=0; i<probabilities.length; i++) { if (probabilities[i] > maxMatchFDRForToleranceBoxCalc) { highestMassDiffWithHighProb = Math.max(highestMassDiffWithHighProb, Math.abs(massDistances.get(i))); highestRTDiffWithHighProb = Math.max(highestRTDiffWithHighProb, Math.abs(rtDistances.get(i))); } } ApplicationContext.infoMessage("Bounding box with passing features: mass=+-" + highestMassDiffWithHighProb + ", RT=+-" + highestRTDiffWithHighProb); _massBucket = highestMassDiffWithHighProb; _elutionBucket = highestRTDiffWithHighProb; ApplicationContext.setMessage("Using mass and elution buckets " + _massBucket + " and " + _elutionBucket); ApplicationContext.infoMessage("Distance dist datapoints: " + massDistances.size()); if (showCharts) { PanelWithScatterPlot pwspDist = new PanelWithScatterPlot(rtDistances, massDistances, "Error distances"); pwspDist.setPointSize(2); pwspDist.addVerticalLine(-highestRTDiffWithHighProb, -highestMassDiffWithHighProb, highestMassDiffWithHighProb); pwspDist.addVerticalLine(highestRTDiffWithHighProb, -highestMassDiffWithHighProb, highestMassDiffWithHighProb); pwspDist.addHorizontalLine(-highestMassDiffWithHighProb, -highestRTDiffWithHighProb, highestRTDiffWithHighProb); pwspDist.addHorizontalLine(highestMassDiffWithHighProb, -highestRTDiffWithHighProb, highestRTDiffWithHighProb); pwspDist.setSeriesColor(0, Color.RED); for (int i=1; i<6; i++) pwspDist.setSeriesColor(i, Color.BLUE); pwspDist.displayInTab(); new PanelWithScatterPlot(massesOfBuckets, massDistances, "mass vs deltaMass").displayInTab(); } } /** * Figure out the filepath for the details file related to an array file * @param arrayFilepath * @return */ public static String calcDetailsFilepath(String arrayFilepath) { int dotPos = arrayFilepath.lastIndexOf('.'); String detailsFileName = null; if (dotPos < 0) detailsFileName = arrayFilepath + ".details.tsv"; else detailsFileName = arrayFilepath.substring(0, dotPos) + ".details" + arrayFilepath.substring(dotPos); return detailsFileName; } /** * Look up the tag for the given feature set. Tags a are supplied as key/value pairs, * where the key is the prefix of the last element of the file path (everything from * the first "." on is stripped off). */ private static String lookupTag(FeatureSet fs, Map<String, String> tags) { if (null == fs || null == fs.getSourceFile()) return null; String filename = fs.getSourceFile().getName(); if (tags.containsKey(filename)) return tags.get(filename); return null; } /** * Group feature sets by tag. * Align within group to generate a representative FeatureSet for each. * Then align the representatives. */ public void alignByGroup(Map<String,String> tags, boolean strict) { String detailsFileName = null; PrintWriter out = null; if (_sel == null) _sel = new FeatureSet.FeatureSelector(); try { ApplicationContext.setMessage("Loading"); List<FeatureSet> sourceFeatureSets = new ArrayList<FeatureSet>(); for (int i = 0; i < _sets.size(); i++) { FeatureSet fs; if (_sets.get(i) instanceof File) fs = new FeatureSet((File) _sets.get(i), Color.RED); else if (_sets.get(i) instanceof FeatureSet) fs = (FeatureSet) _sets.get(i); else if (_sets.get(i) instanceof String) fs = new FeatureSet(new File((String) _sets.get(i)), Color.RED); else { ApplicationContext.errorMessage("Couldn't load feature set due to bad object: " + _sets.get(i).toString(), null); return; } sourceFeatureSets.add(fs); } // ???? At this point, if there is only one tag, we could revert to regular alignment // Filter all source feature sets and add each to the appropriate tagged list ApplicationContext.setMessage("Filtering"); HashMap<String, List<FeatureSet>> taggedFeatureSets = new HashMap<String, List<FeatureSet>>(); ArrayList<String> tagList = new ArrayList<String>(); // Try to keep order of big alignment consistent for (int i = 0; i < sourceFeatureSets.size(); i++) { FeatureSet fs = ((FeatureSet) sourceFeatureSets.get(i)).filter(_sel); fs.setTag(lookupTag(fs, tags)); if (!taggedFeatureSets.containsKey(fs.getTag())) taggedFeatureSets.put(fs.getTag(), new ArrayList<FeatureSet>()); taggedFeatureSets.get(fs.getTag()).add(fs); if (!tagList.contains(fs.getTag())) tagList.add(fs.getTag()); } // Loop through the tags and align all associated FeatureSets ArrayList<FeatureSet> filteredFeatureSets = new ArrayList<FeatureSet>(); for (String tag : tagList) { List<FeatureSet> featureSets = taggedFeatureSets.get(tag); // If only one FeatureSet in this group just add it to the list for the next round if (featureSets.size() == 1) { filteredFeatureSets.add(featureSets.get(0)); } else { //Align ApplicationContext.setMessage("Aligning group " + tag); _aligner.setFeaturePairSelector(_featurePairSelector); featureSets = _aligner.alignFeatureSets(featureSets, false); //Deconvolve if (_shouldDeconvolute) { for (int i = 0; i < featureSets.size(); i++) { featureSets.set(i, featureSets.get(i).deconvolute( _deconvoluteScanWindow, _deconvoluteMassWindow, true)); } } FeatureGrouper grouper = new FeatureGrouper(); grouper.setGroupByMass(true); grouper.setConflictResolver(_conflictResolver); for (FeatureSet fs : featureSets) { grouper.addSet(fs); } ApplicationContext.setMessage("Creating array " + tag); grouper.split2D(_massBucket, _elutionBucket); // If strict, a feature must match across all runs to be included. // Otherwise ~75% of the runs is good enough int minAligned = strict ? featureSets.size() :(int) (featureSets.size() * 3.0/4.0 + 0.5); for (FeatureSet fs : grouper.filterByGroupedAlignment(minAligned)) filteredFeatureSets.add(fs); } } // Now create a new BucketedPeptideArray to align the filtered feature sets. // Note that we've already applied any other filters during the within group // alignment, so there should be no need for a selector here. BucketedPeptideArray arr = new BucketedPeptideArray(filteredFeatureSets); arr.setElutionBucket(_elutionBucket); arr.setMassBucket(_massBucket); arr.setOutFileName(_outFileName); arr.setAlign(true); arr.setNormalize(_normalize); arr.setConflictResolver(_conflictResolver); arr.run(); } catch (Exception x) { ApplicationContext.errorMessage("Error building peptide array", x); } } private void writeHeader(PrintWriter out) { String revision = (String) ApplicationContext.getProperty("REVISION"); out.println("# Revision: " + revision); out.print("# Files:"); for (int i = 0; i < _sets.size(); i++) out.print(" " + (i + 1) + ": " + getSetName(_sets.get(i))); out.println(); out.print("# Params: " + _sel.toString()); out.println(" --massWindow=" + _massBucket + " --scanWindow=" + _elutionBucket + (_normalize ? " --normalize" : "")); // ???? If normalize is true, it would be good to write the individual scales here } private String getSetName(Object set) { if (set instanceof File) return ((File) set).getName(); else if (set instanceof FeatureSet) return ((FeatureSet) set).getSourceFile() != null ? ((FeatureSet) set).getSourceFile().getName() : set.toString(); else return set.toString(); } public java.util.List<?> getFiles() { return _sets; } public void setFiles(java.util.List<?> files) { this._sets = files; } public FeatureSet.FeatureSelector get_sel() { return _sel; } public void set_sel(FeatureSet.FeatureSelector _sel) { this._sel = _sel; } public boolean isAlign() { return _align; } public void setAlign(boolean align) { this._align = align; } public boolean getNormalize() { return _normalize; } public void setNormalize(boolean normalize) { this._normalize = normalize; } public int getDeconvoluteScanWindow() { return _deconvoluteScanWindow; } public void setDeconvoluteScanWindow(int deconvoluteScanWindow) { this._deconvoluteScanWindow = deconvoluteScanWindow; } public double getDeconvoluteMassWindow() { return _deconvoluteMassWindow; } public void setDeconvoluteMassWindow(double deconvoluteMassWindow) { this._deconvoluteMassWindow = deconvoluteMassWindow; } public double getMassBucket() { return _massBucket; } public void setMassBucket(double massBucket) { this._massBucket = massBucket; } public double getElutionBucket() { return _elutionBucket; } public void setElutionBucket(double scanBucket) { this._elutionBucket = scanBucket; } public String getOutFileName() { return _outFileName; } public void setOutFileName(String outFileName) { this._outFileName = outFileName; } public double[] getElutionBuckets() { return _elutionBuckets; } public void setScanBuckets(double[] _scanBuckets) { this._elutionBuckets = _scanBuckets; } public double[] getMassBuckets() { return _massBuckets; } public void setMassBuckets(double[] _massBuckets) { this._massBuckets = _massBuckets; } public int getConflictResolver() { return _conflictResolver; } public void setConflictResolver(int conflictResolver) { this._conflictResolver = conflictResolver; } public FeatureGrouper getFeatureGrouper() { return _featureGrouper; } public void setFeatureGrouper(FeatureGrouper _featureGrouper) { this._featureGrouper = _featureGrouper; } public Aligner.FeaturePairSelector getFeaturePairSelector() { return _featurePairSelector; } public void setFeaturePairSelector(Aligner.FeaturePairSelector featurePairSelector) { this._featurePairSelector = featurePairSelector; } public Aligner getAligner() { return _aligner; } public void setAligner(Aligner _aligner) { this._aligner = _aligner; } public boolean isShouldDeconvolute() { return _shouldDeconvolute; } public void setShouldDeconvolute(boolean _shouldDeconvolute) { this._shouldDeconvolute = _shouldDeconvolute; } public void setElutionMode(int _elutionMode) { _featureGrouper.setElutionMode(_elutionMode); } public int getOptimizationMode() { return optimizationMode; } public void setOptimizationMode(int optimizationMode) { this.optimizationMode = optimizationMode; } /** * Also sets the wide mass tolerance for EM dist optimization to a default. So if you're overriding, call this, * then call setOptimizationMassWideTolerance * @param massType */ public void setMassType(int massType) { _featureGrouper.setMassType(massType); switch (massType) { case FeatureClusterer.DELTA_MASS_TYPE_ABSOLUTE: optimizationMassWideTolerance = DEFAULT_MASS_WIDE_TOLERANCE_DA; _massBuckets = DEFAULT_MASS_BUCKETS_DA; break; case FeatureClusterer.DELTA_MASS_TYPE_PPM: optimizationMassWideTolerance = DEFAULT_MASS_WIDE_TOLERANCE_PPM; _massBuckets = DEFAULT_MASS_BUCKETS_PPM; break; } } public float getOptimizationRTWideTolerance() { return optimizationRTWideTolerance; } public void setOptimizationRTWideTolerance(float optimizationRTWideTolerance) { this.optimizationRTWideTolerance = optimizationRTWideTolerance; } public float getOptimizationMassWideTolerance() { return optimizationMassWideTolerance; } public void setOptimizationMassWideTolerance(float optimizationMassWideTolerance) { this.optimizationMassWideTolerance = optimizationMassWideTolerance; } public int getMaxDatapointsForEMAlgorithm() { return maxDatapointsForEMAlgorithm; } public void setMaxDatapointsForEMAlgorithm(int maxDatapointsForEMAlgorithm) { this.maxDatapointsForEMAlgorithm = maxDatapointsForEMAlgorithm; } public float getMaxMatchFDRForToleranceBoxCalc() { return maxMatchFDRForToleranceBoxCalc; } public void setMaxMatchFDRForToleranceBoxCalc(float minMatchProbForToleranceBoxCalc) { this.maxMatchFDRForToleranceBoxCalc = minMatchProbForToleranceBoxCalc; } }