/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.toolbox.proteomics.feature; import org.fhcrc.cpl.toolbox.datastructure.Pair; import org.fhcrc.cpl.toolbox.datastructure.Tree2D; import org.fhcrc.cpl.toolbox.ApplicationContext; import org.fhcrc.cpl.toolbox.statistics.BasicStatistics; import org.fhcrc.cpl.toolbox.normalize.Normalizer; import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef; import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.FeatureExtraInformationDef; import org.fhcrc.cpl.toolbox.proteomics.Clusterer2D; import org.apache.log4j.Logger; import java.io.PrintWriter; import java.util.*; /** * Uses clustering (Clusterer2D) to align multiple feature sets * * dhmay changing 20100104, for peptide array changes. * Big changes: Clustering may optionally be done separately per charge state */ public class FeatureGrouper { private static Logger _log = Logger.getLogger(FeatureGrouper.class); private List<FeatureSet> _featureSets = new ArrayList<FeatureSet>(); //handles the actual clustering protected FeatureClusterer _featureClusterer; protected Map<Integer, FeatureClusterer> _chargeClustererMap = new HashMap<Integer, FeatureClusterer>(); protected boolean _useMassInsteadOfMz = true; protected boolean _shouldGroupByCharge = false; protected Set<Integer> _allObservedCharges = new HashSet<Integer>(); protected int _conflictResolver = CONFLICT_RESOLVER_SUM; //modes for determining which bucket settings are the best public static final int BUCKET_EVALUATION_MODE_ONE_FROM_EACH = 0; public static final int BUCKET_EVALUATION_MODE_PEPTIDE_AGREEMENT = 1; //These parameters determine the scoring used to decide what the best cluster //diameters are, when factoring in peptide agreement //minimum number of features in a bucket -- one per set -- to consider //the bucket a hit protected double bucketEvaluationPeptideAgreementMinAlignedNumSetsMultiple = 1; //minimum number of matching peptide IDs in the bucket to consider it a hit protected double bucketEvaluationPeptideAgreementMinPeptideIdsNumSetsMultiple = 0.5; //score for a matched bucket protected int bucketEvaluationPeptideAgreementMatchScore = DEFAULT_PEPTIDE_AGREEMENT_MATCH_SCORE; //penalty for a mismatched bucket protected int bucketEvaluationPeptideAgreementMismatchPenalty = DEFAULT_PEPTIDE_AGREEMENT_MISMATCH_PENALTY; public static final int DEFAULT_PEPTIDE_AGREEMENT_MATCH_SCORE=1; public static final int DEFAULT_PEPTIDE_AGREEMENT_MISMATCH_PENALTY=1; //how to resolve conflicts? I.e., features from the same set in the same //cluster public static final int CONFLICT_RESOLVER_SUM=0; public static final int CONFLICT_RESOLVER_BEST=1; public static final int CONFLICT_RESOLVER_MAX=2; //by default, sum the intensities of conflicting features public static final int DEFAULT_CONFLICT_RESOLVER=CONFLICT_RESOLVER_SUM; protected boolean showCharts = false; /** * Initialize the clusterer */ public FeatureGrouper() { _featureClusterer = createFeatureClusterer(); } protected FeatureClusterer createFeatureClusterer() { FeatureClusterer clusterer = new FeatureClusterer(_useMassInsteadOfMz ? FeatureClusterer.MASS_MZ_MODE_MASS : FeatureClusterer.MASS_MZ_MODE_MZ, FeatureClusterer.ELUTION_MODE_TIME); if (_featureClusterer != null) clusterer.setMassType(_featureClusterer.getMassType()); return clusterer; } /** * By default, simply evaluate the bucket settings based on the number of rows with * exactly one feature from each bucket * @param massOrMzBuckets * @param elutionBuckets, * @return */ public Pair<Double, Double> calculateBestBuckets(double[] massOrMzBuckets, double[] elutionBuckets) { return calculateBestBuckets(massOrMzBuckets, elutionBuckets, BUCKET_EVALUATION_MODE_ONE_FROM_EACH, false); } public Pair<Double, Double> calculateBestBuckets(double[] massOrMzBuckets, double[] elutionBuckets, int bucketEvaluationMode, boolean showCharts) { Pair<Double, Double> bestBuckets = null; switch (bucketEvaluationMode) { case BUCKET_EVALUATION_MODE_ONE_FROM_EACH: bestBuckets = _featureClusterer.calculateBestBuckets(massOrMzBuckets, elutionBuckets, showCharts); break; case BUCKET_EVALUATION_MODE_PEPTIDE_AGREEMENT: //evaluate bucket breakdowns based on the number of rows with minAligned //features, one from each represented set, minPeptideIds of which match //to the same peptide ID. Each passing row scores that breakdown matchScore //points, and each peptide mismatch loses it mismatchPenalty points int numSets = _featureClusterer.getClusterableArrays().size(); //round up int bucketEvaluationPeptideAgreementMinPeptideIds = (int) Math.round(bucketEvaluationPeptideAgreementMinPeptideIdsNumSetsMultiple * numSets); //round up int bucketEvaluationPeptideAgreementMinAligned = (int) Math.round(bucketEvaluationPeptideAgreementMinAlignedNumSetsMultiple * numSets); double bestMassOrMzBucket = 0; double bestElutionBucket = 0; int[][] agreeingPeptides = new int[massOrMzBuckets.length][elutionBuckets.length]; int bestNumAgreeingPeptides = -1; //evaluate all combinations of mass and hydrophobicity buckets for (int iMass = 0; iMass < massOrMzBuckets.length; iMass++) { for (int iHydrophobicity = 0; iHydrophobicity < elutionBuckets.length; iHydrophobicity++) { double elutionBucketsize = elutionBuckets[iHydrophobicity]; double massOrMzBucketsize = massOrMzBuckets[iMass]; if (elutionBucketsize <= 0 || massOrMzBucketsize <= 0) continue; split2D(massOrMzBucketsize, elutionBucketsize); int bucketScore = 0; Clusterer2D.BucketSummary[] bucketSummaries = summarize(); for (Clusterer2D.BucketSummary bucketSummary : bucketSummaries) { //see if there's one feature from each represented set in the bucket if (bucketSummary.featureCount >= bucketEvaluationPeptideAgreementMinAligned && bucketSummary.setCount == bucketSummary.featureCount) { String commonPeptide = null; boolean failed = false; int numMatched = 0; for (Clusterer2D.Clusterable clusterable : bucketSummary.getParentList()) { Feature parentFeature = ((FeatureClusterer.FeatureClusterable) clusterable).getParentFeature(); String peptide = MS2ExtraInfoDef.getFirstPeptide(parentFeature); if (peptide == null) continue; if (commonPeptide == null) { commonPeptide = peptide; numMatched++; } else { if (commonPeptide.equals(peptide)) numMatched++; else { failed=true; break; } } } if (failed) bucketScore -= bucketEvaluationPeptideAgreementMismatchPenalty; else if (numMatched >= bucketEvaluationPeptideAgreementMinPeptideIds) { bucketScore += bucketEvaluationPeptideAgreementMatchScore; } } } _log.debug("Evaluating bucket size: mass " + massOrMzBucketsize + ", elution " + elutionBucketsize); _log.debug(" Bucket score: " + bucketScore); agreeingPeptides[iMass][iHydrophobicity] = bucketScore; if (agreeingPeptides[iMass][iHydrophobicity] > bestNumAgreeingPeptides) { bestMassOrMzBucket = massOrMzBucketsize; bestElutionBucket = elutionBucketsize; bestNumAgreeingPeptides = agreeingPeptides[iMass][iHydrophobicity]; } } } ApplicationContext.setMessage("Best number of agreeing peptides: " + bestNumAgreeingPeptides); bestBuckets = new Pair<Double, Double>(bestMassOrMzBucket, bestElutionBucket); break; } return bestBuckets; } /** * Old school entry point; never does normalization */ public void writePeptideArray(PrintWriter writer) { writePeptideArray(writer, false, false); } /** * by default, don't sum intensities * @param writer * @param normalize * @return the array of summaries intensities, normalized if normalize==true */ public float[][] writePeptideArray(PrintWriter writer, boolean normalize) { return writePeptideArray(writer, normalize, false); } public static Feature[] getFeatures(Clusterer2D.BucketSummary bucketSummary) { List<Feature> resultList = new ArrayList<Feature>(); for (Clusterer2D.TreeEntry treeEntry : bucketSummary.entries()) resultList.add(((FeatureClusterer.FeatureClusterable) treeEntry.parent).getParentFeature()); return resultList.toArray(new Feature[0]); } public static List<Feature> getFeaturesFromSet(int setIndex, Clusterer2D.BucketSummary bucketSummary) { List<Feature> resultList = new ArrayList<Feature>(); for (Clusterer2D.Clusterable clusterable : bucketSummary.getParentListForSetIndex(setIndex)) resultList.add(((FeatureClusterer.FeatureClusterable) clusterable).getParentFeature()); return resultList; } /** * Write out a peptide array. Optionally normalize. * If sumIntensities is true, all intensities from a given FeatureSet in the bucket will * be summed. Otherwise, the intensity included will be from the feature that was picked * to represent the bucket * @param writer * @param normalize * @return the array of summaries intensities, normalized if normalize==true */ public float[][] writePeptideArray(PrintWriter writer, boolean normalize, boolean sumIntensities) { //should we be writing out peptides and proteins? Only if any featuresets //have those fields boolean writePeptidesAndProteins = false; //Write peptides and proteins iff any FeatureSet has peptide //and protein info for (int i = 0; i < _featureSets.size() && (!writePeptidesAndProteins); i++) { writePeptidesAndProteins = _featureSets.get(i).hasExtraInformationType(MS2ExtraInfoDef.getSingletonInstance()); } List<String> columns = new ArrayList<String>(); columns.add("id"); if (_shouldGroupByCharge) columns.add("charge"); if (_useMassInsteadOfMz) { columns.add("minMass"); columns.add("maxMass"); } else { columns.add("minMz"); columns.add("maxMz"); } //dhmay changing 20100311. Now the peptide array will have different column names depending on scan or // time mode switch (_featureClusterer.getElutionMode()) { case FeatureClusterer.ELUTION_MODE_TIME: columns.add("minTime"); columns.add("maxTime"); break; default: columns.add("minScan"); columns.add("maxScan"); } columns.add("featureCount"); columns.add("setCount"); for (int i = 0; i < _featureSets.size(); i++) { FeatureSet fs = _featureSets.get(i); String featureSetColumnSuffix = buildFeatureSetColumnSuffix(fs, i+1); columns.add("intensity" + featureSetColumnSuffix); if (writePeptidesAndProteins) { columns.add("peptide" + featureSetColumnSuffix); columns.add("protein" + featureSetColumnSuffix); } } for (int i=0; i<columns.size(); i++) { if (i>0) writer.print("\t"); writer.print(columns.get(i)); } writer.println(); //TODO: probably this should be a parameter that gets passed in. //True, though, it only makes sense if writePeptidesAndProteins is true boolean showMultiplePeptideProteinMatches = writePeptidesAndProteins; List<FeatureClusterer> clusterersToSummarize = new ArrayList<FeatureClusterer>(); List<Integer> chargesOfClusterers = new ArrayList<Integer>(); if (_shouldGroupByCharge) { for (int charge : _chargeClustererMap.keySet()) { FeatureClusterer clusterer = _chargeClustererMap.get(charge); if (clusterer.countAllEntries() == 0) { _log.debug("split2D: skipping clusterer for charge " + charge + ", with 0 entries"); } else { clusterersToSummarize.add(clusterer); chargesOfClusterers.add(charge); } } } else { clusterersToSummarize.add(_featureClusterer); } int numTotalRows = 0; List<Integer> firstRowIndexesEachSummarizer = new ArrayList<Integer>(); List<Clusterer2D.BucketSummary[]> summariesAllClusterers = new ArrayList<Clusterer2D.BucketSummary[]>(); for (int i=0; i<clusterersToSummarize.size(); i++) { FeatureClusterer clusterer = clusterersToSummarize.get(i); firstRowIndexesEachSummarizer.add(numTotalRows); if (_shouldGroupByCharge) ApplicationContext.setMessage("\tSummarizing charge " + chargesOfClusterers.get(i) + "..."); Clusterer2D.BucketSummary[] summaries = clusterer.summarize(); summariesAllClusterers.add(summaries); if (_shouldGroupByCharge) ApplicationContext.setMessage("\t" + summaries.length + " clusters this charge"); numTotalRows += summaries.length; } ApplicationContext.setMessage("Done clustering."); Clusterer2D.BucketSummary[] allSummaries = new Clusterer2D.BucketSummary[numTotalRows]; for (int i=0; i<summariesAllClusterers.size(); i++) { Clusterer2D.BucketSummary[] summaries = summariesAllClusterers.get(i); System.arraycopy(summaries, 0, allSummaries, firstRowIndexesEachSummarizer.get(i), summaries.length); } summariesAllClusterers = null; System.gc(); ApplicationContext.setMessage("Calculating intensities for " + allSummaries.length + " cells...."); //resolve conflicts, determine intensities/features to represent rows //stupid java 1.5 can't handle generic array creation List<Feature>[][] allSummariesFeatures = (List<Feature>[][]) (new List[numTotalRows][_featureSets.size()]); float[][] allSummariesIntensities = new float[numTotalRows][_featureSets.size()]; int arrayRowIndex = -1; FeatureSet[] featureSetsArray = new FeatureSet[_featureSets.size()]; for (int i=0; i<_featureSets.size(); i++) featureSetsArray[i] = _featureSets.get(i); for (Clusterer2D.BucketSummary summary : allSummaries) { arrayRowIndex++; //order features. If the resolver is currently "sum", //order them by "best" anyway; might as well do that so that the //peptides and proteins have a known ordering. //Order is descending order of goodness, whatever goodness is allSummariesFeatures[arrayRowIndex] = orderClusterFeatures(summary, featureSetsArray, _conflictResolver == CONFLICT_RESOLVER_SUM ? CONFLICT_RESOLVER_BEST : _conflictResolver); //Now, assign intensities. for (int j = 0; j < _featureSets.size(); j++) { allSummariesIntensities[arrayRowIndex][j] = 0; List<Feature> thisFeatureList = allSummariesFeatures[arrayRowIndex][j]; if (thisFeatureList != null && !thisFeatureList.isEmpty()) { //sum all intensities if (_conflictResolver == CONFLICT_RESOLVER_SUM) { for (Feature feature : thisFeatureList) { allSummariesIntensities[arrayRowIndex][j] += feature.getIntensity(); } } else { //pick just one intensity. take the intensity of the //first one, because they're in descending order of goodness. allSummariesIntensities[arrayRowIndex][j] = thisFeatureList.get(0).getIntensity(); } } } } ApplicationContext.setMessage("Done calculating intensities."); //throw away return value of normalizeSummaryIntensities if (normalize) { ApplicationContext.setMessage("Beginning normalization..."); normalizeSummaryIntensities(allSummariesIntensities); ApplicationContext.setMessage("Normalization complete."); } int currentCharge = 0; int nextSummarizerFirstRowIndex = 0; int currentSummarizerIndex = -1; if (_shouldGroupByCharge) { currentCharge = chargesOfClusterers.get(0); } ApplicationContext.setMessage("Writing array..."); for (int i = 0; i < numTotalRows; i++) { writer.print(i + 1 + "\t"); if (_shouldGroupByCharge) { if (i == nextSummarizerFirstRowIndex) { currentSummarizerIndex++; currentCharge = chargesOfClusterers.get(currentSummarizerIndex); ApplicationContext.setMessage("\tWriting charge " + currentCharge); if (currentSummarizerIndex < chargesOfClusterers.size()-1) nextSummarizerFirstRowIndex = firstRowIndexesEachSummarizer.get(currentSummarizerIndex+1); } writer.print(currentCharge + "\t"); } writer.println( createArrayRow(allSummaries[i],allSummariesFeatures[i], allSummariesIntensities[i], writePeptidesAndProteins, showMultiplePeptideProteinMatches)); } return allSummariesIntensities; } /** * build the column suffix for this featureset. * * Separating this to make it easy to find and mess with. * @param featureSet * @return */ protected String buildFeatureSetColumnSuffix(FeatureSet featureSet, int featureSetNumber) { StringBuffer resultBuf = new StringBuffer("_"); if (featureSet.getSourceFile() == null) { resultBuf.append(featureSetNumber); return resultBuf.toString(); } //add characters from the filename until you run into the first "." //turn whitespace into _ for (byte charByte : featureSet.getSourceFile().getName().getBytes()) { char thisChar = (char) charByte; if (charByte == '.') break; if (Character.isSpaceChar(thisChar)) thisChar = '_'; resultBuf.append(thisChar); } return resultBuf.toString(); } public float getElution(Feature feature) { switch (_featureClusterer.getElutionMode()) { case FeatureClusterer.ELUTION_MODE_SCAN: return feature.getScan(); default: return feature.getTime(); } } /** * Filter each FeatureSet to include only those features that align across * at least minAligned sets. Intended for use when runs are grouped by * some tag (e.g. "case" or "control") */ public FeatureSet[] filterByGroupedAlignment(int minAligned) { boolean useTree = false; Clusterer2D.BucketSummary[] clusters = _featureClusterer.summarize(); FeatureSet[] featureSets = new FeatureSet[_featureSets.size()]; for (int i = 0; i < _featureSets.size(); i++) { Tree2D tree = null; if (useTree) { tree = new Tree2D(); for (Feature f :_featureSets.get(i).getFeatures()) tree.add(f.getMass(), getElution(f), f); } ArrayList<Feature> filteredFeatures = new ArrayList<Feature>(); for (int j = 0; j < clusters.length; j++) { Clusterer2D.BucketSummary cluster = clusters[j]; if (cluster.setCount >= minAligned) { float minMass = (float) cluster.minDimension1; float maxMass = (float) cluster.maxDimension1; int minElution = (int) cluster.minDimension2; int maxElution = (int) cluster.maxDimension2; if (useTree) { // use Tree2D to pull out all features within the bucket ArrayList<Feature> bucketFeatures = (ArrayList<Feature>) tree.getPoints(minMass, minElution, maxMass, maxElution); for (Feature f : bucketFeatures) filteredFeatures.add(f); } else { // Tree2D is not inclusive... for (Feature f :_featureSets.get(i).getFeatures()) { if (f.getMass() >= minMass && getElution(f) >= minElution && f.getMass() <= maxMass && getElution(f) <= maxElution) filteredFeatures.add(f); } } } } Feature[] features = filteredFeatures.toArray(new Feature[0]); featureSets[i] = (FeatureSet) _featureSets.get(i).clone(); featureSets[i].setFeatures(features); } return featureSets; } /** * Cover method * @param cluster * @param featureSets * @param conflictResolver * @return */ public static List<Feature>[] orderClusterFeatures(Clusterer2D.BucketSummary cluster, List<FeatureSet> featureSets, int conflictResolver) { return orderClusterFeatures(cluster, featureSets.toArray(new FeatureSet[0]), conflictResolver); } /** * Resolve conflicts between features in a cluster by ordering them along * some metric * * Result is a list of Features in descending order of goodness * @param cluster * @param conflictResolver * @return an array of Features, one representing each FeatureSet from * featureSets if at least one feature from that FeatureSet was present. */ public static List<Feature>[] orderClusterFeatures(Clusterer2D.BucketSummary cluster, FeatureSet[] featureSetArray, int conflictResolver) { //stupid Java can't create arrays of generics List<Feature>[] result = (List<Feature>[]) new List[featureSetArray.length]; switch (conflictResolver) { case CONFLICT_RESOLVER_MAX: Comparator<Feature> intensityDescComp = new Feature.IntensityDescComparator(); for (int i = 0; i < featureSetArray.length; i++) { List<Clusterer2D.Clusterable> thisSetClusterables = cluster.getParentListForSetIndex(i); if (!thisSetClusterables.isEmpty()) { ArrayList<Feature> featureList = new ArrayList<Feature>(thisSetClusterables.size()); for (Clusterer2D.Clusterable featureClusterable : thisSetClusterables) { featureList.add(((FeatureClusterer.FeatureClusterable) featureClusterable).getParentFeature()); } Collections.sort(featureList, intensityDescComp); result[i] = featureList; } } break; case CONFLICT_RESOLVER_BEST: for (int i = 0; i < featureSetArray.length; i++) { List<Clusterer2D.Clusterable> thisSetClusterables = cluster.getParentListForSetIndex(i); if (!thisSetClusterables.isEmpty()) { ArrayList<Feature> featureList = new ArrayList<Feature>(thisSetClusterables.size()); for (Clusterer2D.Clusterable featureClusterable : thisSetClusterables) { featureList.add(((FeatureClusterer.FeatureClusterable) featureClusterable).getParentFeature()); } Collections.sort(featureList, Feature.PeaksKLComparatorDesc.getSingletonInstance()); result[i] = featureList; } } break; } return result; } public Clusterer2D.BucketSummary[] summarize() { List<Clusterer2D.BucketSummary> resultList = new ArrayList<Clusterer2D.BucketSummary>(); if (_shouldGroupByCharge) { for (int charge : _chargeClustererMap.keySet()) { FeatureClusterer clusterer = _chargeClustererMap.get(charge); if (clusterer.countAllEntries() == 0) { _log.debug("split2D: skipping clusterer for charge " + charge + ", with 0 entries"); } else { for (Clusterer2D.BucketSummary bucket : _chargeClustererMap.get(charge).summarize()) { resultList.add(bucket); } } } } return resultList.toArray(new Clusterer2D.BucketSummary[resultList.size()]); } /** * Create a single row for a peptide array. * @param summary * @param bucketFeatures * @param bucketIntensities * @return */ protected static String createArrayRow(Clusterer2D.BucketSummary summary, List<Feature>[] bucketFeatures, float[] bucketIntensities, boolean writePeptidesAndProteins, boolean writeMultipleFeatures) { StringBuffer resultBuf = new StringBuffer(summary.toString()); for (int i = 0; i < bucketIntensities.length; i++) { resultBuf.append("\t"); if (bucketIntensities[i] > 0) resultBuf.append(bucketIntensities[i]); List<Feature> featureList = bucketFeatures[i]; //make sure there are actually features to write out. boolean hasFeatures = (featureList != null && !featureList.isEmpty()); //if we're writing peptides and proteins, we've got a lot of work to do //to determine exactly how to do it if (writePeptidesAndProteins) { resultBuf.append("\t"); if (writeMultipleFeatures) { StringBuffer peptideBuf = new StringBuffer(""); StringBuffer proteinBuf = new StringBuffer(""); //If no features to write out, buffers remain empty if (hasFeatures) { List<String> peptideListStrings = new ArrayList<String>(); List<String> proteinListStrings = new ArrayList<String>(); //For each feature, write out its peptide and protein lists, //bracket-separated if necessary. //Do not add duplicate peptide or protein lists for (Feature feature : featureList) { List<String> peptideList = MS2ExtraInfoDef.getPeptideList(feature); List<String> proteinList = MS2ExtraInfoDef.getProteinList(feature); //if this peptide list contains duplicate peptides in //sequence, only keep the first of each series. //maintain the protein list in parallel. if (peptideList != null && !peptideList.isEmpty()) { MS2ExtraInfoDef.collapsePeptideSequentialDuplicates(feature); String peptideListString = MS2ExtraInfoDef.convertStringListToString(peptideList); //even if we have a peptide list with something in it, //only add it if we don't already have an identical //list if (!peptideListStrings.contains(peptideListString)) { peptideListStrings.add(peptideListString); String proteinListString = null; if (proteinList != null && !proteinList.isEmpty()) { MS2ExtraInfoDef.collapseProteinSequentialDuplicates( feature); proteinListString = MS2ExtraInfoDef.convertStringListToString( proteinList); } proteinListStrings.add(proteinListString); } } } //if we've got multiple peptides to write, have to //separate them boolean multiplePeptidesToWrite = (peptideListStrings.size() > 1); for (int j=0; j<peptideListStrings.size(); j++) { if (multiplePeptidesToWrite) peptideBuf.append("["); peptideBuf.append(peptideListStrings.get(j)); if (multiplePeptidesToWrite) peptideBuf.append("]"); //System.err.println("writing eptides and proteins. Peptides: " + peptideListStrings.size() + ", proteins: " + proteinListStrings.size()); if (proteinListStrings.get(j) != null) { if (multiplePeptidesToWrite) proteinBuf.append("["); //System.err.println(" Writing protein " + proteinListStrings.get(j)); proteinBuf.append(proteinListStrings.get(j)); if (multiplePeptidesToWrite) proteinBuf.append("]"); } } } resultBuf.append(peptideBuf); resultBuf.append("\t"); resultBuf.append(proteinBuf); } else { //not writing multiple features, just take the first if (hasFeatures && MS2ExtraInfoDef.getFirstPeptide(featureList.get(0)) != null) resultBuf.append(MS2ExtraInfoDef.getFirstPeptide(featureList.get(0))); resultBuf.append("\t"); if (hasFeatures && MS2ExtraInfoDef.getFirstProtein(featureList.get(0)) != null) resultBuf.append(MS2ExtraInfoDef.getFirstProtein(featureList.get(0))); } } } return resultBuf.toString(); } /** * Write out all feature details (except peptide, protein) * @param writer * If true, writes the original, un-deconvoluted features */ public void writeArrayDetails(PrintWriter writer, boolean writeMS2) { writer.print("id\t"); writer.print("file\t"); FeatureExtraInformationDef[] extraInfos = writeMS2 ? new FeatureExtraInformationDef[] {MS2ExtraInfoDef.getSingletonInstance()} : null; writer.println(Feature.getFeatureHeader(extraInfos)); List<FeatureClusterer> clusterersToSummarize = new ArrayList<FeatureClusterer>(); if (_shouldGroupByCharge) { for (int charge : _chargeClustererMap.keySet()) { clusterersToSummarize.add(_chargeClustererMap.get(charge)); } } else { clusterersToSummarize.add(_featureClusterer); } int curRow = 0; for (FeatureClusterer clusterer : clusterersToSummarize) { if (clusterer.countAllEntries() == 0) { _log.debug("Empty clusterer, skipping"); continue; } Clusterer2D.BucketSummary[] summaries = clusterer.summarize(); for (Clusterer2D.BucketSummary summary : summaries) { curRow++; Clusterer2D.TreeEntry[] entries= summary.entries(); for (Clusterer2D.TreeEntry entry : entries) { String setName = getSetName(entry.iSet); Feature deconvolutedFeature = ((FeatureClusterer.FeatureClusterable) (entry.parent)).getParentFeature(); writeDetailsFeatureRow(writer, curRow, setName, deconvolutedFeature, extraInfos); } writer.flush(); } } } /** * Helper method to write a single row in a details file representing one feature * @param writer * @param rowId * @param setName * @param feature */ protected void writeDetailsFeatureRow(PrintWriter writer, int rowId, String setName, Feature feature, FeatureExtraInformationDef[] extraInfos) { writer.println((rowId) + "\t" + setName + "\t" + feature.toString(extraInfos)); } /** * get the name of a FeatureSet, for the details file * @param i * @return */ private String getSetName(int i) { FeatureSet fs = _featureSets.get(i); if (fs.getSourceFile() != null) return fs.getSourceFile().getName(); else return String.valueOf(i); } /** * Normalize the intensities across all bucket summaries. Note that we don't * attempt to reflect normalization in the feature "details" file. */ private boolean normalizeSummaryIntensities(float[][] allSummariesIntensities) { List<float[]> rows = new ArrayList<float[]>(); for (int i = 0; i < allSummariesIntensities.length; i++) rows.add(allSummariesIntensities[i]); return Normalizer.normalize(rows, showCharts); } //cover methods on FeatureClusterer /** * Split based on a set of bucket sizes. If not splitting by charge, just passes the command along to the clusterer. * If splitting by charge, splits each of the per-charge clusterers * @param dimension1Bucket * @param dimension2Bucket */ public void split2D(double dimension1Bucket, double dimension2Bucket) { if (_shouldGroupByCharge) { for (FeatureClusterer featureClustererThisCharge : _chargeClustererMap.values()) { featureClustererThisCharge.split2D(dimension1Bucket, dimension2Bucket); } } else { _featureClusterer.split2D(dimension1Bucket, dimension2Bucket); } } public int numBuckets() { return _featureClusterer.numBuckets(); } public int rowsWithOneFromEach() { if (_shouldGroupByCharge) { int oneFromEachRowCount = 0; for (int charge : _allObservedCharges) { oneFromEachRowCount += _chargeClustererMap.get(charge).rowsWithOneFromEach(); } return oneFromEachRowCount; } else return _featureClusterer.rowsWithOneFromEach(); } //getters, setters public boolean getGroupByMass() { return _useMassInsteadOfMz; } public void setGroupByMass(boolean groupByMass) { this._useMassInsteadOfMz = groupByMass; if (_shouldGroupByCharge) { for (FeatureClusterer fc : _chargeClustererMap.values()) fc.setMassMzMode(_useMassInsteadOfMz ? FeatureClusterer.MASS_MZ_MODE_MASS : FeatureClusterer.MASS_MZ_MODE_MZ); } else { _featureClusterer.setMassMzMode(_useMassInsteadOfMz ? FeatureClusterer.MASS_MZ_MODE_MASS : FeatureClusterer.MASS_MZ_MODE_MZ); } } /** * Add a featureset. If we're not separating by charge, this is trivial. If we are, then we need to * separate out features by charge, then create a featureset for each charge, then add each of those to * the featureclusterer for that charge * @param fs */ public void addSet(FeatureSet fs) { if (_shouldGroupByCharge) { Map<Integer,List<Feature>> chargeFeatureMap = new HashMap<Integer,List<Feature>>(); Set<Integer> observedCharges = new HashSet<Integer>(); for (Feature feature : fs.getFeatures()) { if (feature.getCharge() == 0) continue; observedCharges.add(feature.charge); List<Feature> featuresThisCharge = chargeFeatureMap.get(feature.getCharge()); if (featuresThisCharge == null) { featuresThisCharge = new ArrayList<Feature>(); chargeFeatureMap.put(feature.getCharge(), featuresThisCharge); _allObservedCharges.add(feature.getCharge()); } featuresThisCharge.add(feature); } for (int charge : observedCharges) { FeatureSet fsThisCharge = (FeatureSet) fs.clone(); if (chargeFeatureMap.containsKey(charge)) fsThisCharge.setFeatures(chargeFeatureMap.get(charge).toArray(new Feature[chargeFeatureMap.get(charge).size()])); else fsThisCharge.setFeatures(new Feature[0]); FeatureClusterer featureClustererThisCharge = _chargeClustererMap.get(charge); if (featureClustererThisCharge == null) { featureClustererThisCharge = createFeatureClusterer(); _chargeClustererMap.put(charge, featureClustererThisCharge); } featureClustererThisCharge.addSet(fsThisCharge); } } else { _featureClusterer.addSet(fs); } _featureSets.add(fs); } public FeatureSet getSet(int i) { return _featureClusterer.getSet(i); } public int getConflictResolver() { return _conflictResolver; } public void setConflictResolver(int conflictResolver) { this._conflictResolver = conflictResolver; } public double getBucketEvaluationPeptideAgreementMinAlignedNumSetsMultiple() { return bucketEvaluationPeptideAgreementMinAlignedNumSetsMultiple; } public void setBucketEvaluationPeptideAgreementMinAlignedNumSetsMultiple(double bucketEvaluationPeptideAgreementMinAlignedNumSetsMultiple) { this.bucketEvaluationPeptideAgreementMinAlignedNumSetsMultiple = bucketEvaluationPeptideAgreementMinAlignedNumSetsMultiple; } public double getBucketEvaluationPeptideAgreementMinPeptideIdsNumSetsMultiple() { return bucketEvaluationPeptideAgreementMinPeptideIdsNumSetsMultiple; } public void setBucketEvaluationPeptideAgreementMinPeptideIdsNumSetsMultiple(double bucketEvaluationPeptideAgreementMinPeptideIdsNumSetsMultiple) { this.bucketEvaluationPeptideAgreementMinPeptideIdsNumSetsMultiple = bucketEvaluationPeptideAgreementMinPeptideIdsNumSetsMultiple; } public int getBucketEvaluationPeptideAgreementMatchScore() { return bucketEvaluationPeptideAgreementMatchScore; } public void setBucketEvaluationPeptideAgreementMatchScore(int bucketEvaluationPeptideAgreementMatchScore) { this.bucketEvaluationPeptideAgreementMatchScore = bucketEvaluationPeptideAgreementMatchScore; } public int getBucketEvaluationPeptideAgreementMismatchPenalty() { return bucketEvaluationPeptideAgreementMismatchPenalty; } public void setBucketEvaluationPeptideAgreementMismatchPenalty(int bucketEvaluationPeptideAgreementMismatchPenalty) { this.bucketEvaluationPeptideAgreementMismatchPenalty = bucketEvaluationPeptideAgreementMismatchPenalty; } public int getMassType() { return _featureClusterer.getMassType(); } public void setMassType(int _massType) { if (_shouldGroupByCharge) { for (FeatureClusterer fc : _chargeClustererMap.values()) fc.setMassType(_massType); } else { _featureClusterer.setMassType(_massType); } } public int getElutionMode() { return _featureClusterer.getElutionMode(); } public void setElutionMode(int _elutionMode) { if (_shouldGroupByCharge) { for (FeatureClusterer fc : _chargeClustererMap.values()) fc.setElutionMode(_elutionMode); } else { _featureClusterer.setElutionMode(_elutionMode); } } public boolean isGroupByCharge() { return _shouldGroupByCharge; } public void setGroupByCharge(boolean _shouldGroupByCharge) { this._shouldGroupByCharge = _shouldGroupByCharge; } public FeatureClusterer getFeatureClusterer() { return _featureClusterer; } public boolean isShowCharts() { return showCharts; } public void setShowCharts(boolean showCharts) { this.showCharts = showCharts; } }