/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.viewer.amt; import java.util.*; import java.util.List; import java.io.IOException; import java.io.File; import java.awt.*; import org.apache.log4j.Logger; import org.fhcrc.cpl.toolbox.proteomics.feature.Feature; import org.fhcrc.cpl.toolbox.proteomics.feature.matching.FeatureSetMatcher; import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef; import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.AmtExtraInfoDef; import org.fhcrc.cpl.toolbox.*; import org.fhcrc.cpl.toolbox.statistics.RInterface; import org.fhcrc.cpl.toolbox.statistics.BasicStatistics; import org.fhcrc.cpl.toolbox.filehandler.TempFileManager; import org.fhcrc.cpl.toolbox.datastructure.Pair; import org.fhcrc.cpl.toolbox.gui.chart.*; import org.fhcrc.cpl.toolbox.proteomics.ModifiedAminoAcid; /** * This class assigns a probability to every AMT match, based on the distribution of * target and decoy mass and H errors. * * First, all matches (including multiple matches to the same feature) are mined for their mass * and H error values. Those values are fed to an Expectation-Maximization algorithm, implemented * in R, which estimates parameters for a mixed distribution. We assume the true distribution to * be a uniform distribution of false matches mixed with a bivariate normal distribution of true * matches. We also feed the EM algorithm an initial estimate of the proportion of matches in the * bivariate normal distribution, which is based on a match to a decoy database. * * The EM algorithm returns distribution parameters, as well as a probability for each match indicating * how likely it is that the match is in the bivariate normal distribution of true matches. * * Lots of charts are provided in order to let the user judge the success of the algorithm in modeling * the distribution. */ public class AmtMatchProbabilityAssigner { static Logger _log = Logger.getLogger(AmtMatchProbabilityAssigner.class); //values for scaffolding printlns protected static float minXScaff = 450f; protected static float maxXScaff = 500f; protected static float minYScaff = -600f; protected static float maxYScaff = -500f; protected boolean isInScaffRange(double x, double y) { return (x>=minXScaff && x<=maxXScaff && y>=minYScaff && y<=maxYScaff); } //maximum time to wait for R to assign probabilities. Any value here is dangerous. public static final int MAX_R_PROB_ASSIGNMENT_MILLIS = 900000; protected int maxRProbAssignmentMillis = MAX_R_PROB_ASSIGNMENT_MILLIS; //boundaries for the entire match space protected float minDeltaMass; protected float maxDeltaMass; protected float minDeltaElution; protected float maxDeltaElution; //ranges for the mass space, calculated from the above protected float totalMassRange; protected float totalElutionRange; //Use EM estimation for probabilities, rather than nonparametric public static final int DEFAULT_MIN_EM_ITERATIONS = 30; public static final int DEFAULT_MAX_EM_ITERATIONS = 200; public static final float DEFAULT_EM_MAX_DELTA_P_FOR_STABLE = .005f; public static final int DEFAULT_EM_MAX_ITERATIONS_STABLE_FOR_CONVERGENCE = 1; protected int minEMIterations = DEFAULT_MIN_EM_ITERATIONS; protected int maxEMIterations = DEFAULT_MAX_EM_ITERATIONS; //minimum match probability to keep in output protected float minMatchProbability; protected float maxMatchFDR; public static final float KS_CUTOFF_FOR_WARN = 0.005f; //Default minimum match probability to keep. Because ProteinProphet uses 0.05 and above. public static final float DEFAULT_MIN_MATCH_PROBABILITY = 0.1f; //Default max FDR to keep. Keep everything public static final float DEFAULT_MAX_MATCH_FDR = 1f; //Hard maximum on second-best probability protected float maxSecondBestProbability = DEFAULT_MAX_SECONDBEST_PROBABILITY; //Minimum difference between second-best probability and best probability protected float minSecondBestProbabilityDifference = DEFAULT_MIN_SECONDBEST_PROBABILITY_DIFFERENCE; //maximum depth of the quadtree. We have to set a maximum or we run out of //precision, even when using doubles. public static final int MAX_TREE_DEPTH = 30; //Scaling factor for error values. This is purely cosmetic. public static final int SCALING_FACTOR = 100; //The number of points to find around each point in order to estimate the area. //Density will always be estimated using this number of points unless there _aren't_ that //many points in the distribution public static final int NUM_TARGET_POINTS = 30; public static final int NUM_DECOY_POINTS = 30; //Hard maximum on second-best probability public static final float DEFAULT_MAX_SECONDBEST_PROBABILITY = 0.5f; //Minimum difference between second-best probability and best probability public static final float DEFAULT_MIN_SECONDBEST_PROBABILITY_DIFFERENCE = 0.1f; //these statistics are kept for methods development only, controlled by the keepStatistics flag. //For heaven's sake, turn this flag off in the shipped code. protected boolean keepStatistics = false; protected List<Double> targetAreas = new ArrayList<Double>(); protected List<Double> decoyAreas = new ArrayList<Double>(); protected List<Integer> targetWindowFeatureNumbers = new ArrayList<Integer>(); protected List<Integer> decoyWindowFeatureNumbers = new ArrayList<Integer>(); protected float initialProportionTrue = 0f; protected float meanProbability = 0f; protected float expectedTrue = 0f; protected float ks_score_x = 0f; protected float ks_score_y = 0f; protected float quantileCorrX = 0f; protected float quantileCorrY = 0f; protected float quantileBetaX = 0f; protected float quantileBetaY = 0f; protected float mu_x = 0f; protected float mu_y = 0f; protected float sigma_x = 0f; protected float sigma_y = 0f; protected int num_iterations = 0; protected boolean converged = false; protected float proportion = 0f; /** * Set the loose matching parameters. Possibly should just specify the target and decoy * matching results here, too. * @param minDeltaMass * @param maxDeltaMass * @param minDeltaElution * @param maxDeltaElution * @param minMatchProbability * @param maxMatchFDR */ public AmtMatchProbabilityAssigner(float minDeltaMass, float maxDeltaMass, float minDeltaElution, float maxDeltaElution, float minMatchProbability, float maxMatchFDR) { this.minDeltaMass = minDeltaMass; this.maxDeltaMass = maxDeltaMass; this.minDeltaElution = minDeltaElution; this.maxDeltaElution = maxDeltaElution; //this relies on maxDeltaMass > 0 and minDeltaMass < 0, etc. totalMassRange = maxDeltaMass + Math.abs(minDeltaMass); totalElutionRange = maxDeltaElution + Math.abs(minDeltaElution); this.minMatchProbability = minMatchProbability; this.maxMatchFDR = maxMatchFDR; } /** * Helper method to analyze a matchingresult and create the error data, both * dimensions, that is fed the probability model. * The data structure that this returns is atrocious, I know. It returns * the list of mass errors, the list of H errors, and a map from pairs of features * in the matchingResult to indices in the lists. * @param matchingResult * @return */ Pair<Pair<List<Float>, List<Float>>,Map<Pair<Feature,Feature>, Integer>> createMassAndHErrorLists( FeatureSetMatcher.FeatureMatchingResult matchingResult) { List<Float> massErrorDataList = new ArrayList<Float>(); List<Float> hErrorDataList = new ArrayList<Float>(); Map<Pair<Feature,Feature>, Integer> amtMatchErrorIndexMap = new HashMap<Pair<Feature,Feature>, Integer>(); for (Feature ms1Feature : matchingResult.getMasterSetFeatures()) { for (Feature matchedAmtFeature : matchingResult.get(ms1Feature)) { amtMatchErrorIndexMap.put(new Pair<Feature,Feature>(ms1Feature, matchedAmtFeature), massErrorDataList.size()); //convert to ppm massErrorDataList.add((ms1Feature.getMass() - matchedAmtFeature.getMass()) * (1000000 / ms1Feature.getMass())); hErrorDataList.add((float) (AmtExtraInfoDef.getObservedHydrophobicity(ms1Feature) - (AmtExtraInfoDef.getObservedHydrophobicity(matchedAmtFeature)))); } } Pair<List<Float>, List<Float>> errorLists = new Pair<List<Float>, List<Float>>(massErrorDataList, hErrorDataList); return new Pair<Pair<List<Float>, List<Float>>,Map<Pair<Feature,Feature>, Integer>>( errorLists, amtMatchErrorIndexMap); } /** * Use the initial (loose-tolerance) matching data, to both the target and decoy databases, * to determine a probability for each loose match. Probabilities get assigned directly * to the MS1 features. This method also makes the peptide assignments: one peptide per * MS1 feature, break ties using probability of match. * * Probability for each match is * (estimated target density - estimated decoy density) / estimated target density * * @param targetMatchingResult * @param decoyMatchingResult * @param showCharts * @throws IOException */ public List<Feature> assignMatchesAndProbabilities( FeatureSetMatcher.FeatureMatchingResult targetMatchingResult, FeatureSetMatcher.FeatureMatchingResult decoyMatchingResult, boolean showCharts) throws IOException { //Create lists of error data and auxiliary data structures for target matches Pair<Pair<List<Float>, List<Float>>,Map<Pair<Feature,Feature>, Integer>> targetErrorDataListsAndMap = createMassAndHErrorLists(targetMatchingResult); Pair<List<Float>, List<Float>> targetErrorDataLists = targetErrorDataListsAndMap.first; Map<Pair<Feature,Feature>, Integer> targetAmtMatchErrorIndexMap = targetErrorDataListsAndMap.second; List<Float> targetMassErrorList = targetErrorDataLists.first; List<Float> targetHErrorList = targetErrorDataLists.second; int numTargetPoints = targetMassErrorList.size(); //Create lists of error data for decoy matches Pair<Pair<List<Float>, List<Float>>,Map<Pair<Feature,Feature>, Integer>> decoyErrorDataListsAndMap = createMassAndHErrorLists(decoyMatchingResult); Pair<List<Float>, List<Float>> decoyErrorDataLists = decoyErrorDataListsAndMap.first; List<Float> decoyMassErrorList = decoyErrorDataLists.first; List<Float> decoyHErrorList = decoyErrorDataLists.second; int numDecoyPoints = decoyMassErrorList.size(); if (showCharts) { PanelWithScatterPlot pspDecoy = new PanelWithScatterPlot(decoyHErrorList, decoyMassErrorList, "Decoy data"); pspDecoy.setPointSize(2); pspDecoy.setAxisLabels("deltaNRT (NRT units)", "deltaMass (ppm)"); pspDecoy.displayInTab(); } _log.debug("Target matches: " + targetMatchingResult.size() + ", data points: " + numTargetPoints); _log.debug("\tDecoy matches: " + decoyMatchingResult.size() + ", decoy data points: " + numDecoyPoints); ApplicationContext.setMessage("Starting probability assignment..."); //calculate the probabilities. This can take a while, so track how long. Date beforeProbDate = new Date(); float[] allMatchProbabilities = null; float[] allMatchFDRs = null; initialProportionTrue = (float) (numTargetPoints - numDecoyPoints) / (float) numTargetPoints; //fudge in case of no advantage for target. This is to allow decoy matches if (initialProportionTrue <= 0) { ApplicationContext.setMessage("WARNING: Adjusting initial proportion true from " + initialProportionTrue + " to 0.001. This should only be necessary for a decoy match. If you are not performing a decoy " + "match, this message indicates that matching has performed VERY POORLY: you have at least as " + "many decoy matches as target matches."); initialProportionTrue = 0.001f; } ApplicationContext.setMessage("\tinitial proportion true: " + initialProportionTrue); ApplicationContext.setMessage("Calculating probabilities with EM (parametric) method..."); Pair<float[], float[]> matchProbsAndFDRs = calculateProbabilitiesAndFDRsEM(targetMassErrorList, targetHErrorList, initialProportionTrue, showCharts); allMatchProbabilities = matchProbsAndFDRs.first; allMatchFDRs = matchProbsAndFDRs.second; //for (int i=0; i<allMatchProbabilities.length; i++) System.err.println("**" + allMatchProbabilities[i] + ", " + allMatchFDRs[i]); ApplicationContext.setMessage("Done."); if (showCharts) { PanelWithLineChart pwlcProbFDR = new PanelWithLineChart(allMatchProbabilities, allMatchFDRs, "prob vs FDR"); pwlcProbFDR.setAxisLabels("probability", "False Discovery Rate"); pwlcProbFDR.displayInTab(); } Date afterProbDate = new Date(); long probMillis = afterProbDate.getTime() - beforeProbDate.getTime(); ApplicationContext.setMessage("Probability assignment took " + ((float) probMillis / 1000.0f) + " seconds"); //assign probabilities -- and peptides -- to features. If there are multiple matches per feature, //assign the highest-probability match List<Float> featureMatchProbs = new ArrayList<Float>(); int numPosProbMatches=0; int numPoint95Matches=0; float[] featureProbabilities = new float[targetMatchingResult.size()]; float[] featureKLs = new float[targetMatchingResult.size()]; float[] featureLogIntensities = new float[targetMatchingResult.size()]; float[] featureSecondBestProbabilities = new float[targetMatchingResult.size()]; String[] featureSecondBestPeptides = new String[targetMatchingResult.size()]; List<Feature> matchedMS1Features = new ArrayList<Feature>(); int ms1FeatureIndex=0; for (Feature ms1Feature : targetMatchingResult.getMasterSetFeatures()) { List<Feature> matchedAmtFeatures = targetMatchingResult.get(ms1Feature); Feature bestAmtMatchFeature = null; float bestMatchProbThisFeature = -1f; float bestMatchFDRThisFeature = 1f; //only use the best AMT match. This is incomplete, of course. for (Feature amtFeature : matchedAmtFeatures) { int probabilityIndex = targetAmtMatchErrorIndexMap.get(new Pair<Feature,Feature>(ms1Feature, amtFeature)); float matchProbability = allMatchProbabilities[probabilityIndex]; float matchFDR = allMatchFDRs[probabilityIndex]; String peptide = MS2ExtraInfoDef.getFirstPeptide(amtFeature); if (matchProbability > bestMatchProbThisFeature) { featureSecondBestProbabilities[ms1FeatureIndex] = bestMatchProbThisFeature; if (bestAmtMatchFeature != null) featureSecondBestPeptides[ms1FeatureIndex] = MS2ExtraInfoDef.getFirstPeptide(bestAmtMatchFeature); bestMatchProbThisFeature = matchProbability; bestMatchFDRThisFeature = matchFDR; bestAmtMatchFeature = amtFeature; } else if (matchProbability > featureSecondBestProbabilities[ms1FeatureIndex]) { featureSecondBestProbabilities[ms1FeatureIndex] = matchProbability; featureSecondBestPeptides[ms1FeatureIndex] = peptide; } } featureProbabilities[ms1FeatureIndex] = bestMatchProbThisFeature; featureKLs[ms1FeatureIndex] = ms1Feature.kl; featureLogIntensities[ms1FeatureIndex] = (float) Math.log(ms1Feature.intensity); if (bestMatchProbThisFeature <= 0) { continue; } numPosProbMatches++; if (bestMatchProbThisFeature > 0.95) numPoint95Matches++; //If we've exceeded the minimum match probability, process the match if (bestMatchProbThisFeature >= minMatchProbability && bestMatchFDRThisFeature <= maxMatchFDR) { //Filter out this match if: //(1. There is a second-best match with high-enough probability, or // 2. There is a second-best match with close-enough probability to the highest) //AND the first and second match peptides are different if (featureSecondBestProbabilities[ms1FeatureIndex] > 0 && ((featureSecondBestProbabilities[ms1FeatureIndex] > maxSecondBestProbability || (bestMatchProbThisFeature - featureSecondBestProbabilities[ms1FeatureIndex]) < minSecondBestProbabilityDifference)) && !featureSecondBestPeptides[ms1FeatureIndex].equals(MS2ExtraInfoDef.getFirstPeptide(bestAmtMatchFeature))) { _log.debug("Filtering peptide " + MS2ExtraInfoDef.getFirstPeptide(bestAmtMatchFeature) + "with probability " + bestMatchProbThisFeature + " due to second-best probability " + featureSecondBestProbabilities[ms1FeatureIndex]); } else { //Everything's OK, record the match MS2ExtraInfoDef.setSinglePeptide(ms1Feature, MS2ExtraInfoDef.getFirstPeptide(bestAmtMatchFeature)); List<ModifiedAminoAcid>[] modifiedAAs = MS2ExtraInfoDef.getModifiedAminoAcids(bestAmtMatchFeature); if (modifiedAAs != null) { MS2ExtraInfoDef.setModifiedAminoAcids(ms1Feature, modifiedAAs); } //set probability of match AmtExtraInfoDef.setMatchProbability(ms1Feature, bestMatchProbThisFeature); AmtExtraInfoDef.setMatchFDR(ms1Feature, bestMatchFDRThisFeature); //probability that the ID in the database is correct -- this is stored on the AMT feature, //modeled for now as PeptideProphet. I'm hedging my bets in case somehow that PeptideProphet //value didn't get assigned float idProbability = 1; if (MS2ExtraInfoDef.hasPeptideProphet(bestAmtMatchFeature) && MS2ExtraInfoDef.getPeptideProphet(bestAmtMatchFeature) > 0) idProbability = (float) MS2ExtraInfoDef.getPeptideProphet(bestAmtMatchFeature); //This represents the overall probability of correct ID: //match probability times ID probability float probabilityToAssign = bestMatchProbThisFeature * idProbability; MS2ExtraInfoDef.setPeptideProphet(ms1Feature, probabilityToAssign); matchedMS1Features.add(ms1Feature); } } featureMatchProbs.add(bestMatchProbThisFeature); ms1FeatureIndex++; } if (showCharts) { PanelWithScatterPlot pwspIntProb = new PanelWithScatterPlot(featureLogIntensities, featureProbabilities, "logintensity vs prob"); pwspIntProb.setAxisLabels("intensity", "probability"); pwspIntProb.displayInTab(); PanelWithScatterPlot pwspKLProb = new PanelWithScatterPlot(featureKLs, featureProbabilities, "KL vs prob"); pwspKLProb.setAxisLabels("KL", "probability"); pwspKLProb.displayInTab(); } meanProbability = (float)BasicStatistics.mean(featureMatchProbs); expectedTrue = meanProbability * featureMatchProbs.size(); ApplicationContext.infoMessage("Actual matches made with prob>0: " + numPosProbMatches + ", with prob>.95: " + numPoint95Matches + ", mean probability = " + meanProbability + ", expected true = " + expectedTrue); return matchedMS1Features; } public Pair<float[],float[]> calculateProbabilitiesAndFDRsEM(List<Float> targetMassErrorDataList, List<Float> targetHErrorDataList, float proportionTrue, boolean showCharts) throws IOException { float[] probabilities = calculateProbabilitiesEM(targetMassErrorDataList, targetHErrorDataList,proportionTrue,showCharts); return new Pair<float[], float[]>(probabilities, calculateFDRsForProbabilities(probabilities)); } /** * Given an array of probabilities, calculate FDR by summing up true and false probabilities, starting * with the best probability and working downward * @param probabilities * @return */ public static float[] calculateFDRsForProbabilities(float[] probabilities) { float[] probabilitiesSorted = new float[probabilities.length]; System.arraycopy(probabilities, 0, probabilitiesSorted, 0, probabilities.length); Arrays.sort(probabilitiesSorted); float sumProbBad = 0f; float sumProbGood = 0f; Map<Float, Float> probFDRMap = new HashMap<Float, Float>(probabilities.length); for (int i=probabilitiesSorted.length-1; i>=0; i--) { double prob = probabilitiesSorted[i]; sumProbBad += (1 - prob); sumProbGood += prob; probFDRMap.put(probabilitiesSorted[i], sumProbBad / (sumProbBad + sumProbGood)); } float[] fdrs = new float[probabilities.length]; for (int i=0; i<fdrs.length; i++) { fdrs[i] = probFDRMap.get(probabilities[i]); } return fdrs; } /** * Use the Expectation Maximization algorithm to determine match probabilities by * modeling the true hits as a normal distribution, and the false hits as a uniform * distribution, superimposed in the target distribution. * @param targetMassErrorDataList * @param targetHErrorDataList * @param proportionTrue * @param showCharts * @return * @throws IOException */ public float[] calculateProbabilitiesEM(List<Float> targetMassErrorDataList, List<Float> targetHErrorDataList, float proportionTrue, boolean showCharts) throws IOException { int numPoints = targetMassErrorDataList.size(); double[] targetMassErrorData = new double[numPoints]; double[] targetHErrorData = new double[numPoints]; for (int i=0; i<numPoints; i++) { targetHErrorData[i] = targetHErrorDataList.get(i); targetMassErrorData[i] = targetMassErrorDataList.get(i); } Map<String,Object> rScalarVarMap = new HashMap<String,Object>(); Map<String,double[]> rVectorVarMap = new HashMap<String,double[]>(); rVectorVarMap.put("targetx",targetHErrorData); rVectorVarMap.put("targety",targetMassErrorData); rScalarVarMap.put("proportion",(double) proportionTrue); double area = (maxDeltaMass - minDeltaMass) * (maxDeltaElution - minDeltaElution); rScalarVarMap.put("area",area); rScalarVarMap.put("miniterations",(double) minEMIterations); rScalarVarMap.put("maxiterations",(double) maxEMIterations); rScalarVarMap.put("max_deltap_proportion_for_stable",(double) DEFAULT_EM_MAX_DELTA_P_FOR_STABLE); rScalarVarMap.put("max_deltap_proportion",(double) DEFAULT_EM_MAX_DELTA_P_FOR_STABLE + 0.001); rScalarVarMap.put("iters_stable_for_converg",(double) DEFAULT_EM_MAX_ITERATIONS_STABLE_FOR_CONVERGENCE); rScalarVarMap.put("showcharts",showCharts ? "TRUE" : "FALSE"); File outChartFile = null; File normTestOutChartFile = null; if (showCharts) { outChartFile = TempFileManager.createTempFile("em_plots.jpg", this); normTestOutChartFile = TempFileManager.createTempFile("em_normtest_plots.jpg", this); rScalarVarMap.put("chart_out_file", "'" + RInterface.generateRFriendlyPath(outChartFile) + "'"); rScalarVarMap.put("normtest_chart_out_file", "'" + RInterface.generateRFriendlyPath(normTestOutChartFile) + "'"); } String calculateProbabilitiesCommand = RInterface.readResourceFile("/org/fhcrc/cpl/viewer/amt/assign_amt_probabilities_em.R"); //this timeout is arbitrary and dangerous. Woo hoo! int timeoutMilliseconds=maxRProbAssignmentMillis; String rResult = RInterface.evaluateRExpression(calculateProbabilitiesCommand, rScalarVarMap, rVectorVarMap, null, null, timeoutMilliseconds); Map<String, String> varResultStringMap = RInterface.extractVariableStringsFromListOutput(rResult); float[] probabilities = new float[numPoints]; int probsIndex = 0; String[] probChunks = varResultStringMap.get("probs").split("\\s"); for (String probChunk : probChunks) { if (!probChunk.contains("[") && probChunk.length() > 0) { probabilities[probsIndex++] = Float.parseFloat(probChunk); } } if (probsIndex != numPoints) throw new IOException("FAILED to read probabilities correctly back from R!"); converged = varResultStringMap.get("converged").contains("TRUE"); String numIterString = varResultStringMap.get("num_iterations"); numIterString = numIterString.substring(numIterString.indexOf("]") + 1).trim(); num_iterations = Integer.parseInt(numIterString); if (converged) { ApplicationContext.setMessage("EM estimation converged after " + num_iterations + " iterations"); } else { ApplicationContext.infoMessage("WARNING!!! EM estimation failed to converge after " + num_iterations + " iterations"); } String[] ksChunks = varResultStringMap.get("ksresults").split("\\s"); ks_score_x = Float.parseFloat(ksChunks[2]); ks_score_y = Float.parseFloat(ksChunks[3]); //todo: move parsing of vector results into RInterface String[] distParamValues = varResultStringMap.get("dist_params").split("\\s"); List<Float> paramVals = new ArrayList<Float>(); for (String paramVal : distParamValues) if (paramVal != null && paramVal.length() > 1 && !paramVal.contains("[")) paramVals.add(Float.parseFloat(paramVal)); mu_x = paramVals.get(0); mu_y = paramVals.get(1); sigma_x = paramVals.get(2); sigma_y = paramVals.get(3); proportion = paramVals.get(4); _log.debug("Distribution params: mu_x=" + mu_x + ", mu_y=" + mu_y + ", sigma_x=" + sigma_x + ", sigma_y=" + sigma_y + ", proportion=" + proportion); if (ks_score_x < KS_CUTOFF_FOR_WARN|| ks_score_y < KS_CUTOFF_FOR_WARN) { _log.debug("WARNING!!!! Kolmogorov-Smirnov test indicates that these matching data " + "may not conform to a bivariate normal distribution intermingled with a uniform " + "distribution. If this key assumption fails, match probabilities will not be " + "accurate. Please consider re-running this analysis in non-parametric mode. " + "KS p-values: X=" + ks_score_x + ", Y=" + ks_score_y); } else { ApplicationContext.setMessage("KS normality test passed. KS values: x = " + ks_score_x + ", y = " + ks_score_y); } String[] corrChunks = varResultStringMap.get("corresults").split("\\s"); quantileCorrX = Float.parseFloat(corrChunks[2]); quantileCorrY = Float.parseFloat(corrChunks[3]); String[] qBetaChunks = varResultStringMap.get("qbetas").split("\\s"); quantileBetaX = Float.parseFloat(qBetaChunks[2]); quantileBetaY = Float.parseFloat(qBetaChunks[3]); if (showCharts) { try { PanelWithBlindImageChart pwbic = new PanelWithBlindImageChart(outChartFile,"EM Parameters"); pwbic.displayInTab(); } catch (Exception e) { ApplicationContext.errorMessage("Error displaying error cutoff chart images, file: " + outChartFile.getAbsolutePath(),e); } try { PanelWithBlindImageChart pwbic = new PanelWithBlindImageChart(normTestOutChartFile,"EM dist analysis"); pwbic.displayInTab(); } catch (Exception e) { ApplicationContext.infoMessage("Error displaying error cutoff chart images, file: " + normTestOutChartFile.getAbsolutePath() + ", error: " + e.getMessage()); } //perspective plot, with distributions indicated try { double minH = Double.MAX_VALUE; double maxH = Double.MIN_VALUE; double minMass = Double.MAX_VALUE; double maxMass = Double.MIN_VALUE; for (int i=0; i<targetHErrorData.length; i++) { minH = Math.min(minH, targetHErrorData[i]); maxH = Math.max(maxH, targetHErrorData[i]); minMass = Math.min(minMass, targetMassErrorData[i]); maxMass = Math.max(maxMass, targetMassErrorData[i]); } int numBinsEachDim = 25; double rangeH = maxH - minH; double rangeMass = maxMass - minMass; double binSizeH = rangeH / numBinsEachDim; double binSizeMass = rangeMass / numBinsEachDim; _log.debug("Building perspective chart..."); PanelWithRPerspectivePlot persp = new PanelWithRPerspectivePlot(); persp.setMillisToWait(300000); persp.setName("Distribution"); persp.setRotationAngle(30); persp.setChartWidth(1100); persp.setChartHeight(1100); persp.setAxisRVariableNames("HError","MassError","Count"); persp.setForegroundColorString("lightblue"); double unifHeight = (targetHErrorData.length * (1 - proportion)) / (numBinsEachDim * numBinsEachDim); _log.debug("Uniform density height: " + unifHeight); int numDistLines = 50; int numDistLinePoints = 125; double expectedNumTrue = numPoints * proportion; double distanceBetweenDistLinePointsH = rangeH / (numDistLinePoints -1); double distanceBetweenDistLinePointsMass = rangeMass / (numDistLinePoints -1); double distanceBetweenDistLinesH = rangeH / (numDistLines -1); double distanceBetweenDistLinesMass = rangeMass / (numDistLines -1); for (int i=0; i<numDistLines; i++) { double[] distLine1x = new double[numDistLinePoints]; double[] distLine1y = new double[numDistLinePoints]; double[] distLine1z = new double[numDistLinePoints]; double[] distLine2x = new double[numDistLinePoints]; double[] distLine2y = new double[numDistLinePoints]; double[] distLine2z = new double[numDistLinePoints]; for (int j=0; j<numDistLinePoints; j++) { distLine1x[j] = minH + (i * distanceBetweenDistLinesH); distLine1y[j] = minMass + (j * distanceBetweenDistLinePointsMass); distLine1z[j] = calculateLocalNormalizedDensity(distLine1x[j], distLine1y[j], expectedNumTrue, binSizeH, binSizeMass) * expectedNumTrue; distLine1z[j] += unifHeight; //System.err.println("xx=" + distLine1x[j] + ", y=" + distLine1y[j] + ", z=" + distLine1z[j]); distLine2x[j] = minH + (j * distanceBetweenDistLinePointsH); distLine2y[j] = minMass + (i * distanceBetweenDistLinesMass); distLine2z[j] = calculateLocalNormalizedDensity(distLine2x[j], distLine2y[j], expectedNumTrue, binSizeH, binSizeMass) * expectedNumTrue; distLine2z[j] += unifHeight; } persp.addLine(distLine1x, distLine1y, distLine1z, "red"); persp.addLine(distLine2x, distLine2y, distLine2z, "red"); } //add lines representing uniform and normal distributions persp.plotPointsSummary(targetHErrorData, targetMassErrorData, binSizeH, binSizeMass); persp.displayInTab(); _log.debug("Done building perspective chart"); } catch (Exception e) { ApplicationContext.infoMessage("Error displaying perspective plot: " + e.getMessage()); e.printStackTrace(System.err); } PanelWithScatterPlot psp = new PanelWithScatterPlot(); psp.setPointSize(2); psp.setAxisLabels("deltaNRT (NRT units)", "deltaMass (ppm)"); psp.setName("Error Data with Prob"); int numGroups = 10; for (int j=0; j<numGroups; j++) { float minProbThisGroup = (float)j/(float)numGroups; float maxProbThisGroup = (float)(j+1)/(float)numGroups; int red = (255/numGroups) * j; int blue = 255 - (255/numGroups) * j; Color color = new Color(blue, 10,red); List<Float> thisGroupMass = new ArrayList<Float>(); List<Float> thisGroupH = new ArrayList<Float>(); for (int k=0; k<probabilities.length; k++) { if (probabilities[k] <= maxProbThisGroup && probabilities[k] >= minProbThisGroup) { thisGroupMass.add((float)targetMassErrorData[k]); thisGroupH.add((float)targetHErrorData[k]); } } psp.addData(thisGroupH, thisGroupMass, ""+minProbThisGroup); psp.setSeriesColor(j, color); psp.setAxisLabels("deltaNRT (NRT units)", "deltaMass (ppm)"); psp.setPointSize(2); } psp.displayInTab(); PanelWithHistogram pwh = new PanelWithHistogram(probabilities, "Probabilities"); pwh.displayInTab(); } TempFileManager.deleteTempFiles(this); return probabilities; } /** * Use the normal CDF to calculate the density of a small area of the distribution * @param xpos * @param ypos * @param expectedNumTrue * @param binSizeX * @param binSizeY * @return */ protected double calculateLocalNormalizedDensity(double xpos, double ypos, double expectedNumTrue, double binSizeX, double binSizeY) { double proportionX = Math.abs(BasicStatistics.calcNormalCumDensity(mu_x, sigma_x, xpos) - BasicStatistics.calcNormalCumDensity(mu_x, sigma_x, xpos + binSizeX)); double proportionY = Math.abs(BasicStatistics.calcNormalCumDensity(mu_y, sigma_y, ypos ) - BasicStatistics.calcNormalCumDensity(mu_y, sigma_y, ypos + binSizeY)); return proportionX * proportionY; } public float getInitialProportionTrue() { return initialProportionTrue; } public void setInitialProportionTrue(float initialProportionTrue) { this.initialProportionTrue = initialProportionTrue; } public float getExpectedTrue() { return expectedTrue; } public float getKsScoreX() { return ks_score_x; } public float getKsScoreY() { return ks_score_y; } public float getQuantileCorrX() { return quantileCorrX; } public float getQuantileCorrY() { return quantileCorrY; } public float getQuantileBetaX() { return quantileBetaX; } public float getQuantileBetaY() { return quantileBetaY; } public int getMaxRProbAssignmentMillis() { return maxRProbAssignmentMillis; } public void setMaxRProbAssignmentMillis(int maxRProbAssignmentMillis) { this.maxRProbAssignmentMillis = maxRProbAssignmentMillis; } public float getMuX() { return mu_x; } public float getMuY() { return mu_y; } public float getSigmaX() { return sigma_x; } public float getSigmaY() { return sigma_y; } public int getNumIterations() { return num_iterations; } public float getMaxSecondBestProbability() { return maxSecondBestProbability; } public void setMaxSecondBestProbability(float maxSecondBestProbability) { this.maxSecondBestProbability = maxSecondBestProbability; } public float getMinSecondBestProbabilityDifference() { return minSecondBestProbabilityDifference; } public void setMinSecondBestProbabilityDifference(float minSecondBestProbabilityDifference) { this.minSecondBestProbabilityDifference = minSecondBestProbabilityDifference; } public int getMaxEMIterations() { return maxEMIterations; } public void setMaxEMIterations(int maxEMIterations) { this.maxEMIterations = maxEMIterations; } public int getMinEMIterations() { return minEMIterations; } public void setMinEMIterations(int minEMIterations) { this.minEMIterations = minEMIterations; } public boolean isConverged() { return converged; } }