AmtMatchProbabilityAssigner.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.fhcrc.cpl.viewer.amt;

import java.util.*;
import java.util.List;
import java.io.IOException;
import java.io.File;
import java.awt.*;

import org.apache.log4j.Logger;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.matching.FeatureSetMatcher;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.AmtExtraInfoDef;
import org.fhcrc.cpl.toolbox.*;
import org.fhcrc.cpl.toolbox.statistics.RInterface;
import org.fhcrc.cpl.toolbox.statistics.BasicStatistics;
import org.fhcrc.cpl.toolbox.filehandler.TempFileManager;
import org.fhcrc.cpl.toolbox.datastructure.Pair;
import org.fhcrc.cpl.toolbox.gui.chart.*;
import org.fhcrc.cpl.toolbox.proteomics.ModifiedAminoAcid;

/**
 * This class assigns a probability to every AMT match, based on the distribution of
 * target and decoy mass and H errors.
 *
 * First, all matches (including multiple matches to the same feature) are mined for their mass
 * and H error values.  Those values are fed to an Expectation-Maximization algorithm, implemented
 * in R, which estimates parameters for a mixed distribution.  We assume the true distribution to
 * be a uniform distribution of false matches mixed with a bivariate normal distribution of true
 * matches. We also feed the EM algorithm an initial estimate of the proportion of matches in the
 * bivariate normal distribution, which is based on a match to a decoy database.
 *
 * The EM algorithm returns distribution parameters, as well as a probability for each match indicating
 * how likely it is that the match is in the bivariate normal distribution of true matches.
 *
 * Lots of charts are provided in order to let the user judge the success of the algorithm in modeling
 * the distribution.   
 */
public class AmtMatchProbabilityAssigner
{
    static Logger _log = Logger.getLogger(AmtMatchProbabilityAssigner.class);

    //values for scaffolding printlns
    protected static float minXScaff = 450f;
    protected static float maxXScaff = 500f;
    protected static float minYScaff = -600f;
    protected static float maxYScaff = -500f;
    protected boolean isInScaffRange(double x, double y)
    { return (x>=minXScaff && x<=maxXScaff && y>=minYScaff && y<=maxYScaff); }

    //maximum time to wait for R to assign probabilities.  Any value here is dangerous.
    public static final int MAX_R_PROB_ASSIGNMENT_MILLIS = 900000;
    protected int maxRProbAssignmentMillis = MAX_R_PROB_ASSIGNMENT_MILLIS;

    //boundaries for the entire match space
    protected float minDeltaMass;
    protected float maxDeltaMass;
    protected float minDeltaElution;
    protected float maxDeltaElution;


    //ranges for the mass space, calculated from the above
    protected float totalMassRange;
    protected float totalElutionRange;

    //Use EM estimation for probabilities, rather than nonparametric
    public static final int DEFAULT_MIN_EM_ITERATIONS = 30;
    public static final int DEFAULT_MAX_EM_ITERATIONS = 200;
    public static final float DEFAULT_EM_MAX_DELTA_P_FOR_STABLE = .005f;
    public static final int DEFAULT_EM_MAX_ITERATIONS_STABLE_FOR_CONVERGENCE = 1;

    protected int minEMIterations = DEFAULT_MIN_EM_ITERATIONS;
    protected int maxEMIterations = DEFAULT_MAX_EM_ITERATIONS;


    //minimum match probability to keep in output
    protected float minMatchProbability;

    protected float maxMatchFDR;

    public static final float KS_CUTOFF_FOR_WARN = 0.005f;


    //Default minimum match probability to keep.  Because ProteinProphet uses 0.05 and above.
    public static final float DEFAULT_MIN_MATCH_PROBABILITY = 0.1f;

    //Default max FDR to keep.  Keep everything
    public static final float DEFAULT_MAX_MATCH_FDR = 1f;

    //Hard maximum on second-best probability
    protected float maxSecondBestProbability = DEFAULT_MAX_SECONDBEST_PROBABILITY;
    //Minimum difference between second-best probability and best probability
    protected float minSecondBestProbabilityDifference = DEFAULT_MIN_SECONDBEST_PROBABILITY_DIFFERENCE;
    

    //maximum depth of the quadtree.  We have to set a maximum or we run out of
    //precision, even when using doubles.
    public static final int MAX_TREE_DEPTH = 30;

    //Scaling factor for error values.  This is purely cosmetic.
    public static final int SCALING_FACTOR = 100;

    //The number of points to find around each point in order to estimate the area.
    //Density will always be estimated using this number of points unless there _aren't_ that
    //many points in the distribution
    public static final int NUM_TARGET_POINTS = 30;
    public static final int NUM_DECOY_POINTS = 30;

    //Hard maximum on second-best probability
    public static final float DEFAULT_MAX_SECONDBEST_PROBABILITY = 0.5f;

    //Minimum difference between second-best probability and best probability
    public static final float DEFAULT_MIN_SECONDBEST_PROBABILITY_DIFFERENCE = 0.1f;
    


    //these statistics are kept for methods development only, controlled by the keepStatistics flag.
    //For heaven's sake, turn this flag off in the shipped code.
    protected boolean keepStatistics = false;

    protected List<Double> targetAreas = new ArrayList<Double>();
    protected List<Double> decoyAreas = new ArrayList<Double>();

    protected List<Integer> targetWindowFeatureNumbers = new ArrayList<Integer>();
    protected List<Integer> decoyWindowFeatureNumbers = new ArrayList<Integer>();

    protected float initialProportionTrue = 0f;
    protected float meanProbability = 0f;
    protected float expectedTrue = 0f;

    protected float ks_score_x = 0f;
    protected float ks_score_y = 0f;

    protected float quantileCorrX = 0f;
    protected float quantileCorrY = 0f;

    protected float quantileBetaX = 0f;
    protected float quantileBetaY = 0f;

    protected float mu_x = 0f;
    protected float mu_y = 0f;

    protected float sigma_x = 0f;
    protected float sigma_y = 0f;

    protected int num_iterations = 0;

    protected boolean converged = false;

    protected float proportion = 0f;



    /**
     * Set the loose matching parameters.  Possibly should just specify the target and decoy
     * matching results here, too.
     * @param minDeltaMass
     * @param maxDeltaMass
     * @param minDeltaElution
     * @param maxDeltaElution
     * @param minMatchProbability
     * @param maxMatchFDR
     */
    public AmtMatchProbabilityAssigner(float minDeltaMass, float maxDeltaMass,
                                       float minDeltaElution, float maxDeltaElution,
                                       float minMatchProbability, float maxMatchFDR)
    {
        this.minDeltaMass = minDeltaMass;
        this.maxDeltaMass = maxDeltaMass;
        this.minDeltaElution = minDeltaElution;
        this.maxDeltaElution = maxDeltaElution;

        //this relies on maxDeltaMass > 0 and minDeltaMass < 0, etc.
        totalMassRange = maxDeltaMass + Math.abs(minDeltaMass);
        totalElutionRange = maxDeltaElution + Math.abs(minDeltaElution);

        this.minMatchProbability = minMatchProbability;
        this.maxMatchFDR = maxMatchFDR;
    }


    /**
     * Helper method to analyze a matchingresult and create the error data, both
     * dimensions, that is fed the probability model.
     * The data structure that this returns is atrocious, I know.  It returns
     * the list of mass errors, the list of H errors, and a map from pairs of features
     * in the matchingResult to indices in the lists.
     * @param matchingResult
     * @return
     */
    Pair<Pair<List<Float>, List<Float>>,Map<Pair<Feature,Feature>, Integer>>
          createMassAndHErrorLists(
            FeatureSetMatcher.FeatureMatchingResult matchingResult)
    {
       List<Float> massErrorDataList = new ArrayList<Float>();
       List<Float> hErrorDataList = new ArrayList<Float>();
        Map<Pair<Feature,Feature>, Integer> amtMatchErrorIndexMap =
                new HashMap<Pair<Feature,Feature>, Integer>();

        for (Feature ms1Feature : matchingResult.getMasterSetFeatures())
        {
            for (Feature matchedAmtFeature : matchingResult.get(ms1Feature))
            {
                amtMatchErrorIndexMap.put(new Pair<Feature,Feature>(ms1Feature, matchedAmtFeature),
                                          massErrorDataList.size());

                //convert to ppm
                massErrorDataList.add((ms1Feature.getMass() - matchedAmtFeature.getMass()) *
                    (1000000 / ms1Feature.getMass()));
                hErrorDataList.add((float) (AmtExtraInfoDef.getObservedHydrophobicity(ms1Feature) -
                            (AmtExtraInfoDef.getObservedHydrophobicity(matchedAmtFeature))));
            }
        }
        Pair<List<Float>, List<Float>> errorLists =
                new Pair<List<Float>, List<Float>>(massErrorDataList, hErrorDataList);
        return new Pair<Pair<List<Float>, List<Float>>,Map<Pair<Feature,Feature>, Integer>>(
                errorLists, amtMatchErrorIndexMap);
    }

    /**
     * Use the initial (loose-tolerance) matching data, to both the target and decoy databases,
     * to determine a probability for each loose match.  Probabilities get assigned directly
     * to the MS1 features.  This method also makes the peptide assignments:  one peptide per
     * MS1 feature, break ties using probability of match.
     *
     * Probability for each match is
     * (estimated target density - estimated decoy density) / estimated target density
     *
     * @param targetMatchingResult
     * @param decoyMatchingResult
     * @param showCharts
     * @throws IOException
     */
    public List<Feature> assignMatchesAndProbabilities(
            FeatureSetMatcher.FeatureMatchingResult targetMatchingResult,
            FeatureSetMatcher.FeatureMatchingResult decoyMatchingResult,
            boolean showCharts)
                throws IOException
    {
        //Create lists of error data and auxiliary data structures for target matches
        Pair<Pair<List<Float>, List<Float>>,Map<Pair<Feature,Feature>, Integer>>
                targetErrorDataListsAndMap =
                    createMassAndHErrorLists(targetMatchingResult);


        Pair<List<Float>, List<Float>> targetErrorDataLists =
                targetErrorDataListsAndMap.first;
        Map<Pair<Feature,Feature>, Integer> targetAmtMatchErrorIndexMap =
                targetErrorDataListsAndMap.second;

        List<Float> targetMassErrorList = targetErrorDataLists.first;
        List<Float> targetHErrorList = targetErrorDataLists.second;

        int numTargetPoints = targetMassErrorList.size();


        //Create lists of error data for decoy matches
        Pair<Pair<List<Float>, List<Float>>,Map<Pair<Feature,Feature>, Integer>>
                decoyErrorDataListsAndMap =
                createMassAndHErrorLists(decoyMatchingResult);
        Pair<List<Float>, List<Float>> decoyErrorDataLists =
                decoyErrorDataListsAndMap.first;

        List<Float> decoyMassErrorList = decoyErrorDataLists.first;
        List<Float> decoyHErrorList = decoyErrorDataLists.second;

        int numDecoyPoints = decoyMassErrorList.size();

        if (showCharts)
        {
            PanelWithScatterPlot pspDecoy =
                    new PanelWithScatterPlot(decoyHErrorList, decoyMassErrorList,
                                             "Decoy data");
            pspDecoy.setPointSize(2);
            pspDecoy.setAxisLabels("deltaNRT (NRT units)", "deltaMass (ppm)");
            pspDecoy.displayInTab();
        }        

        _log.debug("Target matches: " + targetMatchingResult.size() + ", data points: " + numTargetPoints);
        _log.debug("\tDecoy matches: " + decoyMatchingResult.size() + ", decoy data points: " + numDecoyPoints);

        ApplicationContext.setMessage("Starting probability assignment...");
        //calculate the probabilities.  This can take a while, so track how long.
        Date beforeProbDate = new Date();
        float[] allMatchProbabilities = null;
        float[] allMatchFDRs = null;


        initialProportionTrue = (float) (numTargetPoints - numDecoyPoints) / (float) numTargetPoints;
        //fudge in case of no advantage for target.  This is to allow decoy matches
        if (initialProportionTrue <= 0)
        {
            ApplicationContext.setMessage("WARNING: Adjusting initial proportion true from " + initialProportionTrue +
                    " to 0.001.  This should only be necessary for a decoy match.  If you are not performing a decoy " +
                    "match, this message indicates that matching has performed VERY POORLY: you have at least as " +
                    "many decoy matches as target matches.");
            initialProportionTrue = 0.001f;
        }

        ApplicationContext.setMessage("\tinitial proportion true: " + initialProportionTrue);
        ApplicationContext.setMessage("Calculating probabilities with EM (parametric) method...");

        Pair<float[], float[]> matchProbsAndFDRs =
                calculateProbabilitiesAndFDRsEM(targetMassErrorList, targetHErrorList,
                        initialProportionTrue, showCharts);
        allMatchProbabilities = matchProbsAndFDRs.first;
        allMatchFDRs = matchProbsAndFDRs.second;
//for (int i=0; i<allMatchProbabilities.length; i++) System.err.println("**" + allMatchProbabilities[i] + ", " + allMatchFDRs[i]);

        ApplicationContext.setMessage("Done.");

        if (showCharts)
        {
            PanelWithLineChart pwlcProbFDR = new PanelWithLineChart(allMatchProbabilities, allMatchFDRs, "prob vs FDR");

            pwlcProbFDR.setAxisLabels("probability", "False Discovery Rate");
            pwlcProbFDR.displayInTab();
        }


        Date afterProbDate = new Date();
        long probMillis = afterProbDate.getTime() - beforeProbDate.getTime();
        ApplicationContext.setMessage("Probability assignment took " + ((float) probMillis / 1000.0f) +
                                       " seconds");

        //assign probabilities -- and peptides -- to features.  If there are multiple matches per feature,
        //assign the highest-probability match
        List<Float> featureMatchProbs = new ArrayList<Float>();
        int numPosProbMatches=0;
        int numPoint95Matches=0;


        float[] featureProbabilities = new float[targetMatchingResult.size()];

        float[] featureKLs = new float[targetMatchingResult.size()];
        float[] featureLogIntensities = new float[targetMatchingResult.size()];

        float[] featureSecondBestProbabilities = new float[targetMatchingResult.size()];
        String[] featureSecondBestPeptides = new String[targetMatchingResult.size()];



        List<Feature> matchedMS1Features = new ArrayList<Feature>();
        int ms1FeatureIndex=0;
        for (Feature ms1Feature : targetMatchingResult.getMasterSetFeatures())
        {
            List<Feature> matchedAmtFeatures = targetMatchingResult.get(ms1Feature);
            Feature bestAmtMatchFeature = null;
            float bestMatchProbThisFeature = -1f;
            float bestMatchFDRThisFeature = 1f;

            //only use the best AMT match.  This is incomplete, of course.
            for (Feature amtFeature : matchedAmtFeatures)
            {
                int probabilityIndex =
                        targetAmtMatchErrorIndexMap.get(new Pair<Feature,Feature>(ms1Feature, amtFeature));
                float matchProbability = allMatchProbabilities[probabilityIndex];
                float matchFDR = allMatchFDRs[probabilityIndex];
                String peptide = MS2ExtraInfoDef.getFirstPeptide(amtFeature);

                if (matchProbability > bestMatchProbThisFeature)
                {
                    featureSecondBestProbabilities[ms1FeatureIndex] = bestMatchProbThisFeature;
                    if (bestAmtMatchFeature != null)
                        featureSecondBestPeptides[ms1FeatureIndex] = MS2ExtraInfoDef.getFirstPeptide(bestAmtMatchFeature);
                    bestMatchProbThisFeature = matchProbability;
                    bestMatchFDRThisFeature = matchFDR;
                    bestAmtMatchFeature = amtFeature;
                }
                else if (matchProbability > featureSecondBestProbabilities[ms1FeatureIndex])
                {
                    featureSecondBestProbabilities[ms1FeatureIndex] = matchProbability;
                    featureSecondBestPeptides[ms1FeatureIndex] = peptide;
                }
            }

            featureProbabilities[ms1FeatureIndex] = bestMatchProbThisFeature;
            featureKLs[ms1FeatureIndex] = ms1Feature.kl;
            featureLogIntensities[ms1FeatureIndex] = (float) Math.log(ms1Feature.intensity);


            if (bestMatchProbThisFeature <= 0)
            {
                continue;
            }

            numPosProbMatches++;
            if (bestMatchProbThisFeature > 0.95)
                numPoint95Matches++;

            //If we've exceeded the minimum match probability, process the match
            if (bestMatchProbThisFeature >= minMatchProbability && bestMatchFDRThisFeature <= maxMatchFDR)
            {
                //Filter out this match if:
                //(1.  There is a second-best match with high-enough probability, or
                // 2.  There is a second-best match with close-enough probability to the highest)
                //AND the first and second match peptides are different
                if (featureSecondBestProbabilities[ms1FeatureIndex] > 0 &&
                    ((featureSecondBestProbabilities[ms1FeatureIndex] > maxSecondBestProbability ||
                      (bestMatchProbThisFeature - featureSecondBestProbabilities[ms1FeatureIndex]) <
                       minSecondBestProbabilityDifference)) &&
                    !featureSecondBestPeptides[ms1FeatureIndex].equals(MS2ExtraInfoDef.getFirstPeptide(bestAmtMatchFeature)))
                {
                    _log.debug("Filtering peptide " + MS2ExtraInfoDef.getFirstPeptide(bestAmtMatchFeature) +
                               "with probability " + bestMatchProbThisFeature + 
                               " due to second-best probability " + featureSecondBestProbabilities[ms1FeatureIndex]);
                }
                else
                {
                    //Everything's OK, record the match
                    MS2ExtraInfoDef.setSinglePeptide(ms1Feature,
                            MS2ExtraInfoDef.getFirstPeptide(bestAmtMatchFeature));

                    List<ModifiedAminoAcid>[] modifiedAAs =
                            MS2ExtraInfoDef.getModifiedAminoAcids(bestAmtMatchFeature);
                    if (modifiedAAs != null)
                    {
                        MS2ExtraInfoDef.setModifiedAminoAcids(ms1Feature, modifiedAAs);
                    }

                    //set probability of match
                    AmtExtraInfoDef.setMatchProbability(ms1Feature, bestMatchProbThisFeature);
                    AmtExtraInfoDef.setMatchFDR(ms1Feature, bestMatchFDRThisFeature);

                    //probability that the ID in the database is correct -- this is stored on the AMT feature,
                    //modeled for now as PeptideProphet.  I'm hedging my bets in case somehow that PeptideProphet
                    //value didn't get assigned
                    float idProbability = 1;
                    if (MS2ExtraInfoDef.hasPeptideProphet(bestAmtMatchFeature) &&
                            MS2ExtraInfoDef.getPeptideProphet(bestAmtMatchFeature) > 0)
                        idProbability = (float) MS2ExtraInfoDef.getPeptideProphet(bestAmtMatchFeature);

                    //This represents the overall probability of correct ID:
                    //match probability times ID probability
                    float probabilityToAssign = bestMatchProbThisFeature * idProbability;
                    MS2ExtraInfoDef.setPeptideProphet(ms1Feature, probabilityToAssign);

                    matchedMS1Features.add(ms1Feature);
                }
            }

            featureMatchProbs.add(bestMatchProbThisFeature);
            ms1FeatureIndex++;
        }


        if (showCharts)
        {
            PanelWithScatterPlot pwspIntProb = new PanelWithScatterPlot(featureLogIntensities, featureProbabilities, "logintensity vs prob");
            pwspIntProb.setAxisLabels("intensity", "probability");
            pwspIntProb.displayInTab();

            PanelWithScatterPlot pwspKLProb = new PanelWithScatterPlot(featureKLs, featureProbabilities, "KL vs prob");
            pwspKLProb.setAxisLabels("KL", "probability");
            pwspKLProb.displayInTab();
        }


        meanProbability = (float)BasicStatistics.mean(featureMatchProbs);
        expectedTrue = meanProbability * featureMatchProbs.size();
        ApplicationContext.infoMessage("Actual matches made with prob>0: " + numPosProbMatches +
                ", with prob>.95: " + numPoint95Matches +
                ", mean probability = " + meanProbability +
                ", expected true = " + expectedTrue);

        return matchedMS1Features;
    }


    public Pair<float[],float[]> calculateProbabilitiesAndFDRsEM(List<Float> targetMassErrorDataList,
                                         List<Float> targetHErrorDataList,
                                         float proportionTrue,
                                         boolean showCharts)
            throws IOException
    {
        float[] probabilities = calculateProbabilitiesEM(targetMassErrorDataList,
                                         targetHErrorDataList,proportionTrue,showCharts);
        return new Pair<float[], float[]>(probabilities, calculateFDRsForProbabilities(probabilities));
    }

    /**
     * Given an array of probabilities, calculate FDR by summing up true and false probabilities, starting
     * with the best probability and working downward
     * @param probabilities
     * @return
     */
    public static float[] calculateFDRsForProbabilities(float[] probabilities)
    {
        float[] probabilitiesSorted = new float[probabilities.length];
        System.arraycopy(probabilities, 0, probabilitiesSorted, 0, probabilities.length);
        Arrays.sort(probabilitiesSorted);

        float sumProbBad = 0f;
        float sumProbGood = 0f;
        Map<Float, Float> probFDRMap = new HashMap<Float, Float>(probabilities.length);
        for (int i=probabilitiesSorted.length-1; i>=0; i--)
        {
            double prob = probabilitiesSorted[i];
            sumProbBad += (1 - prob);
            sumProbGood += prob;

            probFDRMap.put(probabilitiesSorted[i], sumProbBad / (sumProbBad + sumProbGood));
        }
        float[] fdrs = new float[probabilities.length];
        for (int i=0; i<fdrs.length; i++)
        {
            fdrs[i] = probFDRMap.get(probabilities[i]);

        }
        return fdrs;
    }


    /**
     * Use the Expectation Maximization algorithm to determine match probabilities by
     * modeling the true hits as a normal distribution, and the false hits as a uniform
     * distribution, superimposed in the target distribution.
     * @param targetMassErrorDataList
     * @param targetHErrorDataList
     * @param proportionTrue
     * @param showCharts
     * @return
     * @throws IOException
     */
    public float[] calculateProbabilitiesEM(List<Float> targetMassErrorDataList,
                                         List<Float> targetHErrorDataList,
                                         float proportionTrue,
                                         boolean showCharts)
            throws IOException
    {
        int numPoints = targetMassErrorDataList.size();

        double[] targetMassErrorData = new double[numPoints];
        double[] targetHErrorData = new double[numPoints];



        for (int i=0; i<numPoints; i++)
        {
            targetHErrorData[i] = targetHErrorDataList.get(i);
            targetMassErrorData[i] = targetMassErrorDataList.get(i);
        }


        Map<String,Object> rScalarVarMap = new HashMap<String,Object>();
        Map<String,double[]> rVectorVarMap = new HashMap<String,double[]>();



        rVectorVarMap.put("targetx",targetHErrorData);
        rVectorVarMap.put("targety",targetMassErrorData);


        rScalarVarMap.put("proportion",(double) proportionTrue);
        double area = (maxDeltaMass - minDeltaMass) * (maxDeltaElution - minDeltaElution);
        rScalarVarMap.put("area",area);
        rScalarVarMap.put("miniterations",(double) minEMIterations);
        rScalarVarMap.put("maxiterations",(double) maxEMIterations);
        rScalarVarMap.put("max_deltap_proportion_for_stable",(double) DEFAULT_EM_MAX_DELTA_P_FOR_STABLE);
        rScalarVarMap.put("max_deltap_proportion",(double) DEFAULT_EM_MAX_DELTA_P_FOR_STABLE + 0.001);

        rScalarVarMap.put("iters_stable_for_converg",(double) DEFAULT_EM_MAX_ITERATIONS_STABLE_FOR_CONVERGENCE);

        rScalarVarMap.put("showcharts",showCharts ? "TRUE" : "FALSE");
        File outChartFile = null;
        File normTestOutChartFile = null;

        if (showCharts)
        {
            outChartFile = TempFileManager.createTempFile("em_plots.jpg", this);
            normTestOutChartFile = TempFileManager.createTempFile("em_normtest_plots.jpg", this);

            rScalarVarMap.put("chart_out_file", "'" + RInterface.generateRFriendlyPath(outChartFile) + "'");
            rScalarVarMap.put("normtest_chart_out_file", "'" +
                    RInterface.generateRFriendlyPath(normTestOutChartFile) + "'");

        }

        String calculateProbabilitiesCommand =
          RInterface.readResourceFile("/org/fhcrc/cpl/viewer/amt/assign_amt_probabilities_em.R");

        //this timeout is arbitrary and dangerous.  Woo hoo!
        int timeoutMilliseconds=maxRProbAssignmentMillis;
        String rResult = RInterface.evaluateRExpression(calculateProbabilitiesCommand,
                rScalarVarMap, rVectorVarMap, null, null, timeoutMilliseconds);

        Map<String, String> varResultStringMap =
                RInterface.extractVariableStringsFromListOutput(rResult);
        float[] probabilities = new float[numPoints];

        int probsIndex = 0;
        String[] probChunks = varResultStringMap.get("probs").split("\\s");
        for (String probChunk : probChunks)
        {
            if (!probChunk.contains("[") && probChunk.length() > 0)
            {
                probabilities[probsIndex++] = Float.parseFloat(probChunk);
            }
        }
        if (probsIndex != numPoints)
            throw new IOException("FAILED to read probabilities correctly back from R!");

        converged = varResultStringMap.get("converged").contains("TRUE");
        String numIterString = varResultStringMap.get("num_iterations");
        numIterString = numIterString.substring(numIterString.indexOf("]") + 1).trim();
        num_iterations = Integer.parseInt(numIterString);
        if (converged)
        {

            ApplicationContext.setMessage("EM estimation converged after " +
                    num_iterations + " iterations");
        }
        else
        {
            ApplicationContext.infoMessage("WARNING!!! EM estimation failed to converge after " +
                    num_iterations + " iterations");
        }

        String[] ksChunks = varResultStringMap.get("ksresults").split("\\s");
        ks_score_x = Float.parseFloat(ksChunks[2]);
        ks_score_y = Float.parseFloat(ksChunks[3]);

        //todo: move parsing of vector results into RInterface
        String[] distParamValues = varResultStringMap.get("dist_params").split("\\s");
        List<Float> paramVals = new ArrayList<Float>();
        for (String paramVal : distParamValues)
            if (paramVal != null && paramVal.length() > 1 && !paramVal.contains("["))
                paramVals.add(Float.parseFloat(paramVal));

        mu_x = paramVals.get(0);
        mu_y = paramVals.get(1);
        sigma_x = paramVals.get(2);
        sigma_y = paramVals.get(3);
        proportion = paramVals.get(4);


        _log.debug("Distribution params: mu_x=" + mu_x + ", mu_y=" + mu_y +
                   ", sigma_x=" + sigma_x + ", sigma_y=" + sigma_y + ", proportion=" + proportion);


        if (ks_score_x < KS_CUTOFF_FOR_WARN|| ks_score_y < KS_CUTOFF_FOR_WARN)
        {
            _log.debug("WARNING!!!!  Kolmogorov-Smirnov test indicates that these matching data " +
                "may not conform to a bivariate normal distribution intermingled with a uniform " +
                "distribution.  If this key assumption fails, match probabilities will not be " +
                "accurate.  Please consider re-running this analysis in non-parametric mode. " +
                "KS p-values: X=" + ks_score_x + ", Y=" + ks_score_y);
        }
        else
        {
            ApplicationContext.setMessage("KS normality test passed.  KS values: x = " + ks_score_x + ", y = " + ks_score_y);
        }

        String[] corrChunks = varResultStringMap.get("corresults").split("\\s");
        quantileCorrX = Float.parseFloat(corrChunks[2]);
        quantileCorrY = Float.parseFloat(corrChunks[3]);

        String[] qBetaChunks = varResultStringMap.get("qbetas").split("\\s");
        quantileBetaX = Float.parseFloat(qBetaChunks[2]);
        quantileBetaY = Float.parseFloat(qBetaChunks[3]);        

        if (showCharts)
        {
            try
            {
                PanelWithBlindImageChart pwbic =
                        new PanelWithBlindImageChart(outChartFile,"EM Parameters");
                pwbic.displayInTab();

            }
            catch (Exception e)
            {
                ApplicationContext.errorMessage("Error displaying error cutoff chart images, file: "
                              + outChartFile.getAbsolutePath(),e);
            }
            try
            {
                PanelWithBlindImageChart pwbic =
                        new PanelWithBlindImageChart(normTestOutChartFile,"EM dist analysis");
                pwbic.displayInTab();
            }
            catch (Exception e)
            {
                ApplicationContext.infoMessage("Error displaying error cutoff chart images, file: "
                              + normTestOutChartFile.getAbsolutePath() + ", error: " + e.getMessage());
            }


            //perspective plot, with distributions indicated
            try
            {
                double minH = Double.MAX_VALUE;
                double maxH = Double.MIN_VALUE;
                double minMass = Double.MAX_VALUE;
                double maxMass = Double.MIN_VALUE;

                for (int i=0; i<targetHErrorData.length; i++)
                {
                    minH = Math.min(minH, targetHErrorData[i]);
                    maxH = Math.max(maxH, targetHErrorData[i]);
                    minMass = Math.min(minMass, targetMassErrorData[i]);
                    maxMass = Math.max(maxMass, targetMassErrorData[i]);
                }
                int numBinsEachDim = 25;

                double rangeH = maxH - minH;
                double rangeMass = maxMass - minMass;
                double binSizeH = rangeH / numBinsEachDim;
                double binSizeMass = rangeMass / numBinsEachDim;

                _log.debug("Building perspective chart...");
                PanelWithRPerspectivePlot persp =
                        new PanelWithRPerspectivePlot();
                persp.setMillisToWait(300000);
                persp.setName("Distribution");
                persp.setRotationAngle(30);
                persp.setChartWidth(1100);
                persp.setChartHeight(1100);
                persp.setAxisRVariableNames("HError","MassError","Count");
                persp.setForegroundColorString("lightblue");

                double unifHeight = (targetHErrorData.length * (1 - proportion)) /
                                    (numBinsEachDim * numBinsEachDim);
                _log.debug("Uniform density height: " + unifHeight);

                int numDistLines = 50;
                int numDistLinePoints = 125;

                double expectedNumTrue = numPoints * proportion;

                double distanceBetweenDistLinePointsH = rangeH / (numDistLinePoints -1);
                double distanceBetweenDistLinePointsMass = rangeMass / (numDistLinePoints -1);
                double distanceBetweenDistLinesH = rangeH / (numDistLines -1);
                double distanceBetweenDistLinesMass = rangeMass / (numDistLines -1);

                for (int i=0; i<numDistLines; i++)
                {
                    double[] distLine1x = new double[numDistLinePoints];
                    double[] distLine1y = new double[numDistLinePoints];
                    double[] distLine1z = new double[numDistLinePoints];

                    double[] distLine2x = new double[numDistLinePoints];
                    double[] distLine2y = new double[numDistLinePoints];
                    double[] distLine2z = new double[numDistLinePoints];
                    for (int j=0; j<numDistLinePoints; j++)
                    {
                        distLine1x[j] = minH + (i * distanceBetweenDistLinesH);
                        distLine1y[j] = minMass + (j * distanceBetweenDistLinePointsMass);
                        distLine1z[j] = calculateLocalNormalizedDensity(distLine1x[j], distLine1y[j],
                                        expectedNumTrue, binSizeH, binSizeMass) * expectedNumTrue;
                        distLine1z[j] += unifHeight;

//System.err.println("xx=" + distLine1x[j] + ", y=" + distLine1y[j] + ", z=" + distLine1z[j]);
                        distLine2x[j] = minH + (j * distanceBetweenDistLinePointsH);
                        distLine2y[j] = minMass + (i * distanceBetweenDistLinesMass);
                        distLine2z[j] = calculateLocalNormalizedDensity(distLine2x[j], distLine2y[j],
                                        expectedNumTrue, binSizeH, binSizeMass) * expectedNumTrue;
                        distLine2z[j] += unifHeight;

                    }
                    persp.addLine(distLine1x, distLine1y, distLine1z, "red");
                    persp.addLine(distLine2x, distLine2y, distLine2z, "red");
                }


//add lines representing uniform and normal distributions
                persp.plotPointsSummary(targetHErrorData, targetMassErrorData, binSizeH, binSizeMass);

                persp.displayInTab();
                _log.debug("Done building perspective chart");
            }
            catch (Exception e)
            {
                ApplicationContext.infoMessage("Error displaying perspective plot: " + e.getMessage());
                e.printStackTrace(System.err);
            }

            PanelWithScatterPlot psp = new PanelWithScatterPlot();
            psp.setPointSize(2);
            psp.setAxisLabels("deltaNRT (NRT units)", "deltaMass (ppm)");
            psp.setName("Error Data with Prob");

            int numGroups = 10;
            for (int j=0; j<numGroups; j++)
            {
                float minProbThisGroup = (float)j/(float)numGroups;
                float maxProbThisGroup = (float)(j+1)/(float)numGroups;
                int red = (255/numGroups) * j;
                int blue = 255 - (255/numGroups) * j;
                Color color = new Color(blue, 10,red);

                List<Float> thisGroupMass = new ArrayList<Float>();
                List<Float> thisGroupH = new ArrayList<Float>();

                for (int k=0; k<probabilities.length; k++)
                {
                    if (probabilities[k] <= maxProbThisGroup &&
                            probabilities[k] >= minProbThisGroup)
                    {
                        thisGroupMass.add((float)targetMassErrorData[k]);
                        thisGroupH.add((float)targetHErrorData[k]);
                    }
                }

                psp.addData(thisGroupH, thisGroupMass, ""+minProbThisGroup);
                psp.setSeriesColor(j, color);
                psp.setAxisLabels("deltaNRT (NRT units)", "deltaMass (ppm)");
                psp.setPointSize(2);
            }
            psp.displayInTab();

            PanelWithHistogram pwh = new PanelWithHistogram(probabilities, "Probabilities");
            pwh.displayInTab();
        }
        TempFileManager.deleteTempFiles(this);

        return probabilities;
    }


    /**
     * Use the normal CDF to calculate the density of a small area of the distribution
     * @param xpos
     * @param ypos
     * @param expectedNumTrue
     * @param binSizeX
     * @param binSizeY
     * @return
     */
    protected double calculateLocalNormalizedDensity(double xpos, double ypos, double expectedNumTrue,
                                                     double binSizeX, double binSizeY)
    {
        double proportionX = Math.abs(BasicStatistics.calcNormalCumDensity(mu_x, sigma_x, xpos) -
                             BasicStatistics.calcNormalCumDensity(mu_x, sigma_x, xpos + binSizeX));
        double proportionY = Math.abs(BasicStatistics.calcNormalCumDensity(mu_y, sigma_y, ypos ) -
                             BasicStatistics.calcNormalCumDensity(mu_y, sigma_y, ypos + binSizeY));

        return proportionX * proportionY;
    }



    public float getInitialProportionTrue()
    {
        return initialProportionTrue;
    }

    public void setInitialProportionTrue(float initialProportionTrue)
    {
        this.initialProportionTrue = initialProportionTrue;
    }


    public float getExpectedTrue()
    {
        return expectedTrue;
    }


    public float getKsScoreX()
    {
        return ks_score_x;
    }

    public float getKsScoreY()
    {
        return ks_score_y;
    }

    public float getQuantileCorrX()
    {
        return quantileCorrX;
    }

    public float getQuantileCorrY()
    {
        return quantileCorrY;
    }


    public float getQuantileBetaX()
    {
        return quantileBetaX;
    }

    public float getQuantileBetaY()
    {
        return quantileBetaY;
    }

    public int getMaxRProbAssignmentMillis()
    {
        return maxRProbAssignmentMillis;
    }

    public void setMaxRProbAssignmentMillis(int maxRProbAssignmentMillis)
    {
        this.maxRProbAssignmentMillis = maxRProbAssignmentMillis;
    }


    public float getMuX()
    {
        return mu_x;
    }

    public float getMuY()
    {
        return mu_y;
    }

    public float getSigmaX()
    {
        return sigma_x;
    }

    public float getSigmaY()
    {
        return sigma_y;
    }

    public int getNumIterations()
    {
        return num_iterations;
    }

    public float getMaxSecondBestProbability()
    {
        return maxSecondBestProbability;
    }

    public void setMaxSecondBestProbability(float maxSecondBestProbability)
    {
        this.maxSecondBestProbability = maxSecondBestProbability;
    }

    public float getMinSecondBestProbabilityDifference()
    {
        return minSecondBestProbabilityDifference;
    }

    public void setMinSecondBestProbabilityDifference(float minSecondBestProbabilityDifference)
    {
        this.minSecondBestProbabilityDifference = minSecondBestProbabilityDifference;
    }

    public int getMaxEMIterations()
    {
        return maxEMIterations;
    }

    public void setMaxEMIterations(int maxEMIterations)
    {
        this.maxEMIterations = maxEMIterations;
    }

    public int getMinEMIterations()
    {
        return minEMIterations;
    }

    public void setMinEMIterations(int minEMIterations)
    {
        this.minEMIterations = minEMIterations;
    }

    public boolean isConverged()
    {
        return converged;
    }
}