FeaturePepXmlWriter.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.fhcrc.cpl.toolbox.proteomics.feature;

import java.io.*;
import java.util.*;
import java.math.BigInteger;

import org.apache.log4j.Logger;

import net.systemsbiology.regisWeb.pepXML.*;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.IsotopicLabelExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.MS2Modification;
import org.fhcrc.cpl.toolbox.proteomics.ModifiedAminoAcid;
import org.fhcrc.cpl.toolbox.proteomics.PeptideGenerator;
import org.fhcrc.cpl.toolbox.proteomics.QuantitationUtilities;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet;
import org.fhcrc.cpl.toolbox.proteomics.feature.AnalyzeICAT;
import org.fhcrc.cpl.toolbox.proteomics.filehandler.Q3Handler;
import org.fhcrc.cpl.toolbox.proteomics.filehandler.XPressHandler;
import org.fhcrc.cpl.toolbox.proteomics.filehandler.BasePepXmlWriter;
import org.fhcrc.cpl.toolbox.proteomics.filehandler.RelativeQuantAnalysisResult;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.Attr;

/**
 * A restrictive wrapper for writing PepXml files, using a Feature array to populate the spectrum_queries
 */
public class FeaturePepXmlWriter extends BasePepXmlWriter
{
    static Logger _log = Logger.getLogger(FeaturePepXmlWriter.class);

    //all features and modifications to be written
    protected Feature[] _features = null;


    protected boolean writeIntensitiesAsXpressResults = false;

    protected int firstSpectrumQueryIndex = 1;

    public final int RATIO_MODE_XPRESS = 0;
    public final int RATIO_MODE_Q3 = 1;

    //we have to write out ratios, whose provenance we don't know, as either Q3 or XPress.
    //This variable determines which
    protected int ratioMode = RATIO_MODE_Q3;
    //keep track of whether the FeatureSet thinks it has a ratio declaration, because we should write that
    //out even if no features have it
    protected boolean featureSetHasRatioDeclaration = false;


    protected AnalyzeICAT.IsotopicLabel _isotopicLabel = null;

    //spectrum names that have been written to the file.  Must keep track to preserve uniqueness
    protected Set<String> assignedSpectrumNames = new HashSet<String>();


    /**
     * Create doc structure, populate features and modifications
     * @param features
     * @param modifications
     */
    public FeaturePepXmlWriter(Feature[] features, MS2Modification[] modifications)
    {
        super(modifications);
        setFeatures(features);
    }

    public FeaturePepXmlWriter(FeatureSet featureSet)
    {
        super();
        setModifications(MS2ExtraInfoDef.getFeatureSetModifications(featureSet));
        setSearchDatabase(MS2ExtraInfoDef.getFeatureSetSearchDatabasePath(featureSet));

        String quantitationAlgorithm = IsotopicLabelExtraInfoDef.getFeatureSetAlgorithm(featureSet);
        if (quantitationAlgorithm != null)
        {
            featureSetHasRatioDeclaration = true;
            if (quantitationAlgorithm.equals(QuantitationUtilities.ALGORITHM_Q3))
                ratioMode = RATIO_MODE_Q3;
            else if (quantitationAlgorithm.equals(QuantitationUtilities.ALGORITHM_XPRESS))
                ratioMode = RATIO_MODE_XPRESS;
            else throw new IllegalArgumentException("Unknown quantitation algorithm " + quantitationAlgorithm);
            _log.debug("Quantitation Algorithm: " + quantitationAlgorithm);
        }
        else
            _log.debug("No quantitation algorithm found");

        String baseName = MS2ExtraInfoDef.getFeatureSetBaseName(featureSet);
        if (featureSet != null)
            setBaseName(baseName);

        int maxCleavages =
                MS2ExtraInfoDef.getFeatureSetSearchConstraintMaxIntCleavages(featureSet);
        int minTermini =
                MS2ExtraInfoDef.getFeatureSetSearchConstraintMinTermini(featureSet);

        setSearchConstraints(maxCleavages, minTermini);
                
        setFeatures(featureSet.getFeatures());
    }

    /**
     * setter for features
     * @param features
     */
    public void setFeatures(Feature[] features)
    {
        _features = features;
    }


    public boolean willWriteIntensitiesAsXpressResults()
    {
        return writeIntensitiesAsXpressResults;
    }

    public void setWriteIntensitiesAsXpressResults(boolean writeIntensitiesAsXpressResults)
    {
        this.writeIntensitiesAsXpressResults = writeIntensitiesAsXpressResults;
    }


    /**
     * Write out all features immediately
     * @param pw
     */
    protected void writeSpectrumQueries(PrintWriter pw)
    {
        if (_features == null)
            return;
        for (int i=0; i<_features.length; i++)
        {
            //only write features with peptide identifications
            if (MS2ExtraInfoDef.getFirstPeptide(_features[i]) != null)
                writeFeature(i, pw);
        }
    }

    /**
     * Add a SearchResult representing the passed-in feature to the first run of the file.
     * Write out the XML for that search result
     */
    protected void writeFeature(int featureIndex, PrintWriter pw)
    {
        Feature feature = _features[featureIndex];

        MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery spectrumQuery =
                addSpectrumQuery(feature.getScan(),
                                 feature.getScan(),
                        feature.getCharge(), featureIndex + firstSpectrumQueryIndex);
//        int indexAttribute = featureIndex + firstSpectrumQueryIndex;
        //dhmay changing the contents of the spectrum attribute, 20100419.  We've had reports that
        //the TPP can't open these files unless the format matches [basename].[start_scan].[end_scan].[assumed_charge]

        //dhmay changing again.  Sometimes, the spectrumName can be non-unique.  That is death for ProteinProphet.
        //Strategy: perturb the base by adding a number at the end.  Ensure uniqueness.
        //todo: is it OK to mess with the base in this way? Base won't always be the same.
        String spectrumName = _spectrumBaseString + feature.getScanFirst() + "." + feature.getScanLast() +
                "." + feature.charge;
        int baseSuffix=0;
        while (assignedSpectrumNames.contains(spectrumName))
        {
            String tempBase = _spectrumBaseString;
            if (tempBase.endsWith("."))
                tempBase = tempBase.substring(0, tempBase.length()-1);
            tempBase = tempBase + baseSuffix++ + ".";
            spectrumName = tempBase + feature.getScanFirst() + "." + feature.getScanLast() +
                "." + feature.charge;
        }

        spectrumQuery.setSpectrum(spectrumName);
        assignedSpectrumNames.add(spectrumName);

        float precursorNeutralMass = feature.getMass() + MS2ExtraInfoDef.getDeltaMass(feature);
        spectrumQuery.setPrecursorNeutralMass(precursorNeutralMass);

        //dhmay adding 7/1/08.  retention_time_sec isn't defined in the pepXml spec, but it's used by
        //various folks
        if (feature.getTime() > 0)
        {
            Attr retentionTimeAttr =
                    spectrumQuery.getDomNode().getOwnerDocument().createAttribute("retention_time_sec");
            retentionTimeAttr.setValue("" + feature.getTime());
            spectrumQuery.getDomNode().getAttributes().setNamedItem(retentionTimeAttr);
        }

        MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit searchHit =
                spectrumQuery.addNewSearchResult().addNewSearchHit();
        searchHit.setPeptide(MS2ExtraInfoDef.getFirstPeptide(feature));
        searchHit.setCalcNeutralPepMass(feature.getMass());
        searchHit.setHitRank(1);
        searchHit.setNumMatchedIons(BigInteger.valueOf(1));
        searchHit.setNumTotProteins(1);
        searchHit.setMassdiff(Float.toString(MS2ExtraInfoDef.getDeltaMass(feature)));
        searchHit.setNumMissedCleavages(BigInteger.valueOf(0));
        searchHit.setIsRejected(BigInteger.valueOf(0));

        //num_tol_term is important to ProteinProphet.  From David Shteynburg:
        //"In general, having NTT information is very important for distinguishing correct from
        // incorrect assignments both on the peptide and on the protein levels and this information
        // should be given whenever available."
        //If numTolTerm is unspecified but prevAA and nextAA are specified, ProteinProphet will
        //calculate numTolTerm, assuming trypsin
        searchHit.setNumTolTerm(BigInteger.valueOf(2));
        Character prevAA = MS2ExtraInfoDef.getPrevAminoAcid(feature);
        Character nextAA = MS2ExtraInfoDef.getNextAminoAcid(feature);

        if (prevAA != null)
            searchHit.setPeptidePrevAa("" + prevAA);
        if (nextAA != null)
            searchHit.setPeptideNextAa("" + nextAA);

        if (MS2ExtraInfoDef.hasNumEnzymaticEnds(feature))
            searchHit.setNumTolTerm(BigInteger.valueOf(MS2ExtraInfoDef.getNumEnzymaticEnds(feature)));


        //Properly, we should actually carry the previous and next AAs forward
        //through the Feature
//Should we do this?   The goal would be to get ProteinProphet to specify
//n_enzymatic_termini, but that doesn't seem to work
//        searchHit.setPeptidePrevAa("K");
//        searchHit.setPeptideNextAa("A");


        List<String> proteins =
                MS2ExtraInfoDef.getProteinList(feature);
        if (proteins != null && proteins.size() > 0)
        {
            searchHit.setProtein(proteins.get(0));

            List<Integer> altProteinNTTs = MS2ExtraInfoDef.getAltProteinNTTs(feature);
            for (int i=1; i<proteins.size(); i++)
            {
                MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit.AlternativeProtein altProtein =
                        searchHit.addNewAlternativeProtein();
                altProtein.setProtein(proteins.get(i));

                altProtein.setProteinDescr("(dummy description) Alternative Protein");

                int altProteinNTT = MS2ExtraInfoDef.getNumEnzymaticEnds(feature);
                if (altProteinNTTs != null && altProteinNTTs.size() >= i)
                    altProteinNTT = altProteinNTTs.get(i-1);
                if (MS2ExtraInfoDef.hasNumEnzymaticEnds(feature))
                    altProtein.setNumTolTerm(BigInteger.valueOf(altProteinNTT));
            }
        }

        //Peptide Prophet
        if (MS2ExtraInfoDef.hasPeptideProphet(feature))
        {
            addPeptideProphet(searchHit, 
                    (float) MS2ExtraInfoDef.getPeptideProphet(feature),
                    MS2ExtraInfoDef.getAllNttProb(feature),
                    (float) MS2ExtraInfoDef.getFval(feature));
        }

        //search scores
        Map<String,String> searchScoreMap = MS2ExtraInfoDef.getSearchScores(feature);
        if (searchScoreMap != null)
        {
            for (String searchScoreType : searchScoreMap.keySet())
            {
                addSearchScore(searchHit, searchScoreType, searchScoreMap.get(searchScoreType));
            }
        }

        //intensity
//        if (writeIntensitiesAsXpressResults && feature.getIntensity() > 0)
//        {
//            XPressHandler.XPressResult xpressAnalysisResult = new XPressHandler.XPressResult();
//            MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit.AnalysisResult xmlBeansXpressAnalysisResult =
//                searchHit.addNewAnalysisResult();
//            xmlBeansXpressAnalysisResult.setAnalysis(xpressAnalysisResult.getAnalysisType());
//
//            Element arElement =
//                    xmlBeansXpressAnalysisResult.getDomNode().getOwnerDocument().createElement("xpressratio_result");
//            arElement.setAttribute("ratio", feature.getIntensity() + ":1");
//            arElement.setAttribute("heavy_area", "1");
//            arElement.setAttribute("light_area",""  + feature.getIntensity());
//            xmlBeansXpressAnalysisResult.getDomNode().appendChild(arElement);
//        }
        //NOTE: we don't distinguish between Q3 and XPress results
        if (IsotopicLabelExtraInfoDef.hasRatio(feature))
        {
            String resultElementTagName = null;
            RelativeQuantAnalysisResult myAnalysisResult = null;
            switch (ratioMode)
            {
                case RATIO_MODE_Q3:
                    myAnalysisResult = new Q3Handler.Q3Result();
                    resultElementTagName = "q3ratio_result";
                    break;
                default:
                    myAnalysisResult = new XPressHandler.XPressResult();
                    resultElementTagName = "xpressratio_result";                     
                    break;
            }
            MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit.AnalysisResult xmlBeansAnalysisResult =
                    searchHit.addNewAnalysisResult();
            xmlBeansAnalysisResult.setAnalysis(myAnalysisResult.getAnalysisType());

            Element arElement =
                    xmlBeansAnalysisResult.getDomNode().getOwnerDocument().createElement(resultElementTagName);
            double ratio = IsotopicLabelExtraInfoDef.getRatio(feature);
            arElement.setAttribute("decimal_ratio", "" + ratio);
            arElement.setAttribute("heavy_area", "" + IsotopicLabelExtraInfoDef.getHeavyIntensity(feature));
            arElement.setAttribute("light_area", "" + IsotopicLabelExtraInfoDef.getLightIntensity(feature));

            float lightMass = (float) IsotopicLabelExtraInfoDef.getLightMass(feature);
            float heavyMass = (float) IsotopicLabelExtraInfoDef.getHeavyMass(feature);
            //if we haven't stored light and heavy masses, take our best guess           
            if (lightMass == 0 || heavyMass == 0)
            {
                //singly protonated mass.  This is OK if there's no n-terminal label.
                //TODO: store light and heavy mass, pass them through
                lightMass = precursorNeutralMass +
                        (float) PeptideGenerator.getMasses(true)['h'];
                float massDiff = 0;
                if (IsotopicLabelExtraInfoDef.hasLabel(feature))
                {
                    int labelCount = IsotopicLabelExtraInfoDef.getLabelCount(feature);
                    //HACK... sometimes this value doesn't get stored correctly on features
                    if (labelCount == 0) labelCount++;
                    massDiff =  labelCount * (IsotopicLabelExtraInfoDef.getLabel(feature).getHeavy() -
                                              IsotopicLabelExtraInfoDef.getLabel(feature).getLight());
                }
                else
                {
                    if (_isotopicLabel != null)
                        massDiff = _isotopicLabel.getHeavy() - _isotopicLabel.getLight();
                    arElement.setAttribute("heavy_mass", "" + (lightMass + massDiff));
                }

                heavyMass = lightMass + massDiff;
            }

            arElement.setAttribute("light_mass", "" + lightMass);
            arElement.setAttribute("heavy_mass", "" + heavyMass);
                                                                                          
            arElement.setAttribute("heavy_firstscan", "" + IsotopicLabelExtraInfoDef.getLightFirstScan(feature));
            arElement.setAttribute("heavy_lastscan", "" + IsotopicLabelExtraInfoDef.getLightLastScan(feature));
            arElement.setAttribute("light_firstscan", "" + IsotopicLabelExtraInfoDef.getHeavyFirstScan(feature));
            arElement.setAttribute("light_lastscan", "" + IsotopicLabelExtraInfoDef.getHeavyLastScan(feature));

            switch (ratioMode)
            {
                case RATIO_MODE_Q3:
                    arElement.setAttribute("q2_light_area", "" + IsotopicLabelExtraInfoDef.getLightIntensity(feature));
                    arElement.setAttribute("q2_heavy_area", "" + IsotopicLabelExtraInfoDef.getHeavyIntensity(feature));                    
                    break;
                default:
                    double heavy2LightRatio = 999;
                    if (ratio != 0)
                        heavy2LightRatio = 1.0 / ratio;
                    arElement.setAttribute("heavy2light_ratio", "" + heavy2LightRatio);
                    break;
            }

            xmlBeansAnalysisResult.getDomNode().appendChild(arElement);
        }

        List<ModifiedAminoAcid>[] modifiedAminoAcids = MS2ExtraInfoDef.getModifiedAminoAcids(feature);
        addModificationsToSearchHit(searchHit, modifiedAminoAcids, 
                MS2ExtraInfoDef.getNtermModMass(feature), MS2ExtraInfoDef.getCtermModMass(feature));

        try
        {
            String fragment = _firstRunSummary.getSpectrumQueryArray(0).xmlText(_optionsForPrinting);
//            String[] pieces = fragment.split("<[\\/]*xml-fragment[^>]*>");
//            fragment = pieces[pieces.length-1];
            fragment = fragment.replaceAll("<pep:","<");
            fragment = fragment.replaceAll("</pep:","</");

            //Empty namespace attributes are created, and I don't know of a cleaner way to get rid of them
            //TODO: find a cleaner way to get rid of xmlns attrs on q3ratio_summary, peptideprophet_result
            fragment = fragment.replaceAll("xmlns=\"\"", "");

            fragment = fragment + "\n";

            pw.print(fragment);
            pw.flush();
        }
        catch (Exception e)
        {
            e.printStackTrace(System.err);
        }


        _xmlBeansRunSummaryArray[0].removeSpectrumQuery(0);
    }

    /**
     * Make absolutely sure that we write a quantitative summary declaration at the top of the file, if either:
     * -the incoming FeatureFile told us it had ratios, or
     * -any feature in the file has ratios
     */
    protected void preWrite()
    {
        super.preWrite();


        boolean shouldWriteRatioDeclaration = featureSetHasRatioDeclaration;
        for (Feature feature : _features)
        {
            if (IsotopicLabelExtraInfoDef.hasRatio(feature))
            {
                shouldWriteRatioDeclaration = true;
                _isotopicLabel = IsotopicLabelExtraInfoDef.getLabel(feature);
                break;
            }
        }

        if (shouldWriteRatioDeclaration)
        {
            MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.AnalysisSummary quantAnalysisSummary =
                    _xmlBeansAnalysis.addNewAnalysisSummary();
            switch (ratioMode)
            {
                case RATIO_MODE_Q3:
                    quantAnalysisSummary.setAnalysis("q3");
                    Element ratioSummaryElement =
                            quantAnalysisSummary.getDomNode().getOwnerDocument().createElement("q3ratio_summary");

                    ratioSummaryElement.setAttribute("version", "1.2");
                    ratioSummaryElement.setAttribute("author","Marc Coram");
                    if (_isotopicLabel != null)
                    {
                        ratioSummaryElement.setAttribute("labeled_residues", "" + _isotopicLabel.getResidue());
                        ratioSummaryElement.setAttribute("massdiff", "" +
                                (_isotopicLabel.getHeavy() - _isotopicLabel.getLight()));
                    }

                    //TODO: fix this HACK
                    ratioSummaryElement.setAttribute("massTol", ".25");

                    quantAnalysisSummary.getDomNode().appendChild(ratioSummaryElement);

                    //HACK
                    MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.AnalysisTimestamp q3Timestamp =
                            _xmlBeansRunSummaryArray[0].addNewAnalysisTimestamp();
                    q3Timestamp.setAnalysis("q3");
                    q3Timestamp.setTime(new GregorianCalendar());
                    q3Timestamp.setId(1);

                    break;
                case RATIO_MODE_XPRESS:
                    quantAnalysisSummary.setAnalysis("xpress");
                    XpressratioSummaryDocument summaryDoc = XpressratioSummaryDocument.Factory.newInstance();
                    XpressratioSummaryDocument.XpressratioSummary xpressRatioSummaryOtherDoc =
                            summaryDoc.addNewXpressratioSummary();
                    if (_isotopicLabel != null)
                    {
                        xpressRatioSummaryOtherDoc.setXpressLight((long) _isotopicLabel.getLight());
                        xpressRatioSummaryOtherDoc.setMassdiff("" + (_isotopicLabel.getHeavy() - _isotopicLabel.getLight()));
                        xpressRatioSummaryOtherDoc.setLabeledResidues("" + _isotopicLabel.getResidue());
                    }

                    Node ratioSummaryNode =
                            quantAnalysisSummary.getDomNode().getOwnerDocument().importNode(xpressRatioSummaryOtherDoc.getDomNode(), true);
                    quantAnalysisSummary.getDomNode().appendChild(ratioSummaryNode);
                    break;
            }

        }

        //Stuff specific to trypsin
        //TODO: expose this, make it pluggable?

        MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SampleEnzyme
                xmlBeansSampleEnzyme = _firstRunSummary.addNewSampleEnzyme();
        xmlBeansSampleEnzyme.setName("trypsin");
        MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SampleEnzyme.Specificity
                xmlBeansSpecificity = xmlBeansSampleEnzyme.addNewSpecificity();
        xmlBeansSpecificity.setCut("KR");
        xmlBeansSpecificity.setNoCut("P");
        xmlBeansSpecificity.setSense(MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SampleEnzyme.Specificity.Sense.Enum.forString("C"));
    }

    /**
     * Write out the full document, with all modifications and features, to a file
     *
     * It's too bad I have to override the superclass version here.  The only reason
     * that's necessary is to strip out the "pep:" stuff, because programs like RefreshParser
     * only _pretend_ to parse XML.  In practice they're looking for actual strings like
     * "/msms_run_summary"
     * @throws IOException
     */
//    public void write(File file) throws IOException
//    {
//        preWrite();
//
//        //add a sentinel node that tells us where to split the document to insert features and modifications,
//        //which, conveniently, is the same place for both
//        Node runSummaryNode = _firstRunSummary.getDomNode();
//        Node featureLocationNode = runSummaryNode.getOwnerDocument().createElement("SENTINEL_FEATURE_LOCATION");
//
//        runSummaryNode.appendChild(featureLocationNode);
//        //create and break up the xml that defines the document structure
//        String documentShell = _xmlBeansPepXmlDoc.xmlText(_optionsForPrinting);
//
//        String[] halves = documentShell.split("<SENTINEL_FEATURE_LOCATION[^\\/]*\\/>");
//        if (halves.length != 2)
//        {
//            _log.error("Failed to create document shell for writing");
//            return;
//        }
//
//        _documentPrefix = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + halves[0];
//        _documentPostscript = halves[1];
//
//        //remove our dummy node
//        runSummaryNode.removeChild(featureLocationNode);
//
//
//        PrintWriter pw = new PrintWriter(file);
//        pw.print(_documentPrefix);
//        writeModifications(pw);
//        writeSpectrumQueries(pw);
//        pw.print(_documentPostscript);
//        pw.flush();
//    }


    public int getFirstSpectrumQueryIndex()
    {
        return firstSpectrumQueryIndex;
    }

    public void setFirstSpectrumQueryIndex(int firstSpectrumQueryIndex)
    {
        this.firstSpectrumQueryIndex = firstSpectrumQueryIndex;
    }


    public int getRatioMode()
    {
        return ratioMode;
    }

    public void setRatioMode(int ratioMode)
    {
        this.ratioMode = ratioMode;
    }
}