/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.toolbox.proteomics.filehandler; import java.io.*; import java.math.BigInteger; import java.util.List; import org.apache.log4j.Logger; import net.systemsbiology.regisWeb.pepXML.*; import org.w3c.dom.Node; import org.w3c.dom.Element; import org.w3c.dom.Attr; import org.apache.xmlbeans.XmlOptions; import org.apache.xmlbeans.XmlAnySimpleType; import org.apache.xmlbeans.xml.stream.Attribute; import org.fhcrc.cpl.toolbox.proteomics.MS2Modification; import org.fhcrc.cpl.toolbox.proteomics.ModifiedAminoAcid; /** * An abstract base class for use as a wrapper for writing PepXml files. We take advantage of XmlBeans to build * the structure of the pepxml file, and search_summaries (for modifications), but we stitch the XmlBeans XML * output for features together by hand, writing out to a file as we go, so that we don't have to hold the whole * structure in memory. * * This class is abstract because different implementations are likely to want to use different objects to * populate the individual spectrum_queries that are written out. The only implementing class at the time of * this comment is viewer.feature.FeaturePepXmlWriter, which uses an array of viewer.Feature.Feature to * populate the spectrum_queries. * * I can foresee at least two issues arising when another implementation needs to be created (likely on the * CPAS side): * 1. More file-level data may need to be written out at the top of the file. In that case, please discuss * with me (Damon) and we'll make sure that anything that's useful for everybody gets stuck in this abstract * base class * 2. Something other than MS2Modifications might be desired to populate the modifications. In that case, * this abstract base class can turn into two levels of abstract base classes. :) * * Also I think there may be some weirdness with where the MS2Modifications are declared, in the case that there's * more than one fraction. We'll burn that bridge when we come to it. */ public abstract class BasePepXmlWriter { static Logger _log = Logger.getLogger(BasePepXmlWriter.class); //root node protected MsmsPipelineAnalysisDocument _xmlBeansPepXmlDoc = null; //analysis node, one per doc protected MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis _xmlBeansAnalysis = null; //run summaries protected MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary[] _xmlBeansRunSummaryArray = null; //first run summary protected MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary _firstRunSummary = null; //search summary protected MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SearchSummary _searchSummary = null; //Strings of xml representing the structure before and after the feature content protected String _documentPrefix = null; protected String _documentPostscript = null; //encapsulates printing options for all fragments protected XmlOptions _optionsForPrinting = null; //Modification objects to populate the AminoAcidModifications and terminal mods protected MS2Modification[] _modifications = null; //String to begin all spectrum attributes with protected String _spectrumBaseString = ""; //source file's base name protected String _baseName = null; public static final String PRECURSOR_MASS_TYPE_MONOISOTOPIC = "monoisotopic"; //precursor mass type. Must be an allowed value protected String _precursorMassType = PRECURSOR_MASS_TYPE_MONOISOTOPIC; public static final String SEARCH_ENGINE_XTANDEM_COMET = "X! Tandem (comet)"; public static final String DEFAULT_SEARCH_ENGINE = SEARCH_ENGINE_XTANDEM_COMET; //search engine protected String _searchEngine = DEFAULT_SEARCH_ENGINE; //dhmay adding two strings that must be pasted into the output file. //These will need to be updated if schema version changes public static final String XSI_URL = "http://www.w3.org/2001/XMLSchema-instance"; public static final String XSD_URL = "http://sashimi.sourceforge.net/schema_revision/pepXML/pepXML_v18.xsd"; /** * Constructor creates the XmlBeans representing the shell of a PepXml document, and * creates the "prefix" and "postscript" strings representing that shell */ public BasePepXmlWriter() { //Construct generic document structure _xmlBeansPepXmlDoc = MsmsPipelineAnalysisDocument.Factory.newInstance(); _xmlBeansAnalysis = _xmlBeansPepXmlDoc.addNewMsmsPipelineAnalysis(); //dhmay adding two attributes required for TPP to parse these files Node xsiAtt = _xmlBeansAnalysis.getDomNode().getOwnerDocument().createAttribute("xmlns:xsi"); xsiAtt.setNodeValue(XSI_URL); _xmlBeansAnalysis.getDomNode().getAttributes().setNamedItem(xsiAtt); Node schemaLocAtt = _xmlBeansAnalysis.getDomNode().getOwnerDocument().createAttribute("xsi:schemaLocation"); schemaLocAtt.setNodeValue(XSD_URL); _xmlBeansAnalysis.getDomNode().getAttributes().setNamedItem(schemaLocAtt); _firstRunSummary = _xmlBeansAnalysis.addNewMsmsRunSummary(); _xmlBeansRunSummaryArray = _xmlBeansAnalysis.getMsmsRunSummaryArray(); //set printing options for xml fragments _optionsForPrinting = new XmlOptions(); _optionsForPrinting.setSaveOuter(); _optionsForPrinting.setSavePrettyPrint(); _optionsForPrinting.setSavePrettyPrintOffset(0); } public MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SearchSummary getSearchSummary() { if (_searchSummary == null) { MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SearchSummary[] searchSummaries = _firstRunSummary.getSearchSummaryArray(); if (searchSummaries != null && searchSummaries.length > 0) _searchSummary = searchSummaries[0]; else { _log.debug("Adding search summary"); _searchSummary = _firstRunSummary.addNewSearchSummary(); } _searchSummary.setPrecursorMassType(MassType.Enum.forString(_precursorMassType)); //have to do this manually because the schema isn't aware of any useful search engines Node searchSummAtt = _searchSummary.getDomNode().getOwnerDocument().createAttribute("search_engine"); searchSummAtt.setNodeValue(_searchEngine); _searchSummary.getDomNode().getAttributes().setNamedItem(searchSummAtt); } return _searchSummary; } /** * Create doc structure, populate features and modifications * @param modifications */ public BasePepXmlWriter(MS2Modification[] modifications) { this(); setModifications(modifications); } /** * setter for modifications * @param modifications */ public void setModifications(MS2Modification[] modifications) { _modifications = modifications; } /** * setter for basename * @param baseName */ public void setBaseName(String baseName) { _baseName = baseName; } /** * Add modifications to the xml output * We assume a reasonably small number of modifications. No need to write them each out * individually and then dispose of the java object */ protected void addModificationsToXML() { if (_modifications == null || _modifications.length == 0) return; for (MS2Modification modification : _modifications) { String aminoacid = modification.getAminoAcid(); if (aminoacid.equals("n") || aminoacid.equals("c")) { MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SearchSummary.TerminalModification xmlTerminalMod = getSearchSummary().addNewTerminalModification(); xmlTerminalMod.setMass(modification.getMass()); xmlTerminalMod.setMassdiff(Float.toString(modification.getMassDiff())); xmlTerminalMod.setVariable(modification.getVariable()? "Y" : "N"); xmlTerminalMod.setTerminus(aminoacid); //TODO: this is a hack. Not sure if it matters xmlTerminalMod.setProteinTerminus("N"); } else { MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SearchSummary.AminoacidModification xmlModification = getSearchSummary().addNewAminoacidModification(); xmlModification.setAminoacid(modification.getAminoAcid()); xmlModification.setMassdiff(Float.toString(modification.getMassDiff())); //dhmay removing logic that set mass only if it were !=0, after finding Mascot //results with 0-mass modifications declared xmlModification.setMass(modification.getMass()); xmlModification.setVariable(modification.getVariable()? "Y" : "N"); if (modification.getVariable() && modification.getSymbol() != null && modification.getSymbol().length() > 0 && !("'".equals(modification.getSymbol()))) { AaSymbolType.Enum xmlSymbol = AaSymbolType.Enum.forString(modification.getSymbol()); //problems with " as a symbol. xml doesn't like that. No time to fix right now //For now, just not setting it. //TODO: carry forward " as a symbol correctly if (xmlSymbol != null) { xmlModification.setSymbol(xmlSymbol); _log.debug("Adding symbol for mod on var " + modification.getAminoAcid() + ". getSymbol: " + modification.getSymbol() + ", xml Symbol: " + xmlSymbol); } else _log.debug("Not adding symbol for null symbol. Var=" + modification.getAminoAcid() + ", input symbol=" + modification.getSymbol()); } } } } /** * Write out all features immediately * @param pw */ protected abstract void writeSpectrumQueries(PrintWriter pw); /** * Adds an AnalysisResult representing the peptide prophet score * @param searchHit * @param pprophet * @param allNttProb * @param fval 0 is a sentinel value meaning absent */ protected void addPeptideProphet(MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit searchHit, float pprophet, String allNttProb, float fval) { MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit.AnalysisResult analysisResult = searchHit.addNewAnalysisResult(); analysisResult.setAnalysis("peptideprophet"); Node domNode = analysisResult.getDomNode(); Element childElement = domNode.getOwnerDocument().createElement("peptideprophet_result"); childElement.setAttribute("probability", "" + pprophet); if (allNttProb == null) allNttProb = "(" + pprophet + "," + pprophet + "," + pprophet + ")"; childElement.setAttribute("all_ntt_prob", allNttProb); domNode.appendChild(childElement); if (fval != 0) { Element searchScoreSummaryElement = domNode.getOwnerDocument().createElement("search_score_summary"); childElement.appendChild(searchScoreSummaryElement); Element paramElem = domNode.getOwnerDocument().createElement("parameter"); searchScoreSummaryElement.appendChild(paramElem); paramElem.setAttribute("name", "fval"); paramElem.setAttribute("value", "" + fval); } } /** * Add a search score. Doesn't make any attempt to prevent duplicates * @param searchHit * @param searchScoreName * @param searchScoreValue */ protected void addSearchScore(MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit searchHit, String searchScoreName, String searchScoreValue) { NameValueType newSearchScore = searchHit.addNewSearchScore(); newSearchScore.setName(searchScoreName); XmlAnySimpleType valueSimpleType = XmlAnySimpleType.Factory.newInstance(); valueSimpleType.setStringValue(searchScoreValue); newSearchScore.setValue(valueSimpleType); } /** * Utility method to create a spectrumQuery, added to the first run summary * @param scanFirst * @param scanLast * @param charge * @return */ protected MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery addSpectrumQuery(int scanFirst, int scanLast, int charge, int index) { MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery spectrumQuery = _firstRunSummary.addNewSpectrumQuery(); spectrumQuery.setIndex(index); spectrumQuery.setStartScan(scanFirst); spectrumQuery.setEndScan(scanLast); spectrumQuery.setAssumedCharge(new BigInteger(Integer.toString(charge))); return spectrumQuery; } /** * Utility method for adding modified aminoacids * @param searchHit * @param modifiedAminoAcids */ protected void addModificationsToSearchHit(MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit searchHit, List<ModifiedAminoAcid>[] modifiedAminoAcids, float nTermModMass, float cTermModMass) { if (modifiedAminoAcids != null || nTermModMass != 0 || cTermModMass != 0) { MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit.ModificationInfo xmlBeansModInfo =searchHit.addNewModificationInfo(); String peptideSequence = searchHit.getPeptide(); StringBuffer modPeptideStringBuf = new StringBuffer(); //handling cterm down below because of stringbuf if (nTermModMass != 0) { xmlBeansModInfo.setModNtermMass(nTermModMass); modPeptideStringBuf.append("n[" + (int) nTermModMass + "]"); } //modifiedAminoAcids.length guaranteed == peptideSequence.length(); for (int i=0; i<modifiedAminoAcids.length; i++) { modPeptideStringBuf.append(peptideSequence.charAt(i)); if (modifiedAminoAcids[i] == null) continue; //there may be multiple modifications. If so, too bad. Keep the highest modification mass double greatestModMass = 0f; for (ModifiedAminoAcid mod : modifiedAminoAcids[i]) { MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SpectrumQuery.SearchResult.SearchHit.ModificationInfo.ModAminoacidMass xmlBeansModAminoacidMass = xmlBeansModInfo.addNewModAminoacidMass(); //0-based to 1-based array indexing conversion xmlBeansModAminoacidMass.setPosition(BigInteger.valueOf(i + 1)); xmlBeansModAminoacidMass.setMass(mod.getMass()); if (mod.getMass() > greatestModMass) greatestModMass = mod.getMass(); } if (greatestModMass > 0f) modPeptideStringBuf.append("[" + Math.round(greatestModMass) + "]"); } xmlBeansModInfo.setModifiedPeptide(modPeptideStringBuf.toString()); if (cTermModMass != 0) { xmlBeansModInfo.setModCtermMass(cTermModMass); modPeptideStringBuf.append("c[" + (int) cTermModMass + "]"); } } } /** * Better only call once * @param databasePath */ public void setSearchDatabase(String databasePath) { MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SearchSummary.SearchDatabase searchDatabase = getSearchSummary().addNewSearchDatabase(); searchDatabase.setLocalPath(databasePath); } /** * Better only call once. * TODO: un-hardcode trypsin * @param maxCleavages * @param minTermini */ public void setSearchConstraints(int maxCleavages, int minTermini) { MsmsPipelineAnalysisDocument.MsmsPipelineAnalysis.MsmsRunSummary.SearchSummary.EnzymaticSearchConstraint enzSearchConstraint = getSearchSummary().addNewEnzymaticSearchConstraint(); enzSearchConstraint.setMaxNumInternalCleavages(BigInteger.valueOf(maxCleavages)); enzSearchConstraint.setMinNumberTermini(BigInteger.valueOf(minTermini)); enzSearchConstraint.setEnzyme("trypsin"); } protected void preWrite() { if (_baseName != null) { _firstRunSummary.setBaseName(_baseName); getSearchSummary().setBaseName(_baseName); } } /** * Write out the full document, with all modifications and features, to a file * @param file * @throws IOException */ public void write(File file) throws IOException { preWrite(); //add a sentinel node that tells us where to split the document to insert features and modifications, //which, conveniently, is the same place for both Node runSummaryNode = _firstRunSummary.getDomNode(); //if there isn't an explicitly defined base string for all spectra, create it from //the first part of the filename up to the /last/ ".". Exception: if ends with .pep.xml (any case), //leave out "pep.xml" if ("".equals(_spectrumBaseString)) { _spectrumBaseString = file.getName(); //special handling: if .pep.xml, change to .xml before removing extension if (_spectrumBaseString.toLowerCase().endsWith(".pep.xml")) _spectrumBaseString = _spectrumBaseString.substring(0, _spectrumBaseString.length()-8) + ".xml"; if (_spectrumBaseString.contains(".")) _spectrumBaseString = _spectrumBaseString.substring(0, _spectrumBaseString.lastIndexOf(".")); _spectrumBaseString = _spectrumBaseString + "."; } //add required namespace element // Element namespaceElement = runSummaryNode.getOwnerDocument().createElement("xmlns"); Attr nsAttr = _xmlBeansAnalysis.getDomNode().getOwnerDocument().createAttribute("xmlns"); nsAttr.setValue("http://regis-web.systemsbiology.net/pepXML"); // namespaceElement.setNodeValue("http://regis-web.systemsbiology.net/pepXML"); _xmlBeansAnalysis.getDomNode().getAttributes().setNamedItem(nsAttr); Node featureLocationNode = runSummaryNode.getOwnerDocument().createElement("SENTINEL_FEATURE_LOCATION"); runSummaryNode.appendChild(featureLocationNode); addModificationsToXML(); //create and break up the xml that defines the document structure String documentShell = _xmlBeansPepXmlDoc.xmlText(_optionsForPrinting); //By default, namespace stuff will be written to every opening and closing tag. This gives //some "parsers" heartburn documentShell = documentShell.replaceAll("<pep:","<"); documentShell = documentShell.replaceAll("</pep:","</"); //Empty namespace attributes are created, and I don't know of a cleaner way to get rid of them //TODO: find a cleaner way to get rid of xmlns attrs on q3ratio_summary, peptideprophet_result documentShell = documentShell.replaceAll("xmlns=\"\"", ""); String[] halves = documentShell.split("<SENTINEL_FEATURE_LOCATION[^\\/]*\\/>"); if (halves.length != 2) { _log.error("Failed to create document shell for writing"); return; } _documentPrefix = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + halves[0]; _documentPostscript = halves[1]; //remove our dummy node runSummaryNode.removeChild(featureLocationNode); PrintWriter pw = new PrintWriter(file); pw.print(_documentPrefix); writeSpectrumQueries(pw); pw.print(_documentPostscript); pw.flush(); } public String getSpectrumBaseString() { return _spectrumBaseString; } public void setSpectrumBaseString(String spectrumBaseString) { this._spectrumBaseString = spectrumBaseString; } public String get_precursorMassType() { return _precursorMassType; } public void set_precursorMassType(String _precursorMassType) { this._precursorMassType = _precursorMassType; } public String get_searchEngine() { return _searchEngine; } public void set_searchEngine(String _searchEngine) { this._searchEngine = _searchEngine; if (_searchSummary != null) { Node searchSummAtt = _searchSummary.getDomNode().getAttributes().getNamedItem("search_engine"); searchSummAtt.setNodeValue(_searchEngine); } } }