/*
* Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fhcrc.cpl.viewer.amt;
import java.io.*;
import java.util.List;
import java.util.ArrayList;
import org.w3c.dom.Node;
import org.apache.log4j.Logger;
import org.fhcrc.cpl.toolbox.filehandler.Stax2DomBuilder;
import org.fhcrc.cpl.toolbox.proteomics.MS2Modification;
import org.fhcrc.proteomics.schemaRevision.amtXml10.*;
import javax.xml.stream.XMLStreamException;
/**
* A restrictive wrapper for reading AmtXml files. The method is a compromise between
* the elegance of DOM/XmlBeans and the efficiency of stax. We use Stax2DomBuilder to pull DOM subtrees
* out of the pepxml file for bits that we care about, namely modifications and features
*/
public class AmtXmlReader
{
static Logger _log = Logger.getLogger(AmtXmlReader.class);
//stores the database read from a file
protected AmtDatabase mAmtDatabase = null;
/**
* Read in a amtxml file
* @param file
* @throws FileNotFoundException
* @throws XMLStreamException
*/
public AmtXmlReader(File file) throws FileNotFoundException, XMLStreamException
{
read(file);
}
/**
* Read in a file, extract modifications and features
* @param file
*/
public void read(File file) throws FileNotFoundException, XMLStreamException
{
//this is not the most efficient way to do it. If performance is bad, reimplement.
//On the other hand, it's simple, and it doesn't waste any memory to speak of
_log.debug("Reading metadata");
extractMetadataAndModifications(file);
_log.debug("Extracting runs");
extractRuns(file);
_log.debug("Extracting entries");
extractEntries(file);
}
/**
* TODO: get hydrophobicity calculator and version
* @param inputFile
* @throws FileNotFoundException
* @throws XMLStreamException
*/
protected void extractMetadataAndModifications(File inputFile)
throws FileNotFoundException, XMLStreamException
{
if (mAmtDatabase == null)
mAmtDatabase = new AmtDatabase();
mAmtDatabase.setAmtDBSourceFile(inputFile);
Stax2DomBuilder builder = new Stax2DomBuilder(inputFile);
//these XmlBeans objects are necessary throway structure to get XmlBeans to parse the xml
//we pull out with Stax2DomBuilder
AmtDatabaseDocument xmlBeansAmtDatabaseDocument =
AmtDatabaseDocument.Factory.newInstance();
AmtDatabaseDocument.AmtDatabase xmlBeansAmtDatabase =
xmlBeansAmtDatabaseDocument.addNewAmtDatabase();
//extract top-level stuff
mAmtDatabase.setHydrophobicityAlgorithmName(xmlBeansAmtDatabase.getHydrophobicityAlgorithm());
mAmtDatabase.setHydrophobicityAlgorithmVersion(xmlBeansAmtDatabase.getHydrophobicityAlgVersion());
while (true)
{
Node treeRoot = builder.findTreeForName(xmlBeansAmtDatabase.getDomNode().getOwnerDocument(), "aminoacid_modification","amt_database");
if (treeRoot == null)
break;
xmlBeansAmtDatabase.getDomNode().appendChild(treeRoot);
AmtDatabaseDocument.AmtDatabase.AminoacidModification xmlBeansModification =
xmlBeansAmtDatabase.getAminoacidModificationArray(0);
MS2Modification dbMod = new MS2Modification();
dbMod.setAminoAcid(xmlBeansModification.getResidue());
dbMod.setVariable(xmlBeansModification.getVariableFlag());
dbMod.setMassDiff(xmlBeansModification.getMassDifference().floatValue());
mAmtDatabase.addAminoacidModification(dbMod);
//remove the dummy node
xmlBeansAmtDatabase.getDomNode().removeChild(treeRoot);
}
}
/**
* extracts interesting stuff at the amt_database level, and all runs
* @param inputFile
* @throws FileNotFoundException
* @throws XMLStreamException
*/
protected void extractRuns(File inputFile) throws FileNotFoundException, XMLStreamException
{
Stax2DomBuilder builder = new Stax2DomBuilder(inputFile);
//these XmlBeans objects are necessary throway structure to get XmlBeans to parse the xml
//we pull out with Stax2DomBuilder
AmtDatabaseDocument xmlBeansAmtDatabaseDocument =
AmtDatabaseDocument.Factory.newInstance();
AmtDatabaseDocument.AmtDatabase xmlBeansAmtDatabase =
xmlBeansAmtDatabaseDocument.addNewAmtDatabase();
while (true)
{
Node treeRoot = builder.findTreeForName(xmlBeansAmtDatabase.getDomNode().getOwnerDocument(), "run","amt_database");
if (treeRoot == null)
break;
xmlBeansAmtDatabase.getDomNode().appendChild(treeRoot);
AmtDatabaseDocument.AmtDatabase.Run xmlBeansRun =
xmlBeansAmtDatabase.getRunArray(0);
List<MS2Modification> ms2ModList = new ArrayList<MS2Modification>();
for (AmtDatabaseDocument.AmtDatabase.Run.AminoacidModUsage modUsage :
xmlBeansRun.getAminoacidModUsageArray())
{
ms2ModList.add(mAmtDatabase.getAminoacidModificationBySequence(modUsage.getModificationId()));
}
AmtDatabaseDocument.AmtDatabase.Run.TimeHydroMappingCoeff[] mappingCoeffs
= xmlBeansRun.getTimeHydroMappingCoeffArray();
double[] coeffs = new double[mappingCoeffs.length];
for (int i=0; i<coeffs.length; i++)
{
coeffs[mappingCoeffs[i].getDegree()] = mappingCoeffs[i].getValue().doubleValue();
}
AmtRunEntry runEntry =
new AmtRunEntry(coeffs,
ms2ModList.toArray(new MS2Modification[0]),
xmlBeansRun.getTimeAdded().getTime());
mAmtDatabase.addRunEntry(runEntry);
//check that the sequence worked out ok
assert(mAmtDatabase.getSequenceForRun(runEntry) == xmlBeansRun.getRunId());
if (xmlBeansRun.getMzxmlFilename() != null &&
!("".equals(xmlBeansRun.getMzxmlFilename())))
runEntry.setMzXmlFilename(xmlBeansRun.getMzxmlFilename());
if (xmlBeansRun.getPepxmlFilename() != null &&
!("".equals(xmlBeansRun.getPepxmlFilename())))
runEntry.setPepXmlFilename(xmlBeansRun.getPepxmlFilename());
if (xmlBeansRun.getLSID() != null &&
!("".equals(xmlBeansRun.getLSID())))
runEntry.setLSID(xmlBeansRun.getLSID());
if (xmlBeansRun.getMinPeptideProphet() != null)
runEntry.setMinPeptideProphet(xmlBeansRun.getMinPeptideProphet().doubleValue());
if (xmlBeansRun.getTimeAnalyzed() != null)
runEntry.setTimeAnalyzed(xmlBeansRun.getTimeAnalyzed().getTime());
//remove the dummy node
xmlBeansAmtDatabase.getDomNode().removeChild(treeRoot);
}
}
/**
* @param inputFile
* @throws FileNotFoundException
* @throws XMLStreamException
*/
protected void extractEntries(File inputFile) throws FileNotFoundException, XMLStreamException
{
if (mAmtDatabase == null)
mAmtDatabase = new AmtDatabase();
Stax2DomBuilder builder = new Stax2DomBuilder(inputFile);
//these XmlBeans objects are necessary throway structure to get XmlBeans to parse the xml
//we pull out with Stax2DomBuilder
AmtDatabaseDocument xmlBeansAmtDatabaseDocument = AmtDatabaseDocument.Factory.newInstance();
AmtDatabaseDocument.AmtDatabase xmlBeansAmtDatabase = xmlBeansAmtDatabaseDocument.addNewAmtDatabase();
//Because we have potentially a Very Large Number of entries, it's necessary to add them
//and remove them one-by-one, so as not to fill up the available memory.
while (true)
{
Node treeRoot = builder.findTreeForName(xmlBeansAmtDatabase.getDomNode().getOwnerDocument(), "peptide_entry","amt_database");
if (treeRoot == null)
break;
xmlBeansAmtDatabase.getDomNode().appendChild(treeRoot);
AmtDatabaseDocument.AmtDatabase.PeptideEntry xmlBeansPeptideEntry =
xmlBeansAmtDatabase.getPeptideEntryArray(0);
AmtPeptideEntry entry = new AmtPeptideEntry();
entry.setPeptideSequence(xmlBeansPeptideEntry.getPeptideSequence());
// entry.setMedianObservedHydrophobicity(xmlBeansPeptideEntry.getMedianObservedHydrophobicity().doubleValue());
_log.debug("extractEntries: peptide " + entry.getPeptideSequence());
entry.setPredictedHydrophobicity(xmlBeansPeptideEntry.getCalculatedHydrophobicity().doubleValue());
for (AmtDatabaseDocument.AmtDatabase.PeptideEntry.ModificationStateEntry xmlBeansModState :
xmlBeansPeptideEntry.getModificationStateEntryArray())
{
_log.debug("extractEntries: mod state");
List<MS2Modification>[] ms2Modifications = null;
AmtDatabaseDocument.AmtDatabase.PeptideEntry.ModificationStateEntry.AminoacidModInstance[]
xmlBeansModifications = xmlBeansModState.getAminoacidModInstanceArray();
if (xmlBeansModifications != null && xmlBeansModifications.length > 0)
{
ms2Modifications =
(List<MS2Modification>[]) new List[xmlBeansPeptideEntry.getPeptideSequence().length()];
//remember, position = index + 1
for (AmtDatabaseDocument.AmtDatabase.PeptideEntry.ModificationStateEntry.AminoacidModInstance xmlBeansModification :
xmlBeansModifications)
{
if (ms2Modifications[xmlBeansModification.getPosition()] == null)
ms2Modifications[xmlBeansModification.getPosition()] =
new ArrayList<MS2Modification>();
ms2Modifications[xmlBeansModification.getPosition()].add(
mAmtDatabase.getAminoacidModificationBySequence(xmlBeansModification.getModificationId()));
}
}
_log.debug("extractEntries: mod state 2");
AmtPeptideEntry.AmtPeptideModificationStateEntry modState =
entry.addModificationStateEntry(
xmlBeansModState.getModifiedSequence(),
xmlBeansModState.getModifiedMass().doubleValue(),
ms2Modifications);
_log.debug("extractEntries: mod state 3");
for (AmtDatabaseDocument.AmtDatabase.PeptideEntry.ModificationStateEntry.Observation xmlBeansObservation :
xmlBeansModState.getObservationArray())
{
AmtPeptideEntry.AmtPeptideObservation observation =
AmtPeptideEntry.AmtPeptideObservation.createObservation(
xmlBeansObservation.getObservedHydrophobicity().doubleValue(),
xmlBeansObservation.getPeptideProphet().doubleValue(),
mAmtDatabase.getRunBySequence(xmlBeansObservation.getRunId()),
xmlBeansObservation.getTimeInRun().doubleValue());
if (xmlBeansObservation.isSetSpectralCount())
observation.setSpectralCount(xmlBeansObservation.getSpectralCount());
modState.addObservationNoRecalc(observation);
}
_log.debug("extractEntries: mod state 4");
modState.recalculateStats();
}
entry.recalculateStats();
mAmtDatabase.addObservationsFromEntry(entry);
xmlBeansAmtDatabase.removePeptideEntry(0);
}
}
/**
* Accessor for loaded database
* @return
*/
public AmtDatabase getDatabase()
{
return mAmtDatabase;
}
}