/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.lcms;
import org.apache.commons.io.input.ReaderInputStream;
import org.apache.commons.lang3.tuple.Pair;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathException;
import javax.xml.xpath.XPathFactory;
import java.io.FileInputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.IOException;
/**
* Parses mzXML files, converting the time points contained therein into {@link com.act.lcms.LCMSSpectrum} objects.
*
* mzXML has a few quirks that the user ought to be aware of:
* <ul>
* <li>
* Each mzXML file may contain data for several kinds of scans, differentiated by their "function" value. For
* the Waters instrument used by ECL, there may be three different scan functions; we currently are only interested
* in function 2.
* </li>
* <li>
* The mass/charge and intensity data for a given spectrum (time point) are stored as base64-encoded lists of
* little-endian IEEE 754 floating point numbers. These lists are unpacked and zipped together by this parser.
* </li>
* <li>
* Each spectrum has a "base peak m/z" and "base peak intensity" value specified, which is the
* mass/charge with the highest intensity value (and that intensity value) at the current time point. This
* {mass/charge, intensity} pair does <b>not</b> reappear in the spectrum data: it seems to be plucked out of the
* spectrum data.
* </li>
* </ul>
*
* Note that the {@link #parse(String)} function for this class is a memory hog. Use {@link #getIterator(String)}
* wherever possible instead.
*/
public class LCMSmzMLParser extends MzMLParser<LCMSSpectrum> implements LCMSParser {
// Paths for invariant checking.
public static final String SPECTRUM_PATH_EXPECTED_VERSION = "/spectrum/cvParam[@name='MS1 spectrum']";
public static final String SPECTRUM_PATH_EXPECTED_VERSION_DIODE_ARRAY = "/spectrum/cvParam[@name='electromagnetic radiation spectrum']";
public static final String SPECTRUM_PATH_SCAN_LIST_COUNT = "/spectrum/scanList/@count";
public static final String SPECTRUM_PATH_BINARY_DATA_ARRAY_LIST_COUNT =
"/spectrum/binaryDataArrayList/@count";
public LCMSmzMLParser() {
super();
}
protected LCMSSpectrum handleSpectrumEntry(Document doc) throws XPathException {
XPath xpath = getXPathFactory().newXPath();
Double spectrumIndexD = (Double) xpath.evaluate(SPECTRUM_PATH_INDEX, doc, XPathConstants.NUMBER);
if (spectrumIndexD == null) {
System.err.format("WARNING: found spectrum document without index attribute.\n");
return null;
}
Integer spectrumIndex = spectrumIndexD.intValue();
if (xpath.evaluate(SPECTRUM_PATH_EXPECTED_VERSION, doc, XPathConstants.NODE) == null) {
// if it is not MS1 Spectrum data then we will skip from the output.
// check if it entry we see here is the diode array data, those we expect to silently skip
// if on the other hand, even that is not matched; we truly have some unexpected entries, so report to user
if (xpath.evaluate(SPECTRUM_PATH_EXPECTED_VERSION_DIODE_ARRAY, doc, XPathConstants.NODE) == null) {
System.err.format("WARNING: found unexpected MS spectrum version in spectrum document %d. Skipping.\n",
spectrumIndex);
}
return null;
}
String spectrumId = (String) xpath.evaluate(SPECTRUM_PATH_ID, doc, XPathConstants.STRING);
if (spectrumId == null) {
System.err.format("WARNING: no spectrum id found for documnt %d\n", spectrumIndex);
return null;
}
Matcher matcher = SPECTRUM_EXTRACTION_REGEX.matcher(spectrumId);
if (!matcher.find()) {
System.err.format("WARNING: spectrum id for documnt %d did not match regex: %s\n", spectrumIndex, spectrumId);
return null;
}
Integer spectrumFunction = Integer.parseInt(matcher.group(1));
Integer spectrumScan = Integer.parseInt(matcher.group(3));
Integer scanListCount =
((Double) xpath.evaluate(SPECTRUM_PATH_SCAN_LIST_COUNT, doc, XPathConstants.NUMBER)).intValue();
if (!Integer.valueOf(1).equals(scanListCount)) {
System.err.format("WARNING: unexpected number of scan entries in spectrum document %d: %d",
spectrumIndex, scanListCount);
return null;
}
Integer binaryDataCount =
((Double) xpath.evaluate(SPECTRUM_PATH_BINARY_DATA_ARRAY_LIST_COUNT, doc, XPathConstants.NUMBER)).intValue();
if (!Integer.valueOf(2).equals(binaryDataCount)) {
System.err.format("WARNING: unexpected number of binary data entries in spectrum document %d: %d",
spectrumIndex, binaryDataCount);
return null;
}
Double basePeakMz = (Double) xpath.evaluate(SPECTRUM_PATH_BASE_PEAK_MZ, doc, XPathConstants.NUMBER);
if (basePeakMz == null) {
System.err.format("WARNING: no base peak m/z found for spectrum document %d\n", spectrumIndex);
return null;
}
Double basePeakIntensity = (Double) xpath.evaluate(SPECTRUM_PATH_BASE_PEAK_INTENSITY, doc, XPathConstants.NUMBER);
if (basePeakIntensity == null) {
System.err.format("WARNING: no base peak intensity found for spectrum document %d\n", spectrumIndex);
return null;
}
Double scanStartTime = (Double) xpath.evaluate(SPECTRUM_PATH_SCAN_START_TIME, doc, XPathConstants.NUMBER);
if (scanStartTime == null) {
System.err.format("WARNING: no scan start time found for spectrum document %d\n", spectrumIndex);
return null;
}
String scanStartTimeUnit = (String) xpath.evaluate(SPECTRUM_PATH_SCAN_START_TIME_UNIT, doc, XPathConstants.STRING);
if (scanStartTimeUnit == null) {
System.err.format("WARNING: no scan start time unit found for spectrum document %d\n", spectrumIndex);
return null;
}
String mzData = (String) xpath.evaluate(SPECTRUM_PATH_MZ_BINARY_DATA, doc, XPathConstants.STRING);
if (mzData == null) {
System.err.format("WARNING: no m/z data found for spectrum document %d\n", spectrumIndex);
return null;
}
String intensityData = (String) xpath.evaluate(SPECTRUM_PATH_INTENSITY_BINARY_DATA, doc, XPathConstants.STRING);
if (intensityData == null) {
System.err.format("WARNING: no intensity data found for spectrum document %d\n", spectrumIndex);
return null;
}
List<Double> mzs = base64ToDoubleList(mzData);
List<Double> intensities = base64ToDoubleList(intensityData);
List<Pair<Double, Double>> mzIntensityPairs = zipLists(mzs, intensities);
return new LCMSSpectrum(spectrumIndex, scanStartTime, scanStartTimeUnit,
mzIntensityPairs, basePeakMz, basePeakIntensity, spectrumFunction, spectrumScan, null);
}
@Override
public List<LCMSSpectrum> parse(String inputFile)
throws ParserConfigurationException, IOException, XMLStreamException {
return super.parse(inputFile);
}
@Override
public Iterator<LCMSSpectrum> getIterator(String inputFile)
throws ParserConfigurationException, IOException, XMLStreamException {
return super.getIterator(inputFile);
}
}