LCMSmzMLParser.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.lcms;

import org.apache.commons.io.input.ReaderInputStream;
import org.apache.commons.lang3.tuple.Pair;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathException;
import javax.xml.xpath.XPathFactory;
import java.io.FileInputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.IOException;

/**
 * Parses mzXML files, converting the time points contained therein into {@link com.act.lcms.LCMSSpectrum} objects.
 *
 * mzXML has a few quirks that the user ought to be aware of:
 * <ul>
 *   <li>
 *     Each mzXML file may contain data for several kinds of scans, differentiated by their "function" value.  For
 *     the Waters instrument used by ECL, there may be three different scan functions; we currently are only interested
 *     in function 2.
 *   </li>
 *   <li>
 *     The mass/charge and intensity data for a given spectrum (time point) are stored as base64-encoded lists of
 *     little-endian IEEE 754 floating point numbers.  These lists are unpacked and zipped together by this parser.
 *   </li>
 *   <li>
 *     Each spectrum has a "base peak m/z" and "base peak intensity" value specified, which is the
 *     mass/charge with the highest intensity value (and that intensity value) at the current time point.  This
 *     {mass/charge, intensity} pair does <b>not</b> reappear in the spectrum data: it seems to be plucked out of the
 *     spectrum data.
 *   </li>
 * </ul>
 *
 * Note that the {@link #parse(String)} function for this class is a memory hog.  Use {@link #getIterator(String)}
 * wherever possible instead.
 */
public class LCMSmzMLParser extends MzMLParser<LCMSSpectrum> implements LCMSParser {

  // Paths for invariant checking.
  public static final String SPECTRUM_PATH_EXPECTED_VERSION = "/spectrum/cvParam[@name='MS1 spectrum']";
  public static final String SPECTRUM_PATH_EXPECTED_VERSION_DIODE_ARRAY = "/spectrum/cvParam[@name='electromagnetic radiation spectrum']";
  public static final String SPECTRUM_PATH_SCAN_LIST_COUNT = "/spectrum/scanList/@count";
  public static final String SPECTRUM_PATH_BINARY_DATA_ARRAY_LIST_COUNT =
      "/spectrum/binaryDataArrayList/@count";

  public LCMSmzMLParser() {
    super();
  }

  protected LCMSSpectrum handleSpectrumEntry(Document doc) throws XPathException {
    XPath xpath = getXPathFactory().newXPath();

    Double spectrumIndexD = (Double) xpath.evaluate(SPECTRUM_PATH_INDEX, doc, XPathConstants.NUMBER);
    if (spectrumIndexD == null) {
      System.err.format("WARNING: found spectrum document without index attribute.\n");
      return null;
    }
    Integer spectrumIndex = spectrumIndexD.intValue();

    if (xpath.evaluate(SPECTRUM_PATH_EXPECTED_VERSION, doc, XPathConstants.NODE) == null) {
      // if it is not MS1 Spectrum data then we will skip from the output.

      // check if it entry we see here is the diode array data, those we expect to silently skip
      // if on the other hand, even that is not matched; we truly have some unexpected entries, so report to user
      if (xpath.evaluate(SPECTRUM_PATH_EXPECTED_VERSION_DIODE_ARRAY, doc, XPathConstants.NODE) == null) {
        System.err.format("WARNING: found unexpected MS spectrum version in spectrum document %d.  Skipping.\n",
            spectrumIndex);
      }

      return null;
    }

    String spectrumId = (String) xpath.evaluate(SPECTRUM_PATH_ID, doc, XPathConstants.STRING);
    if (spectrumId == null) {
      System.err.format("WARNING: no spectrum id found for documnt %d\n", spectrumIndex);
      return null;
    }

    Matcher matcher = SPECTRUM_EXTRACTION_REGEX.matcher(spectrumId);
    if (!matcher.find()) {
      System.err.format("WARNING: spectrum id for documnt %d did not match regex: %s\n", spectrumIndex, spectrumId);
      return null;
    }
    Integer spectrumFunction = Integer.parseInt(matcher.group(1));
    Integer spectrumScan = Integer.parseInt(matcher.group(3));

    Integer scanListCount =
        ((Double) xpath.evaluate(SPECTRUM_PATH_SCAN_LIST_COUNT, doc, XPathConstants.NUMBER)).intValue();
    if (!Integer.valueOf(1).equals(scanListCount)) {
      System.err.format("WARNING: unexpected number of scan entries in spectrum document %d: %d",
          spectrumIndex, scanListCount);
      return null;
    }

    Integer binaryDataCount =
        ((Double) xpath.evaluate(SPECTRUM_PATH_BINARY_DATA_ARRAY_LIST_COUNT, doc, XPathConstants.NUMBER)).intValue();
    if (!Integer.valueOf(2).equals(binaryDataCount)) {
      System.err.format("WARNING: unexpected number of binary data entries in spectrum document %d: %d",
          spectrumIndex, binaryDataCount);
      return null;
    }

    Double basePeakMz = (Double) xpath.evaluate(SPECTRUM_PATH_BASE_PEAK_MZ, doc, XPathConstants.NUMBER);
    if (basePeakMz == null) {
      System.err.format("WARNING: no base peak m/z found for spectrum document %d\n", spectrumIndex);
      return null;
    }

    Double basePeakIntensity = (Double) xpath.evaluate(SPECTRUM_PATH_BASE_PEAK_INTENSITY, doc, XPathConstants.NUMBER);
    if (basePeakIntensity == null) {
      System.err.format("WARNING: no base peak intensity found for spectrum document %d\n", spectrumIndex);
      return null;
    }

    Double scanStartTime = (Double) xpath.evaluate(SPECTRUM_PATH_SCAN_START_TIME, doc, XPathConstants.NUMBER);
    if (scanStartTime == null) {
      System.err.format("WARNING: no scan start time found for spectrum document %d\n", spectrumIndex);
      return null;
    }

    String scanStartTimeUnit = (String) xpath.evaluate(SPECTRUM_PATH_SCAN_START_TIME_UNIT, doc, XPathConstants.STRING);
    if (scanStartTimeUnit == null) {
      System.err.format("WARNING: no scan start time unit found for spectrum document %d\n", spectrumIndex);
      return null;
    }

    String mzData = (String) xpath.evaluate(SPECTRUM_PATH_MZ_BINARY_DATA, doc, XPathConstants.STRING);
    if (mzData == null) {
      System.err.format("WARNING: no m/z data found for spectrum document %d\n", spectrumIndex);
      return null;
    }

    String intensityData = (String) xpath.evaluate(SPECTRUM_PATH_INTENSITY_BINARY_DATA, doc, XPathConstants.STRING);
    if (intensityData == null) {
      System.err.format("WARNING: no intensity data found for spectrum document %d\n", spectrumIndex);
      return null;
    }

    List<Double> mzs = base64ToDoubleList(mzData);
    List<Double> intensities = base64ToDoubleList(intensityData);
    List<Pair<Double, Double>> mzIntensityPairs = zipLists(mzs, intensities);

    return new LCMSSpectrum(spectrumIndex, scanStartTime, scanStartTimeUnit,
        mzIntensityPairs, basePeakMz, basePeakIntensity, spectrumFunction, spectrumScan, null);
  }

  @Override
  public List<LCMSSpectrum> parse(String inputFile)
      throws ParserConfigurationException, IOException, XMLStreamException {
    return super.parse(inputFile);
  }

  @Override
  public Iterator<LCMSSpectrum> getIterator(String inputFile)
      throws ParserConfigurationException, IOException, XMLStreamException {
    return super.getIterator(inputFile);
  }
}