/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.lcms;
import org.apache.commons.lang3.tuple.Pair;
import ucar.ma2.Array;
import ucar.ma2.DataType;
import ucar.nc2.Attribute;
import ucar.nc2.NetcdfFile;
import ucar.nc2.Variable;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLStreamException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* Parses NetCDF files produced by an LCMS apparatus, converting the time points contained therein into
* {@link com.act.lcms.LCMSSpectrum} objects.
*
* <a href="http://www.unidata.ucar.edu/software/netcdf/">NetCDF</a> is a generic data format for storing array-oriented
* data. The Waters LCMS apparatus produces NetCDF files that are structured as follows:
* <ul>
* <li>
* The mass/charge and intensity values are stored as two long parallel arrays of values. The mass/charges
* from each scan are concatenated together to form an enormous 1-d array; the same is done for intensities.
* </li>
* <li>
* Several other parallel arrays are available in the file that represent attributes of each time-point (scan).
* These include the scan acquisition time, the number of mass/charge points acquired in the scan, and the offset
* of those points in the concatenated mass/charge and intensity arrays.
* </li>
* <li>
* To extract the set of {mass/charge, intensity} pairs for a given scan <i>i</i>, we grab the exclusive range
* mass_values[scan_index[i]:scan_index[i]+point_count[i]] and zip it with the corresponding intensity values.
* </li>
* </ul>
*
* The NetCDF API exposed in the ucar.ma2 package makes it easy to read and access these point arrays. Note, however,
* that the time/space performance characteristics of this library have not been thoroughly tested at 20n, so beware of
* excessive GC overhead or heap consumption.
*/
public class LCMSNetCDFParser implements LCMSParser {
public static final String MASS_VALUES = "mass_values";
public static final String INTENSITY_VALUES = "intensity_values";
public static final String SCAN_TIME = "scan_acquisition_time";
public static final String SCAN_POINTS_START = "scan_index";
public static final String SCAN_POINTS_COUNT = "point_count";
public static final String TOTAL_INTENSITY = "total_intensity";
@Override
public Iterator<LCMSSpectrum> getIterator(String inputFile)
throws ParserConfigurationException, IOException, XMLStreamException {
final NetcdfFile netcdfFile = NetcdfFile.open(inputFile);
// Assumption: all referenced Variables will always exist in the NetcdfFfile.
// Assumption: these arrays will have the same length.
final Array mzValues = netcdfFile.findVariable(MASS_VALUES).read();
final Array intensityValues = netcdfFile.findVariable(INTENSITY_VALUES).read();
assert(mzValues.getSize() == intensityValues.getSize());
// Assumption: the mz/intensity values are always floats.
assert(mzValues.getDataType() == DataType.FLOAT &&
intensityValues.getDataType() == DataType.FLOAT);
// Assumption: all of these variables' arrays will have the same lengths.
final Array scanTimeArray = netcdfFile.findVariable(SCAN_TIME).read();
final Array scanPointsStartArray = netcdfFile.findVariable(SCAN_POINTS_START).read();
final Array scanPointsCountArray = netcdfFile.findVariable(SCAN_POINTS_COUNT).read();
final Array totalIntensityArray = netcdfFile.findVariable(TOTAL_INTENSITY).read();
assert(scanTimeArray.getSize() == scanPointsStartArray.getSize() &&
scanPointsStartArray.getSize() == scanPointsCountArray.getSize() &&
scanPointsCountArray.getSize() == totalIntensityArray.getSize());
// Assumption: the following four columns always have these types.
assert(scanTimeArray.getDataType() == DataType.DOUBLE &&
scanPointsStartArray.getDataType() == DataType.INT &&
scanPointsCountArray.getDataType() == DataType.INT &&
totalIntensityArray.getDataType() == DataType.DOUBLE);
final long size = scanTimeArray.getSize();
return new Iterator<LCMSSpectrum>() {
private int i = 0;
@Override
public boolean hasNext() {
return this.i < size;
}
@Override
public LCMSSpectrum next() {
int pointCount = scanPointsCountArray.getInt(i);
List<Pair<Double, Double>> mzIntPairs = new ArrayList<>(pointCount);
int pointsStart = scanPointsStartArray.getInt(i);
int pointsEnd = pointsStart + pointCount;
for (int p = pointsStart; p < pointsEnd; p++) {
Double mz = Float.valueOf(mzValues.getFloat(p)).doubleValue();
Double intensity = Float.valueOf(intensityValues.getFloat(p)).doubleValue();
mzIntPairs.add(Pair.of(mz, intensity));
}
LCMSSpectrum s = new LCMSSpectrum(i, scanTimeArray.getDouble(i), "s", mzIntPairs,
null, null, null, i, totalIntensityArray.getDouble(i));
// Don't forget to advance the counter!
this.i++;
// Close the file if we're done with all the array contents.
if (i >= size) {
try {
netcdfFile.close();
} catch (IOException e) {
throw new RuntimeException(e); // TODO: can we do better?
}
}
return s;
}
};
}
/**
* Print information about a specific Variable from a NetCDF file. Used for debugging.
* @param name A human-readable name for this variable.
* @param v The variable whose details to print.
* @throws IOException
*/
public static void printVariableDetails(String name, Variable v) throws IOException {
System.out.format("%s name and dimensions: %s\n", name, v.getNameAndDimensions());
Array a = v.read();
System.out.format(" rank: %d\n", a.getRank());
System.out.format(" shape size: %d\n", a.getShape().length);
System.out.format(" shape: %s\n", a.shapeToString());
System.out.format(" array data type: %s\n", a.getDataType());
}
/**
* Print top-level details about a NetCDF file. Used for debugging.
* @param netcdfFile The NetCDF file whose details to print.
*/
public static void printNetcdfFileDetails(NetcdfFile netcdfFile) {
System.out.format("Details: %s\n", netcdfFile.getDetailInfo());
System.out.format("File type description: %s\n", netcdfFile.getFileTypeDescription());
System.out.format("Title: %s\n", netcdfFile.getTitle());
System.out.println("Variables:");
for (Variable v : netcdfFile.getVariables()) {
System.out.format(" %s\n", v.getNameAndDimensions());
}
System.out.println("Global attributes:");
for (Attribute a : netcdfFile.getGlobalAttributes()) {
System.out.format(" %s: %s (%s)\n", a.getFullName(), a.getStringValue(), a.getDataType().toString());
}
}
@Override
public List<LCMSSpectrum> parse(String inputFile)
throws ParserConfigurationException, IOException, XMLStreamException {
List<LCMSSpectrum> spectra = new ArrayList<>();
Iterator<LCMSSpectrum> iter = this.getIterator(inputFile);
while (iter.hasNext()) {
spectra.add(iter.next());
}
return spectra;
}
}