/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.lcms;
import com.act.lcms.db.model.MS1ScanForWellAndMassCharge;
import com.act.lcms.plotter.WriteAndPlotMS1Results;
import org.apache.commons.lang3.tuple.Pair;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Arrays;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLStreamException;
public class MS1 {
public static class YZ {
Double mz;
Double intensity;
public YZ(Double mz, Double intensity) {
this.mz = mz;
this.intensity = intensity;
}
public Double getMZ() {
return mz;
}
public Double getIntensity() {
return intensity;
}
}
// In the MS1 case, we look for a very tight window
// because we do not noise to broaden our signal
public static final Double MS1_MZ_TOLERANCE_FINE = 0.001;
public static final Double MS1_MZ_TOLERANCE_COARSE = 0.01;
public static final Double MS1_MZ_TOLERANCE_DEFAULT = MS1_MZ_TOLERANCE_COARSE;
private static final Logger LOGGER = LogManager.getFormatterLogger(MS1.class);
// when aggregating the MS1 signal, we do not expect
// more than these number of measurements within the
// mz window specified by the tolerance above
static final Integer MAX_MZ_IN_WINDOW_FINE = 3;
static final Integer MAX_MZ_IN_WINDOW_COARSE = 4;
static final Integer MAX_MZ_IN_WINDOW_DEFAULT = MAX_MZ_IN_WINDOW_FINE;
// when using max intensity, the threshold is 20% of max
static final Double THRESHOLD_PERCENT = 0.20d;
// when using SNR, the threshold is 60 for snr x max_intensity/1M;
static final boolean USE_SNR_FOR_THRESHOLD_DEFAULT = true;
// we experimented and observed the following:
// At logSNR > 10.0d: only VERY strong peaks are isolated, e.g., those above 10^6 straight up
// At logSNR > 5.0d : a lot of junky smears appear
// At logSNR > 6.0d : Midway point that is lenient in allowing for peaks; while eliminating full smears
// At logSNR > 3.5d : Since we are using the more thorough standard ion analysis to compare peaks, this,
// threshold was good to compare positive vs negative control peaks.
static final Double LOGSNR_THRESHOLD = 3.5d;
// because SNR is sigma(signal)^2/sigma(ambient)^2, when ambient is close to 0 (which could happen
// because we are not looking at statistics across multiple runs, but in the same spectra), we get
// very high SNR. Therefore, we want to eliminate spectra where less than a 1000 ions are detected
static final Double NONTRIVIAL_SIGNAL = 1e3d;
// good clean signals show within about +-3-5 seconds of the peak.
static final Double PEAK_WIDTH = 10.0d; // seconds
private Double mzTolerance = MS1_MZ_TOLERANCE_DEFAULT;
private Integer maxDetectionsInWindow = MAX_MZ_IN_WINDOW_DEFAULT;
private boolean useSNRForThreshold = USE_SNR_FOR_THRESHOLD_DEFAULT;
public MS1() { }
public MS1(boolean useFineGrainedMZTolerance, boolean useSNR) {
mzTolerance = useFineGrainedMZTolerance ? MS1_MZ_TOLERANCE_FINE : MS1_MZ_TOLERANCE_COARSE;
maxDetectionsInWindow = useFineGrainedMZTolerance ? MAX_MZ_IN_WINDOW_FINE : MAX_MZ_IN_WINDOW_COARSE;
useSNRForThreshold = useSNR;
}
public double extractMZ(double mzWanted, List<Pair<Double, Double>> intensities) {
double intensityFound = 0;
int numWithinPrecision = 0;
double mzLowRange = mzWanted - this.mzTolerance;
double mzHighRange = mzWanted + this.mzTolerance;
// we expect there to be pretty much only one intensity value in the precision
// range we are looking at. But if a lot of masses show up then complain
for (Pair<Double, Double> mz_int : intensities) {
double mz = mz_int.getLeft();
double intensity = mz_int.getRight();
if (mz >= mzLowRange && mz <= mzHighRange) {
intensityFound += intensity;
numWithinPrecision++;
}
}
if (numWithinPrecision > maxDetectionsInWindow) {
LOGGER.warn("Only expected %d, but found %d in the mz range [%f, %f]",
maxDetectionsInWindow, numWithinPrecision, mzLowRange, mzHighRange);
}
return intensityFound;
}
private List<YZ> toYZSpectra(List<Pair<Double, Double>> intensities) {
List<YZ> scan = new ArrayList<>();
for (Pair<Double, Double> detected : intensities) {
// L is mz
// R is intensity
scan.add(new YZ(detected.getLeft(), detected.getRight()));
}
return scan;
}
private TIC_MzAtMax getTIC(String ms1File) throws Exception {
Iterator<LCMSSpectrum> ms1Iter = new LCMSNetCDFParser().getIterator(ms1File);
Double maxTI = null;
List<YZ> scanAtMax = null;
List<XZ> tic = new ArrayList<>();
while (ms1Iter.hasNext()) {
LCMSSpectrum timepoint = ms1Iter.next();
// what is the total intensity (across all mz) at this timepoint?
Double ti = timepoint.getTotalIntensity();
// add the data point to the TIC chromatogram
tic.add(new XZ(timepoint.getTimeVal(), ti));
// get all (mz, intensity) at this timepoint
List<Pair<Double, Double>> intensities = timepoint.getIntensities();
// update the max total intensity if it is
if (maxTI == null || maxTI < ti) {
maxTI = ti;
scanAtMax = toYZSpectra(intensities);
}
}
TIC_MzAtMax chrom = new TIC_MzAtMax();
chrom.tic = tic;
chrom.mzScanAtMaxIntensity = scanAtMax;
return chrom;
}
public MS1ScanForWellAndMassCharge getMS1(Map<String, Double> metlinMasses, String ms1File)
throws Exception {
return getMS1(metlinMasses, new LCMSNetCDFParser().getIterator(ms1File));
}
private MS1ScanForWellAndMassCharge getMS1(
Map<String, Double> metlinMasses, Iterator<LCMSSpectrum> ms1File) {
// create the map with placeholder empty lists for each ion
// we will populate this later when we go through each timepoint
MS1ScanForWellAndMassCharge scanResults = new MS1ScanForWellAndMassCharge();
for (String ionDesc : metlinMasses.keySet()) {
List<XZ> ms1 = new ArrayList<>();
scanResults.getIonsToSpectra().put(ionDesc, ms1);
}
while (ms1File.hasNext()) {
LCMSSpectrum timepoint = ms1File.next();
// get all (mz, intensity) at this timepoint
List<Pair<Double, Double>> intensities = timepoint.getIntensities();
// for this timepoint, extract each of the ion masses from the METLIN set
for (Map.Entry<String, Double> metlinMass : metlinMasses.entrySet()) {
String ionDesc = metlinMass.getKey();
Double ionMz = metlinMass.getValue();
// this time point is valid to look at if its max intensity is around
// the mass we care about. So lets first get the max peak location
double intensityForMz = extractMZ(ionMz, intensities);
// the above is Pair(mz_extracted, intensity), where mz_extracted = mz
// we now add the timepoint val and the intensity to the output
XZ intensityAtThisTime = new XZ(timepoint.getTimeVal(), intensityForMz);
scanResults.getIonsToSpectra().get(ionDesc).add(intensityAtThisTime);
}
}
// populate statistics about the curve for each ion curve
for (String ionDesc : metlinMasses.keySet()) {
computeAndStorePeakProfile(scanResults, ionDesc);
}
// set the yaxis max: if intensity used as filtering function then intensity max, else snr max
Double globalYAxis = 0.0d;
Map<String, Double> individualYMax = new HashMap<String, Double>();
for (String ionDesc : metlinMasses.keySet()) {
if (useSNRForThreshold && !isGoodPeak(scanResults, ionDesc)) {
// if we are using SNR for thresholding scans (see code in writeMS1Values)
// then for scans that have low SNR, their max is irrelevant. So it might
// be the case that the max is 100k; but SNR is 1.5, which case it will be
// a lower value signal compared to a max of 10k, and SNR 20.
continue;
}
// this scan will be included in the plots, so its max should be taken into account
Double maxInt = scanResults.getMaxIntensityForIon(ionDesc);
individualYMax.put(ionDesc, maxInt);
globalYAxis = globalYAxis == 0.0d ? maxInt : Math.max(maxInt, globalYAxis);
}
scanResults.setMaxYAxis(globalYAxis);
scanResults.setIndividualMaxIntensities(individualYMax);
return scanResults;
}
/* DO NOT change this function while `getMS1` depends on it. This approach has been thorougly manually vetted; any
* changes will eliminate our ability to compare optimized results to a consistent baseline. */
public void computeAndStorePeakProfile(MS1ScanForWellAndMassCharge scanResults, String ionDesc) {
List<XZ> curve = scanResults.getIonsToSpectra().get(ionDesc);
Integer sz = curve.size();
Double maxIntensity = null;
Double maxIntensityTime = null;
for (XZ signal : curve) {
Double intensity = signal.getIntensity();
if (maxIntensity == null) {
maxIntensity = intensity; maxIntensityTime = signal.getTime();
} else {
if (maxIntensity < intensity) {
maxIntensity = intensity; maxIntensityTime = signal.getTime();
}
}
}
// split the curve into parts that are the signal (+- from peak), and ambient
List<XZ> signalIntensities = new ArrayList<>();
List<XZ> ambientIntensities = new ArrayList<>();
for (XZ measured : curve) {
if (measured.getTime() > maxIntensityTime - PEAK_WIDTH/2.0d && measured.getTime() < maxIntensityTime + PEAK_WIDTH/2.0d) {
signalIntensities.add(measured);
} else {
ambientIntensities.add(measured);
}
}
Double avgIntensitySignal = average(signalIntensities);
Double avgIntensityAmbient = average(ambientIntensities);
Double logSNR = -100.0d, snr = null;
// only set SNR to real value if the signal is non-trivial
if (maxIntensity > NONTRIVIAL_SIGNAL) {
// snr = sigma_signal^2 / sigma_ambient^2
// where sigma = sqrt(E[(X-mean)^2])
// The above equation is a simplistic attempt at evaluating the power around the max_peak compared to the rest of
// the spectra. So we chop out the window into a signal, calculate its deviation from the full_mean
// (hopefully close to 0) and compare that against the deviation of the rest of the spectra from (again) the
// full_mean.
Double mean = avgIntensityAmbient;
snr = sigmaSquared(signalIntensities, mean) / sigmaSquared(ambientIntensities, mean);
logSNR = Math.log(snr);
}
scanResults.setIntegralForIon(ionDesc, getAreaUnder(curve));
scanResults.setMaxIntensityForIon(ionDesc, maxIntensity);
scanResults.setAvgIntensityForIon(ionDesc, avgIntensitySignal, avgIntensityAmbient);
scanResults.setLogSNRForIon(ionDesc, logSNR);
if (logSNR > -100.0d) {
LOGGER.info("%10s: logSNR: %5.1f Max: %7.0f SignalAvg: %7.0f Ambient Avg: %7.0f %s\n", ionDesc, logSNR, maxIntensity, avgIntensitySignal, avgIntensityAmbient, isGoodPeak(scanResults, ionDesc) ? "INCLUDED" : "");
}
}
boolean isGoodPeak(MS1ScanForWellAndMassCharge scans, String ion) {
Double logSNR = scans.getLogSNRForIon(ion);
return logSNR > LOGSNR_THRESHOLD;
}
public enum IonMode { POS, NEG };
public static class MetlinIonMass {
// colums in each row from METLIN data, as seen here:
// https://metlin.scripps.edu/mz_calc.php?mass=300.120902994
private IonMode mode; // POS or NEG
private String name; // M+H, M+K, etc
private Integer charge;
private Double mz;
MetlinIonMass(IonMode mode, String name, Integer charge, Double mz) {
this.mode = mode; this.name = name; this.charge = charge; this.mz = mz;
}
public IonMode getMode() {
return mode;
}
public String getName() {
return name;
}
}
// TODO: create an ion Enum rather than having to access this list by String name
public static final MetlinIonMass[] ionDeltas = new MetlinIonMass[] {
new MetlinIonMass(IonMode.POS, "M+H-2H2O", 1, 35.0128),
new MetlinIonMass(IonMode.POS, "M+H-H2O", 1, 17.0028),
new MetlinIonMass(IonMode.POS, "M-H", 1, 1.0073),
new MetlinIonMass(IonMode.POS, "M-H2O+NH4", 1, -0.0227),
new MetlinIonMass(IonMode.POS, "M+H", 1, -1.0073),
new MetlinIonMass(IonMode.POS, "M+Li", 1, -7.0160),
new MetlinIonMass(IonMode.POS, "M+NH4", 1, -18.0338),
new MetlinIonMass(IonMode.POS, "M+Na", 1, -22.9892),
new MetlinIonMass(IonMode.POS, "M+CH3OH+H", 1, -33.0335),
new MetlinIonMass(IonMode.POS, "M+K", 1, -38.9631),
new MetlinIonMass(IonMode.POS, "M+ACN+H", 1, -42.0338),
new MetlinIonMass(IonMode.POS, "M+2Na-H", 1, -44.9711),
new MetlinIonMass(IonMode.POS, "M+ACN+Na", 1, -64.0157),
new MetlinIonMass(IonMode.POS, "M+2H", 2, -1.0073),
new MetlinIonMass(IonMode.POS, "M+H+Na", 2, -11.9982),
new MetlinIonMass(IonMode.POS, "M+2Na", 2, -22.9892),
new MetlinIonMass(IonMode.POS, "M+3H", 3, -1.0072),
new MetlinIonMass(IonMode.POS, "M+2H+Na", 3, -8.3346),
new MetlinIonMass(IonMode.POS, "M+2Na+H", 3, -15.6619),
new MetlinIonMass(IonMode.NEG, "M-H2O-H", 1, 19.0184),
new MetlinIonMass(IonMode.NEG, "M-H", 1, 1.0073),
new MetlinIonMass(IonMode.NEG, "M+F", 1, -18.9984),
new MetlinIonMass(IonMode.NEG, "M+Na-2H", 1, -20.9746),
new MetlinIonMass(IonMode.NEG, "M+Cl", 1, -34.9694),
new MetlinIonMass(IonMode.NEG, "M+K-2H", 1, -36.9486),
new MetlinIonMass(IonMode.NEG, "M+FA-H", 1, -44.9982),
new MetlinIonMass(IonMode.NEG, "M+CH3COO", 1, -59.0138),
new MetlinIonMass(IonMode.NEG, "M-2H", 2, 1.0073),
new MetlinIonMass(IonMode.NEG, "M-3H", 3, 1.0073),
};
public static final Set<String> VALID_MS1_IONS;
static {
HashSet<String> names = new HashSet<>(ionDeltas.length);
for (MetlinIonMass mim : ionDeltas) {
names.add(mim.name);
}
VALID_MS1_IONS = Collections.unmodifiableSet(names);
}
public static Double computeIonMz(Double mz, MetlinIonMass delta) {
// this delta specifies how to calculate the ionMz; except we need
// to take care of the charge this ion acquires/looses
Double ionMz = mz/delta.charge - delta.mz;
return ionMz;
}
public static Double computeMassFromIonMz(Double ionMz, MetlinIonMass delta) {
// this reverses the computation of `computeIonMz`
Double mass = (ionMz + delta.mz) * delta.charge;
return mass;
}
/**
* This function takes a mass charge and mode as input and returns a list of metlin ion masses
* @param mz Mass charge
* @param ionMode Ion mode to search from
* @return A list of metlin ion masses
* @throws IOException
*/
private static List<MetlinIonMass> queryMetlin(Double mz, IonMode ionMode) throws IOException {
List<MetlinIonMass> rows = new ArrayList<>();
for (MetlinIonMass delta : ionDeltas) {
// only pick ions that are the right mode, else skip
if (delta.mode != ionMode)
continue;
rows.add(new MetlinIonMass(delta.mode, delta.name, delta.charge, computeIonMz(mz, delta)));
}
return rows;
}
/**
* This function takes as input a mass charge and ionmode and outputs a map of ion names to mass charge
* @param mz Mass charge
* @param ionMode Ion mode
* @return A map of ion names to their mass charge
* @throws IOException
*/
public static Map<String, Double> getIonMasses(Double mz, IonMode ionMode) throws IOException {
List<MetlinIonMass> rows = queryMetlin(mz, ionMode);
Map<String, Double> ionMasses = new HashMap<>();
for (MetlinIonMass metlinMass : rows) {
ionMasses.put(metlinMass.name, metlinMass.mz);
}
return ionMasses;
}
/**
* This function returns the ion mode of a given ion. There is a weird case were M-H is both in the positive
* and negative ion mode, even if it is a negative ion, so bias towards positive (since our results are more
* accurate in that mode) for such cases.
* @param ion The specific query ion
* @return The ionMode of the query ion
*/
public static IonMode getIonModeOfIon(String ion) {
if (ion.equals("M-H")) {
return IonMode.POS;
}
for (MetlinIonMass mass : ionDeltas) {
if (mass.getName().equals(ion)) {
return mass.getMode();
}
}
return null;
}
private static boolean areNCFiles(String[] fnames) {
for (String n : fnames) {
LOGGER.debug(".nc file = %s", n);
if (!n.endsWith(".nc"))
return false;
}
return true;
}
private boolean lowSignalInEntireSpectrum(List<XZ> ms1, Double threshold) {
XZ maxSignal = null;
for (XZ xz : ms1) {
if (maxSignal == null || xz.getIntensity() > maxSignal.getIntensity()) {
maxSignal = xz;
}
}
// check if the max is below the threshold
return maxSignal.getIntensity() < threshold;
}
class TIC_MzAtMax {
List<XZ> tic;
List<YZ> mzScanAtMaxIntensity;
}
public double getAreaUnder(List<XZ> curve) {
Double timePrev = curve.get(0).getTime();
Double areaTotal = 0.0d;
for (XZ curveVal : curve) {
Double height = curveVal.getIntensity();
Double timeDelta = curveVal.getTime() - timePrev;
// compute the area occupied by the slice
Double areaDelta = height * timeDelta;
// add this slice to the area under the curve
areaTotal += areaDelta;
// move the left boundary of the time slice forward
timePrev = curveVal.getTime();
}
return areaTotal;
}
public double average(List<XZ> curve) {
Double avg = 0.0d;
int sz = curve.size();
for (XZ curveVal : curve) {
Double intensity = curveVal.getIntensity();
avg += intensity / sz;
}
return avg;
}
private double sigmaSquared(List<XZ> curve, Double mean) {
// computes sigma squared, i.e., E[(X-mean)^2]
Double exp = 0.0d;
int sz = curve.size();
for (XZ curveVal : curve) {
Double X = curveVal.getIntensity();
Double devSqrd = (X - mean) * (X - mean);
exp += devSqrd / sz;
}
return exp;
}
private enum PlotModule { RAW_SPECTRA, TIC, FEEDINGS };
public static void main(String[] args) throws Exception {
if (args.length < 8 || !areNCFiles(Arrays.copyOfRange(args, 7, args.length))) {
throw new RuntimeException("Needs: \n" +
"(1) mz for main product, e.g., 431.1341983 (ononin) \n" +
"(2) ion mode = POS OR NEG \n" +
"(3) ion = M+H or M+Na etc \n" +
"(4) prefix for .data and rendered .pdf \n" +
"(5) {heatmap, default=no heatmap, i.e., 2d} \n" +
"(6) {overlay, default=separate plots} \n" +
"(7) {" + StringUtils.join(PlotModule.values(), ", ") + "} \n" +
"(8,9..) NetCDF .nc file 01.nc from MS1 run \n"
);
}
String fmt = "pdf";
Double mz = Double.parseDouble(args[0]);
IonMode ionMode = IonMode.valueOf(args[1]);
String ion = args[2];
String outPrefix = args[3];
boolean makeHeatmap = args[4].equals("heatmap");
boolean overlayPlots = args[5].equals("overlay");
PlotModule module = PlotModule.valueOf(args[6]);
String[] ms1Files = Arrays.copyOfRange(args, 7, args.length);
boolean plotsHaveIndependentYAxis = true;
WriteAndPlotMS1Results plottingUtil = new WriteAndPlotMS1Results();
MS1 c = new MS1();
Map<String, Double> metlinMasses = c.getIonMasses(mz, ionMode);
MS1ScanForWellAndMassCharge ms1ScanResults;
Map<String, List<XZ>> ms1s;
switch (module) {
case RAW_SPECTRA:
for (String ms1File : ms1Files) {
ms1ScanResults = c.getMS1(metlinMasses, ms1File);
Double maxYAxis = ms1ScanResults.getMaxYAxis();
Map<String, Double> individualMaxIntensities = null;
// if we wish each plot to have an independent y-range (to show internal structure, as opposed
// to compare across different spectra), then we set the individual maxes
if (plotsHaveIndependentYAxis) {
individualMaxIntensities = ms1ScanResults.getIndividualMaxYAxis();
}
plottingUtil.plotSpectra(ms1ScanResults.getIonsToSpectra(), maxYAxis, individualMaxIntensities, metlinMasses,
outPrefix, fmt, makeHeatmap, overlayPlots);
}
break;
case FEEDINGS:
// for now we assume the concentrations are in log ramped up
Double[] concVals = new Double[] { 0.0001d, 0.001d, 0.01d, 0.025d, 0.05d, 0.1d };
int concIdx = 0;
List<Pair<Double, MS1ScanForWellAndMassCharge>> rampUp = new ArrayList<>();
for (String ms1File : ms1Files) {
ms1ScanResults = c.getMS1(metlinMasses, ms1File);
// until we read from the db, artificial values
Double concentration = concIdx < concVals.length ? concVals[concIdx] : concVals[concVals.length - 1] * 10 * (concIdx - concVals.length + 1);
concIdx++;
LOGGER.info("Well %s label concentration: %e", ms1File, concentration);
rampUp.add(Pair.of(concentration, ms1ScanResults));
}
plottingUtil.plotFeedings(rampUp, ion, outPrefix, fmt, outPrefix + ".gnuplot");
break;
case TIC:
for (String ms1File : ms1Files) {
// get and plot Total Ion Chromatogram
TIC_MzAtMax totalChrom = c.getTIC(ms1File);
plottingUtil.plotTIC(totalChrom.tic, outPrefix + ".TIC", fmt);
plottingUtil.plotScan(totalChrom.mzScanAtMaxIntensity, outPrefix + ".MaxTICScan", fmt);
}
break;
}
}
}