/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.lcms.v2;
import com.act.lcms.MS1;
import com.act.lcms.MassCalculator;
import com.act.lcms.db.io.DB;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
public class MassChargeCalculator {
private static final Logger LOGGER = LogManager.getFormatterLogger(MassChargeCalculator.class);
/**
* An MZSource is a handle to the result of ionic mass/charge computation. Once m/z computation has been completed
* for a given set of mass sources (like InChIs, arbitrary numeric values, and the results of the very convenient
* Utils.extractMassFromString), the resulting MZSource object can be used to access all of the m/z values
* computed from that source of a monoisotopic mass. In turn, any peaks associated with a particular m/z can be
* mapped back to the source from which the search window in which the peak was found was originally derived.
*/
@JsonInclude(JsonInclude.Include.NON_NULL) // Hide unused fields since this is effectively a tagged union/sum type.
public static class MZSource implements Serializable {
private static final long serialVersionUID = -6359715907934400901L;
@JsonIgnore
private static transient AtomicInteger instanceCounter = new AtomicInteger(0);
private enum KIND {
INCHI, // An InChI whose mass to import.
ARBITRARY_MASS, // Some monoisotopic mass specified without a source molecule.
PRE_EXTRACTED_MASS, // The output of Utils.extractMassFromString, which is very handy.
UNKNOWN, // For deserialization.
;
}
@JsonProperty(value = "kind", required = true)
KIND kind = KIND.UNKNOWN;
@JsonProperty(value = "inchi")
String inchi;
@JsonProperty(value = "arbitrary_mass")
Double arbitraryMass;
// Don't use Pair here, as Jackson 2.6 chokes on it.
@JsonProperty(value = "pre_extracted_label")
String preExtractedLabel;
@JsonProperty(value = "pre_extracted_mass")
Double preExtractedMass;
/* Note: this ID is for logging/convenience only. It is not intended as a durable or consistent pointer to a
* particular MZSource. The caller is responsible for understanding the true source/meaning of an MZSource
* object; by itself, it is just a handle to some computation. */
@JsonProperty(value = "id", required = true)
@JsonDeserialize(using = MZSourceIDDeserializer.class)
Integer objectId;
/**
* Construct a source based on an InChI. Monoisotopic mass will automatically be calculated using MassCalculator.
* @param inchi The inchi whose mass to use.
*/
public MZSource(String inchi) {
this.kind = KIND.INCHI;
this.inchi = inchi;
setId();
}
/**
* Construct a source based on an arbitrary monoisotopic mass. Do not use average mass for this value.
* @param arbitraryMass Some monoisotopic mass to use as a source.
*/
public MZSource(Double arbitraryMass) {
this.kind = KIND.ARBITRARY_MASS;
this.arbitraryMass = arbitraryMass;
setId();
}
/**
* Construct a source based on the result of {@link com.act.lcms.db.analysis.Utils#extractMassFromString}.
* @param extractMassFromStringResult The results of {@link com.act.lcms.db.analysis.Utils#extractMassFromString}.
*/
public MZSource(Pair<String, Double> extractMassFromStringResult) {
this.kind = KIND.PRE_EXTRACTED_MASS;
this.preExtractedLabel = extractMassFromStringResult.getLeft();
this.preExtractedMass = extractMassFromStringResult.getRight();
setId();
}
private MZSource() { // For de/serialization.
}
private void setId() {
objectId = instanceCounter.getAndIncrement();
}
Integer getId() {
return objectId;
}
// No protection specification here = package private.
KIND getKind() {
return kind;
}
String getInchi() {
return inchi;
}
Double getArbitraryMass() {
return arbitraryMass;
}
Pair<String, Double> getPreExtractedMass() {
return Pair.of(preExtractedLabel, preExtractedMass);
}
/* Note: these methods are more complicated than they need be. Specifically, since objectId should be unique, it's
* really the only thing we should need to determine if two MZSource objects are equivalent. However, given the
* complexities of serialization, testing, etc., comprehensive implementations of these methods seem like a good
* idea for now.
*/
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
MZSource mzSource = (MZSource) o;
if (kind != mzSource.kind) return false;
if (inchi != null ? !inchi.equals(mzSource.inchi) : mzSource.inchi != null) return false;
if (arbitraryMass != null ? !arbitraryMass.equals(mzSource.arbitraryMass) : mzSource.arbitraryMass != null)
return false;
if (preExtractedMass != null ? !preExtractedMass.equals(mzSource.preExtractedMass) : mzSource.preExtractedMass != null)
return false;
return objectId.equals(mzSource.objectId);
}
@Override
public int hashCode() {
int result = kind.hashCode();
result = 31 * result + (inchi != null ? inchi.hashCode() : 0);
result = 31 * result + (arbitraryMass != null ? arbitraryMass.hashCode() : 0);
result = 31 * result + (preExtractedMass != null ? preExtractedMass.hashCode() : 0);
result = 31 * result + objectId.hashCode();
return result;
}
public static class MZSourceIDDeserializer extends JsonDeserializer<Integer> {
@Override
public Integer deserialize(JsonParser p, DeserializationContext ctxt)
throws IOException, JsonProcessingException {
final int thisId = p.getIntValue();
/* Set the instance counter to the current document's id + 1 if it isn't already higher. This will (weakly)
* ensure that any newly created documents don't collide with whatever we're reading in.
*
* TODO: these ids are awful, and are certainly something I expect to regret implementing. We should use
* something better, make ids cleanly transient, or try to get away from ids at all.
*/
instanceCounter.getAndUpdate(x -> Integer.max(x, thisId + 1));
return thisId;
}
}
private void readObject(java.io.ObjectInputStream in)
throws IOException, ClassNotFoundException {
in.defaultReadObject();
// See comment in JSON id serializer re: why we need special id handling.
instanceCounter.getAndUpdate(x -> Integer.max(x, getId() + 1));
}
}
/**
* A MassChargeMap is built from a list of MZSources, and stores the monoisotopic and ionic mass variants for each
* of those sources.
*/
public static class MassChargeMap implements Serializable {
private static final long serialVersionUID = -332580987490663497L;
// Tracks the monoisotopic mass for a given source, which can then be used to find collisions in the reverse map.
Map<MZSource, Double> monoisotopicMasses;
// Maps each monoisotopic mass to any colliding sources.
Map<Double, List<MZSource>> reverseMonoisotopicMasses = new HashMap<>();
// For a given ionic mass, it is a <ION_TYPE> for mass <MONOISOTOPIC MASS>, e.g., M+H for
Map<Double, List<Pair<String, Double>>> reverseIonicMasses = new HashMap<>();
// Package-private again.
MassChargeMap() {
}
public List<MZSource> monoMassToMZSources(Double monoisotopicMass) {
return Collections.unmodifiableList(reverseMonoisotopicMasses.get(monoisotopicMass));
}
public List<Pair<String, Double>> ionMZtoMonoMassAndIonName(Double mz) {
return Collections.unmodifiableList(reverseIonicMasses.get(mz));
}
public Set<MZSource> ionMZToMZSources(Double mz) {
List<Pair<String, Double>> reverseIons = reverseIonicMasses.get(mz);
if (reverseIons == null) {
return Collections.emptySet();
}
return Collections.unmodifiableSet(
reverseIons.stream().
map(p -> reverseMonoisotopicMasses.get(p.getRight())).
flatMap(List::stream). // Flattens stream of List<MZSource> into single stream of MZSource.
collect(Collectors.toSet())
);
}
/* Use Iterable -> Stream instead of Iterator, as Iterator doesn't play nicely with streams on its own.
* Iterator + Stream.generate = java.util.NoSuchElementException = :(
* http://stackoverflow.com/questions/24511052/how-to-convert-an-iterator-to-a-stream */
public Iterable<MZSource> mzSourceIter() {
return () -> monoisotopicMasses.keySet().iterator();
}
public Iterable<Double> monoisotopicMassIter() {
return () -> reverseMonoisotopicMasses.keySet().iterator();
}
public Iterable<Double> ionMZIter() {
return () -> reverseIonicMasses.keySet().iterator();
}
// Package private--this one can eat a whole lot of memory, so we prefer the streaming approaches.
List<Double> ionMZsSorted() {
List<Double> results = new ArrayList<>(reverseIonicMasses.keySet());
Collections.sort(results);
// The keySet has already unique'd all the values, so all we need to do is sort 'em and return 'em.
return Collections.unmodifiableList(results); // Make the list unmodifiable for safety's sake (we own the masses).
}
// Package private again.
void loadSourcesAndMasses(List<Pair<MZSource, Double>> sourcesAndMasses, Set<String> onlyConsiderIons)
throws IOException {
monoisotopicMasses = new LinkedHashMap<>(sourcesAndMasses.size()); // Preserve order for easier testing.
LOGGER.info("Converting %d m/z sources into ionic masses and resolving collisions", sourcesAndMasses.size());
int sourceCounter = 0;
int ionCounter = 0;
for (Pair<MZSource, Double> sourceAndMass : sourcesAndMasses) {
MZSource source = sourceAndMass.getLeft();
Double monoMass = sourceAndMass.getRight();
monoisotopicMasses.put(source, monoMass);
sourceCounter++;
if (sourceCounter % 1000 == 0) {
LOGGER.info("Resolved %d sources, handled %d ion m/z's in total", sourceCounter, ionCounter);
}
if (reverseMonoisotopicMasses.containsKey(monoMass)) {
reverseMonoisotopicMasses.get(monoMass).add(source);
/* And we're done! We've already computed the ions for this monoisotopic mass, so we've just squashed a
* duplicate. The source -> results correspondence will come out when we map the hits over the ionic masses,
* then tie those back to their corresponding MZSources. */
continue;
}
List<MZSource> sourcesWithThisMass = new ArrayList<MZSource>(1) {{
add(source);
}};
reverseMonoisotopicMasses.put(monoMass, sourcesWithThisMass);
Map<String, Double> ions = MS1.getIonMasses(monoMass, MS1.IonMode.POS);
// Filter to only the specified ions if the onlyConsider set is non-empty; allow all ions by default.
if (onlyConsiderIons.size() != 0) {
ions = ions.entrySet().stream().
filter(e -> onlyConsiderIons.contains(e.getKey())).
collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
}
for (Map.Entry<String, Double> ion : ions.entrySet()) {
List<Pair<String, Double>> reverseMapping = reverseIonicMasses.get(ion.getValue());
if (reverseMapping == null) {
reverseMapping = new ArrayList<>(1); // Assume few collisions.
reverseIonicMasses.put(ion.getValue(), reverseMapping);
}
reverseMapping.add(Pair.of(ion.getKey(), monoMass));
ionCounter++;
}
}
LOGGER.info("Done resolving %d sources, found %d ion m/z's in total", sourceCounter, ionCounter);
}
/**
* Accumulates all of the MZSources associated with this ion and the ion name per each source for a given ion m/z.
* @param ionicMassCharge The ion's m/z for which to search.
* @return A list of each corresponding MZSource and the ionic variant the specified m/z represents for this source.
* Returns an empty list if no matching sources are found.
*/
public List<Pair<MZSource, String>> mapIonMZToSources(Double ionicMassCharge) {
List<Pair<String, Double>> reverseMapping = reverseIonicMasses.get(ionicMassCharge);
if (reverseMapping == null) {
return Collections.emptyList();
}
// Flattening this list with the stream API is a bit clumsy, so just loop over all the reverse mappings.
List<Pair<MZSource, String>> results = new ArrayList<>();
for (Pair<String, Double> ionNameAndMonoMass : reverseMapping) {
List<MZSource> mzSources = reverseMonoisotopicMasses.get(ionNameAndMonoMass.getRight());
for (MZSource source : mzSources) {
results.add(Pair.of(source, ionNameAndMonoMass.getLeft()));
}
}
return results;
}
}
public static MassChargeMap makeMassChargeMap(List<MZSource> mzSources) throws IOException {
return makeMassChargeMap(mzSources, Collections.emptySet());
}
public static MassChargeMap makeMassChargeMap(List<MZSource> mzSources, Set<String> onlyConsiderIons)
throws IOException {
MassChargeMap map = new MassChargeMap();
/* Map over the sources, extracting or computing the mass as we go to encapsulate any unsafe behavior outside the
* MassChargeMap constructor. */
List<Pair<MZSource, Double>> sourcesAndMasses = new ArrayList<>();
for (MZSource source : mzSources) {
try {
sourcesAndMasses.add(Pair.of(source, computeMass(source)));
} catch (Exception e) {
LOGGER.error("MZSource %d threw an error during mass calculation, skipping", source.getId());
}
}
LOGGER.info("Consumed %d mzSources, sourcesAndMasses has %d entries", mzSources.size(), sourcesAndMasses.size());
map.loadSourcesAndMasses(sourcesAndMasses, onlyConsiderIons);
return map;
}
protected static Double computeMass(MZSource mzSource) {
Double mass;
String msg;
switch (mzSource.getKind()) {
case INCHI:
Pair<Double, Integer> massAndCharge;
try {
massAndCharge = MassCalculator.calculateMassAndCharge(mzSource.getInchi());
} catch (Exception e) {
LOGGER.error("Calculating mass for molecule %s failed: %s", mzSource.getInchi(), e.getMessage());
throw e;
}
if (massAndCharge.getRight() > 0) {
LOGGER.warn("(MZSrc %d) Molecule %s has a positive charge %d; ionization may not have the expected effect",
mzSource.getId(), mzSource.getInchi(), massAndCharge.getRight());
} else if (massAndCharge.getRight() < 0) {
LOGGER.warn("(MZSrc %d) Molecule %s has a negative charge %d; it may not fly in positive ion mode",
mzSource.getId(), mzSource.getInchi(), massAndCharge.getRight());
}
mass = massAndCharge.getLeft();
break;
case ARBITRARY_MASS:
mass = mzSource.getArbitraryMass();
break;
case PRE_EXTRACTED_MASS:
mass = mzSource.getPreExtractedMass().getRight();
break;
case UNKNOWN:
msg = String.format("mzSource with id %d has UNKNOWN kind, which is invalid", mzSource.getId());
LOGGER.error(msg);
throw new RuntimeException(msg);
default:
msg = String.format("mzSource with id %d has unknown kind (should be impossible): %s",
mzSource.getId(), mzSource.getKind());
LOGGER.error(msg);
throw new RuntimeException(msg);
}
return mass;
}
}