/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.lcms.v2;
import com.act.utils.CLIUtil;
import com.act.utils.TSVWriter;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.concurrent.atomic.LongAdder;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
public class MZCollisionCounter {
private static final Logger LOGGER = LogManager.getFormatterLogger(MZCollisionCounter.class);
private static final String OPTION_INPUT_INCHI_LIST = "i";
private static final String OPTION_OUTPUT_FILE = "o";
private static final String OPTION_COUNT_WINDOW_INTERSECTIONS = "w";
private static final String OPTION_WINDOW_HALFWIDTH = "s";
private static final String OPTION_ONLY_CONSIDER_IONS = "n";
private static final Double DEFAULT_WINDOW_TOLERANCE = 0.01;
private static final String HELP_MESSAGE = StringUtils.join(new String[]{
"This class calculates the monoisoptic masses of a list of InChIs, computes ion m/z's for those masses, ",
"and computes the distribution of m/z collisions over these m/z's, producing a histogram as its output."
}, "");
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_INPUT_INCHI_LIST)
.argName("input-file")
.desc("An input file containing just InChIs")
.hasArg()
.required()
.longOpt("input-file")
);
add(Option.builder(OPTION_OUTPUT_FILE)
.argName("output-file")
.desc("Write collision histogram data to an output file")
.hasArg()
.required()
.longOpt("output-file")
);
add(Option.builder(OPTION_COUNT_WINDOW_INTERSECTIONS)
.desc("Count intersections of +/-0.01 Dalton mass charge windows instead of counting exact collisions; " +
"counts the number of structures that might fall within each window and bins by count")
.longOpt("window-collisions")
);
add(Option.builder(OPTION_WINDOW_HALFWIDTH)
.desc(String.format("Sets the window half-width for collision counting in window mode, default is %.3f",
DEFAULT_WINDOW_TOLERANCE))
.longOpt("window-half-width")
);
add(Option.builder(OPTION_ONLY_CONSIDER_IONS)
.argName("ions")
.desc("Only consider these ions when computing mass/charges (comma separated list)")
.hasArgs()
.valueSeparator(',')
.longOpt("only-ions")
);
}};
public static void main(String[] args) throws Exception {
CLIUtil cliUtil = new CLIUtil(MassChargeCalculator.class, HELP_MESSAGE, OPTION_BUILDERS);
CommandLine cl = cliUtil.parseCommandLine(args);
File inputFile = new File(cl.getOptionValue(OPTION_INPUT_INCHI_LIST));
if (!inputFile.exists()) {
cliUtil.failWithMessage("Input file at does not exist at %s", inputFile.getAbsolutePath());
}
List<MassChargeCalculator.MZSource> sources = new ArrayList<>();
try (BufferedReader reader = new BufferedReader(new FileReader(inputFile))) {
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
sources.add(new MassChargeCalculator.MZSource(line));
if (sources.size() % 1000 == 0) {
LOGGER.info("Loaded %d sources from input file", sources.size());
}
}
}
Set<String> considerIons = Collections.emptySet();
if (cl.hasOption(OPTION_ONLY_CONSIDER_IONS)) {
List<String> ions = Arrays.asList(cl.getOptionValues(OPTION_ONLY_CONSIDER_IONS));
LOGGER.info("Only considering ions for m/z calculation: %s", StringUtils.join(ions, ", "));
considerIons = new HashSet<>(ions);
}
TSVWriter<String, Long> tsvWriter = new TSVWriter<>(Arrays.asList("collisions", "count"));
tsvWriter.open(new File(cl.getOptionValue(OPTION_OUTPUT_FILE)));
try {
LOGGER.info("Loaded %d sources in total from input file", sources.size());
MassChargeCalculator.MassChargeMap mzMap = MassChargeCalculator.makeMassChargeMap(sources, considerIons);
if (!cl.hasOption(OPTION_COUNT_WINDOW_INTERSECTIONS)) {
// Do an exact analysis of the m/z collisions if windowing is not specified.
LOGGER.info("Computing precise collision histogram.");
Iterable<Double> mzs = mzMap.ionMZIter();
Map<Integer, Long> collisionHistogram =
histogram(StreamSupport.stream(mzs.spliterator(), false).map(mz -> { // See comment about Iterable below.
try {
return mzMap.ionMZToMZSources(mz).size();
} catch (NoSuchElementException e) {
LOGGER.error("Caught no such element exception for mz %f: %s", mz, e.getMessage());
throw e;
}
}));
List<Integer> sortedCollisions = new ArrayList<>(collisionHistogram.keySet());
Collections.sort(sortedCollisions);
for (Integer collision : sortedCollisions) {
tsvWriter.append(new HashMap<String, Long>() {{
put("collisions", collision.longValue());
put("count", collisionHistogram.get(collision));
}});
}
} else {
/* After some deliberation (thanks Gil!), the windowed variant of this calculation counts the number of
* structures whose 0.01 Da m/z windows (for some set of ions) overlap with each other.
*
* For example, let's assume we have five total input structures, and are only searching for one ion. Let's
* also assume that three of those structures have m/z A and the remaining two have m/z B. The windows might
* look like this in the m/z domain:
* |----A----|
* |----B----|
* Because A represents three structures and overlaps with B, which represents two, we assign A a count of 5--
* this is the number of structures we believe could fall into the range of A given our current peak calling
* approach. Similarly, B is assigned a count of 5, as the possibility for collision/confusion is symmetric.
*
* Note that this is an over-approximation of collisions, as we could more precisely only consider intersections
* when the exact m/z of B falls within the window around A and vice versa. However, because we have observed
* cases where the MS sensor doesn't report structures at exactly the m/z we predict, we employ this weaker
* definition of intersection to give a slightly pessimistic view of what confusions might be possible. */
// Compute windows for every m/z. We don't care about the original mz values since we just want the count.
List<Double> mzs = mzMap.ionMZsSorted();
final Double windowHalfWidth;
if (cl.hasOption(OPTION_WINDOW_HALFWIDTH)) {
// Don't use get with default for this option, as we want the exact FP value of the default tolerance.
windowHalfWidth = Double.valueOf(cl.getOptionValue(OPTION_WINDOW_HALFWIDTH));
} else {
windowHalfWidth = DEFAULT_WINDOW_TOLERANCE;
}
/* Window = (lower bound, upper bound), counter of represented m/z's that collide with this window, and number
* of representative structures (which will be used in counting collisions). */
LinkedList<CollisionWindow> allWindows = new LinkedList<CollisionWindow>() {{
for (Double mz : mzs) {
// CPU for memory trade-off: don't re-compute the window bounds over and over and over and over and over.
try {
add(new CollisionWindow(mz, windowHalfWidth, mzMap.ionMZToMZSources(mz).size()));
} catch (NoSuchElementException e) {
LOGGER.error("Caught no such element exception for mz %f: %s", mz, e.getMessage());
throw e;
}
}
}};
// Sweep line time! The window ranges are the interesting points. We just accumulate overlap counts as we go.
LinkedList<CollisionWindow> workingSet = new LinkedList<>();
List<CollisionWindow> finished = new LinkedList<>();
while (allWindows.size() > 0) {
CollisionWindow thisWindow = allWindows.pop();
// Remove any windows from the working set that don't overlap with the next window.
while (workingSet.size() > 0 && workingSet.peekFirst().getMaxMZ() < thisWindow.getMinMZ()) {
finished.add(workingSet.pop());
}
for (CollisionWindow w : workingSet) {
/* Add the size of the new overlapping window's structure count to each of the windows in the working set,
* which represents the number of possible confused structures that fall within the overlapping region.
* We exclude the window itself as it should already have counted the colliding structures it represents. */
w.getAccumulator().add(thisWindow.getStructureCount());
/* Reciprocally, add the structure counts of all windows with which the current window overlaps to it. */
thisWindow.getAccumulator().add(w.getStructureCount());
}
// Now that accumulation is complete, we can safely add the current window.
workingSet.add(thisWindow);
}
// All the interesting events are done, so drop the remaining windows into the finished set.
finished.addAll(workingSet);
Map<Long, Long> collisionHistogram = histogram(finished.stream().map(w -> w.getAccumulator().longValue()));
List<Long> sortedCollisions = new ArrayList<>(collisionHistogram.keySet());
Collections.sort(sortedCollisions);
for (Long collision : sortedCollisions) {
tsvWriter.append(new HashMap<String, Long>() {{
put("collisions", collision);
put("count", collisionHistogram.get(collision));
}});
}
}
} finally {
if (tsvWriter != null) {
tsvWriter.close();
}
}
}
private static class CollisionWindow {
Double min;
Double max;
LongAdder accumulator = new LongAdder();
Integer structureCount;
public CollisionWindow(Double mz, Double windowHalfWidth, Integer structureCount) {
this.min = mz - windowHalfWidth;
this.max = mz + windowHalfWidth;
this.structureCount = structureCount;
// Set the base
this.accumulator.add(structureCount);
}
Double getMinMZ() {
return min;
}
Double getMaxMZ() {
return max;
}
LongAdder getAccumulator() {
return accumulator;
}
Integer getStructureCount() {
return structureCount;
}
}
public static <T> Map<T, Long> histogram(Stream<T> stream) {
Map<T, Long> hist = new HashMap<>();
// This could be done with reduce (fold) or collector cleverness, but this invocation makes the intention clear.
//stream.forEach(x -> hist.merge(x, 1l, (acc, one) -> one + acc));
stream.forEach(x -> {
try {
hist.put(x, hist.getOrDefault(x, 0l) + 1);
} catch (NoSuchElementException e) {
LOGGER.error("Caught no such element exception for %s: %s", x.toString(), e.getMessage());
throw e;
}
});
return hist;
}
}