/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.lcms.v2.fullindex;
import com.act.lcms.LCMSNetCDFParser;
import com.act.lcms.LCMSSpectrum;
import com.act.lcms.MS1;
import com.act.utils.CLIUtil;
import com.act.utils.rocksdb.DBUtil;
import com.act.utils.rocksdb.RocksDBAndHandles;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.joda.time.DateTime;
import org.rocksdb.RocksDB;
import org.rocksdb.RocksDBException;
import org.rocksdb.WriteOptions;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLStreamException;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
public class Builder {
private static final Logger LOGGER = LogManager.getFormatterLogger(Builder.class);
private static final Charset UTF8 = StandardCharsets.UTF_8;
/* TIMEPOINTS_KEY is a fixed key into a separate column family in the index that just holds a list of time points.
* Within that column family, there is only one entry:
* "timepoints" -> serialized array of time point doubles
* and we use this key to write/read those time points. Since time points are shared across all traces, we can
* maintain this one copy in the index and reconstruct the XZ pairs as we read trace intensity arrays. */
// All of these are intentionally package private.
static final byte[] TIMEPOINTS_KEY = "timepoints".getBytes(UTF8);
static final Double WINDOW_WIDTH_FROM_CENTER = MS1.MS1_MZ_TOLERANCE_DEFAULT;
/* This step size should make it impossible for us to miss any readings in the index due to FP error.
* We could make the index more compact by spacing these windows out a bit, but I'll leave that as a TODO. */
static final Double MZ_STEP_SIZE = MS1.MS1_MZ_TOLERANCE_DEFAULT / 2.0;
static final Double MIN_MZ = 50.0;
static final Double MAX_MZ = 950.0;
public static final String OPTION_INDEX_PATH = "x";
public static final String OPTION_SCAN_FILE = "i";
public static final String HELP_MESSAGE = StringUtils.join(new String[]{
"This class extracts and indexes readings from an LCMS scan files, ",
"and writes them to an on-disk index for later processing."
}, "");
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_INDEX_PATH)
.argName("index path")
.desc("A path to the directory where the on-disk index will be stored; must not already exist")
.hasArg().required()
.longOpt("index")
);
add(Option.builder(OPTION_SCAN_FILE)
.argName("scan file")
.desc("A path to the LCMS NetCDF scan file to read")
.hasArg().required()
.longOpt("input")
);
add(Option.builder("h")
.argName("help")
.desc("Prints this help message")
.longOpt("help")
);
}};
public static class Factory {
public static Builder makeBuilder(File indexDir) throws RocksDBException{
RocksDB.loadLibrary();
LOGGER.info("Creating index at %s", indexDir.getAbsolutePath());
RocksDBAndHandles<ColumnFamilies> dbAndHandles = DBUtil.createNewRocksDB(indexDir, ColumnFamilies.values());
return new Builder(dbAndHandles);
}
}
private RocksDBAndHandles<ColumnFamilies> dbAndHandles;
Builder(RocksDBAndHandles<ColumnFamilies> dbAndHandles) {
this.dbAndHandles = dbAndHandles;
}
public static void main(String[] args) throws Exception {
CLIUtil cliUtil = new CLIUtil(Builder.class, HELP_MESSAGE, OPTION_BUILDERS);
CommandLine cl = cliUtil.parseCommandLine(args);
File inputFile = new File(cl.getOptionValue(OPTION_SCAN_FILE));
if (!inputFile.exists()) {
cliUtil.failWithMessage("Cannot find input scan file at %s", inputFile.getAbsolutePath());
}
File indexDir = new File(cl.getOptionValue(OPTION_INDEX_PATH));
if (indexDir.exists()) {
cliUtil.failWithMessage("Index file at %s already exists--remove and retry", indexDir.getAbsolutePath());
}
Builder indexBuilder = Factory.makeBuilder(indexDir);
try {
indexBuilder.processScan(indexBuilder.makeTargetMasses(), inputFile);
} finally {
if (indexBuilder != null) {
indexBuilder.close();
}
}
LOGGER.info("Done");
}
public void close() throws RocksDBException {
dbAndHandles.close();
}
private List<Double> makeTargetMasses() {
List<Double> targets = new ArrayList<>();
for (double m = MIN_MZ - MZ_STEP_SIZE; m <= MAX_MZ + MZ_STEP_SIZE; m += MZ_STEP_SIZE) {
targets.add(m);
}
return targets;
}
public void processScan(List<Double> targetMZs, File scanFile)
throws RocksDBException, ParserConfigurationException, XMLStreamException, IOException {
DateTime start = DateTime.now();
LOGGER.info("Accessing scan file at %s", scanFile.getAbsolutePath());
LCMSNetCDFParser parser = new LCMSNetCDFParser();
Iterator<LCMSSpectrum> spectrumIterator = parser.getIterator(scanFile.getAbsolutePath());
WriteOptions writeOptions = new WriteOptions();
/* The write-ahead log and disk synchronization features are useful when we need our writes to be durable (i.e. to
* survive a crash). However, our index construction is effectively a one-shot deal: if it doesn't succeed, we'll
* just start from scratch. Since we don't care about durability while we're constructing the index, the WAL and
* sync features eat a lot of disk space and I/O bandwidth, which slows us down. So long as we cleanly close the
* index once it's built, nobody has to know that we disabled these features. */
writeOptions.setDisableWAL(true);
writeOptions.setSync(false);
dbAndHandles.setWriteOptions(writeOptions);
// TODO: split targetMZs into batches of ~100k and extract incrementally to allow huge input sets.
LOGGER.info("Extracting traces");
List<MZWindow> windows = targetsToWindows(targetMZs);
extractTriples(spectrumIterator, windows);
LOGGER.info("Writing search targets to on-disk index");
writeWindowsToDB(windows);
DateTime end = DateTime.now();
LOGGER.info("Index construction completed in %dms", end.getMillis() - start.getMillis());
}
private List<MZWindow> targetsToWindows(List<Double> targetMZs) {
List<MZWindow> windows = new ArrayList<MZWindow>() {{
int i = 0;
for (Double targetMZ : targetMZs) {
add(new MZWindow(i, targetMZ));
i++;
}
}};
/* We *must* ensure the windows are sorted in m/z order for the sweep line to work. However, we don't know anything
* about the input targetMZs list, which may be immutable or may be in some order the client wants to preserve.
* Rather than mess with that array, we'll sort the windows in our internal array and leave be the client's targets.
*/
Collections.sort(windows, (a, b) -> a.getTargetMZ().compareTo(b.getTargetMZ()));
return windows;
}
protected void extractTriples(
Iterator<LCMSSpectrum> iter,
List<MZWindow> windows)
throws RocksDBException, IOException {
/* Warning: this method makes heavy use of ByteBuffers to perform memory efficient collection of values and
* conversion of those values into byte arrays that RocksDB can consume. If you haven't already, go read this
* tutorial on ByteBuffers: http://mindprod.com/jgloss/bytebuffer.html
*
* ByteBuffers are quite low-level structures, and they use some terms you need to watch out for:
* capacity: The total number of bytes in the array backing the buffer. Don't write more than this.
* position: The next index in the buffer to read or write a byte. Moves with each read or write op.
* limit: A mark of where the final byte in the buffer was written. Don't read past this.
* The remaining() call is affected by the limit.
* mark: Ignore this for now, we don't use it. (We'll always, always read buffers from 0.)
*
* And here are some methods that we'll use often:
* clear: Set position = 0, limit = 0. Pretend the buffer is empty, and is ready for more writes.
* flip: Set limit = position, then position = 0. This remembers how many bytes were written to the buffer
* (as the current position), and then puts the position at the beginning.
* Always call this after the write before a read.
* rewind: Set position = 0. Buffer is ready for reading, but unless the limit was set we might now know how
* many bytes there are to read. Always call flip() before rewind(). Can rewind many times to re-read
* the buffer repeatedly.
* remaining: How many bytes do we have left to read? Requires an accurate limit value to avoid garbage bytes.
* reset: Don't use this. It uses the mark, which we don't need currently.
*
* Write/read patterns look like:
* buffer.clear(); // Clear out anything already in the buffer.
* buffer.put(thing1).put(thing2)... // write a bunch of stuff
* buffer.flip(); // Prep for reading. Call *once*!
*
* while (buffer.hasRemaining()) { buffer.get(); } // Read a bunch of stuff.
* buffer.rewind(); // Ready for reading again!
* while (buffer.hasRemaining()) { buffer.get(); } // Etc.
* buffer.reset(); // Forget what was written previously, buffer is ready for reuse.
*
* We use byte buffers because they're fast, efficient, and offer incredibly convenient means of serializing a
* stream of primitive types to their minimal binary representations. The same operations on objects + object
* streams require significantly more CPU cycles, consume more memory, and tend to be brittle (i.e. if a class
* definition changes slightly, serialization may break). Since the data we're dealing with is pretty simple, we
* opt for the low-level approach.
*/
/* Because we'll eventually use the window indices to map a mz range to a list of triples that fall within that
* range, verify that all of the indices are unique. If they're not, we'll end up overwriting the data in and
* corrupting the structure of the index. */
ensureUniqueMZWindowIndices(windows);
// For every mz window, allocate a buffer to hold the indices of the triples that fall in that window.
ByteBuffer[] mzWindowTripleBuffers = new ByteBuffer[windows.size()];
for (int i = 0; i < mzWindowTripleBuffers.length; i++) {
/* Note: the mapping between these buffers and their respective mzWindows is purely positional. Specifically,
* mzWindows.get(i).getIndex() != i, but mzWindowTripleBuffers[i] belongs to mzWindows.get(i). We'll map windows
* indices to the contents of mzWindowTripleBuffers at the very end of this function. */
mzWindowTripleBuffers[i] = ByteBuffer.allocate(Long.BYTES * 4096); // Start with 4096 longs = 8 pages per window.
}
// Every TMzI gets an index which we'll use later when we're querying by m/z and time.
long counter = -1; // We increment at the top of the loop.
// Note: we could also write to an mmapped file and just track pointers, but then we might lose out on compression.
// We allocate all the buffers strictly here, as we know how many bytes a long and a triple will take. Then reuse!
ByteBuffer counterBuffer = ByteBuffer.allocate(Long.BYTES);
ByteBuffer valBuffer = ByteBuffer.allocate(TMzI.BYTES);
List<Float> timepoints = new ArrayList<>(2000); // We can be sloppy here, as the count is small.
/* We use a sweep-line approach to scanning through the m/z windows so that we can aggregate all intensities in
* one pass over the current LCMSSpectrum (this saves us one inner loop in our extraction process). The m/z
* values in the LCMSSpectrum become our "critical" or "interesting points" over which we sweep our m/z ranges.
* The next window in m/z order is guaranteed to be the next one we want to consider since we address the points
* in m/z order as well. As soon as we've passed out of the range of one of our windows, we discard it. It is
* valid for a window to be added to and discarded from the working queue in one application of the work loop. */
LinkedList<MZWindow> tbdQueueTemplate = new LinkedList<>(windows); // We can reuse this template to init the sweep.
int spectrumCounter = 0;
while (iter.hasNext()) {
LCMSSpectrum spectrum = iter.next();
float time = spectrum.getTimeVal().floatValue();
// This will record all the m/z + intensity readings that correspond to this timepoint. Exactly sized too!
ByteBuffer triplesForThisTime = ByteBuffer.allocate(Long.BYTES * spectrum.getIntensities().size());
// Batch up all the triple writes to reduce the number of times we hit the disk in this loop.
// Note: huge success!
RocksDBAndHandles.RocksDBWriteBatch<ColumnFamilies> writeBatch = dbAndHandles.makeWriteBatch();
// Initialize the sweep line lists. Windows go follow: tbd -> working -> done (nowhere).
LinkedList<MZWindow> workingQueue = new LinkedList<>();
LinkedList<MZWindow> tbdQueue = (LinkedList<MZWindow>) tbdQueueTemplate.clone(); // clone is in the docs, so okay!
for (Pair<Double, Double> mzIntensity : spectrum.getIntensities()) {
// Very important: increment the counter for every triple. Otherwise we'll overwrite triples = Very Bad (tm).
counter++;
// Brevity = soul of wit!
Double mz = mzIntensity.getLeft();
Double intensity = mzIntensity.getRight();
// Reset the buffers so we end up re-using the few bytes we've allocated.
counterBuffer.clear(); // Empty (virtually).
counterBuffer.putLong(counter);
counterBuffer.flip(); // Prep for reading.
valBuffer.clear(); // Empty (virtually).
TMzI.writeToByteBuffer(valBuffer, time, mz, intensity.floatValue());
valBuffer.flip(); // Prep for reading.
// First, shift any applicable ranges onto the working queue based on their minimum mz.
while (!tbdQueue.isEmpty() && tbdQueue.peekFirst().getMin() <= mz) {
workingQueue.add(tbdQueue.pop());
}
// Next, remove any ranges we've passed.
while (!workingQueue.isEmpty() && workingQueue.peekFirst().getMax() < mz) {
workingQueue.pop(); // TODO: add() this to a recovery queue which can then become the tbdQueue. Edge cases!
}
/* In the old indexed trace extractor world, we could bail here if there were no target m/z's in our window set
* that matched with the m/z of our current mzIntensity. However, since we're now also recording the links
* between timepoints and their (t, m/z, i) triples, we need to keep on keepin' on regardless of whether we have
* any m/z windows in the working set right now. */
// The working queue should now hold only ranges that include this m/z value. Sweep line swept!
/* Now add this intensity to the buffers of all the windows in the working queue. Note that since we're only
* storing the *index* of the triple, these buffers are going to consume less space than they would if we
* stored everything together. */
for (MZWindow window : workingQueue) {
// TODO: count the number of times we add intensities to each window's accumulator for MS1-style warnings.
counterBuffer.rewind(); // Already flipped.
mzWindowTripleBuffers[window.getIndex()] = // Must assign when calling appendOrRealloc.
Utils.appendOrRealloc(mzWindowTripleBuffers[window.getIndex()], counterBuffer);
}
// We flipped after reading, so we should be good to rewind (to be safe) and write here.
counterBuffer.rewind();
valBuffer.rewind();
writeBatch.put(ColumnFamilies.ID_TO_TRIPLE, Utils.toCompactArray(counterBuffer), Utils.toCompactArray(valBuffer));
// Rewind again for another read.
counterBuffer.rewind();
triplesForThisTime.put(counterBuffer);
}
writeBatch.write();
assert(triplesForThisTime.position() == triplesForThisTime.capacity());
ByteBuffer timeBuffer = ByteBuffer.allocate(Float.BYTES).putFloat(time);
timeBuffer.flip(); // Prep both bufers for reading so they can be written to the DB.
triplesForThisTime.flip();
dbAndHandles.put(ColumnFamilies.TIMEPOINT_TO_TRIPLES,
Utils.toCompactArray(timeBuffer), Utils.toCompactArray(triplesForThisTime));
timepoints.add(time);
spectrumCounter++;
if (spectrumCounter % 1000 == 0) {
LOGGER.info("Extracted %d time spectra", spectrumCounter);
}
}
LOGGER.info("Extracted %d total time spectra", spectrumCounter);
// Now write all the mzWindow to triple indexes.
RocksDBAndHandles.RocksDBWriteBatch<ColumnFamilies> writeBatch = dbAndHandles.makeWriteBatch();
ByteBuffer idBuffer = ByteBuffer.allocate(Integer.BYTES);
for (int i = 0; i < mzWindowTripleBuffers.length; i++) {
idBuffer.clear();
idBuffer.putInt(windows.get(i).getIndex());
idBuffer.flip();
ByteBuffer triplesBuffer = mzWindowTripleBuffers[i];
triplesBuffer.flip(); // Prep for read.
writeBatch.put(ColumnFamilies.WINDOW_ID_TO_TRIPLES, Utils.toCompactArray(idBuffer), Utils.toCompactArray(triplesBuffer));
}
writeBatch.write();
dbAndHandles.put(ColumnFamilies.TIMEPOINTS, TIMEPOINTS_KEY, Utils.floatListToByteArray(timepoints));
dbAndHandles.flush(true);
}
private void ensureUniqueMZWindowIndices(List<MZWindow> windows) {
Set<Integer> ids = new HashSet<>(windows.size());
for (MZWindow window : windows) {
if (ids.contains(window.getIndex())) {
String msg = String.format("Assumption violation: found duplicate mzWindow index when all should be unique: %d",
window.getIndex());
LOGGER.error(msg);
// A hard invariant has been violated, so crash the program.
throw new RuntimeException(msg);
}
ids.add(window.getIndex());
}
}
/**
* Writes all MZWindows to the DB as serialized objects. Windows are keyed by target MZ (as a Double) for easy lookup
* in the case that we have a target m/z that exactly matches a window.
*
* @param windows The windows to write.
* @throws RocksDBException
* @throws IOException
*/
protected void writeWindowsToDB(List<MZWindow> windows) throws RocksDBException, IOException {
for (MZWindow window : windows) {
byte[] keyBytes = Utils.serializeObject(window.getTargetMZ());
byte[] valBytes = Utils.serializeObject(window);
dbAndHandles.put(ColumnFamilies.TARGET_TO_WINDOW, keyBytes, valBytes);
}
dbAndHandles.flush(true);
LOGGER.info("Done writing window data to index");
}
}