Searcher.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.lcms.v2.fullindex;

import com.act.utils.CLIUtil;
import com.act.utils.rocksdb.DBUtil;
import com.act.utils.rocksdb.RocksDBAndHandles;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.joda.time.DateTime;
import org.rocksdb.RocksDBException;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.function.BiFunction;
import java.util.stream.Collectors;

/**
 * This is the conjoined twin of Builder.  If IndexBuilder changes in a material way, this class should also.
 */
public class Searcher {
  private static final Logger LOGGER = LogManager.getFormatterLogger(Searcher.class);
  private static final Character RANGE_SEPARATOR = ':';
  private static final String OUTPUT_HEADER = StringUtils.join(new String[] {
      "id", "time", "m/z", "intensity"
  }, "\t");

  public static final String OPTION_INDEX_PATH = "x";
  public static final String OPTION_MZ_RANGE   = "m";
  public static final String OPTION_TIME_RANGE = "t";
  public static final String OPTION_OUTPUT_FILE = "o";

  public static final String HELP_MESSAGE = StringUtils.join(new String[]{
      "Queries a triple index constructed by Builder for readings in some m/z and time window.",
  }, "");

  public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
    add(Option.builder(OPTION_INDEX_PATH)
        .argName("index path")
        .desc("A path to the directory where the on-disk index will be stored; must not already exist")
        .hasArg().required()
        .longOpt("index")
    );
    add(Option.builder(OPTION_MZ_RANGE)
        .argName("m/z range")
        .desc("An m/z range to query separated by a colon, like 151.0:152.0")
        .hasArg()
        .longOpt("mz-range")
    );
    add(Option.builder(OPTION_OUTPUT_FILE)
        .argName("output file")
        .desc("A destination at which to write the found triples as a TSV (default is stdout)")
        .hasArg()
        .longOpt("output")
    );
    add(Option.builder(OPTION_TIME_RANGE)
        .argName("time range")
        .desc("An time range to query separated by a colon, like 45.0:50.0")
        .hasArg()
        .longOpt("time-range")
    );
  }};

  public static class Factory {
    public static Searcher makeSearcher(File indexDir)
        throws RocksDBException, ClassNotFoundException, IOException {
      RocksDBAndHandles<ColumnFamilies> dbAndHandles =
          DBUtil.openExistingRocksDB(indexDir, ColumnFamilies.values());
      Searcher searcher = new Searcher(dbAndHandles);
      searcher.init();
      return searcher;
    }
  }

  private RocksDBAndHandles<ColumnFamilies> dbAndHandles;
  private List<MZWindow> mzWindows;
  private List<Float> timepoints;

  Searcher(RocksDBAndHandles<ColumnFamilies> dbAndHandles) {
    this.dbAndHandles = dbAndHandles;
  }

  public static void main(String args[]) throws Exception {
    CLIUtil cliUtil = new CLIUtil(Searcher.class, HELP_MESSAGE, OPTION_BUILDERS);
    CommandLine cl = cliUtil.parseCommandLine(args);

    File indexDir = new File(cl.getOptionValue(OPTION_INDEX_PATH));
    if (!indexDir.exists() || !indexDir.isDirectory()) {
      cliUtil.failWithMessage("Unable to read index directory at %s", indexDir.getAbsolutePath());
    }

    if (!cl.hasOption(OPTION_MZ_RANGE) && !cl.hasOption(OPTION_TIME_RANGE)) {
      cliUtil.failWithMessage("Extracting all readings is not currently supported; specify an m/z or time range");
    }

    Pair<Double, Double> mzRange = extractRange(cl.getOptionValue(OPTION_MZ_RANGE));
    Pair<Double, Double> timeRange = extractRange(cl.getOptionValue(OPTION_TIME_RANGE));

    Searcher searcher = Factory.makeSearcher(indexDir);
    List<TMzI> results = searcher.searchIndexInRange(mzRange, timeRange);

    if (cl.hasOption(OPTION_OUTPUT_FILE)) {
      try (PrintWriter writer = new PrintWriter(new FileWriter(cl.getOptionValue(OPTION_OUTPUT_FILE)))) {
        Searcher.writeOutput(writer, results);
      }
    } else {
      // Don't close the print writer if we're writing to stdout.
      Searcher.writeOutput(new PrintWriter(new OutputStreamWriter(System.out)), results);
    }

    LOGGER.info("Done");
  }

  private static void writeOutput(PrintWriter writer, List<TMzI> results) throws IOException {
    int counter = 0;
    writer.println(OUTPUT_HEADER);
    for (TMzI triple : results) {
      writer.format("%d\t%.6f\t%.6f\t%.6f\n", counter, triple.getTime(), triple.getMz(), triple.getIntensity());
      counter++;
    }
    writer.flush();
  }

  private static Pair<Double, Double> extractRange(String rangeStr) {
    // Skip empty ranges so we can just limit on time or m/z.
    if (rangeStr == null || rangeStr.isEmpty()) {
      return null;
    }
    String[] parts = StringUtils.split(rangeStr, RANGE_SEPARATOR);
    if (parts.length == 1) {
      LOGGER.info("Found only one value in ranged '%s', returning closed range (for exact extraction)", rangeStr);
      Double exactVal = Double.valueOf(parts[0]);
      return Pair.of(exactVal, exactVal);
    } else if (parts.length == 2) {
      Double lowerBound = Double.valueOf(parts[0]);
      Double upperBound = Double.valueOf(parts[1]);
      if (upperBound < lowerBound) {
        String msg = String.format(
            "Lower bound %.6f exceeds upper bound %.6f.  Cowardly refusing to search for an empty range",
            lowerBound, upperBound);
        LOGGER.error(msg);
        throw new RuntimeException(msg);
      }
      return Pair.of(lowerBound, upperBound);
    } else {
      String msg = String.format(
          "Unable to parse range string '%s'; did you use the correct separator ('%c')?", RANGE_SEPARATOR);
      LOGGER.error(msg);
      throw new RuntimeException(msg);
    }
  }

  protected void init() throws RocksDBException, ClassNotFoundException, IOException {
    LOGGER.info("Initializing DB");

    // TODO: hold onto the byte representation of the timepoints so we can use them as keys more easily.
    timepoints = Utils.byteArrayToFloatList(
        dbAndHandles.get(ColumnFamilies.TIMEPOINTS, Builder.TIMEPOINTS_KEY)
    );
    LOGGER.info("Loaded %d timepoints", timepoints.size());
    // Assumes timepoints are sorted.  TODO: check!

    mzWindows = new ArrayList<>();
    RocksDBAndHandles.RocksDBIterator mzIter = dbAndHandles.newIterator(ColumnFamilies.TARGET_TO_WINDOW);
    mzIter.reset();
    while (mzIter.isValid()) {
      // The keys are the target m/z's, so we can ignore them.
      mzWindows.add(Utils.deserializeObject(mzIter.value()));
      mzIter.next();
    }

    // Sort windows so we can easily search through them
    Collections.sort(mzWindows, (a, b) -> a.getTargetMZ().compareTo(b.getTargetMZ()));

    LOGGER.info("Loaded %d m/z windows", mzWindows.size());
  }

  /**
   * Searches an LCMS index for all (time, m/z, intensity) triples within some time and m/z ranges.
   *
   * Note that this method is very much a first-draft/WIP.  There are many opportunities for optimization and
   * improvement here, but this works as an initial attempt.  This method is littered with TODOs, which once TODone
   * should make this a near optimal method of searching through LCMS readings.
   *
   * @param mzRange The range of m/z values for which to search.
   * @param timeRange The time range for which to search.
   * @return A list of (time, m/z, intensity) triples that fall within the specified ranges.
   * @throws RocksDBException
   * @throws ClassNotFoundException
   * @throws IOException
   */
  public List<TMzI> searchIndexInRange(
      Pair<Double, Double> mzRange,
      Pair<Double, Double> timeRange)
      throws RocksDBException, ClassNotFoundException, IOException {
    // TODO: gracefully handle the case when only range is specified.
    // TODO: consider producing some sort of query plan structure that can be used for optimization/explanation.

    DateTime start = DateTime.now();
    /* Demote the time range to floats, as we know that that's how we stored times in the DB.  This tight coupling would
     * normally be a bad thing, but given that this class is joined at the hip with Builder necessarily, it
     * doesn't seem like a terrible thing at the moment. */
    Pair<Float, Float> tRangeF = // My kingdom for a functor!
        Pair.of(timeRange.getLeft().floatValue(), timeRange.getRight().floatValue());

    LOGGER.info("Running search for %.6f <= t <= %.6f, %.6f <= m/z <= %.6f",
        tRangeF.getLeft(), tRangeF.getRight(), mzRange.getLeft(), mzRange.getRight()
    );

    // TODO: short circuit these filters.  The first failure after success => no more possible hits.
    List<Float> timesInRange = timepointsInRange(tRangeF);

    byte[][] timeIndexBytes = extractValueBytes(
        ColumnFamilies.TIMEPOINT_TO_TRIPLES,
        timesInRange,
        Float.BYTES,
        ByteBuffer::putFloat
    );
    // TODO: bail if all the timeIndexBytes lengths are zero.

    List<MZWindow> mzWindowsInRange = mzWindowsInRange(mzRange);

    byte[][] mzIndexBytes = extractValueBytes(
        ColumnFamilies.WINDOW_ID_TO_TRIPLES,
        mzWindowsInRange,
        Integer.BYTES,
        (buff, mz) -> buff.putInt(mz.getIndex())
    );
    // TODO: bail if all the mzIndexBytes are zero.

    /* TODO: if the number of entries in one range is significantly smaller than the other (like an order of magnitude
     * or more, skip extraction of the other set of ids and just filter at the end.  This will be especially helpful
     * when the number of ids in the m/z domain is small, as each time point will probably have >10k ids. */

    LOGGER.info("Found/loaded %d matching time ranges, %d matching m/z ranges",
        timesInRange.size(), mzWindowsInRange.size());

    // TODO: there is no need to union the time indices since they are necessarily distinct.  Just concatenate instead.
    Set<Long> unionTimeIds = unionIdBuffers(timeIndexBytes);
    Set<Long> unionMzIds = unionIdBuffers(mzIndexBytes);
    // TODO: handle the case where one of the sets is empty specially.  Either keep all in the other set or drop all.
    // TODO: we might be able to do this faster by intersecting two sorted lists.
    Set<Long> intersectionIds = new HashSet<>(unionTimeIds);
    /* TODO: this is effectively a hash join, which isn't optimal for sets of wildly different cardinalities.
     * Consider using sort-merge join instead, which will reduce the object overhead (by a lot) and allow us to pass
     * over the union of the ids from each range just once when joining them.  Additionally, just skip this whole step
     * and filter at the end if one of the set's sizes is less than 1k or so and the other is large. */
    intersectionIds.retainAll(unionMzIds);
    LOGGER.info("Id intersection results: t = %d, mz = %d, t ^ mz = %d",
        unionTimeIds.size(), unionMzIds.size(), intersectionIds.size());

    List<Long> idsToFetch = new ArrayList<>(intersectionIds);
    Collections.sort(idsToFetch); // Sort ids so we retrieve them in an order that exploits index locality.

    LOGGER.info("Collecting TMzI triples");
    // Collect all the triples for the ids we extracted.
    // TODO: don't manifest all the bytes: just create a stream of results from the cursor to reduce memory overhead.
    List<TMzI> results = new ArrayList<>(idsToFetch.size());
    byte[][] resultBytes = extractValueBytes(
        ColumnFamilies.ID_TO_TRIPLE,
        idsToFetch,
        Long.BYTES,
        ByteBuffer::putLong
    );
    for (byte[] tmziBytes : resultBytes) {
      results.add(TMzI.readNextFromByteBuffer(ByteBuffer.wrap(tmziBytes)));
    }

    // TODO: do this filtering inline with the extraction.  We shouldn't have to load all the triples before filtering.
    LOGGER.info("Performing final filtering");
    int preFilterTMzICount = results.size();
    results = results.stream().filter(tmzi ->
        tmzi.getTime() >= tRangeF.getLeft() && tmzi.getTime() <= tRangeF.getRight() &&
        tmzi.getMz() >= mzRange.getLeft() && tmzi.getMz() <= mzRange.getRight()
    ).collect(Collectors.toList());
    LOGGER.info("Precise filtering results: %d -> %d", preFilterTMzICount, results.size());

    DateTime end = DateTime.now();
    LOGGER.info("Search completed in %dms", end.getMillis() - start.getMillis());

    // TODO: return a stream instead that can load the triples lazily.
    return results;
  }

  private List<Float> timepointsInRange(Pair<Float, Float> tRange) {
    // TODO: short circuit these filters.  The first failure after success => no more possible hits.
    List<Float> timesInRange = new ArrayList<>( // Use an array list as we'll be accessing by index.
        timepoints.stream().filter(x -> x >= tRange.getLeft() && x <= tRange.getRight()).collect(Collectors.toList())
    );
    if (timesInRange.size() == 0) {
      LOGGER.warn("Found zero times in range %.6f - %.6f", tRange.getLeft(), tRange.getRight());
    }
    return timesInRange;
  }

  private List<MZWindow> mzWindowsInRange(Pair<Double, Double> mzRange) {
    List<MZWindow> mzWindowsInRange = new ArrayList<>( // Same here--access by index.
        mzWindows.stream().filter(x -> rangesOverlap(mzRange.getLeft(), mzRange.getRight(), x.getMin(), x.getMax())).
            collect(Collectors.toList())
    );
    if (mzWindowsInRange.size() == 0) {
      LOGGER.warn("Found zero m/z windows in range %.6f - %.6f", mzRange.getLeft(), mzRange.getRight());
    }
    return mzWindowsInRange;
  }

  /**
   * Extracts the value bytes from the index corresponding to a list of keys of fixed primitive type.
   * @param cf The column family from which to read.
   * @param keys A list of keys whose values to extract.
   * @param keyBytes The exact number of bytes required by a key; should be uniform for primitive-typed keys
   * @param put A function that writes a key to a ByteBuffer.
   * @param <K> The type of the key.
   * @return An array of arrays of bytes, one per key, containing the values of the key at that position.
   * @throws RocksDBException
   */
  private <K> byte[][] extractValueBytes(
      ColumnFamilies cf, List<K> keys, int keyBytes, BiFunction<ByteBuffer, K, ByteBuffer> put)
      throws RocksDBException {
    byte[][] valBytes = new byte[keys.size()][];
    ByteBuffer keyBuffer = ByteBuffer.allocate(keyBytes);
    for (int i = 0; i < keys.size(); i++) {
      K k = keys.get(i);
      keyBuffer.clear();
      put.apply(keyBuffer, k).flip();
      // TODO: try compacting the keyBuffer array to be safe?
      valBytes[i] = dbAndHandles.get(cf, keyBuffer.array());
      assert(valBytes[i] != null);
    }
    return valBytes;
  }

  private static boolean rangesOverlap(double aMin, double aMax, double bMin, double bMax) {
    /* You can push this through negation and De Morgan's Law to get
     * !(aMax < bMin || bMax < aMin) -> !(A to the left of B || B to the left of A) = intersection */
    return aMax >= bMin && bMax >= aMin;
  }

  private static Set<Long> unionIdBuffers(byte[][] idBytes) {
    /* TODO: this doesn't take advantage of the fact that all of the ids are in sorted order in every idBytes sub-array.
     * We should be able to exploit that.  For now, we'll just start by hashing the ids. */
    Set<Long> uniqueIds = new HashSet<>();
    for (int i = 0; i < idBytes.length; i++) {
      assert(idBytes[i] != null);
      ByteBuffer idsBuffer = ByteBuffer.wrap(idBytes[i]);
      while (idsBuffer.hasRemaining()) {
        uniqueIds.add(idsBuffer.getLong());
      }
    }
    return uniqueIds;
  }
}