PubchemTTLMerger.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer.pubchem;


import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.impl.SimpleIRI;
import org.eclipse.rdf4j.model.impl.SimpleLiteral;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParser;
import org.eclipse.rdf4j.rio.Rio;
import org.eclipse.rdf4j.rio.helpers.AbstractRDFHandler;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.rocksdb.ColumnFamilyDescriptor;
import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.CompressionType;
import org.rocksdb.DBOptions;
import org.rocksdb.FlushOptions;
import org.rocksdb.Options;
import org.rocksdb.RocksDB;
import org.rocksdb.RocksDBException;
import org.rocksdb.RocksIterator;
import org.rocksdb.WriteOptions;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;

/**
 * This class implements a parser for Pubchem's TTL (turtle) files.  These contain both the features available in the
 * full Pubchem compound corpus, as well as other features not available in that dataset.
 */
public class PubchemTTLMerger {
  private static final Logger LOGGER = LogManager.getFormatterLogger(PubchemTTLMerger.class);
  private static final Charset UTF8 = StandardCharsets.UTF_8;
  private static final Set<PC_SYNONYM_TYPES> DEFAULT_SYNONYM_DATA_TYPES =
      Collections.unmodifiableSet(Collections.singleton(PC_SYNONYM_TYPES.UNKNOWN));

  private static final String DEFAULT_ROCKSDB_COLUMN_FAMILY = "default";

  // Dunno why RocksDB needs two different types for these...
  private static final Options ROCKS_DB_CREATE_OPTIONS = new Options()
      .setCreateIfMissing(true)
      .setDisableDataSync(true)
      .setAllowMmapReads(true) // Trying all sorts of performance tweaking knobs, which are not well documented. :(
      .setAllowMmapWrites(true)
      .setWriteBufferSize(1 << 27)
      .setArenaBlockSize(1 << 20)
      .setCompressionType(CompressionType.SNAPPY_COMPRESSION) // Will hopefully trade CPU for I/O.
      ;

  public static final DBOptions ROCKS_DB_OPEN_OPTIONS = new DBOptions()
      .setCreateIfMissing(false)
      .setDisableDataSync(true)
      .setAllowMmapReads(true)
      .setAllowMmapWrites(true)
      ;

  public static final String OPTION_INDEX_PATH = "x";
  public static final String OPTION_RDF_DIRECTORY = "d";
  public static final String OPTION_ONLY_SYNONYMS = "s";
  public static final String OPTION_ONLY_MESH = "m";
  public static final String OPTION_ONLY_PUBCHEM_IDS = "p";
  public static final String OPTION_ONLY_MERGE = "g";
  public static final String OPTION_OPEN_EXISTING_OKAY = "e";

  public static final String HELP_MESSAGE = StringUtils.join(new String[]{
      "This class extracts Pubchem synonym data from RDF files into an on-disk index, then uses that index to join ",
      "the synonyms and MeSH ids with their corresponding pubchem ids."
  }, "");

  public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
    add(Option.builder(OPTION_INDEX_PATH)
        .argName("index path")
        .desc("A path to the directory where the on-disk index will be stored; must not already exist")
        .hasArg().required()
        .longOpt("index")
    );
    add(Option.builder(OPTION_RDF_DIRECTORY)
        .argName("RDF directory")
        .desc("A path to the directory of Pubchem RDF files")
        .hasArg()
        .longOpt("dir")
    );
    add(Option.builder(OPTION_ONLY_SYNONYMS)
        .desc(String.format("If set, only '%s' files will be processed, useful for debugging",
            PC_RDF_DATA_FILE_CONFIG.HASH_TO_SYNONYM.filePrefix))
        .longOpt("only-synonyms")
    );
    add(Option.builder(OPTION_ONLY_MESH)
        .desc(String.format("If set, only '%s' files will be processed, useful for debugging",
            PC_RDF_DATA_FILE_CONFIG.HASH_TO_MESH.filePrefix))
        .longOpt("only-mesh")
    );
    add(Option.builder(OPTION_ONLY_PUBCHEM_IDS)
        .desc(String.format("If set, only '%s' files will be processed, useful for debugging",
            PC_RDF_DATA_FILE_CONFIG.HASH_TO_CID.filePrefix))
        .longOpt("only-pubchem-id")
    );
    add(Option.builder(OPTION_ONLY_MERGE)
        .desc("If set, only merge on Pubchem id, assuming other columns are populated")
        .longOpt("only-merge")
    );
    add(Option.builder(OPTION_OPEN_EXISTING_OKAY)
        .desc("Use an existing index directory.  By default, indexes must be created in one shot.")
        .longOpt("use-existing")
    );
    add(Option.builder("h")
        .argName("help")
        .desc("Prints this help message")
        .longOpt("help")
    );
  }};
  public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

  static {
    HELP_FORMATTER.setWidth(100);
  }

  public PubchemTTLMerger() {

  }

  private enum PC_RDF_DATA_FILE_CONFIG {
    HASH_TO_SYNONYM("pc_synonym_value", COLUMN_FAMILIES.HASH_TO_SYNONYMS,
        PC_RDF_DATA_TYPES.SYNONYM, PC_RDF_DATA_TYPES.LITERAL, false, null),
    HASH_TO_CID("pc_synonym2compound", COLUMN_FAMILIES.CID_TO_HASHES,
        PC_RDF_DATA_TYPES.SYNONYM, PC_RDF_DATA_TYPES.COMPOUND, true, null),
    HASH_TO_MESH("pc_synonym_topic", COLUMN_FAMILIES.HASH_TO_MESH,
        PC_RDF_DATA_TYPES.SYNONYM, PC_RDF_DATA_TYPES.MeSH, false, null),
    HASH_TO_SYNONYM_TYPE("pc_synonym_type", COLUMN_FAMILIES.HASH_TO_SYNONYM_TYPE,
        PC_RDF_DATA_TYPES.SYNONYM, PC_RDF_DATA_TYPES.SIO, false,
        (String x) -> PC_SYNONYM_TYPES.getByCheminfId(x).name()), // Map CHEMINF values to synonym type designators.
    ;

    private String filePrefix;
    private COLUMN_FAMILIES columnFamily;
    private PC_RDF_DATA_TYPES keyType;
    private PC_RDF_DATA_TYPES valType;
    private boolean reverseSubjectAndObject;
    private Function<String, String> valueTransformer;

    PC_RDF_DATA_FILE_CONFIG(String filePrefix, COLUMN_FAMILIES columnFamily,
                            PC_RDF_DATA_TYPES keyType, PC_RDF_DATA_TYPES valType,
                            boolean reverseSubjectAndObject, Function<String, String> valueTransformer) {
      this.filePrefix = filePrefix;
      this.columnFamily = columnFamily;
      this.keyType = keyType;
      this.valType = valType;
      this.reverseSubjectAndObject = reverseSubjectAndObject;
      this.valueTransformer = valueTransformer;
    }

    public static PC_RDF_DATA_FILE_CONFIG getDataTypeForFile(File file) {
      String name = file.getName();
      for (PC_RDF_DATA_FILE_CONFIG t : PC_RDF_DATA_FILE_CONFIG.values()) {
        if (name.startsWith(t.filePrefix)) {
          return t;
        }
      }
      return null;
    }

    public static AbstractRDFHandler makeHandlerForDataFile(
        Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles, File file) {
      PC_RDF_DATA_FILE_CONFIG config = getDataTypeForFile(file);
      if (config == null) {
        LOGGER.info("No handler config found for file %s", file.getAbsolutePath());
        return null;
      }
      LOGGER.info("Selected handler type %s for file %s", config.name(), file.getName());

      return new PCRDFHandler(
          dbAndHandles,
          config.columnFamily,
          config.keyType,
          config.valType,
          config.reverseSubjectAndObject,
          config.valueTransformer
      );
    }
  }

  /**
   * Each triple in the RDF files takes the form:
   * <pre>[subject namespace]:[subject value] predicate namespace]:[predicate value] [object namespace]:[object value] .</pre>
   * Some of the files contain multiple types of values, only some of which we want to store.  For example, the
   * `topics` file contains both MeSH ids and "concepts" (I'm not sure what the latter actually represents).  We can
   * identify the MeSH ids based on their namespace and throw everything else away.
   *
   * Additionally, rdf4j represents different types of values with different Java objects.  IRI stands for
   * "internationalized resource identifier" (https://www.w3.org/TR/rdf11-concepts/#dfn-iri), and acts as a pointer
   * or identifier in the PC synonym corpus.  Synonym string values are modeled as literals, which have some sort of
   * label in some language (we ignore the language for now).
   *
   * This enum is a map of the useful namespaces and associated rdf4j model types to the parts of the synonym corpus
   * we want to extract.  Check out their use in PC_RDF_DATA_FILE_CONFIG to see how these are mapped to the
   * subjects and objects of different files in the synonym corpus.
   */
  private enum PC_RDF_DATA_TYPES {
    SYNONYM("http://rdf.ncbi.nlm.nih.gov/pubchem/synonym/", PCRDFHandler.OBJECT_TYPE.IRI),
    MeSH("http://id.nlm.nih.gov/mesh/", PCRDFHandler.OBJECT_TYPE.IRI),
    COMPOUND("http://rdf.ncbi.nlm.nih.gov/pubchem/compound/", PCRDFHandler.OBJECT_TYPE.IRI),
    LITERAL("langString", PCRDFHandler.OBJECT_TYPE.LITERAL),
    SIO("http://semanticscience.org/resource/", PCRDFHandler.OBJECT_TYPE.IRI),
    ;

    private String urlOrDatatypeName;
    /* We only expect one kind of RDF value object at a time depending on the value's namespace, so constrain to that
     * to allow proper dispatch within the handler. */
    private PCRDFHandler.OBJECT_TYPE valueObjectType;

    PC_RDF_DATA_TYPES(String urlOrDatatypeName, PCRDFHandler.OBJECT_TYPE valueObjectType) {
      this.urlOrDatatypeName = urlOrDatatypeName;
      this.valueObjectType = valueObjectType;
    }

    public String getUrlOrDatatypeName() {
      return this.urlOrDatatypeName;
    }

    public PCRDFHandler.OBJECT_TYPE getValueObjectType() {
      return this.valueObjectType;
    }
  }

  public enum COLUMN_FAMILIES {
    HASH_TO_SYNONYMS("hash_to_synonym"),
    CID_TO_HASHES("cid_to_hashes"),
    HASH_TO_MESH("hash_to_MeSH"),
    CID_TO_SYNONYMS("cid_to_synonyms"),
    HASH_TO_SYNONYM_TYPE("hash_to_synonym_type")
    ;

    private static final Map<String, COLUMN_FAMILIES> NAME_MAPPING = Collections.unmodifiableMap(
        new HashMap<String, COLUMN_FAMILIES>() {{
          for (COLUMN_FAMILIES family : COLUMN_FAMILIES.values()) {
            put(family.name, family);
          }
        }}
    );

    public static COLUMN_FAMILIES getFamilyByName(String name) {
      return NAME_MAPPING.get(name);
    }

    private String name;

    COLUMN_FAMILIES(String name) {
      this.name = name;
    }

    public String getName() {
      return this.name;
    }
  }

  // Note: @JsonSerialize and @JsonDeserialize didn't work here, so I've used @JsonCreator and @JsonValue instead.
  public enum PC_SYNONYM_TYPES {
    // Names derived from the Semantic Chemistry Ontology: https://github.com/egonw/semanticchemistry
    TRIVIAL_NAME("CHEMINF_000109", "trivial name", "trivial_name"),
    DEPOSITORY_NAME("CHEMINF_000339", "depositor-supplied name", "depositor_supplied_name"),
    IUPAC_NAME("CHEMINF_000382", "IUPAC name (LexiChem)", "IUPAC_name"),
    DRUG_BANK_ID("CHEMINF_000406", "DrugBank ID", "drugbank_id"),
    CHEBI_ID("CHEMINF_000407", "ChEBI ID", "ChEBI_id"),
    KEGG_ID("CHEMINF_000409", "KEGG ID", "KEGG_ID"),
    CHEMBL_ID("CHEMINF_000412", "ChEMBL ID", "ChEMBL_id"),
    CAS_REGISTRY_NUMBER("CHEMINF_000446", "CAS registry number", "cas_number"),
    EC_NUMBER("CHEMINF_000447", "EC number", "ec_number"),
    VALIDATED_CHEM_DB_ID("CHEMINF_000467", "Validated chemical database ID", "chem_db_id"),
    DRUG_TRADE_NAME("CHEMINF_000561", "Drug trade name", "trade_name"),
    INTL_NONPROPRIETARY_NAME("CHEMINF_000562", "International non-proprietary name", "non_proprietary_name"),
    UNIQUE_INGREDIENT_ID("CHEMINF_000563", "Unique ingredient ID", "unique_ingredient_id"),
    LIPID_MAPS_ID("CHEMINF_000564", "LipidMaps ID", "lipidmaps_id"),
    NSC_NUMBER("CHEMINF_000565", "National Service Center number", "nsc_number"),
    RTECS_ID("CHEMINF_000566", "RTECS ID", "RTECS_id"),
    UNKNOWN("NO_ID", "Unknown", "unknown")
    ;

    private static final Map<String, PC_SYNONYM_TYPES> CHEMINF_TO_TYPE = new HashMap<String, PC_SYNONYM_TYPES>() {{
      for (PC_SYNONYM_TYPES type : PC_SYNONYM_TYPES.values()) {
        put(type.getCheminfId(), type);
      }
    }};

    private static final Map<String, PC_SYNONYM_TYPES> JSON_LABEL_TO_TYPE = new HashMap<String, PC_SYNONYM_TYPES>() {{
      for (PC_SYNONYM_TYPES type : PC_SYNONYM_TYPES.values()) {
        put(type.getJsonLabel(), type);
      }
    }};

    public static PC_SYNONYM_TYPES getByCheminfId(String cheminfId) {
      return CHEMINF_TO_TYPE.getOrDefault(cheminfId, UNKNOWN);
    }

    @JsonCreator
    public static PC_SYNONYM_TYPES getByJsonLabel(String cheminfId) {
      return JSON_LABEL_TO_TYPE.getOrDefault(cheminfId, UNKNOWN);
    }

    String cheminfId;
    String label;
    String jsonLabel;

    PC_SYNONYM_TYPES(String cheminfId, String label, String jsonLabel) {
      this.cheminfId = cheminfId;
      this.label = label;
      this.jsonLabel = jsonLabel;
    }

    public String getCheminfId() {
      return cheminfId;
    }

    public String getLabel() {
      return label;
    }

    @JsonValue
    public String getJsonLabel() {
      return jsonLabel;
    }
  }

  private static class PCRDFHandler extends AbstractRDFHandler {
    public static final Double MS_PER_S = 1000.0;
    /* The Pubchem RDF corpus represents all subjects as SimpleIRIs, but objects can be IRIs or literals.  Let the child
     * class decide which one it wants to handle. */
    enum OBJECT_TYPE {
      IRI,
      LITERAL,
      ;
    }

    private RocksDB db;
    private COLUMN_FAMILIES columnFamily;
    private ColumnFamilyHandle cfh;
    // Filter out RDF types (based on namespace) that we don't recognize or don't want to process.
    PC_RDF_DATA_TYPES keyType, valueType;
    boolean reverseSubjectAndObject;
    /* This is a super janky way to map synonym types to their enum values in the index.  Would be better done with a
     * subclass, but we'll leave that for a refactoring once we get this working. */
    Function<String, String> valueTransformer = null;

    DateTime startTime;
    // Is the RDF parser single threaded?  We don't know, so use an atomic counter to be safe.
    AtomicLong numProcessed = new AtomicLong(0);
    // Store unrecognized namespaces so we only log once per RDF file, rather than once per entry (which is a lot).
    Set<String> seenUnrecognizedSubjectNamespaces = new HashSet<>();
    Set<String> seenUnrecognizedObjectNamespaces = new HashSet<>();

    PCRDFHandler(Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles, COLUMN_FAMILIES columnFamily,
                 PC_RDF_DATA_TYPES keyType, PC_RDF_DATA_TYPES valueType, boolean reverseSubjectAndObject,
                 Function<String, String> valueTransformer) {
      this.db = dbAndHandles.getLeft();
      this.columnFamily = columnFamily;
      this.cfh = dbAndHandles.getRight().get(columnFamily);
      this.keyType = keyType;
      this.valueType = valueType;
      this.reverseSubjectAndObject = reverseSubjectAndObject;
      this.valueTransformer = valueTransformer;
    }

    @Override
    public void startRDF() throws RDFHandlerException {
      super.startRDF();
      startTime = new DateTime().withZone(DateTimeZone.UTC);
    }

    @Override
    public void endRDF() throws RDFHandlerException {
      super.endRDF();
      DateTime endTime = new DateTime().withZone(DateTimeZone.UTC);
      Long runtimeInMilis = endTime.getMillis() - startTime.getMillis();
      Long numProcessedVal = numProcessed.get();
      LOGGER.info("PCRDFHandler reached end of RDF with %d events in %.3fs, at %.3f ms per event",
          numProcessedVal,
          runtimeInMilis.floatValue() / MS_PER_S,
          runtimeInMilis.doubleValue() / numProcessedVal.doubleValue()
      );
      try {
        db.flush(new FlushOptions().setWaitForFlush(true));
      } catch (RocksDBException e) {
        LOGGER.error("Caught RocksDB exception when flushing after completing RDF processing: %s", e.getMessage());
        throw new RDFHandlerException(e);
      }
    }

    @Override
    public void handleStatement(Statement st) {
      if (!(st.getSubject() instanceof SimpleIRI)) {
        // If we can't even recognize the type of the subject, something is very wrong.
        String msg = String.format("Unknown type of subject: %s", st.getSubject().getClass().getCanonicalName());
        LOGGER.error(msg);
        throw new RuntimeException(msg);
      }

      SimpleIRI subjectIRI = (SimpleIRI) st.getSubject();
      // Filter out keys in namespaces we're not interested in.
      if (!(keyType.getUrlOrDatatypeName().equals(subjectIRI.getNamespace()))) {
        // If we don't recognize the namespace of the subject, then we probably can't handle this triple.
        if (!seenUnrecognizedSubjectNamespaces.contains(subjectIRI.getNamespace())) {
          seenUnrecognizedSubjectNamespaces.add(subjectIRI.getNamespace());
          LOGGER.warn("Unrecognized subject namespace: %s\n", subjectIRI.getNamespace());
        }
        return;
      }

      String subject = subjectIRI.getLocalName();
      String object = null;
      // Let the subclasses tell us what
      if (this.valueType.getValueObjectType() == OBJECT_TYPE.IRI && st.getObject() instanceof SimpleIRI) {
        SimpleIRI objectIRI = (SimpleIRI) st.getObject();
        if (!valueType.getUrlOrDatatypeName().equals(objectIRI.getNamespace())) {
          // If we don't recognize the namespace of the subject, then we probably can't handle this triple.
          if (!seenUnrecognizedObjectNamespaces.contains(objectIRI.getNamespace())) {
            seenUnrecognizedObjectNamespaces.add(objectIRI.getNamespace());
            LOGGER.warn("Unrecognized object namespace: %s\n", objectIRI.getNamespace());
          }
          return;
        }
        object = objectIRI.getLocalName();
      } else if (this.valueType.getValueObjectType() == OBJECT_TYPE.LITERAL &&
          st.getObject() instanceof SimpleLiteral) {
        SimpleLiteral objectLiteral = (SimpleLiteral) st.getObject();
        IRI datatype = objectLiteral.getDatatype();
        if (!valueType.getUrlOrDatatypeName().equals(datatype.getLocalName())) {
          // We're only expecting string values where we find literals.
          if (!seenUnrecognizedObjectNamespaces.contains(datatype.getLocalName())) {
            seenUnrecognizedObjectNamespaces.add(datatype.getLocalName());
            LOGGER.warn("Unrecognized simple literal datatype: %s\n", datatype.getLocalName());
          }
          return;
        }
        object = objectLiteral.getLabel();
      } else {
        String msg = String.format("Unknown type of object: %s", st.getObject().getClass().getCanonicalName());
        LOGGER.error(msg);
        throw new RuntimeException(msg);
      }

      /* I considered modeling this decision using subclasses, but it made the configuration to much of a pain.  Maybe
       * we'll do something clever the next time this code needs modification... */
      Pair<String, String> kvPair;
      if (reverseSubjectAndObject) {
        // If the keys, like PC ids, are on the right, we need to swap them around before storing.
        kvPair = Pair.of(object, subject);
      } else {
        kvPair = Pair.of(subject, object);
      }

      if (valueTransformer != null) {
        kvPair = Pair.of(kvPair.getKey(), valueTransformer.apply(kvPair.getValue()));
      }

      // Store the key and value in the appropriate column family.
      appendValueToList(db, cfh, kvPair.getKey(), kvPair.getValue());
      numProcessed.incrementAndGet();
    }

    private void appendValueToList(RocksDB db, ColumnFamilyHandle cfh, String key, String val) {
      StringBuffer buffer = new StringBuffer();
      List<String> storedObjects = null;
      byte[] keyBytes = key.getBytes(UTF8);
      // TODO: pull this out into a helper class or interface.  Alas, we can must extend the AbstractRDFHandler.
      try {
        if (db.keyMayExist(cfh, keyBytes, buffer)) {
          byte[] existingVal = db.get(cfh, keyBytes);
          if (existingVal != null) {
            ObjectInputStream oi = new ObjectInputStream(new ByteArrayInputStream(existingVal));
            storedObjects = (ArrayList<String>) oi.readObject(); // Note: assumes all values are lists.
            /* Once upon a time I had a constraint here that crashed if we expected unique keys.  This was mainly to
             * guard against hypothetical synonym hash collisions.  What ends up happening, however, is that Pubchem
             * stores multiple values of one hash with different normalizations (like all uppercase or all lowercase)
             * meaning there *will* be multiple values with the same hash, but these values will all be valid.
             * Instead we just ignore potential hash collisions and assume that any "collisions" are intentional. */
          } else {
            storedObjects = new ArrayList<>(1);
          }
        } else {
          storedObjects = new ArrayList<>(1);
        }

        storedObjects.add(val);

        try (ByteArrayOutputStream bos = new ByteArrayOutputStream();
             ObjectOutputStream oo = new ObjectOutputStream(bos)) {
          oo.writeObject(storedObjects);
          oo.flush();

          db.put(cfh, new WriteOptions(), keyBytes, bos.toByteArray());
        }
      } catch (RocksDBException e) {
        LOGGER.error("Caughted unexpected RocksDBException: %s", e.getMessage());
        throw new RuntimeException(e);
      } catch (IOException e) {
        LOGGER.error("Caughted unexpected IOException: %s", e.getMessage());
        throw new RuntimeException(e);
      } catch (ClassNotFoundException e) {
        LOGGER.error("Caughted unexpected ClassNotFoundEXception: %s", e.getMessage());
        throw new RuntimeException(e);
      }
    }
  }

  public static void main(String[] args) throws Exception {
    org.apache.commons.cli.Options opts = new org.apache.commons.cli.Options();
    for (Option.Builder b : OPTION_BUILDERS) {
      opts.addOption(b.build());
    }

    CommandLine cl = null;
    try {
      CommandLineParser parser = new DefaultParser();
      cl = parser.parse(opts, args);
    } catch (ParseException e) {
      System.err.format("Argument parsing failed: %s\n", e.getMessage());
      HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
      System.exit(1);
    }

    if (cl.hasOption("help")) {
      HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
      return;
    }

    PubchemTTLMerger merger = new PubchemTTLMerger();

    File rocksDBFile = new File(cl.getOptionValue(OPTION_INDEX_PATH));

    if (cl.hasOption(OPTION_ONLY_MERGE)) {
      if (!(rocksDBFile.exists() && rocksDBFile.isDirectory())) {
        System.err.format("Must specify an existing RocksDB index when using '%s'.\n", OPTION_ONLY_MERGE);
        HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
        System.exit(1);
      }
      merger.finish(merger.merge(rocksDBFile));
      return;
    }

    File rdfDir = new File(cl.getOptionValue(OPTION_RDF_DIRECTORY));
    if (!rdfDir.isDirectory()) {
      System.err.format("Must specify a directory of RDF files to be parsed.\n");
      HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
      System.exit(1);
    }

    File[] filesInDirectoryArray = rdfDir.listFiles(new FilenameFilter() {
      private static final String TTL_GZ_SUFFIX = ".ttl.gz";
      @Override
      public boolean accept(File dir, String name) {
        return name.endsWith(TTL_GZ_SUFFIX);
      }
    });

    if (filesInDirectoryArray == null || filesInDirectoryArray.length == 0) {
      System.err.format("Found zero compressed TTL files in directory at '%s'.\n", rdfDir.getAbsolutePath());
      HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
      System.exit(1);
    }

    // Sort files for stability/sanity.
    List<File> filesInDirectory = Arrays.asList(filesInDirectoryArray);
    Collections.sort(filesInDirectory);

    if (cl.hasOption(OPTION_ONLY_SYNONYMS)) {
      filesInDirectory = filterByFileContents(filesInDirectory, PC_RDF_DATA_FILE_CONFIG.HASH_TO_SYNONYM);
    }

    if (cl.hasOption(OPTION_ONLY_MESH)) {
      filesInDirectory = filterByFileContents(filesInDirectory, PC_RDF_DATA_FILE_CONFIG.HASH_TO_MESH);
    }

    if (cl.hasOption(OPTION_ONLY_PUBCHEM_IDS)) {
      filesInDirectory = filterByFileContents(filesInDirectory, PC_RDF_DATA_FILE_CONFIG.HASH_TO_CID);
    }

    if (filesInDirectory.size() == 0) {
      System.err.format("Arrived at index initialization with no files to process.  " +
              "Maybe too many filters were specified?  synonyms: %s, MeSH: %s, Pubchem ids: %s\n",
          cl.hasOption(OPTION_ONLY_SYNONYMS), cl.hasOption(OPTION_ONLY_MESH), cl.hasOption(OPTION_ONLY_PUBCHEM_IDS));
      HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
      System.exit(1);
    }

    RocksDB.loadLibrary();
    Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles = null;
    try {
      if (rocksDBFile.exists()) {
        if (!cl.hasOption(OPTION_OPEN_EXISTING_OKAY)) {
          System.err.format(
              "Index directory at '%s' already exists, delete before retrying or add '%s' option to reuse.\n",
              rocksDBFile.getAbsolutePath(), OPTION_OPEN_EXISTING_OKAY);
          HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
          System.exit(1);
        } else {
          LOGGER.info("Reusing existing index at %s", rocksDBFile.getAbsolutePath());
          dbAndHandles = openExistingRocksDB(rocksDBFile);
        }
      } else {
        LOGGER.info("Creating new index at %s", rocksDBFile.getAbsolutePath());
        dbAndHandles = createNewRocksDB(rocksDBFile);
      }
      merger.buildIndex(dbAndHandles, filesInDirectory);

      merger.merge(dbAndHandles);
    } finally {
      if (dbAndHandles != null) {
        merger.finish(dbAndHandles);
      }
    }
  }

  protected static List<File> filterByFileContents(List<File> files, PC_RDF_DATA_FILE_CONFIG fileConfig) {
    return files.stream().filter(x -> x.getName().startsWith(fileConfig.filePrefix)).collect(Collectors.toList());
  }

  protected static Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> createNewRocksDB(File pathToIndex)
      throws RocksDBException {
    RocksDB db = null; // Not auto-closable.
    Map<COLUMN_FAMILIES, ColumnFamilyHandle> columnFamilyHandles = new HashMap<>();

    Options options = ROCKS_DB_CREATE_OPTIONS;
    System.out.println("Opening index at " + pathToIndex.getAbsolutePath());
    db = RocksDB.open(options, pathToIndex.getAbsolutePath());

    for (COLUMN_FAMILIES cf : COLUMN_FAMILIES.values()) {
      LOGGER.info("Creating column family %s", cf.getName());
      ColumnFamilyHandle cfh =
          db.createColumnFamily(new ColumnFamilyDescriptor(cf.getName().getBytes(UTF8)));
      columnFamilyHandles.put(cf, cfh);
    }

    return Pair.of(db, columnFamilyHandles);
  }

  /**
   * Open an existing RocksDB index.  Use this after successful index generation to access the map of Pubchem compound
   * ids to synonyms/MeSH ids using the column family CID_TO_SYNONYMS.
   * @param pathToIndex A path to the RocksDB index directory to use.
   * @return
   * @throws RocksDBException
   */
  public static Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> openExistingRocksDB(File pathToIndex)
      throws RocksDBException {
    List<ColumnFamilyDescriptor> columnFamilyDescriptors = new ArrayList<>(COLUMN_FAMILIES.values().length + 1);
    // Must also open the "default" family or RocksDB will probably choke.
    columnFamilyDescriptors.add(new ColumnFamilyDescriptor(DEFAULT_ROCKSDB_COLUMN_FAMILY.getBytes()));
    for (COLUMN_FAMILIES family : COLUMN_FAMILIES.values()) {
      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(family.getName().getBytes()));
    }
    List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>(columnFamilyDescriptors.size());

    DBOptions dbOptions = ROCKS_DB_OPEN_OPTIONS;
    dbOptions.setCreateIfMissing(false);
    RocksDB rocksDB = RocksDB.open(dbOptions, pathToIndex.getAbsolutePath(),
        columnFamilyDescriptors, columnFamilyHandles);
    Map<COLUMN_FAMILIES, ColumnFamilyHandle> columnFamilyHandleMap = new HashMap<>(COLUMN_FAMILIES.values().length);
    // TODO: can we zip these together more easily w/ Java 8?

    for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
      ColumnFamilyDescriptor cfd = columnFamilyDescriptors.get(i);
      ColumnFamilyHandle cfh = columnFamilyHandles.get(i);
      String familyName = new String(cfd.columnFamilyName(), UTF8);
      COLUMN_FAMILIES descriptorFamily = COLUMN_FAMILIES.getFamilyByName(familyName);
      if (descriptorFamily == null) {
        if (!DEFAULT_ROCKSDB_COLUMN_FAMILY.equals(familyName)) {
          String msg = String.format("Found unexpected family name '%s' when trying to open RocksDB at %s",
              familyName, pathToIndex.getAbsolutePath());
          LOGGER.error(msg);
          // Crash if we don't recognize the contents of this DB.
          throw new RuntimeException(msg);
        }
        // Just skip this column family if it doesn't map to something we know but is expected.
        continue;
      }

      columnFamilyHandleMap.put(descriptorFamily, cfh);
    }

    return Pair.of(rocksDB, columnFamilyHandleMap);
  }

  protected Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> merge(File pathToRocksDB)
      throws RocksDBException, IOException, ClassNotFoundException {
    Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles = openExistingRocksDB(pathToRocksDB);
    merge(dbAndHandles);
    return dbAndHandles;
  }

  protected void merge(Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles)
      throws RocksDBException, IOException, ClassNotFoundException {
    LOGGER.info("Beginning merge on Pubchem CID");
    RocksDB db = dbAndHandles.getLeft();
    ColumnFamilyHandle pubchemIdCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.CID_TO_HASHES);
    ColumnFamilyHandle meshCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.HASH_TO_MESH);
    ColumnFamilyHandle synonymCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.HASH_TO_SYNONYMS);
    ColumnFamilyHandle synonymTypeCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.HASH_TO_SYNONYM_TYPE);
    ColumnFamilyHandle mergeResultsCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.CID_TO_SYNONYMS);

    RocksIterator cidIterator = db.newIterator(pubchemIdCFH);
    // With help from https://github.com/facebook/rocksdb/wiki/Basic-Operations
    int processed = 0;
    for (cidIterator.seekToFirst(); cidIterator.isValid(); cidIterator.next()) {
      byte[] key = cidIterator.key();
      byte[] val = cidIterator.value();
      String pubchemId = new String(key, UTF8);
      List<String> hashes;
      try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(val))) {
        // We know all our values so far have been lists of strings, so this should be completely safe.
        hashes = (List<String>) ois.readObject();
      }

      PubchemSynonyms pubchemSynonyms = new PubchemSynonyms(pubchemId);

      /* The hash keys are based on synonym value, which we can manually compute with:
       *   $ echo -n  'dimethyltin(iv)' | md5
       * This means that MeSH ids are linked to synonyms rather than pubchem ids.  We need to look up each cid-linked
       * hash in both the MeSH and synonym collections, as the key may legitimately exist in both (and serve to link
       * cid to synonym and cid to MeSH). */
      for (String hash : hashes) {
        /* Note: these ids are not proper MeSH topic ids, but are internal MeSH ids found in the RDF and TTL
         * representations of the MeSH corpus.  You can find them in the MeSH .nt or .xml files, but they won't turn up
         * anything on the MeSH website. */
        List<String> meshIds = getValueAsObject(db, meshCFH, hash);
        if (meshIds != null) {
          pubchemSynonyms.addMeSHIds(meshIds);
        }

        List<String> synonyms = getValueAsObject(db, synonymCFH, hash);
        // There are, surprisingly, some dangling hashes in the DB!  Handle them gracefully.
        if (synonyms == null) {
          LOGGER.warn("Dangling synonym hash reference, adding empty list in place of value: cid = %s, hash = %s",
              pubchemId, hash);
          synonyms = Collections.emptyList();
        }

        List<String> synonymTypeStrings = getValueAsObject(db, synonymTypeCFH, hash);
        Set<PC_SYNONYM_TYPES> synonymTypes = DEFAULT_SYNONYM_DATA_TYPES;
        if (synonymTypeStrings != null) {
          synonymTypes = synonymTypeStrings.stream().map(PC_SYNONYM_TYPES::valueOf).collect(Collectors.toSet());
        }

        if (synonymTypes.size() == 0) {
          LOGGER.warn("Found zero synonym types for synonym, defaulting to %s: %s %s, synonyms = %s",
              PC_SYNONYM_TYPES.UNKNOWN.name(), pubchemId, hash, StringUtils.join(synonyms, ", "));
        }
        /* It turns out that *lots* of synonyms are duplicated as depositor supplied names, so don't complain about it
         * here.  For performance sake we might want to consider changing the data model of PubchemSynonyms to reduce
         * synonym string duplication, as the current model is pretty inefficient. */

        for (PC_SYNONYM_TYPES synonymType : synonymTypes) {
          for (String synonym : synonyms) {
            // Let the PubchemSynonyms object do the de-duplication for us rather than reducing `synonyms` to a Set.
            pubchemSynonyms.addSynonym(synonymType, synonym);
          }
        }
      }

      try (ByteArrayOutputStream bos = new ByteArrayOutputStream();
           ObjectOutputStream oo = new ObjectOutputStream(bos)) {
        oo.writeObject(pubchemSynonyms);
        oo.flush();

        db.put(mergeResultsCFH, key, bos.toByteArray());
      }

      processed++;
      if (processed % 100000 == 0) {
        LOGGER.info("Merged %d entries on Pubchem compound id", processed);
      }
    }
    LOGGER.info("Merge complete, %d entries processed", processed);
  }

  protected <T> T getValueAsObject(RocksDB db, ColumnFamilyHandle cfh, String key)
      throws RocksDBException, ClassNotFoundException, IOException {
    StringBuffer stringBuffer = new StringBuffer();
    T val = null;
    /* Check for existence before fetching.  IIRC doing otherwise might cause segfaults in the RocksDB JNI wrapper.
     * Or it might just be faster thanks to the DB's bloom filter. */
    if (db.keyMayExist(cfh, key.getBytes(), stringBuffer)) {
      byte[] valBytes = db.get(cfh, key.getBytes());
      // Make sure that the key actually exist (beware the "May" in keyMayExist).
      if (valBytes != null) {
        try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(valBytes))) {
          val = (T) ois.readObject();
        }
      }
    }
    return val;
  }

  protected void buildIndex(Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles, List<File> rdfFiles)
      throws RocksDBException, ClassNotFoundException, IOException {
    LOGGER.info("Building RocksDB index of data in RDF files");
    RDFParser parser = Rio.createParser(RDFFormat.TURTLE);

    LOGGER.info("Processing %d RDF files", rdfFiles.size());
    for (File rdfFile : rdfFiles) {
      LOGGER.info("Processing file %s", rdfFile.getAbsolutePath());
      AbstractRDFHandler handler = PC_RDF_DATA_FILE_CONFIG.makeHandlerForDataFile(dbAndHandles, rdfFile);
      if (handler == null) {
        LOGGER.info("Skipping file without defined handler: %s", rdfFile.getAbsolutePath());
        continue;
      }

      parser.setRDFHandler(handler);
      parser.parse(new GZIPInputStream(new FileInputStream(rdfFile)), "");
      LOGGER.info("Successfully parsed file at %s", rdfFile.getAbsolutePath());
    }
    LOGGER.info("Done processing RDF files");
  }

  protected void finish(Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles) {
    LOGGER.info("Closing DB to complete merge.");
    dbAndHandles.getLeft().close();
  }
}