/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package act.installer.pubchem;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.impl.SimpleIRI;
import org.eclipse.rdf4j.model.impl.SimpleLiteral;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParser;
import org.eclipse.rdf4j.rio.Rio;
import org.eclipse.rdf4j.rio.helpers.AbstractRDFHandler;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.rocksdb.ColumnFamilyDescriptor;
import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.CompressionType;
import org.rocksdb.DBOptions;
import org.rocksdb.FlushOptions;
import org.rocksdb.Options;
import org.rocksdb.RocksDB;
import org.rocksdb.RocksDBException;
import org.rocksdb.RocksIterator;
import org.rocksdb.WriteOptions;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
/**
* This class implements a parser for Pubchem's TTL (turtle) files. These contain both the features available in the
* full Pubchem compound corpus, as well as other features not available in that dataset.
*/
public class PubchemTTLMerger {
private static final Logger LOGGER = LogManager.getFormatterLogger(PubchemTTLMerger.class);
private static final Charset UTF8 = StandardCharsets.UTF_8;
private static final Set<PC_SYNONYM_TYPES> DEFAULT_SYNONYM_DATA_TYPES =
Collections.unmodifiableSet(Collections.singleton(PC_SYNONYM_TYPES.UNKNOWN));
private static final String DEFAULT_ROCKSDB_COLUMN_FAMILY = "default";
// Dunno why RocksDB needs two different types for these...
private static final Options ROCKS_DB_CREATE_OPTIONS = new Options()
.setCreateIfMissing(true)
.setDisableDataSync(true)
.setAllowMmapReads(true) // Trying all sorts of performance tweaking knobs, which are not well documented. :(
.setAllowMmapWrites(true)
.setWriteBufferSize(1 << 27)
.setArenaBlockSize(1 << 20)
.setCompressionType(CompressionType.SNAPPY_COMPRESSION) // Will hopefully trade CPU for I/O.
;
public static final DBOptions ROCKS_DB_OPEN_OPTIONS = new DBOptions()
.setCreateIfMissing(false)
.setDisableDataSync(true)
.setAllowMmapReads(true)
.setAllowMmapWrites(true)
;
public static final String OPTION_INDEX_PATH = "x";
public static final String OPTION_RDF_DIRECTORY = "d";
public static final String OPTION_ONLY_SYNONYMS = "s";
public static final String OPTION_ONLY_MESH = "m";
public static final String OPTION_ONLY_PUBCHEM_IDS = "p";
public static final String OPTION_ONLY_MERGE = "g";
public static final String OPTION_OPEN_EXISTING_OKAY = "e";
public static final String HELP_MESSAGE = StringUtils.join(new String[]{
"This class extracts Pubchem synonym data from RDF files into an on-disk index, then uses that index to join ",
"the synonyms and MeSH ids with their corresponding pubchem ids."
}, "");
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_INDEX_PATH)
.argName("index path")
.desc("A path to the directory where the on-disk index will be stored; must not already exist")
.hasArg().required()
.longOpt("index")
);
add(Option.builder(OPTION_RDF_DIRECTORY)
.argName("RDF directory")
.desc("A path to the directory of Pubchem RDF files")
.hasArg()
.longOpt("dir")
);
add(Option.builder(OPTION_ONLY_SYNONYMS)
.desc(String.format("If set, only '%s' files will be processed, useful for debugging",
PC_RDF_DATA_FILE_CONFIG.HASH_TO_SYNONYM.filePrefix))
.longOpt("only-synonyms")
);
add(Option.builder(OPTION_ONLY_MESH)
.desc(String.format("If set, only '%s' files will be processed, useful for debugging",
PC_RDF_DATA_FILE_CONFIG.HASH_TO_MESH.filePrefix))
.longOpt("only-mesh")
);
add(Option.builder(OPTION_ONLY_PUBCHEM_IDS)
.desc(String.format("If set, only '%s' files will be processed, useful for debugging",
PC_RDF_DATA_FILE_CONFIG.HASH_TO_CID.filePrefix))
.longOpt("only-pubchem-id")
);
add(Option.builder(OPTION_ONLY_MERGE)
.desc("If set, only merge on Pubchem id, assuming other columns are populated")
.longOpt("only-merge")
);
add(Option.builder(OPTION_OPEN_EXISTING_OKAY)
.desc("Use an existing index directory. By default, indexes must be created in one shot.")
.longOpt("use-existing")
);
add(Option.builder("h")
.argName("help")
.desc("Prints this help message")
.longOpt("help")
);
}};
public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
static {
HELP_FORMATTER.setWidth(100);
}
public PubchemTTLMerger() {
}
private enum PC_RDF_DATA_FILE_CONFIG {
HASH_TO_SYNONYM("pc_synonym_value", COLUMN_FAMILIES.HASH_TO_SYNONYMS,
PC_RDF_DATA_TYPES.SYNONYM, PC_RDF_DATA_TYPES.LITERAL, false, null),
HASH_TO_CID("pc_synonym2compound", COLUMN_FAMILIES.CID_TO_HASHES,
PC_RDF_DATA_TYPES.SYNONYM, PC_RDF_DATA_TYPES.COMPOUND, true, null),
HASH_TO_MESH("pc_synonym_topic", COLUMN_FAMILIES.HASH_TO_MESH,
PC_RDF_DATA_TYPES.SYNONYM, PC_RDF_DATA_TYPES.MeSH, false, null),
HASH_TO_SYNONYM_TYPE("pc_synonym_type", COLUMN_FAMILIES.HASH_TO_SYNONYM_TYPE,
PC_RDF_DATA_TYPES.SYNONYM, PC_RDF_DATA_TYPES.SIO, false,
(String x) -> PC_SYNONYM_TYPES.getByCheminfId(x).name()), // Map CHEMINF values to synonym type designators.
;
private String filePrefix;
private COLUMN_FAMILIES columnFamily;
private PC_RDF_DATA_TYPES keyType;
private PC_RDF_DATA_TYPES valType;
private boolean reverseSubjectAndObject;
private Function<String, String> valueTransformer;
PC_RDF_DATA_FILE_CONFIG(String filePrefix, COLUMN_FAMILIES columnFamily,
PC_RDF_DATA_TYPES keyType, PC_RDF_DATA_TYPES valType,
boolean reverseSubjectAndObject, Function<String, String> valueTransformer) {
this.filePrefix = filePrefix;
this.columnFamily = columnFamily;
this.keyType = keyType;
this.valType = valType;
this.reverseSubjectAndObject = reverseSubjectAndObject;
this.valueTransformer = valueTransformer;
}
public static PC_RDF_DATA_FILE_CONFIG getDataTypeForFile(File file) {
String name = file.getName();
for (PC_RDF_DATA_FILE_CONFIG t : PC_RDF_DATA_FILE_CONFIG.values()) {
if (name.startsWith(t.filePrefix)) {
return t;
}
}
return null;
}
public static AbstractRDFHandler makeHandlerForDataFile(
Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles, File file) {
PC_RDF_DATA_FILE_CONFIG config = getDataTypeForFile(file);
if (config == null) {
LOGGER.info("No handler config found for file %s", file.getAbsolutePath());
return null;
}
LOGGER.info("Selected handler type %s for file %s", config.name(), file.getName());
return new PCRDFHandler(
dbAndHandles,
config.columnFamily,
config.keyType,
config.valType,
config.reverseSubjectAndObject,
config.valueTransformer
);
}
}
/**
* Each triple in the RDF files takes the form:
* <pre>[subject namespace]:[subject value] predicate namespace]:[predicate value] [object namespace]:[object value] .</pre>
* Some of the files contain multiple types of values, only some of which we want to store. For example, the
* `topics` file contains both MeSH ids and "concepts" (I'm not sure what the latter actually represents). We can
* identify the MeSH ids based on their namespace and throw everything else away.
*
* Additionally, rdf4j represents different types of values with different Java objects. IRI stands for
* "internationalized resource identifier" (https://www.w3.org/TR/rdf11-concepts/#dfn-iri), and acts as a pointer
* or identifier in the PC synonym corpus. Synonym string values are modeled as literals, which have some sort of
* label in some language (we ignore the language for now).
*
* This enum is a map of the useful namespaces and associated rdf4j model types to the parts of the synonym corpus
* we want to extract. Check out their use in PC_RDF_DATA_FILE_CONFIG to see how these are mapped to the
* subjects and objects of different files in the synonym corpus.
*/
private enum PC_RDF_DATA_TYPES {
SYNONYM("http://rdf.ncbi.nlm.nih.gov/pubchem/synonym/", PCRDFHandler.OBJECT_TYPE.IRI),
MeSH("http://id.nlm.nih.gov/mesh/", PCRDFHandler.OBJECT_TYPE.IRI),
COMPOUND("http://rdf.ncbi.nlm.nih.gov/pubchem/compound/", PCRDFHandler.OBJECT_TYPE.IRI),
LITERAL("langString", PCRDFHandler.OBJECT_TYPE.LITERAL),
SIO("http://semanticscience.org/resource/", PCRDFHandler.OBJECT_TYPE.IRI),
;
private String urlOrDatatypeName;
/* We only expect one kind of RDF value object at a time depending on the value's namespace, so constrain to that
* to allow proper dispatch within the handler. */
private PCRDFHandler.OBJECT_TYPE valueObjectType;
PC_RDF_DATA_TYPES(String urlOrDatatypeName, PCRDFHandler.OBJECT_TYPE valueObjectType) {
this.urlOrDatatypeName = urlOrDatatypeName;
this.valueObjectType = valueObjectType;
}
public String getUrlOrDatatypeName() {
return this.urlOrDatatypeName;
}
public PCRDFHandler.OBJECT_TYPE getValueObjectType() {
return this.valueObjectType;
}
}
public enum COLUMN_FAMILIES {
HASH_TO_SYNONYMS("hash_to_synonym"),
CID_TO_HASHES("cid_to_hashes"),
HASH_TO_MESH("hash_to_MeSH"),
CID_TO_SYNONYMS("cid_to_synonyms"),
HASH_TO_SYNONYM_TYPE("hash_to_synonym_type")
;
private static final Map<String, COLUMN_FAMILIES> NAME_MAPPING = Collections.unmodifiableMap(
new HashMap<String, COLUMN_FAMILIES>() {{
for (COLUMN_FAMILIES family : COLUMN_FAMILIES.values()) {
put(family.name, family);
}
}}
);
public static COLUMN_FAMILIES getFamilyByName(String name) {
return NAME_MAPPING.get(name);
}
private String name;
COLUMN_FAMILIES(String name) {
this.name = name;
}
public String getName() {
return this.name;
}
}
// Note: @JsonSerialize and @JsonDeserialize didn't work here, so I've used @JsonCreator and @JsonValue instead.
public enum PC_SYNONYM_TYPES {
// Names derived from the Semantic Chemistry Ontology: https://github.com/egonw/semanticchemistry
TRIVIAL_NAME("CHEMINF_000109", "trivial name", "trivial_name"),
DEPOSITORY_NAME("CHEMINF_000339", "depositor-supplied name", "depositor_supplied_name"),
IUPAC_NAME("CHEMINF_000382", "IUPAC name (LexiChem)", "IUPAC_name"),
DRUG_BANK_ID("CHEMINF_000406", "DrugBank ID", "drugbank_id"),
CHEBI_ID("CHEMINF_000407", "ChEBI ID", "ChEBI_id"),
KEGG_ID("CHEMINF_000409", "KEGG ID", "KEGG_ID"),
CHEMBL_ID("CHEMINF_000412", "ChEMBL ID", "ChEMBL_id"),
CAS_REGISTRY_NUMBER("CHEMINF_000446", "CAS registry number", "cas_number"),
EC_NUMBER("CHEMINF_000447", "EC number", "ec_number"),
VALIDATED_CHEM_DB_ID("CHEMINF_000467", "Validated chemical database ID", "chem_db_id"),
DRUG_TRADE_NAME("CHEMINF_000561", "Drug trade name", "trade_name"),
INTL_NONPROPRIETARY_NAME("CHEMINF_000562", "International non-proprietary name", "non_proprietary_name"),
UNIQUE_INGREDIENT_ID("CHEMINF_000563", "Unique ingredient ID", "unique_ingredient_id"),
LIPID_MAPS_ID("CHEMINF_000564", "LipidMaps ID", "lipidmaps_id"),
NSC_NUMBER("CHEMINF_000565", "National Service Center number", "nsc_number"),
RTECS_ID("CHEMINF_000566", "RTECS ID", "RTECS_id"),
UNKNOWN("NO_ID", "Unknown", "unknown")
;
private static final Map<String, PC_SYNONYM_TYPES> CHEMINF_TO_TYPE = new HashMap<String, PC_SYNONYM_TYPES>() {{
for (PC_SYNONYM_TYPES type : PC_SYNONYM_TYPES.values()) {
put(type.getCheminfId(), type);
}
}};
private static final Map<String, PC_SYNONYM_TYPES> JSON_LABEL_TO_TYPE = new HashMap<String, PC_SYNONYM_TYPES>() {{
for (PC_SYNONYM_TYPES type : PC_SYNONYM_TYPES.values()) {
put(type.getJsonLabel(), type);
}
}};
public static PC_SYNONYM_TYPES getByCheminfId(String cheminfId) {
return CHEMINF_TO_TYPE.getOrDefault(cheminfId, UNKNOWN);
}
@JsonCreator
public static PC_SYNONYM_TYPES getByJsonLabel(String cheminfId) {
return JSON_LABEL_TO_TYPE.getOrDefault(cheminfId, UNKNOWN);
}
String cheminfId;
String label;
String jsonLabel;
PC_SYNONYM_TYPES(String cheminfId, String label, String jsonLabel) {
this.cheminfId = cheminfId;
this.label = label;
this.jsonLabel = jsonLabel;
}
public String getCheminfId() {
return cheminfId;
}
public String getLabel() {
return label;
}
@JsonValue
public String getJsonLabel() {
return jsonLabel;
}
}
private static class PCRDFHandler extends AbstractRDFHandler {
public static final Double MS_PER_S = 1000.0;
/* The Pubchem RDF corpus represents all subjects as SimpleIRIs, but objects can be IRIs or literals. Let the child
* class decide which one it wants to handle. */
enum OBJECT_TYPE {
IRI,
LITERAL,
;
}
private RocksDB db;
private COLUMN_FAMILIES columnFamily;
private ColumnFamilyHandle cfh;
// Filter out RDF types (based on namespace) that we don't recognize or don't want to process.
PC_RDF_DATA_TYPES keyType, valueType;
boolean reverseSubjectAndObject;
/* This is a super janky way to map synonym types to their enum values in the index. Would be better done with a
* subclass, but we'll leave that for a refactoring once we get this working. */
Function<String, String> valueTransformer = null;
DateTime startTime;
// Is the RDF parser single threaded? We don't know, so use an atomic counter to be safe.
AtomicLong numProcessed = new AtomicLong(0);
// Store unrecognized namespaces so we only log once per RDF file, rather than once per entry (which is a lot).
Set<String> seenUnrecognizedSubjectNamespaces = new HashSet<>();
Set<String> seenUnrecognizedObjectNamespaces = new HashSet<>();
PCRDFHandler(Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles, COLUMN_FAMILIES columnFamily,
PC_RDF_DATA_TYPES keyType, PC_RDF_DATA_TYPES valueType, boolean reverseSubjectAndObject,
Function<String, String> valueTransformer) {
this.db = dbAndHandles.getLeft();
this.columnFamily = columnFamily;
this.cfh = dbAndHandles.getRight().get(columnFamily);
this.keyType = keyType;
this.valueType = valueType;
this.reverseSubjectAndObject = reverseSubjectAndObject;
this.valueTransformer = valueTransformer;
}
@Override
public void startRDF() throws RDFHandlerException {
super.startRDF();
startTime = new DateTime().withZone(DateTimeZone.UTC);
}
@Override
public void endRDF() throws RDFHandlerException {
super.endRDF();
DateTime endTime = new DateTime().withZone(DateTimeZone.UTC);
Long runtimeInMilis = endTime.getMillis() - startTime.getMillis();
Long numProcessedVal = numProcessed.get();
LOGGER.info("PCRDFHandler reached end of RDF with %d events in %.3fs, at %.3f ms per event",
numProcessedVal,
runtimeInMilis.floatValue() / MS_PER_S,
runtimeInMilis.doubleValue() / numProcessedVal.doubleValue()
);
try {
db.flush(new FlushOptions().setWaitForFlush(true));
} catch (RocksDBException e) {
LOGGER.error("Caught RocksDB exception when flushing after completing RDF processing: %s", e.getMessage());
throw new RDFHandlerException(e);
}
}
@Override
public void handleStatement(Statement st) {
if (!(st.getSubject() instanceof SimpleIRI)) {
// If we can't even recognize the type of the subject, something is very wrong.
String msg = String.format("Unknown type of subject: %s", st.getSubject().getClass().getCanonicalName());
LOGGER.error(msg);
throw new RuntimeException(msg);
}
SimpleIRI subjectIRI = (SimpleIRI) st.getSubject();
// Filter out keys in namespaces we're not interested in.
if (!(keyType.getUrlOrDatatypeName().equals(subjectIRI.getNamespace()))) {
// If we don't recognize the namespace of the subject, then we probably can't handle this triple.
if (!seenUnrecognizedSubjectNamespaces.contains(subjectIRI.getNamespace())) {
seenUnrecognizedSubjectNamespaces.add(subjectIRI.getNamespace());
LOGGER.warn("Unrecognized subject namespace: %s\n", subjectIRI.getNamespace());
}
return;
}
String subject = subjectIRI.getLocalName();
String object = null;
// Let the subclasses tell us what
if (this.valueType.getValueObjectType() == OBJECT_TYPE.IRI && st.getObject() instanceof SimpleIRI) {
SimpleIRI objectIRI = (SimpleIRI) st.getObject();
if (!valueType.getUrlOrDatatypeName().equals(objectIRI.getNamespace())) {
// If we don't recognize the namespace of the subject, then we probably can't handle this triple.
if (!seenUnrecognizedObjectNamespaces.contains(objectIRI.getNamespace())) {
seenUnrecognizedObjectNamespaces.add(objectIRI.getNamespace());
LOGGER.warn("Unrecognized object namespace: %s\n", objectIRI.getNamespace());
}
return;
}
object = objectIRI.getLocalName();
} else if (this.valueType.getValueObjectType() == OBJECT_TYPE.LITERAL &&
st.getObject() instanceof SimpleLiteral) {
SimpleLiteral objectLiteral = (SimpleLiteral) st.getObject();
IRI datatype = objectLiteral.getDatatype();
if (!valueType.getUrlOrDatatypeName().equals(datatype.getLocalName())) {
// We're only expecting string values where we find literals.
if (!seenUnrecognizedObjectNamespaces.contains(datatype.getLocalName())) {
seenUnrecognizedObjectNamespaces.add(datatype.getLocalName());
LOGGER.warn("Unrecognized simple literal datatype: %s\n", datatype.getLocalName());
}
return;
}
object = objectLiteral.getLabel();
} else {
String msg = String.format("Unknown type of object: %s", st.getObject().getClass().getCanonicalName());
LOGGER.error(msg);
throw new RuntimeException(msg);
}
/* I considered modeling this decision using subclasses, but it made the configuration to much of a pain. Maybe
* we'll do something clever the next time this code needs modification... */
Pair<String, String> kvPair;
if (reverseSubjectAndObject) {
// If the keys, like PC ids, are on the right, we need to swap them around before storing.
kvPair = Pair.of(object, subject);
} else {
kvPair = Pair.of(subject, object);
}
if (valueTransformer != null) {
kvPair = Pair.of(kvPair.getKey(), valueTransformer.apply(kvPair.getValue()));
}
// Store the key and value in the appropriate column family.
appendValueToList(db, cfh, kvPair.getKey(), kvPair.getValue());
numProcessed.incrementAndGet();
}
private void appendValueToList(RocksDB db, ColumnFamilyHandle cfh, String key, String val) {
StringBuffer buffer = new StringBuffer();
List<String> storedObjects = null;
byte[] keyBytes = key.getBytes(UTF8);
// TODO: pull this out into a helper class or interface. Alas, we can must extend the AbstractRDFHandler.
try {
if (db.keyMayExist(cfh, keyBytes, buffer)) {
byte[] existingVal = db.get(cfh, keyBytes);
if (existingVal != null) {
ObjectInputStream oi = new ObjectInputStream(new ByteArrayInputStream(existingVal));
storedObjects = (ArrayList<String>) oi.readObject(); // Note: assumes all values are lists.
/* Once upon a time I had a constraint here that crashed if we expected unique keys. This was mainly to
* guard against hypothetical synonym hash collisions. What ends up happening, however, is that Pubchem
* stores multiple values of one hash with different normalizations (like all uppercase or all lowercase)
* meaning there *will* be multiple values with the same hash, but these values will all be valid.
* Instead we just ignore potential hash collisions and assume that any "collisions" are intentional. */
} else {
storedObjects = new ArrayList<>(1);
}
} else {
storedObjects = new ArrayList<>(1);
}
storedObjects.add(val);
try (ByteArrayOutputStream bos = new ByteArrayOutputStream();
ObjectOutputStream oo = new ObjectOutputStream(bos)) {
oo.writeObject(storedObjects);
oo.flush();
db.put(cfh, new WriteOptions(), keyBytes, bos.toByteArray());
}
} catch (RocksDBException e) {
LOGGER.error("Caughted unexpected RocksDBException: %s", e.getMessage());
throw new RuntimeException(e);
} catch (IOException e) {
LOGGER.error("Caughted unexpected IOException: %s", e.getMessage());
throw new RuntimeException(e);
} catch (ClassNotFoundException e) {
LOGGER.error("Caughted unexpected ClassNotFoundEXception: %s", e.getMessage());
throw new RuntimeException(e);
}
}
}
public static void main(String[] args) throws Exception {
org.apache.commons.cli.Options opts = new org.apache.commons.cli.Options();
for (Option.Builder b : OPTION_BUILDERS) {
opts.addOption(b.build());
}
CommandLine cl = null;
try {
CommandLineParser parser = new DefaultParser();
cl = parser.parse(opts, args);
} catch (ParseException e) {
System.err.format("Argument parsing failed: %s\n", e.getMessage());
HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
if (cl.hasOption("help")) {
HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
return;
}
PubchemTTLMerger merger = new PubchemTTLMerger();
File rocksDBFile = new File(cl.getOptionValue(OPTION_INDEX_PATH));
if (cl.hasOption(OPTION_ONLY_MERGE)) {
if (!(rocksDBFile.exists() && rocksDBFile.isDirectory())) {
System.err.format("Must specify an existing RocksDB index when using '%s'.\n", OPTION_ONLY_MERGE);
HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
merger.finish(merger.merge(rocksDBFile));
return;
}
File rdfDir = new File(cl.getOptionValue(OPTION_RDF_DIRECTORY));
if (!rdfDir.isDirectory()) {
System.err.format("Must specify a directory of RDF files to be parsed.\n");
HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
File[] filesInDirectoryArray = rdfDir.listFiles(new FilenameFilter() {
private static final String TTL_GZ_SUFFIX = ".ttl.gz";
@Override
public boolean accept(File dir, String name) {
return name.endsWith(TTL_GZ_SUFFIX);
}
});
if (filesInDirectoryArray == null || filesInDirectoryArray.length == 0) {
System.err.format("Found zero compressed TTL files in directory at '%s'.\n", rdfDir.getAbsolutePath());
HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
// Sort files for stability/sanity.
List<File> filesInDirectory = Arrays.asList(filesInDirectoryArray);
Collections.sort(filesInDirectory);
if (cl.hasOption(OPTION_ONLY_SYNONYMS)) {
filesInDirectory = filterByFileContents(filesInDirectory, PC_RDF_DATA_FILE_CONFIG.HASH_TO_SYNONYM);
}
if (cl.hasOption(OPTION_ONLY_MESH)) {
filesInDirectory = filterByFileContents(filesInDirectory, PC_RDF_DATA_FILE_CONFIG.HASH_TO_MESH);
}
if (cl.hasOption(OPTION_ONLY_PUBCHEM_IDS)) {
filesInDirectory = filterByFileContents(filesInDirectory, PC_RDF_DATA_FILE_CONFIG.HASH_TO_CID);
}
if (filesInDirectory.size() == 0) {
System.err.format("Arrived at index initialization with no files to process. " +
"Maybe too many filters were specified? synonyms: %s, MeSH: %s, Pubchem ids: %s\n",
cl.hasOption(OPTION_ONLY_SYNONYMS), cl.hasOption(OPTION_ONLY_MESH), cl.hasOption(OPTION_ONLY_PUBCHEM_IDS));
HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
RocksDB.loadLibrary();
Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles = null;
try {
if (rocksDBFile.exists()) {
if (!cl.hasOption(OPTION_OPEN_EXISTING_OKAY)) {
System.err.format(
"Index directory at '%s' already exists, delete before retrying or add '%s' option to reuse.\n",
rocksDBFile.getAbsolutePath(), OPTION_OPEN_EXISTING_OKAY);
HELP_FORMATTER.printHelp(PubchemTTLMerger.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
} else {
LOGGER.info("Reusing existing index at %s", rocksDBFile.getAbsolutePath());
dbAndHandles = openExistingRocksDB(rocksDBFile);
}
} else {
LOGGER.info("Creating new index at %s", rocksDBFile.getAbsolutePath());
dbAndHandles = createNewRocksDB(rocksDBFile);
}
merger.buildIndex(dbAndHandles, filesInDirectory);
merger.merge(dbAndHandles);
} finally {
if (dbAndHandles != null) {
merger.finish(dbAndHandles);
}
}
}
protected static List<File> filterByFileContents(List<File> files, PC_RDF_DATA_FILE_CONFIG fileConfig) {
return files.stream().filter(x -> x.getName().startsWith(fileConfig.filePrefix)).collect(Collectors.toList());
}
protected static Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> createNewRocksDB(File pathToIndex)
throws RocksDBException {
RocksDB db = null; // Not auto-closable.
Map<COLUMN_FAMILIES, ColumnFamilyHandle> columnFamilyHandles = new HashMap<>();
Options options = ROCKS_DB_CREATE_OPTIONS;
System.out.println("Opening index at " + pathToIndex.getAbsolutePath());
db = RocksDB.open(options, pathToIndex.getAbsolutePath());
for (COLUMN_FAMILIES cf : COLUMN_FAMILIES.values()) {
LOGGER.info("Creating column family %s", cf.getName());
ColumnFamilyHandle cfh =
db.createColumnFamily(new ColumnFamilyDescriptor(cf.getName().getBytes(UTF8)));
columnFamilyHandles.put(cf, cfh);
}
return Pair.of(db, columnFamilyHandles);
}
/**
* Open an existing RocksDB index. Use this after successful index generation to access the map of Pubchem compound
* ids to synonyms/MeSH ids using the column family CID_TO_SYNONYMS.
* @param pathToIndex A path to the RocksDB index directory to use.
* @return
* @throws RocksDBException
*/
public static Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> openExistingRocksDB(File pathToIndex)
throws RocksDBException {
List<ColumnFamilyDescriptor> columnFamilyDescriptors = new ArrayList<>(COLUMN_FAMILIES.values().length + 1);
// Must also open the "default" family or RocksDB will probably choke.
columnFamilyDescriptors.add(new ColumnFamilyDescriptor(DEFAULT_ROCKSDB_COLUMN_FAMILY.getBytes()));
for (COLUMN_FAMILIES family : COLUMN_FAMILIES.values()) {
columnFamilyDescriptors.add(new ColumnFamilyDescriptor(family.getName().getBytes()));
}
List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>(columnFamilyDescriptors.size());
DBOptions dbOptions = ROCKS_DB_OPEN_OPTIONS;
dbOptions.setCreateIfMissing(false);
RocksDB rocksDB = RocksDB.open(dbOptions, pathToIndex.getAbsolutePath(),
columnFamilyDescriptors, columnFamilyHandles);
Map<COLUMN_FAMILIES, ColumnFamilyHandle> columnFamilyHandleMap = new HashMap<>(COLUMN_FAMILIES.values().length);
// TODO: can we zip these together more easily w/ Java 8?
for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
ColumnFamilyDescriptor cfd = columnFamilyDescriptors.get(i);
ColumnFamilyHandle cfh = columnFamilyHandles.get(i);
String familyName = new String(cfd.columnFamilyName(), UTF8);
COLUMN_FAMILIES descriptorFamily = COLUMN_FAMILIES.getFamilyByName(familyName);
if (descriptorFamily == null) {
if (!DEFAULT_ROCKSDB_COLUMN_FAMILY.equals(familyName)) {
String msg = String.format("Found unexpected family name '%s' when trying to open RocksDB at %s",
familyName, pathToIndex.getAbsolutePath());
LOGGER.error(msg);
// Crash if we don't recognize the contents of this DB.
throw new RuntimeException(msg);
}
// Just skip this column family if it doesn't map to something we know but is expected.
continue;
}
columnFamilyHandleMap.put(descriptorFamily, cfh);
}
return Pair.of(rocksDB, columnFamilyHandleMap);
}
protected Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> merge(File pathToRocksDB)
throws RocksDBException, IOException, ClassNotFoundException {
Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles = openExistingRocksDB(pathToRocksDB);
merge(dbAndHandles);
return dbAndHandles;
}
protected void merge(Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles)
throws RocksDBException, IOException, ClassNotFoundException {
LOGGER.info("Beginning merge on Pubchem CID");
RocksDB db = dbAndHandles.getLeft();
ColumnFamilyHandle pubchemIdCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.CID_TO_HASHES);
ColumnFamilyHandle meshCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.HASH_TO_MESH);
ColumnFamilyHandle synonymCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.HASH_TO_SYNONYMS);
ColumnFamilyHandle synonymTypeCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.HASH_TO_SYNONYM_TYPE);
ColumnFamilyHandle mergeResultsCFH = dbAndHandles.getRight().get(COLUMN_FAMILIES.CID_TO_SYNONYMS);
RocksIterator cidIterator = db.newIterator(pubchemIdCFH);
// With help from https://github.com/facebook/rocksdb/wiki/Basic-Operations
int processed = 0;
for (cidIterator.seekToFirst(); cidIterator.isValid(); cidIterator.next()) {
byte[] key = cidIterator.key();
byte[] val = cidIterator.value();
String pubchemId = new String(key, UTF8);
List<String> hashes;
try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(val))) {
// We know all our values so far have been lists of strings, so this should be completely safe.
hashes = (List<String>) ois.readObject();
}
PubchemSynonyms pubchemSynonyms = new PubchemSynonyms(pubchemId);
/* The hash keys are based on synonym value, which we can manually compute with:
* $ echo -n 'dimethyltin(iv)' | md5
* This means that MeSH ids are linked to synonyms rather than pubchem ids. We need to look up each cid-linked
* hash in both the MeSH and synonym collections, as the key may legitimately exist in both (and serve to link
* cid to synonym and cid to MeSH). */
for (String hash : hashes) {
/* Note: these ids are not proper MeSH topic ids, but are internal MeSH ids found in the RDF and TTL
* representations of the MeSH corpus. You can find them in the MeSH .nt or .xml files, but they won't turn up
* anything on the MeSH website. */
List<String> meshIds = getValueAsObject(db, meshCFH, hash);
if (meshIds != null) {
pubchemSynonyms.addMeSHIds(meshIds);
}
List<String> synonyms = getValueAsObject(db, synonymCFH, hash);
// There are, surprisingly, some dangling hashes in the DB! Handle them gracefully.
if (synonyms == null) {
LOGGER.warn("Dangling synonym hash reference, adding empty list in place of value: cid = %s, hash = %s",
pubchemId, hash);
synonyms = Collections.emptyList();
}
List<String> synonymTypeStrings = getValueAsObject(db, synonymTypeCFH, hash);
Set<PC_SYNONYM_TYPES> synonymTypes = DEFAULT_SYNONYM_DATA_TYPES;
if (synonymTypeStrings != null) {
synonymTypes = synonymTypeStrings.stream().map(PC_SYNONYM_TYPES::valueOf).collect(Collectors.toSet());
}
if (synonymTypes.size() == 0) {
LOGGER.warn("Found zero synonym types for synonym, defaulting to %s: %s %s, synonyms = %s",
PC_SYNONYM_TYPES.UNKNOWN.name(), pubchemId, hash, StringUtils.join(synonyms, ", "));
}
/* It turns out that *lots* of synonyms are duplicated as depositor supplied names, so don't complain about it
* here. For performance sake we might want to consider changing the data model of PubchemSynonyms to reduce
* synonym string duplication, as the current model is pretty inefficient. */
for (PC_SYNONYM_TYPES synonymType : synonymTypes) {
for (String synonym : synonyms) {
// Let the PubchemSynonyms object do the de-duplication for us rather than reducing `synonyms` to a Set.
pubchemSynonyms.addSynonym(synonymType, synonym);
}
}
}
try (ByteArrayOutputStream bos = new ByteArrayOutputStream();
ObjectOutputStream oo = new ObjectOutputStream(bos)) {
oo.writeObject(pubchemSynonyms);
oo.flush();
db.put(mergeResultsCFH, key, bos.toByteArray());
}
processed++;
if (processed % 100000 == 0) {
LOGGER.info("Merged %d entries on Pubchem compound id", processed);
}
}
LOGGER.info("Merge complete, %d entries processed", processed);
}
protected <T> T getValueAsObject(RocksDB db, ColumnFamilyHandle cfh, String key)
throws RocksDBException, ClassNotFoundException, IOException {
StringBuffer stringBuffer = new StringBuffer();
T val = null;
/* Check for existence before fetching. IIRC doing otherwise might cause segfaults in the RocksDB JNI wrapper.
* Or it might just be faster thanks to the DB's bloom filter. */
if (db.keyMayExist(cfh, key.getBytes(), stringBuffer)) {
byte[] valBytes = db.get(cfh, key.getBytes());
// Make sure that the key actually exist (beware the "May" in keyMayExist).
if (valBytes != null) {
try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(valBytes))) {
val = (T) ois.readObject();
}
}
}
return val;
}
protected void buildIndex(Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles, List<File> rdfFiles)
throws RocksDBException, ClassNotFoundException, IOException {
LOGGER.info("Building RocksDB index of data in RDF files");
RDFParser parser = Rio.createParser(RDFFormat.TURTLE);
LOGGER.info("Processing %d RDF files", rdfFiles.size());
for (File rdfFile : rdfFiles) {
LOGGER.info("Processing file %s", rdfFile.getAbsolutePath());
AbstractRDFHandler handler = PC_RDF_DATA_FILE_CONFIG.makeHandlerForDataFile(dbAndHandles, rdfFile);
if (handler == null) {
LOGGER.info("Skipping file without defined handler: %s", rdfFile.getAbsolutePath());
continue;
}
parser.setRDFHandler(handler);
parser.parse(new GZIPInputStream(new FileInputStream(rdfFile)), "");
LOGGER.info("Successfully parsed file at %s", rdfFile.getAbsolutePath());
}
LOGGER.info("Done processing RDF files");
}
protected void finish(Pair<RocksDB, Map<COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles) {
LOGGER.info("Closing DB to complete merge.");
dbAndHandles.getLeft().close();
}
}