/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package act.installer.pubchem; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.ParseException; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.rocksdb.ColumnFamilyHandle; import org.rocksdb.RocksDB; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.ObjectInputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; public class PubchemSynonymFinder { private static final Logger LOGGER = LogManager.getFormatterLogger(PubchemSynonymFinder.class); private static final Charset UTF8 = StandardCharsets.UTF_8; private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final String OPTION_INDEX_PATH = "x"; public static final String OPTION_PUBCHEM_COMPOUND_ID = "c"; public static final String OPTION_IDS_FILE = "f"; public static final String OPTION_OUTPUT = "o"; public static final String HELP_MESSAGE = StringUtils.join(new String[]{ "This class finds and prints Pubchem synonym data from a RocksDB index created from Pubchem RDF files. ", "Specify one or more Pubchem compound ids to find." }, ""); public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{ add(Option.builder(OPTION_INDEX_PATH) .argName("index path") .desc("A path to the directory where the on-disk index will be stored; must not already exist") .hasArg().required() .longOpt("index") ); add(Option.builder(OPTION_PUBCHEM_COMPOUND_ID) .argName("compound id") .desc("Lookup one compound ID in the database") .hasArg() .longOpt("pc-cid") ); add(Option.builder(OPTION_IDS_FILE) .argName("compound ids file") .desc("Lookup a list of compound ids and print them as one large JSON document; comments (#) will be ignored") .hasArg() .longOpt("pc-cids-file") ); add(Option.builder(OPTION_OUTPUT) .argName("output file") .desc("Write output to a file; default is stdout") .hasArg() .longOpt("output") ); add(Option.builder("h") .argName("help") .desc("Prints this help message") .longOpt("help") ); }}; public static final HelpFormatter HELP_FORMATTER = new HelpFormatter(); static { HELP_FORMATTER.setWidth(100); } private static final Pattern PC_CID_PATTERN = Pattern.compile("^CID\\d+$"); public static void main(String[] args) throws Exception { org.apache.commons.cli.Options opts = new org.apache.commons.cli.Options(); for (Option.Builder b : OPTION_BUILDERS) { opts.addOption(b.build()); } CommandLine cl = null; try { CommandLineParser parser = new DefaultParser(); cl = parser.parse(opts, args); } catch (ParseException e) { System.err.format("Argument parsing failed: %s\n", e.getMessage()); HELP_FORMATTER.printHelp(PubchemSynonymFinder.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } if (cl.hasOption("help")) { HELP_FORMATTER.printHelp(PubchemSynonymFinder.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); return; } File rocksDBFile = new File(cl.getOptionValue(OPTION_INDEX_PATH)); if (!rocksDBFile.isDirectory()) { System.err.format("Index directory does not exist or is not a directory at '%s'", rocksDBFile.getAbsolutePath()); HELP_FORMATTER.printHelp(PubchemSynonymFinder.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } List<String> compoundIds = null; if (cl.hasOption(OPTION_PUBCHEM_COMPOUND_ID)) { compoundIds = Collections.singletonList(cl.getOptionValue(OPTION_PUBCHEM_COMPOUND_ID)); } else if (cl.hasOption(OPTION_IDS_FILE)) { File idsFile = new File(cl.getOptionValue(OPTION_IDS_FILE)); if (!idsFile.exists()) { System.err.format("Cannot find Pubchem CIDs file at %s", idsFile.getAbsolutePath()); HELP_FORMATTER.printHelp(PubchemSynonymFinder.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } compoundIds = getCIDsFromFile(idsFile); if (compoundIds.size() == 0) { System.err.format("Found zero Pubchem CIDs to process in file at '%s', exiting", idsFile.getAbsolutePath()); HELP_FORMATTER.printHelp(PubchemSynonymFinder.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } } else { System.err.format("Must specify one of '%s' or '%s'; index is too big to print all synonyms.", OPTION_PUBCHEM_COMPOUND_ID, OPTION_IDS_FILE); HELP_FORMATTER.printHelp(PubchemSynonymFinder.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } // Run a quick check to warn users of malformed ids. compoundIds.forEach(x -> { if (!PC_CID_PATTERN.matcher(x).matches()) { // Use matches() for complete matching. LOGGER.warn("Specified compound id does not match expected format: %s", x); } }); LOGGER.info("Opening DB and searching for %d Pubchem CIDs", compoundIds.size()); Pair<RocksDB, Map<PubchemTTLMerger.COLUMN_FAMILIES, ColumnFamilyHandle>> dbAndHandles = null; Map<String, PubchemSynonyms> results = new LinkedHashMap<>(compoundIds.size()); try { dbAndHandles = PubchemTTLMerger.openExistingRocksDB(rocksDBFile); RocksDB db = dbAndHandles.getLeft(); ColumnFamilyHandle cidToSynonymsCfh = dbAndHandles.getRight().get(PubchemTTLMerger.COLUMN_FAMILIES.CID_TO_SYNONYMS); for (String cid : compoundIds) { PubchemSynonyms synonyms = null; byte[] val = db.get(cidToSynonymsCfh, cid.getBytes(UTF8)); if (val != null) { ObjectInputStream oi = new ObjectInputStream(new ByteArrayInputStream(val)); // We're relying on our use of a one-value-type per index model here so we can skip the instanceof check. synonyms = (PubchemSynonyms) oi.readObject(); } else { LOGGER.warn("No synonyms available for compound id '%s'", cid); } results.put(cid, synonyms); } } finally { if (dbAndHandles != null) { dbAndHandles.getLeft().close(); } } try (OutputStream outputStream = cl.hasOption(OPTION_OUTPUT) ? new FileOutputStream(cl.getOptionValue(OPTION_OUTPUT)) : System.out) { OBJECT_MAPPER.writerWithDefaultPrettyPrinter().writeValue(outputStream, results); new OutputStreamWriter(outputStream).append('\n'); } LOGGER.info("Done searching for Pubchem synonyms"); } private static List<String> getCIDsFromFile(File idsFile) throws IOException { List<String> compoundIds = new ArrayList<>(); try (BufferedReader reader = new BufferedReader(new FileReader(idsFile))) { String line; while ((line = reader.readLine()) != null) { line = line.trim(); if (line.startsWith("#")) { // skip comments continue; } compoundIds.add(line); } } return compoundIds; } }