/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.analysis.similarity;
import act.server.DBIterator;
import act.server.MongoDB;
import act.shared.Chemical;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import chemaxon.license.LicenseManager;
import chemaxon.sss.SearchConstants;
import chemaxon.sss.search.MolSearch;
import chemaxon.sss.search.MolSearchOptions;
import chemaxon.sss.search.SearchException;
import chemaxon.struc.Molecule;
import chemaxon.util.MolHandler;
import com.act.utils.TSVParser;
import com.act.utils.TSVWriter;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.function.Consumer;
/**
* This class is based on Chris's substructure search from the biointerpretation branch.
*/
public class SubstructureSearch {
private static final Logger LOGGER = LogManager.getFormatterLogger(SubstructureSearch.class);
// TODO: are these options sufficient? Are there others we might want to use?
/* Chemaxon exposes a very non-uniform means of configuring substructure search. Hence the mess of lambdas below.
* Consumer solves the Function<T, void> problem. */
private static final Map<String, Consumer<MolSearchOptions>> SEARCH_OPTION_ENABLERS =
Collections.unmodifiableMap(new HashMap<String, Consumer<MolSearchOptions>>() {{
put("CHARGE_MATCHING_EXACT", (so -> so.setChargeMatching(SearchConstants.CHARGE_MATCHING_EXACT)));
put("CHARGE_MATCHING_IGNORE", (so -> so.setChargeMatching(SearchConstants.CHARGE_MATCHING_IGNORE)));
put("IMPLICIT_H_MATCHING_ENABLED", (so -> so.setImplicitHMatching(SearchConstants.IMPLICIT_H_MATCHING_ENABLED)));
put("IMPLICIT_H_MATCHING_DISABLED", (so -> so.setImplicitHMatching(SearchConstants.IMPLICIT_H_MATCHING_DISABLED)));
put("IMPLICIT_H_MATCHING_IGNORE", (so -> so.setImplicitHMatching(SearchConstants.IMPLICIT_H_MATCHING_IGNORE)));
put("STEREO_EXACT", (so -> so.setStereoSearchType(SearchConstants.STEREO_EXACT)));
put("STEREO_IGNORE", (so -> so.setStereoSearchType(SearchConstants.STEREO_IGNORE)));
put("STEREO_MODEL_COMPREHENSIVE", (so -> so.setStereoModel(SearchConstants.STEREO_MODEL_COMPREHENSIVE)));
put("STEREO_MODEL_GLOBAL", (so -> so.setStereoModel(SearchConstants.STEREO_MODEL_GLOBAL)));
put("STEREO_MODEL_LOCAL", (so -> so.setStereoModel(SearchConstants.STEREO_MODEL_LOCAL)));
put("TAUTOMER_SEARCH_ON", (so -> so.setTautomerSearch(SearchConstants.TAUTOMER_SEARCH_ON)));
put("TAUTOMER_SEARCH_OFF", (so -> so.setTautomerSearch(SearchConstants.TAUTOMER_SEARCH_OFF)));
put("TAUTOMER_SEARCH_ON_IGNORE_TAUTOMERSTEREO",
(so -> so.setTautomerSearch(SearchConstants.TAUTOMER_SEARCH_ON_IGNORE_TAUTOMERSTEREO)));
put("VAGUE_BOND_OFF", (so -> so.setVagueBondLevel(SearchConstants.VAGUE_BOND_OFF)));
put("VAGUE_BOND_LEVEL_HALF", (so -> so.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL_HALF)));
put("VAGUE_BOND_LEVEL1", (so -> so.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL1)));
put("VAGUE_BOND_LEVEL2", (so -> so.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL2)));
put("VAGUE_BOND_LEVEL3", (so -> so.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL3)));
put("VAGUE_BOND_LEVEL4", (so -> so.setVagueBondLevel(SearchConstants.VAGUE_BOND_LEVEL4)));
}});
public static final List<String> VALID_SEARCH_OPTION_SORTED;
static {
List<String> keys = new ArrayList<>(SEARCH_OPTION_ENABLERS.keySet());
Collections.sort(keys);
VALID_SEARCH_OPTION_SORTED = Collections.unmodifiableList(keys);
}
public static final String OPTION_INPUT_FILE = "f";
public static final String OPTION_INPUT_DB = "d";
public static final String OPTION_INPUT_DB_HOST = "s";
public static final String OPTION_INPUT_DB_PORT = "p";
public static final String OPTION_OUTPUT_FILE = "o";
public static final String OPTION_QUERY = "q";
public static final String OPTION_LICENSE_FILE = "l";
public static final String OPTION_SEARCH_OPTIONS = "x";
public static final String FIELD_INCHI = "inchi";
public static final String FIELD_ID = "id";
public static final String DEFAULT_HOST = "localhost";
public static final String DEFAULT_PORT = "27017";
public static final String HELP_MESSAGE = StringUtils.join(new String[] {
"This class does substructure matching against an installer DB or a TSV file using a single SMILES query. ",
"All matching chemicals are outputted; non-matches are ignored."
}, "");
public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
static {
HELP_FORMATTER.setWidth(100);
}
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_INPUT_FILE)
.argName("tsv file")
.desc(String.format("The name of the input TSV file to read (must contain an '%s' field)", FIELD_INCHI))
.hasArg()
.longOpt("tsv")
);
add(Option.builder(OPTION_INPUT_DB)
.argName("db name")
.desc("The name of the database to read")
.hasArg()
.longOpt("db")
);
add(Option.builder(OPTION_INPUT_DB_HOST)
.argName("db host")
.desc("The host to which connect when reading from a DB")
.hasArg()
.longOpt("host")
);
add(Option.builder(OPTION_INPUT_DB_PORT)
.argName("db port")
.desc("The port to which connect when reading from a DB")
.hasArg()
.longOpt("port")
);
add(Option.builder(OPTION_OUTPUT_FILE)
.argName("file name")
.desc("The name of the output tsv to write")
.hasArg().required()
.longOpt("out")
);
add(Option.builder(OPTION_QUERY)
.argName("query string")
.desc("The SMILES query string for which to search")
.hasArg().required()
.longOpt("query")
);
add(Option.builder(OPTION_LICENSE_FILE)
.argName("license file")
.desc("A chemaxon license file to use")
.hasArg()
.longOpt("license")
);
add(Option.builder(OPTION_SEARCH_OPTIONS)
.argName("search options")
.desc(String.format("Options to supply to the substructure search. See " +
"https://www.chemaxon.com/jchem/doc/dev/java/api/chemaxon/sss/SearchConstants.html " +
"for explanations. Valid options are: %s", StringUtils.join(VALID_SEARCH_OPTION_SORTED, ", "))).
hasArgs().valueSeparator(',').
longOpt("search-opts")
);
// Everybody needs a little help from their friends.
add(Option.builder("h")
.argName("help")
.desc("Prints this help message")
.longOpt("help")
);
}};
MolSearch ms = new MolSearch();
// From https://docs.chemaxon.com/display/jchembase/Bond+specific+search+options.
private final MolSearchOptions searchOptions = new MolSearchOptions(SearchConstants.SUBSTRUCTURE);
public void init(String smilesQuery, List<String> extraOpts)
throws IllegalArgumentException, IOException, MolFormatException {
// Apply all the specified extra search options using the key -> function mapping above.
for (String opt : extraOpts) {
if (!SEARCH_OPTION_ENABLERS.containsKey(opt)) {
throw new IllegalArgumentException(String.format("Unrecognized search option: %s", opt));
}
SEARCH_OPTION_ENABLERS.get(opt).accept(searchOptions);
}
ms.setSearchOptions(searchOptions);
ms.setQuery(new MolHandler(smilesQuery, true).getMolecule());
}
public boolean matchSubstructure(Molecule target) throws SearchException {
ms.setTarget(target);
/* hits are arrays of atom ids in the target that matched the query. If multiple sites in the target matched,
* then there should be multiple arrays of atom ids (but we don't care since we're just looking for any match). */
int[][] hits = ms.findAll();
if (hits != null) {
for (int i = 0; i < hits.length; i++) {
if (hits[i].length > 0) {
return true;
}
}
}
return false;
}
public static void main(String[] args) throws Exception {
Options opts = new Options();
for (Option.Builder b : OPTION_BUILDERS) {
opts.addOption(b.build());
}
CommandLine cl = null;
try {
CommandLineParser parser = new DefaultParser();
cl = parser.parse(opts, args);
} catch (ParseException e) {
System.err.format("Argument parsing failed: %s\n", e.getMessage());
HELP_FORMATTER.printHelp(SubstructureSearch.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
if (cl.hasOption("help")) {
HELP_FORMATTER.printHelp(SubstructureSearch.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
return;
}
if (cl.hasOption(OPTION_LICENSE_FILE)) {
LicenseManager.setLicenseFile(cl.getOptionValue(OPTION_LICENSE_FILE));
}
List<String> searchOpts = Collections.emptyList();
if (cl.hasOption(OPTION_SEARCH_OPTIONS)) {
searchOpts = Arrays.asList(cl.getOptionValues(OPTION_SEARCH_OPTIONS));
}
// Make sure we can initialize correctly before opening any file handles for writing.
SubstructureSearch matcher = new SubstructureSearch();
try {
matcher.init(cl.getOptionValue(OPTION_QUERY), searchOpts);
} catch (IllegalArgumentException e) {
System.err.format("Unable to initialize substructure search. %s\n", e.getMessage());
HELP_FORMATTER.printHelp(SubstructureSearch.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
} catch (MolFormatException e) {
System.err.format("Invalid SMILES structure query. %s\n", e.getMessage());
HELP_FORMATTER.printHelp(SubstructureSearch.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
Pair<List<String>, Iterator<Map<String, String>>> iterPair = null;
if (cl.hasOption(OPTION_INPUT_FILE)) {
File inFile = new File(cl.getOptionValue(OPTION_INPUT_FILE));
if (!inFile.exists()) {
System.err.format("File at %s does not exist", inFile.getAbsolutePath());
HELP_FORMATTER.printHelp(SubstructureSearch.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
iterPair = iterateOverTSV(inFile);
} else if (cl.hasOption(OPTION_INPUT_DB)) {
iterPair = iterateOverDB(
cl.getOptionValue(OPTION_INPUT_DB_HOST, DEFAULT_HOST),
Integer.parseInt(cl.getOptionValue(OPTION_INPUT_DB_HOST, DEFAULT_PORT)),
cl.getOptionValue(OPTION_INPUT_DB)
);
} else {
System.err.format("Must specify either input TSV file or input DB from which to read.\n");
HELP_FORMATTER.printHelp(SubstructureSearch.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
TSVWriter<String, String> writer = new TSVWriter<>(iterPair.getLeft());
writer.open(new File(cl.getOptionValue(OPTION_OUTPUT_FILE)));
LOGGER.info("Seaching for substructure '%s'", cl.getOptionValue(OPTION_QUERY));
try {
int rowNum = 0;
while (iterPair.getRight().hasNext()) {
Map<String, String> row = iterPair.getRight().next();
rowNum++;
try {
String inchi = row.get(FIELD_INCHI);
Molecule target = null;
try {
target = MolImporter.importMol(inchi);
} catch (Exception e) {
LOGGER.warn("Skipping molecule %d due to exception: %s\n", rowNum, e.getMessage());
continue;
}
if (matcher.matchSubstructure(target)) {
writer.append(row);
writer.flush();
} else {
// Don't output if not a match.
LOGGER.debug("Found non-matching molecule: %s", inchi);
}
} catch (SearchException e) {
LOGGER.error("Exception on input line %d: %s\n", rowNum, e.getMessage());
throw e;
}
}
} finally {
writer.close();
}
LOGGER.info("Done with substructure search");
}
public static Pair<List<String>, Iterator<Map<String, String>>> iterateOverTSV(File inputFile) throws Exception {
TSVParser parser = new TSVParser();
parser.parse(inputFile);
List<String> header = parser.getHeader();
Iterator<Map<String, String>> chemsIter = parser.getResults().iterator();
return Pair.of(header, chemsIter);
}
public static Pair<List<String>, Iterator<Map<String, String>>> iterateOverDB(
String host, Integer port, String dbName) throws Exception {
MongoDB db = new MongoDB(host, port, dbName);
final DBIterator iter = db.getIteratorOverChemicals();
Iterator<Map<String, String>> chemsIter = new Iterator<Map<String, String>>() {
@Override
public boolean hasNext() {
return iter.hasNext();
}
@Override
public Map<String, String> next() {
Chemical c = db.getNextChemical(iter);
return new HashMap<String, String>() {{
put(FIELD_ID, c.getUuid().toString());
put(FIELD_INCHI, c.getInChI());
}};
}
};
return Pair.of(Arrays.asList(FIELD_ID, FIELD_INCHI), chemsIter);
}
}