/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.biointerpretation.sarinference; import chemaxon.clustering.LibraryMCS; import chemaxon.formats.MolFormatException; import chemaxon.formats.MolImporter; import chemaxon.struc.Molecule; import com.act.biointerpretation.l2expansion.L2FilteringDriver; import com.act.biointerpretation.l2expansion.L2Prediction; import com.act.biointerpretation.l2expansion.L2PredictionCorpus; import com.act.jobs.FileChecker; import com.act.jobs.JavaRunnable; import com.act.lcms.db.io.report.IonAnalysisInterchangeModel; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; public class LibMcsClustering { private static final Logger LOGGER = LogManager.getFormatterLogger(LibMcsClustering.class); private static final String OPTION_PREDICTION_CORPUS = "c"; private static final String OPTION_POSITIVE_INCHIS = "p"; private static final String OPTION_SARTREE_PATH = "t"; private static final String OPTION_SCORED_SAR_PATH = "s"; private static final String OPTION_HELP = "h"; public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() { { add(Option.builder(OPTION_PREDICTION_CORPUS) .argName("input corpus path") .desc("The absolute path to the input prediction corpus.") .hasArg() .longOpt("input-corpus-path") .required() ); add(Option.builder(OPTION_POSITIVE_INCHIS) .argName("positive inchis file") .desc("The path to a file of positive inchis from LCMS analysis of the prediction corpus.") .hasArg() .longOpt("input-positive-inchis") .required() ); add(Option.builder(OPTION_SARTREE_PATH) .argName("sartree path") .desc("The path to which to write the intermediate file produced by structure clustering.") .hasArg() .longOpt("sartree-path") .required() ); add(Option.builder(OPTION_SCORED_SAR_PATH) .argName("scored sar path") .desc("The path to which to write the final output file of scored sars.") .hasArg() .longOpt("scored-sar-path") .required() ); add(Option.builder(OPTION_HELP) .argName("help") .desc("Prints this help message.") .longOpt("help") ); } }; public static final String HELP_MESSAGE = "This class is used to build sars from an L2Prediction run and LCMS analysis results. The inputs are an " + "L2PredictionCorpus and a file with all the product inchis that came up as positive in the LCMS analysis. " + "The output is a list of Sars with percentageHits scores based on how predictive they are of the " + "reactions in the corpus."; public static final HelpFormatter HELP_FORMATTER = new HelpFormatter(); static { HELP_FORMATTER.setWidth(100); } // This heuristically balances the consideration of wanting high percentage to rank // highly, but not wanting those with only a couple of total matches to rank highly, private static final SarTreeNode.ScoringFunctions SAR_SCORING_FUNCTION = SarTreeNode.ScoringFunctions.HIT_MINUS_MISS; private static final Integer THRESHOLD_TREE_SIZE = 2; // any SAR that is not simply one specific substrate is allowed public static void main(String[] args) throws Exception { // Build command line parser. Options opts = new Options(); for (Option.Builder b : OPTION_BUILDERS) { opts.addOption(b.build()); } CommandLine cl = null; try { CommandLineParser parser = new DefaultParser(); cl = parser.parse(opts, args); } catch (ParseException e) { LOGGER.error("Argument parsing failed: %s", e.getMessage()); exitWithHelp(opts); } // Print help. if (cl.hasOption(OPTION_HELP)) { HELP_FORMATTER.printHelp(L2FilteringDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); return; } File inputCorpusFile = new File(cl.getOptionValue(OPTION_PREDICTION_CORPUS)); File positiveInchisFile = new File(cl.getOptionValue(OPTION_POSITIVE_INCHIS)); File sartreeFile = new File(cl.getOptionValue(OPTION_SARTREE_PATH)); File scoredSarFile = new File(cl.getOptionValue(OPTION_SCORED_SAR_PATH)); JavaRunnable clusterer = getClusterer(inputCorpusFile, sartreeFile); JavaRunnable scorer = getSarScorer( inputCorpusFile, sartreeFile, positiveInchisFile, scoredSarFile, SAR_SCORING_FUNCTION, THRESHOLD_TREE_SIZE); LOGGER.info("Running clustering."); clusterer.run(); LOGGER.info("Running sar scoring."); scorer.run(); LOGGER.info("Complete!."); } private static void exitWithHelp(Options opts) { HELP_FORMATTER.printHelp(L2FilteringDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } /** * Reads in a prediction corpus, containing only one RO's predictions, and builds a clustering tree of the * substrates. Returns every SarTreeNode in the tree. * This method is static because it does not rely on any properties of the enclosing class to construct the job. * TODO: It would probably make more sense to make this its own class, i.e. <Clusterer implements JavaRunnable> * * @param predictionCorpusInput The prediction corpus input file. * @param sarTreeNodesOutput The file to which to write the SarTreeNodeList of every node in the clustering tree. * @return A JavaRunnable to run the appropriate clustering. */ public static JavaRunnable getClusterer(File predictionCorpusInput, File sarTreeNodesOutput) { return new JavaRunnable() { @Override public void run() throws IOException { // Verify input and output files FileChecker.verifyInputFile(predictionCorpusInput); FileChecker.verifyAndCreateOutputFile(sarTreeNodesOutput); // Build input corpus and substrate list L2PredictionCorpus inputCorpus = L2PredictionCorpus.readPredictionsFromJsonFile(predictionCorpusInput); // Get list of molecules, tagged by PredictionId so we can track back to the corresponding predictions later on. Collection<Molecule> molecules = importMoleculesWithPredictionIds(inputCorpus); // Run substrate clustering SarTree sarTree = new SarTree(); try { sarTree.buildByClustering(new LibraryMCS(), molecules); } catch (InterruptedException e) { LOGGER.error("Threw interrupted exception during buildByClustering: %s", e.getMessage()); throw new RuntimeException(e); } // Write output to file SarTreeNodeList nodeList = new SarTreeNodeList(new ArrayList<>(sarTree.getNodes())); nodeList.writeToFile(sarTreeNodesOutput); } @Override public String toString() { return "SarClusterer:" + predictionCorpusInput.getName(); } /** * Imports the substrate molecules from the prediction corpus, and tags them with their prediction IDs from * the corpus, using Chemaxon's setPropertyObject. These prediction Ids can then be recovered from the molecules * later, and mapped back to the original inchis, even if the molecule has been changed so that its inchi is no * longer the same. In particular, LibraryMCS strips stereo and other layers from the inchis, which would make * it hard to track LCMS hit sand misses between the output molecules of LibraryMCS and the initial inchis fed * into LibraryMCS. * * @param inputCorpus The prediction corpus. * @return A list of molecules, tagged with a property objects which contains the corresponding list of * prediction ids.. * @throws MolFormatException If a molecule cannot be imported from an inchi. */ private Collection<Molecule> importMoleculesWithPredictionIds(L2PredictionCorpus inputCorpus) throws MolFormatException { Map<String, Molecule> inchiToMoleculeMap = new HashMap<>(); for (L2Prediction prediction : inputCorpus.getCorpus()) { for (String substrateInchi : prediction.getSubstrateInchis()) { // For now this should only be one substrate if (!inchiToMoleculeMap.containsKey(substrateInchi)) { Molecule mol = importMoleculeWithPredictionId(substrateInchi, prediction.getId()); inchiToMoleculeMap.put(substrateInchi, mol); } else { List<Integer> predictionIds = (ArrayList<Integer>) inchiToMoleculeMap .get(substrateInchi) .getPropertyObject(SarTreeNode.PREDICTION_ID_KEY); predictionIds.add(prediction.getId()); } } } return inchiToMoleculeMap.values(); } private Molecule importMoleculeWithPredictionId(String inchi, Integer id) throws MolFormatException { Molecule mol = MolImporter.importMol(inchi, "inchi"); List<Integer> predictionIds = new ArrayList<>(); predictionIds.add(id); mol.setPropertyObject(SarTreeNode.PREDICTION_ID_KEY, predictionIds); return mol; } }; } /** * Reads in an already-built SarTree from a SarTreeNodeList, and scores the SARs based on LCMS results. Currently * LCMS results are a dummy function that randomly classifies molecules as hits and misses. * This method is static because it does not rely on any properties of the enclosing class to construct the job. * TODO: It would probably make more sense to make this its own class, i.e. <SarScorer implements JavaRunnable> * * @param sarTreeInput An input file containing a SarTreeNodeList with all Sars from the clustering tree. * @param lcmsInput File with LCMS hits. * @param sarTreeNodeOutput The output file to which to write the relevant SARs from the corpus, sorted in decreasing * order of confidence. * @param scoringFunction The function used to score and rank the SARs. * @param subtreeThreshold The minimum number of leaves a sAR should match to be returned. * @return A JavaRunnable to run the SAR scoring. */ public static JavaRunnable getSarScorer( File predictionsFile, File sarTreeInput, File lcmsInput, File sarTreeNodeOutput, SarTreeNode.ScoringFunctions scoringFunction, Integer subtreeThreshold) { return new JavaRunnable() { @Override public void run() throws IOException { // Verify input and output files FileChecker.verifyInputFile(predictionsFile); FileChecker.verifyInputFile(lcmsInput); FileChecker.verifyInputFile(sarTreeInput); FileChecker.verifyAndCreateOutputFile(sarTreeNodeOutput); L2PredictionCorpus predictionCorpus = L2PredictionCorpus.readPredictionsFromJsonFile(predictionsFile); // Build SAR tree from input file SarTreeNodeList nodeList = new SarTreeNodeList(); nodeList.loadFromFile(sarTreeInput); SarTree sarTree = new SarTree(); nodeList.getSarTreeNodes().forEach(node -> sarTree.addNode(node)); // Build LCMS results IonAnalysisInterchangeModel lcmsResults = new IonAnalysisInterchangeModel(); lcmsResults.loadResultsFromFile(lcmsInput); // Build sar scorer from LCMS and prediction corpus SarTreeBasedCalculator sarScorer = new SarTreeBasedCalculator(sarTree, predictionCorpus, lcmsResults); // Score SARs // Calculate hits and misses sarTree.applyToNodes(sarScorer, subtreeThreshold); // Retain nodes that are not repeats or leaves, and have more than 0 LCMS hits SarTreeNodeList treeNodeList = sarTree.getExplanatoryNodes(subtreeThreshold, 0.0); // Sort by the supplied scoring function. treeNodeList.sortBy(scoringFunction); // Write out output. treeNodeList.writeToFile(sarTreeNodeOutput); } @Override public String toString() { return "SarScorer:" + sarTreeInput.getName(); } }; } }