/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package act.installer.reachablesexplorer; import act.server.DBIterator; import act.server.MongoDB; import com.act.biointerpretation.l2expansion.L2InchiCorpus; import com.act.jobs.FileChecker; import com.act.utils.CLIUtil; import com.act.utils.ProcessRunner; import com.act.workflow.tool_manager.workflow.workflow_mixins.mongo.ChemicalKeywords; import com.act.workflow.tool_manager.workflow.workflow_mixins.mongo.MongoKeywords; import com.mongodb.BasicDBObject; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Option; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; public class WordCloudGenerator { /** * This class allow the generation of wordclouds, using R, for any inchi having a Bing reference. * It requires an R script, that takes an InChI as argument and writes a word cloud to a file */ private static final String RSCRIPT_EXE_PATH = "/usr/bin/Rscript"; // TODO: find this using `env` instead. private static final String RSCRIPT_LOCATION = "src/main/r/RWordCloudGenerator.R"; private static final Logger LOGGER = LogManager.getFormatterLogger(WordCloudGenerator.class); private static final String PNG_EXTENSION = ".png"; private static final long CHILD_PROCESS_TIMEOUT_IN_SECONDS = 60; // Thomas thinks this is plenty of time for a cloud. private static final String OPTION_DB_HOST = "H"; private static final String OPTION_DB_PORT = "p"; private static final String OPTION_INSTALLER_SOURCE_DB = "i"; private static final String OPTION_RENDERING_CACHE = "e"; private static final String OPTION_INPUT_INCHIS = "l"; private static final String OPTION_RSCRIPT_EXE_PATH = "r"; private static final String DEFAULT_ASSETS_LOCATION = "data/reachables-explorer-rendering-cache"; // Default host. If running on a laptop, please set a SSH bridge to access speakeasy private static final String DEFAULT_HOST = "localhost"; private static final String DEFAULT_PORT = "27017"; private static final String DEFAULT_CHEMICALS_DATABASE = "SHOULD_COME_FROM_CMDLINE"; // "jarvis_2016-12-09"; public static final String HELP_MESSAGE = StringUtils.join(new String[]{ "This class allows WordCloud generation as a separate process from the Loader" }, " "); public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{ add(Option.builder(OPTION_DB_HOST) .argName("DB host") .desc(String.format("The database host to which to connect (default: %s)", DEFAULT_HOST)) .hasArg() .longOpt("db-host") ); add(Option.builder(OPTION_DB_PORT) .argName("DB port") .desc(String.format( "The port on which to connect to the database (default: %s)", DEFAULT_PORT)) .hasArg() .longOpt("db-port") ); add(Option.builder(OPTION_INSTALLER_SOURCE_DB) .argName("DB name") .desc(String.format( "The name of the database from which to fetch chemicals and reactions (default: %s)", DEFAULT_CHEMICALS_DATABASE)) .hasArg() .longOpt("source-db-name") .required() ); add(Option.builder(OPTION_RENDERING_CACHE) .argName("path to cache") .desc(String.format( "A directory in which to cache rendered images for reachables documents (default: %s)", DEFAULT_ASSETS_LOCATION)) .hasArg() .longOpt("cache-dir") ); add(Option.builder(OPTION_INPUT_INCHIS) .argName("path to inchis list") .desc("A list of input inchis for which to compute word clouds") .hasArg() .required() .longOpt("inchis-path") ); add(Option.builder(OPTION_RSCRIPT_EXE_PATH) .argName("rscript exe path") .desc(String.format( "The path to the Rscript exe for running R scripts. Default is %s. Can be determined by running \"which Rscript\"", RSCRIPT_EXE_PATH)) .hasArg() .required() .longOpt("r-location") ); }}; private File rScript; private String rScriptExePath; private String host; private Integer port; private String database; private MongoDB bingDb; private Set<String> inchisSet; private File assetLocation; public WordCloudGenerator(String host, Integer port, String database, String assetLocation, String rScriptExePath) { this.host = host; this.port = port; this.database = database; this.bingDb = new MongoDB(host, port, database); this.inchisSet = getBingInchis(); this.assetLocation = new File(assetLocation); this.rScript = new File(RSCRIPT_LOCATION); this.rScriptExePath = rScriptExePath; try { FileChecker.verifyInputFile(this.rScript); } catch (IOException e) { String msg = String.format("Failed to locate R script at %s", this.rScript.getAbsolutePath()); LOGGER.error(msg); throw new RuntimeException(msg); } if (!this.assetLocation.exists() || !this.assetLocation.isDirectory()) { String msg = String.format("Failed to locate asset location directory at %s", this.assetLocation.getAbsolutePath()); LOGGER.error(msg); throw new RuntimeException(msg); } } public WordCloudGenerator(String host, Integer port, String database, String assetLocation) { this(host, port, database, assetLocation, RSCRIPT_EXE_PATH); } public WordCloudGenerator(String host, Integer port, String database) { this(host, port, database, DEFAULT_ASSETS_LOCATION, RSCRIPT_EXE_PATH); } public Set<String> getBingInchis() { BasicDBObject query = new BasicDBObject("xref.BING.metadata.usage_terms.0", new BasicDBObject(MongoKeywords.EXISTS$.MODULE$.value(), true)); BasicDBObject keys = new BasicDBObject(ChemicalKeywords.INCHI$.MODULE$.value(), true); DBIterator ite = bingDb.getIteratorOverChemicals(query, keys); Set<String> bingSet = new HashSet<>(); while (ite.hasNext()) { BasicDBObject o = (BasicDBObject) ite.next(); String inchi = o.getString(ChemicalKeywords.INCHI$.MODULE$.value()); if (inchi != null) { bingSet.add(inchi); } } return bingSet; } public File getWordcloudFile(String inchi) { String md5 = DigestUtils.md5Hex(inchi); String postfix = new StringBuilder("-").append(md5).append(PNG_EXTENSION).toString(); String wordcloudFilename = String.join("", "wordcloud", postfix); return Paths.get(this.assetLocation.getPath(), wordcloudFilename).toFile(); } public File generateWordCloud(String inchi) { // TODO: improve wordcloud generation. Currently, each instance open a mongo connection on the R side. // By doing data manipulation in Java and utilizing Rengine, we could make this much better // Wordclouds could be generated ahead of time this way, using the inchi coprus File wordcloud = getWordcloudFile(inchi); if (!Files.exists(wordcloud.toPath()) && inchisSet.contains(inchi)) { try { ProcessRunner.runProcess( rScriptExePath, // TODO: remove hardcoded database from R script Arrays.asList(rScript.getAbsolutePath(), inchi, wordcloud.getAbsolutePath(), host, port.toString(), database), CHILD_PROCESS_TIMEOUT_IN_SECONDS); FileChecker.verifyInputFile(wordcloud); } catch (IOException e) { LOGGER.error("Unable to generate wordcloud for %s at location %s", inchi, wordcloud.toPath().toString()); return null; } catch (InterruptedException e) { LOGGER.error("Child process was interrupted: %s", e.getMessage()); return null; } } return wordcloud; } public static void main(String[] args) { CLIUtil cliUtil = new CLIUtil(Loader.class, HELP_MESSAGE, OPTION_BUILDERS); CommandLine cl = cliUtil.parseCommandLine(args); // TODO add possibility to run wordcloud generation as a post processing step, from a loaded reachables database File inchisFile = new File(cl.getOptionValue(OPTION_INPUT_INCHIS)); L2InchiCorpus inchiCorpus = new L2InchiCorpus(); try { inchiCorpus.loadCorpus(inchisFile); } catch (IOException e) { cliUtil.failWithMessage("Could not load inchi corpus from input file %s", inchisFile.getAbsolutePath()); } WordCloudGenerator wordCloudGenerator = new WordCloudGenerator( cl.getOptionValue(OPTION_DB_HOST, DEFAULT_HOST), Integer.parseInt(cl.getOptionValue(OPTION_DB_PORT, DEFAULT_PORT)), cl.getOptionValue(OPTION_INSTALLER_SOURCE_DB, DEFAULT_CHEMICALS_DATABASE), cl.getOptionValue(OPTION_RENDERING_CACHE, DEFAULT_ASSETS_LOCATION), cl.getOptionValue(OPTION_RSCRIPT_EXE_PATH, RSCRIPT_EXE_PATH) ); inchiCorpus.getInchiList().forEach(wordCloudGenerator::generateWordCloud); } }