/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.analysis.surfactant; import chemaxon.license.LicenseManager; import com.act.lcms.db.io.LoadPlateCompositionIntoDB; import com.act.utils.TSVParser; import com.act.utils.TSVWriter; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.lang3.StringUtils; import java.io.File; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; public class AnalysisDriver { public static final String OPTION_LICENSE_FILE = "l"; public static final String OPTION_INCHI = "n"; public static final String OPTION_INPUT_FILE = "i"; public static final String OPTION_OUTPUT_FILE = "o"; public static final String OPTION_DISPLAY = "d"; public static final String HELP_MESSAGE = StringUtils.join(new String[]{ "This is a driver for the SurfactantAnalysis class. Given a list of input molecules or a single InChI, ", "it will apply the SurfactantAnalysis's structural metrics to the molecule(s) and write them to an output TSV ", "if a file is specified. Visualization can also be enabled if a single InChI is provided." }, ""); public static final HelpFormatter HELP_FORMATTER = new HelpFormatter(); static { HELP_FORMATTER.setWidth(100); } public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{ add(Option.builder(OPTION_LICENSE_FILE) .argName("path") .desc("The Chemaxon license file to load") .hasArg().required() .longOpt("license") ); add(Option.builder(OPTION_INCHI) .argName("inchi") .desc("A single inchi to analyze") .hasArg() .longOpt("inchi") ); add(Option.builder(OPTION_INPUT_FILE) .argName("input file") .desc("An input TSV of chemicals to analyze") .hasArg() .longOpt("input-file") ); add(Option.builder(OPTION_OUTPUT_FILE) .argName("output file") .desc("An output TSV in which to write features") .hasArg() .longOpt("output-file") ); add(Option.builder(OPTION_DISPLAY) .desc(String.format("Display the specified molecule (only works with -%s)", OPTION_INCHI)) .longOpt("display") ); }}; public static void main(String[] args) throws Exception { Options opts = new Options(); for (Option.Builder b : OPTION_BUILDERS) { opts.addOption(b.build()); } CommandLine cl = null; try { CommandLineParser parser = new DefaultParser(); cl = parser.parse(opts, args); } catch (ParseException e) { System.err.format("Argument parsing failed: %s\n", e.getMessage()); HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); System.exit(1); } if (cl.hasOption("help")) { HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null, true); return; } Set<String> seenOutputIds = new HashSet<>(); TSVWriter<String, String> tsvWriter = null; if (cl.hasOption(OPTION_OUTPUT_FILE)) { File outputFile = new File(cl.getOptionValue(OPTION_OUTPUT_FILE)); List<Map<String, String>> oldResults = null; if (outputFile.exists()) { System.err.format("Output file already exists, reading old results and skipping processed molecules.\n"); TSVParser outputParser = new TSVParser(); outputParser.parse(outputFile); oldResults = outputParser.getResults(); for (Map<String, String> row : oldResults) { // TODO: verify that the last row was written cleanly/completely. seenOutputIds.add(row.get("id")); } } List<String> header = new ArrayList<>(); header.add("name"); header.add("id"); header.add("inchi"); header.add("label"); for (SurfactantAnalysis.FEATURES f : SurfactantAnalysis.FEATURES.values()) { header.add(f.toString()); } // TODO: make this API more auto-closable friendly. tsvWriter = new TSVWriter<>(header); tsvWriter.open(outputFile); if (oldResults != null) { System.out.format("Re-writing %d existing result rows\n", oldResults.size()); tsvWriter.append(oldResults); } } try { Map<SurfactantAnalysis.FEATURES, Double> analysisFeatures; LicenseManager.setLicenseFile(cl.getOptionValue(OPTION_LICENSE_FILE)); if (cl.hasOption(OPTION_INCHI)) { analysisFeatures = SurfactantAnalysis.performAnalysis(cl.getOptionValue(OPTION_INCHI), cl.hasOption(OPTION_DISPLAY)); Map<String, String> tsvFeatures = new HashMap<>(); // Convert features to strings to avoid some weird formatting issues. It's ugly, but it works. for (Map.Entry<SurfactantAnalysis.FEATURES, Double> entry : analysisFeatures.entrySet()) { tsvFeatures.put(entry.getKey().toString(), String.format("%.6f", entry.getValue())); } tsvFeatures.put("name", "direct-inchi-input"); if (tsvWriter != null) { tsvWriter.append(tsvFeatures); } } else if (cl.hasOption(OPTION_INPUT_FILE)) { TSVParser parser = new TSVParser(); parser.parse(new File(cl.getOptionValue(OPTION_INPUT_FILE))); int i = 0; List<Map<String, String>> inputRows = parser.getResults(); for (Map<String, String> row : inputRows) { i++; // Just for warning messages. if (!row.containsKey("name") || !row.containsKey("id") || !row.containsKey("inchi")) { System.err.format("WARNING: TSV rows must contain at least name, id, and inchi, skipping row %d\n", i); continue; } if (seenOutputIds.contains(row.get("id"))) { System.out.format("Skipping input row with id already in output: %s\n", row.get("id")); continue; } System.out.format("Analysis for chemical %s\n", row.get("name")); try { analysisFeatures = SurfactantAnalysis.performAnalysis(row.get("inchi"), false); } catch (Exception e) { // Ignore exceptions for now. Sometimes the regression analysis or Chemaxon processing chokes unexpectedly. System.err.format("ERROR caught exception while processing '%s':\n", row.get("name")); System.err.format("%s\n", e.getMessage()); e.printStackTrace(System.err); System.err.println("Skipping..."); continue; } System.out.format("--- Done analysis for chemical %s\n", row.get("name")); // This is a duplicate of the OPTION_INCHI block code, but it's inside of a tight loop, so... Map<String, String> tsvFeatures = new HashMap<>(); for (Map.Entry<SurfactantAnalysis.FEATURES, Double> entry : analysisFeatures.entrySet()) { tsvFeatures.put(entry.getKey().toString(), String.format("%.6f", entry.getValue())); } tsvFeatures.put("name", row.get("name")); tsvFeatures.put("id", row.get("id")); tsvFeatures.put("inchi", row.get("inchi")); tsvFeatures.put("label", row.containsKey("label") ? row.get("label") : "?"); if (tsvWriter != null) { tsvWriter.append(tsvFeatures); // Flush every time in case we crash or get interrupted. The features must flow! tsvWriter.flush(); } } } else { throw new RuntimeException("Must specify inchi or input file"); } } finally { if (tsvWriter != null) { tsvWriter.close(); } } } }