/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.lcms.db.analysis;
import com.act.lcms.Gnuplotter;
import com.act.lcms.db.io.DB;
import com.act.lcms.db.io.LoadPlateCompositionIntoDB;
import com.act.utils.TSVParser;
import com.act.lcms.db.model.LCMSWell;
import com.act.lcms.db.model.Plate;
import com.act.lcms.db.model.ScanFile;
import com.act.lcms.db.model.StandardWell;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import java.io.File;
import java.io.FileOutputStream;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ConfigurableAnalysis {
public static final String OPTION_DIRECTORY = "d";
public static final String OPTION_OUTPUT_PREFIX = "o";
public static final String OPTION_CONFIG_FILE = "c";
public static final String OPTION_USE_HEATMAP = "e";
public static final String OPTION_FONT_SCALE = "s";
public static final String OPTION_USE_SNR = "r";
/* TODO: this format was based on a TSV sample Chris produced. We should improve it to better match the format of the
* data that is available in the DB. */
public static final String HEADER_SAMPLE = "trigger/sample";
public static final String HEADER_LABEL = "label";
public static final String HEADER_EXACT_MASS = "exact_mass";
public static final String HEADER_PRECISION = "precision";
public static final String HEADER_INTENSITY_RANGE_GROUP = "normalization_group";
public static final String[] EXPECTED_HEADER_FIELDS = new String[]{
HEADER_SAMPLE, HEADER_LABEL, HEADER_EXACT_MASS, HEADER_PRECISION, HEADER_INTENSITY_RANGE_GROUP
};
public static final String HELP_MESSAGE = StringUtils.join(new String[]{
"LCMS analysis class that takes as its input a configuration TSV and outputs plots ",
"of the plates/masses specified in that configuration file. The expected rows in the ",
"configuration file are based on a mockup Chris created, and look like:\n",
"`<plate barcode>|<well coordinates>\\t<label>\\t<mass or chemical>\\t",
"<resolution (only 0.01 or 0.001)>\\t<y axis range group>`\n\n",
"Expected TSV fields are:\n",
StringUtils.join(EXPECTED_HEADER_FIELDS, ", "), "\n\n"
}, "");
public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
static {
HELP_FORMATTER.setWidth(100);
}
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_DIRECTORY)
.argName("directory")
.desc("The directory where LCMS analysis results live")
.hasArg().required()
.longOpt("data-dir")
);
add(Option.builder(OPTION_OUTPUT_PREFIX)
.argName("output prefix")
.desc("A prefix for the output data/pdf files")
.hasArg().required()
.longOpt("output-prefix")
);
add(Option.builder(OPTION_CONFIG_FILE)
.argName("configuration file")
.desc("A configuration TSV that determines the layout of the analysis")
.hasArg().required()
.longOpt("config-file")
);
add(Option.builder(OPTION_USE_SNR)
.desc("Use signal-to-noise ratio instead of max intensity for peak identification")
.longOpt("use-snr")
);
add(Option.builder(OPTION_USE_HEATMAP)
.desc("Produce a heat map rather than a 2d line plot")
.longOpt("heat-map")
);
add(Option.builder(OPTION_FONT_SCALE)
.argName("font scale")
.desc("A Gnuplot fontscale value, should be between 0.1 and 0.5 (0.4 works if the graph text is large")
.hasArg()
.longOpt("font-scale")
);
}};
static {
// Add DB connection options.
OPTION_BUILDERS.addAll(DB.DB_OPTION_BUILDERS);
}
public static class AnalysisStep {
public enum KIND {
HEADER,
SEPARATOR_LARGE,
SEPARATOR_SMALL,
SAMPLE,
}
private Integer index;
private KIND kind;
private String plateBarcode;
private String plateCoords;
private String label;
private Double exactMass;
private Boolean useFineGrainedMZTolerance;
private Integer intensityRangeGroup;
public AnalysisStep(Integer index, KIND kind, String plateBarcode, String plateCoords, String label,
Double exactMass, Boolean useFineGrainedMZTolerance, Integer intensityRangeGroup) {
this.index = index;
this.kind = kind;
this.plateBarcode = plateBarcode;
this.plateCoords = plateCoords;
this.label = label;
this.exactMass = exactMass;
this.useFineGrainedMZTolerance = useFineGrainedMZTolerance;
this.intensityRangeGroup = intensityRangeGroup;
}
public Integer getIndex() {
return index;
}
public KIND getKind() {
return kind;
}
public String getPlateBarcode() {
return plateBarcode;
}
public String getPlateCoords() {
return plateCoords;
}
public String getLabel() {
return label;
}
public Double getExactMass() {
return exactMass;
}
public Boolean getUseFineGrainedMZTolerance() {
return useFineGrainedMZTolerance;
}
public Integer getIntensityRangeGroup() {
return intensityRangeGroup;
}
}
public static final Pattern SAMPLE_WELL_PATTERN = Pattern.compile("^(.*)\\|([A-Za-z]+[0-9]+)$");
public static final String EXPECTED_LOW_PRECISION_STRING = "0.01";
public static final String EXPECTED_HIGH_PRECISION_STRING = "0.001";
public static AnalysisStep mapToStep(DB db, Integer index, Map<String, String> map) throws SQLException {
String sample = map.get(HEADER_SAMPLE);
if (sample == null || sample.isEmpty()) {
System.err.format("Missing required field %s in line %d, skipping\n", HEADER_SAMPLE, index);
return null;
}
if (sample.startsWith("<header")) {
return new AnalysisStep(index, AnalysisStep.KIND.HEADER, null, null, map.get(HEADER_LABEL), null, null, null);
}
if (sample.equals("<large_line_separator>")) {
return new AnalysisStep(index, AnalysisStep.KIND.SEPARATOR_LARGE, null, null, null, null, null, null);
}
if (sample.equals("<small_line_separator>")) {
return new AnalysisStep(index, AnalysisStep.KIND.SEPARATOR_SMALL, null, null, null, null, null, null);
}
Matcher sampleMatcher = SAMPLE_WELL_PATTERN.matcher(sample);
if (!sampleMatcher.matches()) {
System.err.format("Unable to interpret sample field %s in line %d, skipping\n", sample, index);
return null;
}
String plateBarcode = sampleMatcher.group(1);
String plateCoords = sampleMatcher.group(2);
String label = map.get(HEADER_LABEL);
Pair<String, Double> exactMass = Utils.extractMassFromString(db, map.get(HEADER_EXACT_MASS));
Boolean useFineGrainedMZTolerance = false;
String precisionString = map.get(HEADER_PRECISION);
if (precisionString != null && !precisionString.isEmpty()) {
switch (map.get(HEADER_PRECISION)) {
case EXPECTED_LOW_PRECISION_STRING:
useFineGrainedMZTolerance = false;
break;
case EXPECTED_HIGH_PRECISION_STRING:
useFineGrainedMZTolerance = true;
break;
case "low":
useFineGrainedMZTolerance = false;
break;
case "high":
useFineGrainedMZTolerance = true;
break;
case "coarse":
useFineGrainedMZTolerance = false;
break;
case "fine":
useFineGrainedMZTolerance = true;
break;
default:
System.err.format("Invalid precision value %s, defaulting to coarse-grained analysis\n", precisionString);
useFineGrainedMZTolerance = false;
break;
}
}
String intensityGroupString = map.get(HEADER_INTENSITY_RANGE_GROUP);
Integer intensityGroup = -1;
if (intensityGroupString != null && !intensityGroupString.isEmpty()) {
intensityGroup = Integer.parseInt(intensityGroupString);
}
return new AnalysisStep(index, AnalysisStep.KIND.SAMPLE, plateBarcode, plateCoords, label, exactMass.getRight(),
useFineGrainedMZTolerance, intensityGroup);
}
// TODO: do we want to make this configurable? Or should we always just search for M?
public static final Set<String> SEARCH_IONS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList("M+H")));
public static final Set<String> EMPTY_SET = Collections.unmodifiableSet(new HashSet<>(0));
public static void runAnalysis(DB db, File lcmsDir, String outputPrefix, List<AnalysisStep> steps,
boolean makeHeatmaps, Double fontScale, boolean useSNR)
throws SQLException, Exception {
HashMap<String, Plate> platesByBarcode = new HashMap<>();
HashMap<Integer, Plate> platesById = new HashMap<>();
Map<Integer, Pair<List<ScanData<LCMSWell>>, Double>> lcmsResults = new HashMap<>();
Map<Integer, Pair<List<ScanData<StandardWell>>, Double>> standardResults = new HashMap<>();
Map<Integer, Double> intensityGroupMaximums = new HashMap<>();
for (AnalysisStep step : steps) {
// There's no trace analysis to perform for non-sample steps.
if (step.getKind() != AnalysisStep.KIND.SAMPLE) {
continue;
}
Plate p = platesByBarcode.get(step.getPlateBarcode());
if (p == null) {
p = Plate.getPlateByBarcode(db, step.getPlateBarcode());
if (p == null) {
throw new IllegalArgumentException(String.format("Found invalid plate barcode '%s' for analysis component %d",
step.getPlateBarcode(), step.getIndex()));
}
platesByBarcode.put(p.getBarcode(), p);
platesById.put(p.getId(), p);
}
Pair<Integer, Integer> coords = Utils.parsePlateCoordinates(step.getPlateCoords());
List<Pair<String, Double>> searchMZs =
Collections.singletonList(Pair.of("Configured m/z value", step.getExactMass()));
Double maxIntesnsity = null;
switch (p.getContentType()) {
case LCMS:
// We don't know which of the scans are positive samples and which are negatives, so call them all positive.
List<LCMSWell> lcmsSamples = Collections.singletonList(
LCMSWell.getInstance().getByPlateIdAndCoordinates(db, p.getId(), coords.getLeft(), coords.getRight()));
Pair<List<ScanData<LCMSWell>>, Double> lcmsScanData =
AnalysisHelper.processScans(db, lcmsDir, searchMZs, ScanData.KIND.POS_SAMPLE, platesById, lcmsSamples,
step.getUseFineGrainedMZTolerance(), SEARCH_IONS, EMPTY_SET, useSNR);
lcmsResults.put(step.getIndex(), lcmsScanData);
maxIntesnsity = lcmsScanData.getRight();
break;
case STANDARD:
List<StandardWell> standardSamples = Collections.singletonList(
StandardWell.getInstance().getStandardWellsByPlateIdAndCoordinates(db, p.getId(), coords.getLeft(), coords.getRight()));
Pair<List<ScanData<StandardWell>>, Double> standardScanData =
AnalysisHelper.processScans(db, lcmsDir, searchMZs, ScanData.KIND.STANDARD, platesById, standardSamples,
step.getUseFineGrainedMZTolerance(), SEARCH_IONS, EMPTY_SET, useSNR);
standardResults.put(step.getIndex(), standardScanData);
maxIntesnsity = standardScanData.getRight();
break;
default:
throw new IllegalArgumentException(
String.format("Invalid plate content kind %s for plate %s in analysis component %d",
p.getContentType(), p.getBarcode(), step.getIndex()));
}
Double existingMax = intensityGroupMaximums.get(step.getIntensityRangeGroup());
if (existingMax == null || existingMax < maxIntesnsity) { // TODO: is this the right max intensity?
intensityGroupMaximums.put(step.getIntensityRangeGroup(), maxIntesnsity);
}
}
// Prep the chart labels/types, write out the data, and plot the charts.
File dataFile = new File(outputPrefix + ".data");
List<Gnuplotter.PlotConfiguration> plotConfigurations = new ArrayList<>(steps.size());
int numGraphs = 0;
try (FileOutputStream fos = new FileOutputStream(dataFile)) {
for (AnalysisStep step : steps) {
if (step.getKind() == AnalysisStep.KIND.HEADER) {
// TODO: change the Gnuplotter API to add headings and update this.
plotConfigurations.add(new Gnuplotter.PlotConfiguration(
Gnuplotter.PlotConfiguration.KIND.HEADER,
step.getLabel(),
null,
null
));
continue;
}
if (step.getKind() == AnalysisStep.KIND.SEPARATOR_LARGE ||
step.getKind() == AnalysisStep.KIND.SEPARATOR_SMALL) {
// TODO: change the Gnuplotter API to add headings and update this.
plotConfigurations.add(new Gnuplotter.PlotConfiguration(
Gnuplotter.PlotConfiguration.KIND.SEPARATOR,
"",
null,
null
));
continue;
}
Plate p = platesByBarcode.get(step.getPlateBarcode());
Double maxIntensity = intensityGroupMaximums.get(step.getIntensityRangeGroup());
switch (p.getContentType()) {
case LCMS:
Pair<List<ScanData<LCMSWell>>, Double> lcmsPair = lcmsResults.get(step.getIndex());
if (lcmsPair.getLeft().size() > 1) {
System.err.format("Found multiple scan files for LCMW well %s @ %s, using first\n",
step.getPlateBarcode(), step.getPlateCoords());
}
AnalysisHelper.writeScanData(fos, lcmsDir, maxIntensity,
lcmsPair.getLeft().get(0), makeHeatmaps, false);
break;
case STANDARD:
Pair<List<ScanData<StandardWell>>, Double> stdPair = standardResults.get(step.getIndex());
if (stdPair.getLeft().size() > 1) {
System.err.format("Found multiple scan files for standard well %s @ %s, using first\n",
step.getPlateBarcode(), step.getPlateCoords());
}
AnalysisHelper.writeScanData(fos, lcmsDir, maxIntensity,
stdPair.getLeft().get(0), makeHeatmaps, false);
break;
default:
// This case represents a bug, so it's a RuntimeException.
throw new RuntimeException(String.format("Found unexpected content type %s for plate %s on analysis step %d",
p.getContentType(), p.getBarcode(), step.getIndex()));
}
plotConfigurations.add(new Gnuplotter.PlotConfiguration(
Gnuplotter.PlotConfiguration.KIND.GRAPH,
step.getLabel(),
numGraphs,
maxIntensity
));
numGraphs++;
}
String fmt = "pdf";
File imgFile = new File(outputPrefix + "." + fmt);
Gnuplotter plotter = fontScale == null ? new Gnuplotter() : new Gnuplotter(fontScale);
if (makeHeatmaps) {
plotter.plotHeatmap(dataFile.getAbsolutePath(), imgFile.getAbsolutePath(),
fmt, null, null, plotConfigurations, imgFile + ".gnuplot");
} else {
plotter.plot2D(dataFile.getAbsolutePath(), imgFile.getAbsolutePath(),
"time", "intensity", fmt, null, null, plotConfigurations, imgFile + ".gnuplot");
}
}
}
public static void main(String[] args) throws Exception {
Options opts = new Options();
for (Option.Builder b : OPTION_BUILDERS) {
opts.addOption(b.build());
}
CommandLine cl = null;
try {
CommandLineParser parser = new DefaultParser();
cl = parser.parse(opts, args);
} catch (ParseException e) {
System.err.format("Argument parsing failed: %s\n", e.getMessage());
HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
if (cl.hasOption("help")) {
HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
return;
}
File lcmsDir = new File(cl.getOptionValue(OPTION_DIRECTORY));
if (!lcmsDir.isDirectory()) {
System.err.format("File at %s is not a directory\n", lcmsDir.getAbsolutePath());
HELP_FORMATTER.printHelp(LoadPlateCompositionIntoDB.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
Double fontScale = null;
if (cl.hasOption("font-scale")) {
try {
fontScale = Double.parseDouble(cl.getOptionValue("font-scale"));
} catch (IllegalArgumentException e) {
System.err.format("Argument for font-scale must be a floating point number.\n");
System.exit(1);
}
}
File configFile = new File(cl.getOptionValue(OPTION_CONFIG_FILE));
if (!configFile.isFile()) {
throw new IllegalArgumentException(String.format("Not a regular file at %s", configFile.getAbsolutePath()));
}
TSVParser parser = new TSVParser();
parser.parse(configFile);
try (DB db = DB.openDBFromCLI(cl)) {
System.out.format("Loading/updating LCMS scan files into DB\n");
ScanFile.insertOrUpdateScanFilesInDirectory(db, lcmsDir);
List<AnalysisStep> steps = new ArrayList<>(parser.getResults().size());
int i = 0;
for (Map<String, String> row : parser.getResults()) {
AnalysisStep step = mapToStep(db, i, row);
if (step != null) {
System.out.format("%d: %s '%s' %s %s %f %s\n", step.getIndex(), step.getKind(), step.getLabel(),
step.getPlateBarcode(), step.getPlateCoords(), step.getExactMass(), step.getUseFineGrainedMZTolerance());
}
steps.add(step);
i++;
}
System.out.format("Running analysis\n");
runAnalysis(db, lcmsDir, cl.getOptionValue(OPTION_OUTPUT_PREFIX),
steps, cl.hasOption(OPTION_USE_HEATMAP), fontScale, cl.hasOption(OPTION_USE_SNR));
}
}
}