/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.sarinference;
import com.act.biointerpretation.l2expansion.L2FilteringDriver;
import com.act.biointerpretation.l2expansion.L2Prediction;
import com.act.biointerpretation.l2expansion.L2PredictionCorpus;
import com.act.jobs.FileChecker;
import com.act.jobs.JavaRunnable;
import com.act.lcms.db.io.report.IonAnalysisInterchangeModel;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
public class ProductScorer {
private static final Logger LOGGER = LogManager.getFormatterLogger(ProductScorer.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
static {
OBJECT_MAPPER.enable(SerializationFeature.INDENT_OUTPUT);
}
private static final String OPTION_PREDICTION_CORPUS = "c";
private static final String OPTION_LCMS_RESULTS = "p";
private static final String OPTION_SCORED_SARS = "s";
private static final String OPTION_OUTPUT_PATH = "o";
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
{
add(Option.builder(OPTION_PREDICTION_CORPUS)
.argName("input corpus path")
.desc("The absolute path to the input prediction corpus.")
.hasArg()
.longOpt("input-corpus-path")
.required(true)
);
add(Option.builder(OPTION_LCMS_RESULTS)
.argName("lcms results")
.desc("The path to a file of lcms results.")
.hasArg()
.longOpt("input-lcms-results")
);
add(Option.builder(OPTION_SCORED_SARS)
.argName("scored sars corpus")
.hasArg()
.longOpt("input-scored-sars")
);
add(Option.builder(OPTION_OUTPUT_PATH)
.argName("output path")
.desc("The path to which to write the output.")
.hasArg()
.longOpt("output-path")
.required(true)
);
}
};
public static final String HELP_MESSAGE =
"This class is used to rank the products of PredictionCorpus according to a set of SARs.";
public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
static {
HELP_FORMATTER.setWidth(100);
}
public static void main(String[] args) throws Exception {
// Build command line parser.
Options opts = new Options();
for (Option.Builder b : OPTION_BUILDERS) {
opts.addOption(b.build());
}
CommandLine cl = null;
try {
CommandLineParser parser = new DefaultParser();
cl = parser.parse(opts, args);
} catch (ParseException e) {
LOGGER.error("Argument parsing failed: %s", e.getMessage());
HELP_FORMATTER.printHelp(L2FilteringDriver.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
File inputCorpusFile = new File(cl.getOptionValue(OPTION_PREDICTION_CORPUS));
File lcmsFile = new File(cl.getOptionValue(OPTION_LCMS_RESULTS));
File scoredSarsFile = new File(cl.getOptionValue(OPTION_SCORED_SARS));
File outputFile = new File(cl.getOptionValue(OPTION_OUTPUT_PATH));
JavaRunnable productScoreRunner = getProductScorer(
inputCorpusFile,
scoredSarsFile,
lcmsFile,
outputFile);
LOGGER.info("Scoring products.");
productScoreRunner.run();
LOGGER.info("Complete!.");
}
/**
* Reads in scored SARs, checks them against a prediction corpus and positive inchi list to get a product ranking.
* This method is static because it does not rely on any properties of the enclosing class to construct the job.
* TODO: It would probably make more sense to make this its own class, i.e. <ProductScorer implements JavaRunnable>
* TODO: improve the data structure used to store scored products- using an L2PredictionCorpus is pretty ugly
*
* @param predictionCorpus The prediction corpus to score.
* @param scoredSars The scored SARs to use.
* @param lcmsFile The set of positive LCMS inchis, to use in scoring.
* @return A JavaRunnable to run the product scoring.
*/
public static JavaRunnable getProductScorer(File predictionCorpus, File scoredSars, File lcmsFile, File outputFile) {
return new JavaRunnable() {
@Override
public void run() throws IOException {
// Verify files
FileChecker.verifyInputFile(predictionCorpus);
FileChecker.verifyInputFile(scoredSars);
FileChecker.verifyInputFile(lcmsFile);
FileChecker.verifyAndCreateOutputFile(outputFile);
// Build SAR node list and best sar finder
SarTreeNodeList nodeList = new SarTreeNodeList();
nodeList.loadFromFile(scoredSars);
BestSarFinder sarFinder = new BestSarFinder(nodeList);
// Build prediction corpus
L2PredictionCorpus predictions = L2PredictionCorpus.readPredictionsFromJsonFile(predictionCorpus);
// Build LCMS results
IonAnalysisInterchangeModel lcmsResults = new IonAnalysisInterchangeModel();
lcmsResults.loadResultsFromFile(lcmsFile);
/**
* Build map from predictions to their scores based on SAR
* For each prediction, we add on auxiliary info about its SARs and score to its projector name.
* TODO: build data structure to store a scored prediction, instead of hijacking the projector name.
*/
Map<L2Prediction, Double> predictionToScoreMap = new HashMap<>();
LOGGER.info("Scoring predictions.");
for (L2Prediction prediction : predictions.getCorpus()) {
String nameAppendage = lcmsResults.getLcmsDataForPrediction(prediction).toString(); // Always tack LCMS result onto name
Optional<SarTreeNode> maybeBestSar = sarFinder.apply(prediction);
if (maybeBestSar.isPresent()) {
// If a SAR was matched, add info about it to the projector name, and put its score into the map
SarTreeNode bestSar = maybeBestSar.get();
nameAppendage += ":" +
bestSar.getHierarchyId() + ":" +
bestSar.getRankingScore();
prediction.setProjectorName(prediction.getProjectorName() + nameAppendage);
predictionToScoreMap.put(prediction, bestSar.getRankingScore());
} else {
// If no SAR is found, append "NO_SAR" to the prediction, and give it a ranking score of 0
nameAppendage += "NO_SAR";
prediction.setProjectorName(prediction.getProjectorName() + nameAppendage);
predictionToScoreMap.put(prediction, 0D);
}
}
LOGGER.info("Sorting predictions in decreasing order of best associated SAR rank.");
List<L2Prediction> predictionList = new ArrayList<>(predictionToScoreMap.keySet());
predictionList.sort((a, b) -> -Double.compare(
predictionToScoreMap.get(a),
predictionToScoreMap.get(b)));
// Wrap results in a corpus and write to file.
L2PredictionCorpus finalCorpus = new L2PredictionCorpus(predictionList);
finalCorpus.writePredictionsToJsonFile(outputFile);
LOGGER.info("Complete!.");
}
@Override
public String toString() {
return "ProductScorer:" + scoredSars.getName();
}
};
}
}