/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.sarinference;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import chemaxon.struc.Molecule;
import com.act.biointerpretation.l2expansion.L2Prediction;
import com.act.biointerpretation.sars.Sar;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.util.Arrays;
import java.util.Optional;
/**
* Test a given prediction against all Sars in a corpus, and return the highest-scored matching Sar. This assumes that
* the SarTreeNodes used have all already been scored in some way. This class then checks, for any given
* prediction, which Sars in the SarTreeNodeList match that prediction, and returns the one that has the highest
* confidence, so that we can associate that confident SAR with the prediction. i.e., a prediction that matches a
* SAR which has an 80% hit rate should be considered a much more likely prediction than one which only matches
* SARs with 10% hit rates- roughly, we'd expect the first one to be correct with probability .8, and the second to
* be correct with only probability .1.
*/
public class BestSarFinder {
private static final Logger LOGGER = LogManager.getFormatterLogger(BestSarFinder.class);
SarTreeNodeList sarTreeNodes;
public BestSarFinder(SarTreeNodeList sarTreeNodes) {
this.sarTreeNodes = sarTreeNodes;
}
/**
* Find the highest scored Sar in the SarTreeNodeList that matches the given prediction
*
* @param prediction The prediction to score.
* @return The highest scored matching Sar, or empty if no Sar matches.
*/
public Optional<SarTreeNode> apply(L2Prediction prediction) {
if (prediction.getSubstrates().size() != 1) {
LOGGER.error("BestSarFinder only works on single substrate predictions.");
return Optional.empty();
}
// Import the substrate into chemaxon
Molecule substrate;
try {
substrate = MolImporter.importMol(prediction.getSubstrateInchis().get(0), "inchi");
} catch (MolFormatException e) {
LOGGER.error("Couldn't import molecule %s: %s", prediction.getSubstrateInchis().get(0), e.getMessage());
return Optional.empty();
}
// Iterate over the SarTreeNodes. For each one that matches the substrate, get its score. Return the highest
// scored matching Sar as the "best sar" for this prediction.
Double bestScore = 0D;
Optional<SarTreeNode> bestSarTreeNode = Optional.empty();
for (SarTreeNode scoredSar : sarTreeNodes.getSarTreeNodes()) {
Sar sar = scoredSar.getSar();
if (sar.test(Arrays.asList(substrate))) {
Double sarScore = scoredSar.getPercentageHits();
if (sarScore > bestScore) {
bestScore = sarScore;
bestSarTreeNode = Optional.of(scoredSar);
}
}
}
return bestSarTreeNode;
}
}