/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.sarinference;
import com.act.biointerpretation.l2expansion.L2Prediction;
import com.act.biointerpretation.l2expansion.L2PredictionCorpus;
import com.act.lcms.db.io.report.IonAnalysisInterchangeModel;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.util.function.Consumer;
/**
* Calculates a SARs hit percentage score by seeing which of the leaves of its subtree are LCMS hits,
* and which are LCMS misses. This is not a perfect scoring across an entire prediction corpus; it is essentially
* a much faster, heuristic approach to simulating what SarHitPercentageCalculator does. The key assumption here is
* that we don't have to look outside a given SAR's subtree when we're trying to assign it a score, since there is
* probably almost nothing outside its subtree (in the clustering hierarchy) which will match it anyway. However,
* as LibMCS clustering is not always perfect, there may in fact be molecules that match a SAR, which do not get
* clustered under that SAR's subtree. In such cases, this heuristic score diverges from the full score calculated
* across the entire corpus.
*/
public class SarTreeBasedCalculator implements Consumer<SarTreeNode> {
private static final Logger LOGGER = LogManager.getFormatterLogger(SarTreeBasedCalculator.class);
private final SarTree sarTree;
private final L2PredictionCorpus predictionCorpus;
private final IonAnalysisInterchangeModel lcmsResults;
public SarTreeBasedCalculator(SarTree sarTree, L2PredictionCorpus corpus, IonAnalysisInterchangeModel lcmsResults) {
this.sarTree = sarTree;
this.predictionCorpus = corpus;
this.lcmsResults = lcmsResults;
}
/**
* Set the hits and missess on the given sarTreeNode based on which of the leaves in its subtree are hits,
* and which are misses.
*
* @param sarTreeNode The node to score.
*/
@Override
public void accept(SarTreeNode sarTreeNode) {
int hits = 0;
int misses = 0;
for (SarTreeNode node : sarTree.traverseSubtree(sarTreeNode)) {
// Only calculate on leaves
if (sarTree.getChildren(node).isEmpty()) {
switch (getLcmsDataForNode(node)) {
case HIT:
hits++;
break;
case MISS:
misses++;
break;
}
}
}
sarTreeNode.setNumberHits(hits);
sarTreeNode.setNumberMisses(misses);
}
/**
* Calculates whether a given SarTreeNode is a hit or not. If any of that node's predictions have positive
* products, we consider it a hit.
*
* @param node The SarTreeNode.
* @return True if at least one prediction Id of the node is an LCMS hit.
*/
public IonAnalysisInterchangeModel.LCMS_RESULT getLcmsDataForNode(SarTreeNode node) {
if (node.getPredictionIds().isEmpty()) {
throw new IllegalArgumentException("Cannot get LCMS results for a node with no predictions:" +
node.getHierarchyId());
}
/**
* The results of the predictions should be the same for one substrate and one RO, as all of the results will
* have the same MZ value. Thus, we pick out the first one, check that the others are all the same for sanity,
* and then return the LCMS result of the first one.
*/
L2Prediction prediction = predictionCorpus.getPredictionFromId(node.getPredictionIds().get(0));
IonAnalysisInterchangeModel.LCMS_RESULT firstResult = lcmsResults.getLcmsDataForPrediction(prediction);
for (Integer id : node.getPredictionIds()) {
IonAnalysisInterchangeModel.LCMS_RESULT otherResult =
lcmsResults.getLcmsDataForPrediction(predictionCorpus.getPredictionFromId(id));
if (otherResult != firstResult) {
LOGGER.error("Different LCMS results for same substrate! %s and %s", firstResult, otherResult);
}
}
return firstResult;
}
}