/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.biointerpretation.sarinference; import com.act.biointerpretation.l2expansion.L2Prediction; import com.act.biointerpretation.l2expansion.L2PredictionCorpus; import com.act.lcms.db.io.report.IonAnalysisInterchangeModel; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.util.function.Consumer; /** * Calculates a SARs hit percentage score by seeing which of the leaves of its subtree are LCMS hits, * and which are LCMS misses. This is not a perfect scoring across an entire prediction corpus; it is essentially * a much faster, heuristic approach to simulating what SarHitPercentageCalculator does. The key assumption here is * that we don't have to look outside a given SAR's subtree when we're trying to assign it a score, since there is * probably almost nothing outside its subtree (in the clustering hierarchy) which will match it anyway. However, * as LibMCS clustering is not always perfect, there may in fact be molecules that match a SAR, which do not get * clustered under that SAR's subtree. In such cases, this heuristic score diverges from the full score calculated * across the entire corpus. */ public class SarTreeBasedCalculator implements Consumer<SarTreeNode> { private static final Logger LOGGER = LogManager.getFormatterLogger(SarTreeBasedCalculator.class); private final SarTree sarTree; private final L2PredictionCorpus predictionCorpus; private final IonAnalysisInterchangeModel lcmsResults; public SarTreeBasedCalculator(SarTree sarTree, L2PredictionCorpus corpus, IonAnalysisInterchangeModel lcmsResults) { this.sarTree = sarTree; this.predictionCorpus = corpus; this.lcmsResults = lcmsResults; } /** * Set the hits and missess on the given sarTreeNode based on which of the leaves in its subtree are hits, * and which are misses. * * @param sarTreeNode The node to score. */ @Override public void accept(SarTreeNode sarTreeNode) { int hits = 0; int misses = 0; for (SarTreeNode node : sarTree.traverseSubtree(sarTreeNode)) { // Only calculate on leaves if (sarTree.getChildren(node).isEmpty()) { switch (getLcmsDataForNode(node)) { case HIT: hits++; break; case MISS: misses++; break; } } } sarTreeNode.setNumberHits(hits); sarTreeNode.setNumberMisses(misses); } /** * Calculates whether a given SarTreeNode is a hit or not. If any of that node's predictions have positive * products, we consider it a hit. * * @param node The SarTreeNode. * @return True if at least one prediction Id of the node is an LCMS hit. */ public IonAnalysisInterchangeModel.LCMS_RESULT getLcmsDataForNode(SarTreeNode node) { if (node.getPredictionIds().isEmpty()) { throw new IllegalArgumentException("Cannot get LCMS results for a node with no predictions:" + node.getHierarchyId()); } /** * The results of the predictions should be the same for one substrate and one RO, as all of the results will * have the same MZ value. Thus, we pick out the first one, check that the others are all the same for sanity, * and then return the LCMS result of the first one. */ L2Prediction prediction = predictionCorpus.getPredictionFromId(node.getPredictionIds().get(0)); IonAnalysisInterchangeModel.LCMS_RESULT firstResult = lcmsResults.getLcmsDataForPrediction(prediction); for (Integer id : node.getPredictionIds()) { IonAnalysisInterchangeModel.LCMS_RESULT otherResult = lcmsResults.getLcmsDataForPrediction(predictionCorpus.getPredictionFromId(id)); if (otherResult != firstResult) { LOGGER.error("Different LCMS results for same substrate! %s and %s", firstResult, otherResult); } } return firstResult; } }