/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.l2expansion;
import act.server.DBIterator;
import act.server.MongoDB;
import act.shared.Chemical;
import act.shared.Reaction;
import chemaxon.formats.MolFormatException;
import com.act.analysis.chemicals.molecules.MoleculeFormat;
import com.act.analysis.chemicals.molecules.MoleculeImporter;
import com.github.benmanes.caffeine.cache.Cache;
import com.github.benmanes.caffeine.cache.Caffeine;
import org.apache.commons.lang3.tuple.Pair;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* This class iterates over all reactions in a MongoDB that contain only valid InChIs as substrates or products,
* returning the substrates of each such reaction. This should limit the set of returned reactions to only those that
* are eligible for mechanistic validation.
*
* TODO: generalize this to iterate over reactions in addition to just substrates.
*/
public class ValidReactionSubstratesIterator implements Iterator<String[]> {
private static final int DEFAULT_CACHE_SIZE = 10000;
private MongoDB db;
private DBIterator dbIter;
private Cache<Long, String> validInchiCache;
private Cache<Long, String>invalidInchiCache;
private Reaction nextValidReaction;
public ValidReactionSubstratesIterator(MongoDB db) {
this.db = db;
this.dbIter = db.getIteratorOverReactions();
this.validInchiCache = Caffeine.newBuilder().maximumSize(DEFAULT_CACHE_SIZE).build();
this.invalidInchiCache = Caffeine.newBuilder().maximumSize(DEFAULT_CACHE_SIZE).build();
}
/* This iterator opportunistically loads a reaction when hasNext() is called, as it must inspect one more more
* reactions in order to determine whether any more valid reactions exist in the DB.
*
* Once hasNext() has primed the iterator, next() simply extracts the reaction's substrates and fetches their
* InChIs, which should already have been cached in this iterator.
*/
@Override
public boolean hasNext() {
if (nextValidReaction == null && !dbIter.hasNext()) {
return false;
}
if (nextValidReaction != null) {
return true; // hasNext should be safely callable any number of times.
}
boolean foundValidReaction = false;
Reaction r = db.getNextReaction(dbIter);
// TODO: simplify the logic of this loop, if possible
do {
if (r == null) {
// TODO: this should not be possible, should it?
return false;
}
if (reactionChemicalsAreValid(r)) {
foundValidReaction = true;
} else {
if (dbIter.hasNext()) {
r = db.getNextReaction(dbIter);
} else {
r = null;
}
}
} while (!foundValidReaction);
nextValidReaction = r;
if (nextValidReaction == null) {
return false;
}
return true;
}
@Override
public String[] next() {
if (nextValidReaction != null) {
Reaction r = nextValidReaction;
nextValidReaction = null; // Invalidate reaction to avoid accidental double next() calls.
List<String> substrateInchis = new ArrayList<>(r.getSubstrates().length);
for (Long id : r.getSubstrates()) {
Pair<String, Boolean> lookupResults = getInchiAndIsCacheHit(id);
assert (lookupResults.getRight()); // We should always hit the cache here since we looked up to validate.
Integer coefficient = r.getSubstrateCoefficient(id);
if (coefficient == null) {
coefficient = 1; // Default to one if we can't find a coefficient for this substrate.
}
// Add the inchi once per coefficient count.
for (int i = 0; i < coefficient; i++) {
substrateInchis.add(lookupResults.getLeft());
}
}
return substrateInchis.toArray(new String[substrateInchis.size()]);
} else {
throw new RuntimeException("next() called without calling hasNext() or on an exhausted iterator");
}
}
/**
* Returns true iff all substrates/products of a reaction have valid InChIs.
* @param r The reaction to test.
* @return True if the reactions substrates/products have valid InChIs; false otherwise.
*/
private boolean reactionChemicalsAreValid(Reaction r) {
if (r.getSubstrates() == null || r.getSubstrates().length == 0) {
return false;
}
for (Long id : r.getSubstrates()) {
if (!validateChemicalForId(id)) {
return false;
}
}
if (r.getProducts() != null) {
for (Long id : r.getProducts()) {
if (!validateChemicalForId(id)) {
return false;
}
}
}
return true;
}
/**
* Validates and caches the InChI for a given chemical id. InChIs are partitioned into different caches depending
* on whether they're valid or not to reduce the incidence of invalid InChIs forcing valid ones out of the cache, but
* still enjoying the performance benefit of caching for chemicals with invalid InChIs.
* @param id The chemical id whose InChI to fetch.
* @return True if the chemical has a valid InChI, false otherwise.
*/
private boolean validateChemicalForId(Long id) {
if (invalidInchiCache.getIfPresent(id) != null){
return false;
}
if (validInchiCache.getIfPresent(id) != null) {
return true;
}
Chemical c = db.getChemicalFromChemicalUUID(id);
String inchi = c.getInChI();
if (inchi.contains("FAKE")) {
invalidInchiCache.put(id, inchi);
}
// TODO: can we skip this step and let the SPARK nodes do it?
try {
MoleculeImporter.importMolecule(inchi, MoleculeFormat.inchi$.MODULE$);
} catch (MolFormatException e) {
invalidInchiCache.put(id, inchi);
return false;
}
validInchiCache.put(id, inchi);
return true;
}
/**
* Tries to fetch a chemical's InChI from the cache; falls back to the DB on a miss. Does not update the cache
* itself, as validation and cache partitioning is done elsewhere.
* @param chemicalId The id of the chemical to look up.
* @return A pair of the chemical's InChI and a boolean indicating whether the chemical was found in the valid cache.
*/
private Pair<String, Boolean> getInchiAndIsCacheHit(Long chemicalId) {
String inchi = validInchiCache.getIfPresent(chemicalId);
if (inchi != null) {
return Pair.of(inchi, true);
}
Chemical c = db.getChemicalFromChemicalUUID(chemicalId);
return Pair.of(c.getInChI(), false);
}
}