/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.desalting;
import act.server.NoSQLAPI;
import act.shared.Chemical;
import act.shared.Reaction;
import act.shared.helpers.P;
import chemaxon.license.LicenseProcessingException;
import chemaxon.reaction.ReactionException;
import com.act.biointerpretation.BiointerpretationProcessor;
import com.act.biointerpretation.Utils.ReactionComponent;
import com.act.biointerpretation.Utils.ReactionProjector;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import static com.act.biointerpretation.Utils.ReactionComponent.PRODUCT;
import static com.act.biointerpretation.Utils.ReactionComponent.SUBSTRATE;
/**
* ReactionDesalter itself does the processing of the database using an instance of Desalter.
* This class creates Synapse from Dr. Know. Synapse is the database in which the chemicals
* have been inspected for containing multiple species or ionized forms, and corrected.
*
* Created by jca20n on 10/22/15.
*/
public class ReactionDesalter extends BiointerpretationProcessor {
private static final Logger LOGGER = LogManager.getFormatterLogger(ReactionDesalter.class);
private static final String PROCESSOR_NAME = "Desalter";
private static final String FAKE = "FAKE";
// Don't use the superclass's maps, as we might convert one chemical into many.
private Map<Long, List<Long>> oldChemicalIdToNewChemicalIds = new HashMap<>();
private Map<String, Long> inchiToNewId = new HashMap<>();
private Map<Pair<Long, Long>, Integer> desalterMultiplerMap = new HashMap<>(); // Old + new ids -> coeff. multipler.
private Desalter desalter;
private int desalterFailuresCounter = 0;
@Override
public String getName() {
return PROCESSOR_NAME;
}
public ReactionDesalter(NoSQLAPI inputApi) {
super(inputApi);
}
@Override
public void init() throws IOException, ReactionException, LicenseProcessingException {
desalter = new Desalter(new ReactionProjector());
desalter.initReactors();
markInitialized();
}
/**
* This function reads the products and reactions from the db, desalts them and writes it back.
*/
@Override
public void run() throws IOException, LicenseProcessingException, ReactionException {
failIfNotInitialized();
LOGGER.debug("Starting Reaction Desalter");
long startTime = new Date().getTime();
desaltAllChemicals();
desaltAllReactions();
long endTime = new Date().getTime();
LOGGER.debug(String.format("Time in seconds: %d", (endTime - startTime) / 1000));
}
public void desaltAllChemicals() throws IOException, LicenseProcessingException, ReactionException {
Iterator<Chemical> chemicals = getNoSQLAPI().readChemsFromInKnowledgeGraph();
while (chemicals.hasNext()) {
Chemical chem = chemicals.next();
desaltChemical(chem); // Ignore results, as the cached mapping will be used for reaction desalting.
}
LOGGER.info("Encountered %d failures while desalting all molecules", desalterFailuresCounter);
}
public void desaltAllReactions() throws IOException, LicenseProcessingException, ReactionException {
//Scan through all Reactions and process each one.
Iterator<Reaction> reactionIterator = getNoSQLAPI().readRxnsFromInKnowledgeGraph();
while (reactionIterator.hasNext()) {
Reaction oldRxn = reactionIterator.next();
// I don't like modifying reaction objects in place, so we'll create a fresh one and write it to the new DB.
Reaction desaltedReaction = new Reaction(
-1, // Assume the id will be set when the reaction is written to the DB.
new Long[0],
new Long[0],
new Long[0],
new Long[0],
new Long[0],
oldRxn.getECNum(),
oldRxn.getConversionDirection(),
oldRxn.getPathwayStepDirection(),
oldRxn.getReactionName(),
oldRxn.getRxnDetailType()
);
// Add the data source and references from the source to the destination
desaltedReaction.setDataSource(oldRxn.getDataSource());
for (P<Reaction.RefDataSource, String> ref : oldRxn.getReferences()) {
desaltedReaction.addReference(ref.fst(), ref.snd());
}
migrateReactionSubsProdsWCoeffs(desaltedReaction, oldRxn);
int newId = getNoSQLAPI().writeToOutKnowlegeGraph(desaltedReaction);
migrateAllProteins(desaltedReaction, oldRxn, Long.valueOf(oldRxn.getUUID()));
// Update the reaction in the DB with the newly migrated protein data.
getNoSQLAPI().getWriteDB().updateActReaction(desaltedReaction, newId);
}
}
private void migrateReactionSubsProdsWCoeffs(Reaction newReaction, Reaction oldReaction) {
{
Pair<List<Long>, Map<Long, Integer>> newSubstratesAndCoefficients =
buildIdAndCoefficientMapping(oldReaction, SUBSTRATE);
newReaction.setSubstrates(newSubstratesAndCoefficients.getLeft().toArray(
new Long[newSubstratesAndCoefficients.getLeft().size()]));
newReaction.setAllSubstrateCoefficients(newSubstratesAndCoefficients.getRight());
List<Long> newSubstrateCofactors = buildIdMapping(oldReaction.getSubstrateCofactors());
newReaction.setSubstrateCofactors(newSubstrateCofactors.toArray(new Long[newSubstrateCofactors.size()]));
}
{
Pair<List<Long>, Map<Long, Integer>> newProductsAndCoefficients =
buildIdAndCoefficientMapping(oldReaction, PRODUCT);
newReaction.setProducts(newProductsAndCoefficients.getLeft().toArray(
new Long[newProductsAndCoefficients.getLeft().size()]));
newReaction.setAllProductCoefficients(newProductsAndCoefficients.getRight());
List<Long> newproductCofactors = buildIdMapping(oldReaction.getProductCofactors());
newReaction.setProductCofactors(newproductCofactors.toArray(new Long[newproductCofactors.size()]));
}
}
private List<Long> buildIdMapping(Long[] oldChemIds) {
LinkedHashSet<Long> newIDs = new LinkedHashSet<>(oldChemIds.length);
for (Long oldChemId : oldChemIds) {
List<Long> newChemIds = oldChemicalIdToNewChemicalIds.get(oldChemId);
if (newChemIds == null) {
throw new RuntimeException(
String.format("Found old chemical id %d that is not in the old -> new chem id map", oldChemId));
}
newIDs.addAll(newChemIds);
}
List<Long> results = new ArrayList<>();
// TODO: does ArrayList's constructor also add all the hashed elements in order? I know addAll does.
results.addAll(newIDs);
return results;
}
private Pair<List<Long>, Map<Long, Integer>> buildIdAndCoefficientMapping(Reaction oldRxn, ReactionComponent sOrP) {
Long[] oldChemIds = sOrP == SUBSTRATE ? oldRxn.getSubstrates() : oldRxn.getProducts();
List<Long> resultIds = new ArrayList<>(oldChemIds.length);
Map<Long, Integer> newIdToCoefficientMap = new HashMap<>(oldChemIds.length);
for (Long oldChemId : oldChemIds) {
Integer originalRxnCoefficient = sOrP == SUBSTRATE ?
oldRxn.getSubstrateCoefficient(oldChemId) : oldRxn.getProductCoefficient(oldChemId);
List<Long> newChemIds = oldChemicalIdToNewChemicalIds.get(oldChemId);
if (newChemIds == null) {
throw new RuntimeException(
String.format("Found old chemical id %d that is not in the old -> new chem id map", oldChemId));
}
for (Long newChemId : newChemIds) {
// Deduplicate new chemicals in the list based on whether we've assigned coefficients for them or not.
if (newIdToCoefficientMap.containsKey(newChemId)) {
Integer coefficientAccumulator = newIdToCoefficientMap.get(newChemId);
// If only one coefficient is null, we have a problem. Just write null and hope we can figure it out later.
if ((coefficientAccumulator == null && originalRxnCoefficient != null) ||
(coefficientAccumulator != null && originalRxnCoefficient == null)) {
LOGGER.error("Found null coefficient that needs to be merged with non-null coefficient. " +
"New chem id: %d, old chem id: %d, coefficient value: %d, old rxn id: %d",
newChemId, oldChemId, originalRxnCoefficient, oldRxn.getUUID());
newIdToCoefficientMap.put(newChemId, null);
} else if (coefficientAccumulator != null && originalRxnCoefficient != null) {
/* If neither are null, multiply the coefficient to be added by the desalting multiplier and sum that
* product with the existing count for this molecule. */
Integer desalterMultiplier = desalterMultiplerMap.get(Pair.of(oldChemId, newChemId));
originalRxnCoefficient *= desalterMultiplier;
newIdToCoefficientMap.put(newChemId, coefficientAccumulator + originalRxnCoefficient);
} // Else both are null we don't need to do anything.
// We don't need to add this new id to the list of substrates/products because it's already there.
} else {
resultIds.add(newChemId); // Add the new id to the subs/prods list.
Integer desalterMultiplier = desalterMultiplerMap.get(Pair.of(oldChemId, newChemId));
if (originalRxnCoefficient == null) {
if (!desalterMultiplier.equals(1)) {
LOGGER.warn("Ignoring >1 desalting multipler due to existing null coefficient. " +
"New chem id: %d, old chem id: %d, coefficient value: null, multiplier: %d, old rxn id: %d",
newChemId, oldChemId, desalterMultiplier, oldRxn.getUUID());
}
newIdToCoefficientMap.put(newChemId, null);
} else {
newIdToCoefficientMap.put(newChemId, originalRxnCoefficient * desalterMultiplier);
}
}
}
}
return Pair.of(resultIds, newIdToCoefficientMap);
}
/**
* This function desalts a single chemical and returns the resulting ids of the modified chemicals that have been
* written to the destination DB. The results of the desalting process are also cached for later use in mapping
* chemicals from the old DB to the new. If the chemicals cannot be desalted, we just migrate the chemical unaltered.
*
* @param chemical A chemical to desalt.
* @return A list of output ids of desalted chemicals
*/
private List<Long> desaltChemical(Chemical chemical) throws IOException, ReactionException {
Long originalId = chemical.getUuid();
// If the chemical's ID maps to a single pre-seen entry, use its existing old id
if (oldChemicalIdToNewChemicalIds.containsKey(originalId)) {
LOGGER.error("desaltChemical was called on a chemical that was already desalted: %d", originalId);
}
// Otherwise need to clean the chemical
String inchi = chemical.getInChI();
// If it's FAKE, just go with it
if (inchi.contains(FAKE)) {
long newId = getNoSQLAPI().writeToOutKnowlegeGraph(chemical); //Write to the db
List<Long> singletonId = Collections.unmodifiableList(Collections.singletonList(newId));
inchiToNewId.put(inchi, newId);
desalterMultiplerMap.put(Pair.of(originalId, newId), 1);
oldChemicalIdToNewChemicalIds.put(originalId, singletonId);
return singletonId;
}
Map<String, Integer> cleanedInchis = null;
try {
cleanedInchis = desalter.desaltInchi(inchi);
} catch (Exception e) {
// TODO: probably should handle this error differently, currently just letting pass unaltered
LOGGER.error(String.format("Exception caught when desalting chemical %d: %s", originalId, e.getMessage()));
desalterFailuresCounter++;
long newId = getNoSQLAPI().writeToOutKnowlegeGraph(chemical); //Write to the db
List<Long> singletonId = Collections.singletonList(newId);
inchiToNewId.put(inchi, newId);
desalterMultiplerMap.put(Pair.of(originalId, newId), 1);
oldChemicalIdToNewChemicalIds.put(originalId, singletonId);
return Collections.singletonList(newId);
}
List<Long> newIds = new ArrayList<>();
// For each cleaned chemical, put in DB or update ID
for (Map.Entry<String, Integer> pair : cleanedInchis.entrySet()) {
String cleanInchi = pair.getKey();
// If the cleaned inchi is already in DB, use existing ID, and hash the id
long newId;
if (inchiToNewId.containsKey(cleanInchi)) {
newId = inchiToNewId.get(cleanInchi);
} else {
// Otherwise update the chemical, put into DB, and hash the id and inchi
chemical.setInchi(cleanInchi);
newId = getNoSQLAPI().writeToOutKnowlegeGraph(chemical); // Write to the db
inchiToNewId.put(cleanInchi, newId);
}
/* The desalter converts complex molecules into a set of unique fragments, but maitains a count of those
* fragments. That fragment count must be multiplied by any reaction-specific coefficient to get the
* true count of fragments participating in a reaction.
*
* Because different salts may have different coefficients, we must key the multiplier on (oldId, newId) to
* ensure that multiple occurrences of the same desalted molecule in a reaction don't overwrite each other.
*
* We must save every (oldId, newId) pair, as each pair will be unique even if newId was seen before. */
desalterMultiplerMap.put(Pair.of(originalId, newId), pair.getValue());
newIds.add(newId);
}
// Store and return the cached list of chemical ids that we just created. Make them immutable for safety's sake.
List<Long> resultsToCache = Collections.unmodifiableList(newIds);
oldChemicalIdToNewChemicalIds.put(originalId, resultsToCache);
return resultsToCache;
}
}