/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.reactionmerging;
import act.server.NoSQLAPI;
import act.shared.Reaction;
import act.shared.helpers.P;
import com.act.biointerpretation.BiointerpretationProcessor;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.biopax.paxtools.model.level3.ConversionDirectionType;
import org.biopax.paxtools.model.level3.StepDirection;
import org.json.JSONObject;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
/**
* This creates Dr. Know from Lucille. Dr. Know is the database in which all Reactions
* have been merged based on the sameness of the reactions and product ids.
*/
public class ReactionMerger extends BiointerpretationProcessor {
private static final Logger LOGGER = LogManager.getFormatterLogger(ReactionMerger.class);
private static final String PROCESSOR_NAME = "Reaction Merger";
@Override
public String getName() {
return PROCESSOR_NAME;
}
public ReactionMerger(NoSQLAPI noSQLAPI) {
super(noSQLAPI);
}
@Override
public void init() {
// Do nothing for this class, as there's no initialization necessary.
markInitialized();
}
@Override
public void processReactions() {
LOGGER.info("Reading all reactions");
Iterator<Reaction> rxns = getNoSQLAPI().readRxnsFromInKnowledgeGraph();
Map<SubstratesProducts, PriorityQueue<Long>> reactionGroups = hashReactions(rxns);
LOGGER.info("Found %d reaction groups, merging", reactionGroups.size());
mergeAllReactions(reactionGroups);
LOGGER.info("Done merging reactions");
}
protected static Map<SubstratesProducts, PriorityQueue<Long>> hashReactions(Iterator<Reaction> reactionIterator) {
HashMap<SubstratesProducts, PriorityQueue<Long>> reactionGroups = new HashMap<>();
// Add the next available reaction to the map of substrates+products -> ids.
// TODO: spill this map to disk if the map gets too large.
while (reactionIterator.hasNext()) {
Reaction rxn = reactionIterator.next();
SubstratesProducts sp = new SubstratesProducts(rxn);
PriorityQueue<Long> pq = reactionGroups.get(sp);
Long id = Long.valueOf(rxn.getUUID());
if (pq != null) {
pq.add(id);
} else {
pq = new PriorityQueue<>(1);
pq.add(id);
reactionGroups.put(sp, pq);
}
}
return reactionGroups;
}
public static class SubstratesProducts {
// TODO: also consider ec-umber, coefficients, and other reaction attributes.
Set<Long> substrates = null, products = null,
substrateCofactors = null, productCofactors = null, coenzymes = null;
Map<Long, Integer> substrateCoefficients = null, productCoefficients = null;
String ecnum = null;
ConversionDirectionType conversionDirectionType = null;
StepDirection pathwayStepDirection = null;
public SubstratesProducts(Reaction reaction) {
// TODO: should we copy these to be safe, or just assume nobody will mess with them?
this.substrates = new HashSet<>(Arrays.asList(reaction.getSubstrates()));
this.products = new HashSet<>(Arrays.asList(reaction.getProducts()));
this.substrateCofactors = new HashSet<>(Arrays.asList(reaction.getSubstrateCofactors()));
this.productCofactors = new HashSet<>(Arrays.asList(reaction.getProductCofactors()));
this.coenzymes = new HashSet<>(Arrays.asList(reaction.getCoenzymes()));
this.substrateCoefficients = new HashMap<>(this.substrates.size());
for (Long id : reaction.getSubstrateIdsOfSubstrateCoefficients()) {
this.substrateCoefficients.put(id, reaction.getSubstrateCoefficient(id));
}
this.productCoefficients = new HashMap<>(this.products.size());
for (Long id : reaction.getProductIdsOfProductCoefficients()) {
this.productCoefficients.put(id, reaction.getProductCoefficient(id));
}
this.ecnum = reaction.getECNum();
this.conversionDirectionType = reaction.getConversionDirection();
this.pathwayStepDirection = reaction.getPathwayStepDirection();
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
SubstratesProducts that = (SubstratesProducts) o;
if (!substrates.equals(that.substrates)) return false;
if (!products.equals(that.products)) return false;
if (!substrateCofactors.equals(that.substrateCofactors)) return false;
if (!productCofactors.equals(that.productCofactors)) return false;
if (!coenzymes.equals(that.coenzymes)) return false;
if (!substrateCoefficients.equals(that.substrateCoefficients)) return false;
if (!productCoefficients.equals(that.productCoefficients)) return false;
if (ecnum != null ? !ecnum.equals(that.ecnum) : that.ecnum != null) return false;
return conversionDirectionType == that.conversionDirectionType &&
pathwayStepDirection == that.pathwayStepDirection;
}
@Override
public int hashCode() {
int result = substrates.hashCode();
result = 31 * result + products.hashCode();
result = 31 * result + substrateCofactors.hashCode();
result = 31 * result + productCofactors.hashCode();
result = 31 * result + coenzymes.hashCode();
result = 31 * result + substrateCoefficients.hashCode();
result = 31 * result + productCoefficients.hashCode();
result = 31 * result + (ecnum != null ? ecnum.hashCode() : 0);
result = 31 * result + (conversionDirectionType != null ? conversionDirectionType.hashCode() : 0);
result = 31 * result + (pathwayStepDirection != null ? pathwayStepDirection.hashCode() : 0);
return result;
}
}
private Reaction mergeReactions(List<Reaction> reactions) {
if (reactions.size() < 1) {
return null;
}
Reaction fr = reactions.get(0); // fr = First reaction; we'll refer to it a lot in a moment.
Reaction mergedReaction = new Reaction(
-1, // Assume the id will be set when the reaction is written to the DB.
fr.getSubstrates(),
fr.getProducts(),
fr.getSubstrateCofactors(),
fr.getProductCofactors(),
fr.getCoenzymes(),
fr.getECNum(),
fr.getConversionDirection(),
fr.getPathwayStepDirection(),
fr.getReactionName(),
fr.getRxnDetailType()
);
mergedReaction.setDataSource(fr.getDataSource());
// Write stub reaction to DB to get its id, which is required for migrating sequences.
int newId = getNoSQLAPI().writeToOutKnowlegeGraph(mergedReaction);
// TODO: are there other fields we need to capture in this merge?
// TODO: add source ids for the various attributes to make debugging easier.
for (Reaction r : reactions) {
// TODO: should these be sorted before adding?
for (P<Reaction.RefDataSource, String> ref : r.getReferences()) {
mergedReaction.addReference(ref.fst(), ref.snd());
}
for (JSONObject protein : r.getProteinData()) {
// Save the source reaction ID for debugging/verification purposes. TODO: is adding a field like this okay?
protein.put("source_reaction_id", r.getUUID());
writeMigratedReactionMap((long) r.getUUID(), (long) newId);
JSONObject newProteinData = migrateProteinData(protein);
mergedReaction.addProteinData(newProteinData);
}
// Set the data source as MERGED if this is a combination of multiple sources. The protein data will store which.
if (mergedReaction.getDataSource() != Reaction.RxnDataSource.MERGED &&
mergedReaction.getDataSource() != r.getDataSource()) {
mergedReaction.setDataSource(Reaction.RxnDataSource.MERGED);
}
}
migrateReactionChemicals(mergedReaction, fr);
// Update the reaction in the DB with the newly migrated protein data.
getNoSQLAPI().getWriteDB().updateActReaction(mergedReaction, newId);
return mergedReaction;
}
protected void mergeAllReactions(Map<SubstratesProducts, PriorityQueue<Long>> reactionGroups) {
/* Maintain stability by constructing the ordered set of minimum group reaction ids so that we can iterate
* over reactions in the same order they occur in the source DB. Stability makes life easier in a number of ways
* (easier testing, deterministic output, general sanity) so we go to the trouble here. */
final HashMap<Long, PriorityQueue<Long>> minGroupIdsToGroups = new HashMap<>(reactionGroups.size());
for (Map.Entry<SubstratesProducts, PriorityQueue<Long>> entry : reactionGroups.entrySet()) {
minGroupIdsToGroups.put(entry.getValue().peek(), entry.getValue());
}
List<Long> orderedIds = Arrays.asList(minGroupIdsToGroups.keySet().toArray(new Long[minGroupIdsToGroups.size()]));
Collections.sort(orderedIds);
for (Long nextId : orderedIds) {
PriorityQueue<Long> groupIds = minGroupIdsToGroups.get(nextId);
List<Reaction> reactions = new ArrayList<>(groupIds.size());
for (Long id : groupIds) {
// Since we've only installed reaction IDs based on instances we've seen, this should be safe.
reactions.add(getNoSQLAPI().readReactionFromInKnowledgeGraph(id));
}
mergeReactions(reactions);
}
}
}