/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.biointerpretation; import act.server.NoSQLAPI; import act.shared.Chemical; import act.shared.Organism; import act.shared.Reaction; import act.shared.Seq; import act.shared.helpers.MongoDBToJSON; import act.shared.helpers.P; import chemaxon.license.LicenseProcessingException; import chemaxon.reaction.ReactionException; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.json.JSONArray; import org.json.JSONObject; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; public abstract class BiointerpretationProcessor { private static final Logger LOGGER = LogManager.getFormatterLogger(BiointerpretationProcessor.class); private NoSQLAPI api; private Map<Long, Long> oldChemIdToNewChemId = new HashMap<>(); private Map<Long, String> newChemIdToInchi = new HashMap<>(); private HashMap<Long, Long> organismMigrationMap = new HashMap<>(); private HashMap<Long, Long> sequenceMigrationMap = new HashMap<>(); private HashMap<Long, Long> reactionMigrationMap = new HashMap<>(); boolean initCalled = false; /** * Returns the name of this biointerpretation step. * @return A name string for logging. */ public abstract String getName(); public BiointerpretationProcessor(NoSQLAPI api) { this.api = api; } /** * Initializes this processing step. Must be called before run(). * @throws Exception */ public abstract void init() throws Exception; /** * Subclasses should call this in their init() implementations to prevent an exception being thrown when run() is * called. This prevents run attempts without initialization. * * This isn't the most elegant way of handling this (a factory or dependency injection would be better), but this is * quick, safe, and effective. */ protected void markInitialized() { initCalled = true; } protected void failIfNotInitialized() { if (!initCalled) { String msg = String.format("run() called without initialization for biointerpretation processor '%s'", getName()); LOGGER.error(msg); throw new RuntimeException(msg); } } /** * Runs the biointerpretation processing on the read DB data and writes it to the write DB. * @throws Exception */ public void run() throws IOException, LicenseProcessingException, ReactionException { // TODO: are these enough? failIfNotInitialized(); LOGGER.debug("Starting %s", getName()); long startTime = new Date().getTime(); LOGGER.info("Processing chemicals"); processChemicals(); LOGGER.info("Done processing chemicals"); afterProcessChemicals(); LOGGER.info("Processing sequences"); processSequences(); LOGGER.info("Processing reactions"); processReactions(); LOGGER.info("Done processing reactions"); afterProcessReactions(); long endTime = new Date().getTime(); LOGGER.debug(String.format("Time in seconds: %d", (endTime - startTime) / 1000)); LOGGER.info("Done %s", getName()); } /** * A hook that runs after all chemicals have been processed/migrated. This is meant to * be overridden, as it does nothing by default. */ protected void afterProcessChemicals() throws IOException, ReactionException { } /** * A hook that runs after all reactions have been processed/migrated. This is meant to update all the reaction IDs in * the Seq entries */ protected void afterProcessReactions() throws IOException, ReactionException { Iterator<Seq> writtenSeqIterator = api.getWriteDB().getSeqIterator(); while (writtenSeqIterator.hasNext()) { Seq writtenSeq = writtenSeqIterator.next(); Set<Long> oldRxnRefs = writtenSeq.getReactionsCatalyzed(); Set<Long> newRxnRefs = new HashSet<>(); for (Long oldRxnRef : oldRxnRefs) { newRxnRefs.add(reactionMigrationMap.get(oldRxnRef)); } writtenSeq.setReactionsCatalyzed(newRxnRefs); api.getWriteDB().updateRxnRefs(writtenSeq); } } protected NoSQLAPI getNoSQLAPI() { return this.api; } protected void writeMigratedReactionMap(Long oldId, Long newId) { reactionMigrationMap.put(oldId, newId); } protected Long readMigrationReactionMap(Long oldId) { return reactionMigrationMap.get(oldId); } protected Map<Long, Long> getOldChemIdToNewChemId() { return this.oldChemIdToNewChemId; } protected Map<Long, String> getNewChemIdToInchi() { return this.newChemIdToInchi; } protected Long mapOldChemIdToNewId(Long oldChemId) { // TODO: maybe raise a runtime exception if the result is null? return this.oldChemIdToNewChemId.get(oldChemId); } protected String mapNewChemIdToInChI(Long newChemId) { return this.newChemIdToInchi.get(newChemId); } /** * Process and migrate chemicals. Default implementation merely copies, preserving source id. * @throws Exception */ protected void processChemicals() throws IOException, ReactionException { Iterator<Chemical> chemicals = api.readChemsFromInKnowledgeGraph(); while (chemicals.hasNext()) { // TODO: should we apply the blacklist here so everybody can benefit from it? Chemical chem = chemicals.next(); Long oldId = chem.getUuid(); chem = runSpecializedChemicalProcessing(chem); Long newId = api.writeToOutKnowlegeGraph(chem); // Cache the old-to-new id mapping so we don't have to hit the DB for each chemical. oldChemIdToNewChemId.put(oldId, newId); // Cache the id to InChI mapping so we don't have to re-load the chem documents just to get the InChI. newChemIdToInchi.put(newId, chem.getInChI()); } } /** * Process and migrate sequences. This is meant to be overridden, as it does nothing by default. */ protected void processSequences() { } /** * A hook that runs after the reaction's chemicals and proteins have been prepped for writing. This is meant to * be overridden, as it does nothing by default. * @param chem The chem object about to be written. * @return The modified reaction. */ protected Chemical runSpecializedChemicalProcessing(Chemical chem) { return chem; } /** * Process and migrate reactions. Default implementation merely copies, preserving source id. * @throws Exception */ protected void processReactions() throws IOException, ReactionException { //Scan through all Reactions and process each Iterator<Reaction> iterator = api.readRxnsFromInKnowledgeGraph(); while (iterator.hasNext()) { // Get reaction from the read db Reaction oldRxn = iterator.next(); Long oldId = Long.valueOf(oldRxn.getUUID()); oldRxn = preProcessReaction(oldRxn); // preProcessReaction can return null to indicate that this reaction shouldn't be written to the new DB. if (oldRxn == null) { LOGGER.debug("preProcessReaction returned null for reaction %d, not saving to write DB", oldId); continue; } Reaction newRxn = new Reaction( -1, // Assume the id will be set when the reaction is written to the DB. new Long[0], new Long[0], new Long[0], new Long[0], new Long[0], oldRxn.getECNum(), oldRxn.getConversionDirection(), oldRxn.getPathwayStepDirection(), oldRxn.getReactionName(), oldRxn.getRxnDetailType() ); // Add the data source and references from the source to the destination newRxn.setDataSource(oldRxn.getDataSource()); for (P<Reaction.RefDataSource, String> ref : oldRxn.getReferences()) { newRxn.addReference(ref.fst(), ref.snd()); } int newId = api.writeToOutKnowlegeGraph(newRxn); Long newIdL = Long.valueOf(newId); migrateReactionChemicals(newRxn, oldRxn); migrateAllProteins(newRxn, oldRxn, oldId); // Give the subclasses a chance at the reactions. newRxn = runSpecializedReactionProcessing(newRxn, newIdL); reactionMigrationMap.put(oldId, newIdL); // Update the reaction in the DB with the newly migrated protein data. api.getWriteDB().updateActReaction(newRxn, newId); } } /** * A hook that runs on the reaction from the read DB before it's written to the write DB. This is meant to * be overridden, as it does nothing by default. * * Return an original or modified reaction to be migrated to the DB, or return null to have this reaction skipped. * * @param rxn The reaction object from the read DB. * @return The modified reaction or null if nothing should be written to the DB. */ protected Reaction preProcessReaction(Reaction rxn) throws IOException, ReactionException { return rxn; } /** * A hook that runs after the reaction's chemicals and proteins have been prepped for writing. This is meant to * be overridden, as it does nothing by default. * @param rxn The reaction object about to be written. * @return The modified reaction. */ protected Reaction runSpecializedReactionProcessing(Reaction rxn, Long rxnId) throws IOException, ReactionException { return rxn; } /** * Migrates all protein data from oldRxn to newRxn, preserving the source reaction id on the protein objects. * @param newRxn The reaction to which to write protein data. * @param oldRxn The reaction from which to read protein data. * @param oldId The old reaction's ID (taken as a parameter for symmetry with newId). */ protected void migrateAllProteins(Reaction newRxn, Reaction oldRxn, Long oldId) { for (JSONObject protein : oldRxn.getProteinData()) { JSONObject newProteinData = migrateProteinData(protein); // Save the source reaction ID for debugging/verification purposes. TODO: is adding a field like this okay? newProteinData.put("source_reaction_id", oldId); newRxn.addProteinData(newProteinData); } } /** * Default implementation just copies chemicals, cofactors, and coefficients. * @param newRxn The new Reaction object to be written into the write DB. * @param oldRxn The old Reaction object read from the read DB. */ protected void migrateReactionChemicals(Reaction newRxn, Reaction oldRxn) { // TODO: this has been written/re-written too many times. Lift this into a shared superclass. Long[] oldSubstrates = oldRxn.getSubstrates(); Long[] oldProducts = oldRxn.getProducts(); List<Long> migratedSubstrates = new ArrayList<>(mapChemicalIds(oldSubstrates)); List<Long> migratedProducts = new ArrayList<>(mapChemicalIds(oldProducts)); // Substrate/product counts must be identical before and after migration. if (migratedSubstrates.size() != oldSubstrates.length || migratedProducts.size() != oldProducts.length) { throw new RuntimeException(String.format( "Pre/post substrate/product migration lengths don't match for source reaction %d: %d -> %d, %d -> %d", oldRxn.getUUID(), oldSubstrates.length, migratedSubstrates.size(), oldProducts.length, migratedProducts.size() )); } newRxn.setSubstrates(migratedSubstrates.toArray(new Long[migratedSubstrates.size()])); newRxn.setProducts(migratedProducts.toArray(new Long[migratedProducts.size()])); // Copy over substrate/product coefficients one at a time based on index, which should be consistent. for (int i = 0; i < migratedSubstrates.size(); i++) { newRxn.setSubstrateCoefficient(migratedSubstrates.get(i), oldRxn.getSubstrateCoefficient(oldSubstrates[i])); } for (int i = 0; i < migratedProducts.size(); i++) { newRxn.setProductCoefficient(migratedProducts.get(i), oldRxn.getProductCoefficient(oldProducts[i])); } Long[] oldSubstrateCofactors = oldRxn.getSubstrateCofactors(); Long[] oldProductCofactors = oldRxn.getProductCofactors(); List<Long> migratedSubstrateCofactors = mapChemicalIds(oldSubstrateCofactors); List<Long> migratedProductCofactors = mapChemicalIds(oldProductCofactors); if (migratedSubstrateCofactors.size() != oldSubstrateCofactors.length || migratedProductCofactors.size() != oldProductCofactors.length) { throw new RuntimeException(String.format( "Pre/post sub/prod cofactor migration lengths don't match for source reaction %d: %d -> %d, %d -> %d", oldRxn.getUUID(), oldSubstrateCofactors.length, migratedSubstrateCofactors.size(), oldProductCofactors.length, migratedProductCofactors.size() )); } newRxn.setSubstrateCofactors(migratedSubstrateCofactors.toArray(new Long[migratedSubstrateCofactors.size()])); newRxn.setProductCofactors(migratedProductCofactors.toArray(new Long[migratedProductCofactors.size()])); Long[] oldCoenzymes = oldRxn.getCoenzymes(); List<Long> migratedCoenzymes = mapChemicalIds(oldCoenzymes); if (migratedCoenzymes.size() != oldCoenzymes.length) { throw new RuntimeException(String.format( "Pre/post coenzyme migration lengths don't match for source reaction %d: %d -> %d", oldRxn.getUUID(), oldCoenzymes.length, migratedCoenzymes.size() )); } newRxn.setCoenzymes(migratedCoenzymes.toArray(new Long[migratedCoenzymes.size()])); } private List<Long> mapChemicalIds(Long[] chemIds) { return Arrays.asList(chemIds).stream(). map(oldChemIdToNewChemId::get).filter(x -> x != null).collect(Collectors.toList()); } // Cache seen organism ids locally to speed up migration. private Long migrateOrganism(Long oldOrganismId) { if (organismMigrationMap.containsKey(oldOrganismId)) { return organismMigrationMap.get(oldOrganismId); } String organismName = api.getReadDB().getOrganismNameFromId(oldOrganismId); Long newOrganismId = null; // Assume any valid organism entry will have a name. if (organismName != null) { // TODO: reading from the writeDB is not so good, but we need to not insert twice. Is there a better way? long writeDBOrganismId = api.getWriteDB().getOrganismId(organismName); if (writeDBOrganismId != -1) { // -1 is used in MongoDB.java for missing values. // Reuse the existing organism entry if we can find a matching one. newOrganismId = writeDBOrganismId; } else { // Use -1 for no NCBI Id. Note that the NCBI parameter isn't even stored in the DB at present. Organism newOrganism = new Organism(oldOrganismId, -1, organismName); api.getWriteDB().submitToActOrganismNameDB(newOrganism); newOrganismId = newOrganism.getUUID(); } } organismMigrationMap.put(oldOrganismId, newOrganismId); return newOrganismId; } protected JSONObject migrateProteinData(JSONObject oldProtein) { // Copy the protein object for modification. // With help from http://stackoverflow.com/questions/12809779/how-do-i-clone-an-org-json-jsonobject-in-java. JSONObject newProtein = new JSONObject(oldProtein, JSONObject.getNames(oldProtein)); if (oldProtein.has("organism")) { // BRENDA protein entries just have one organism, so the migration is a little easier. Long oldOrganismId = oldProtein.getLong("organism"); Long newOrganismId = migrateOrganism(oldOrganismId); newProtein.put("organism", newOrganismId); } else if (oldProtein.has("organisms")) { // Metacyc proteins use the plural "organisms." Unclear why, but oh well. JSONArray oldOrganisms = oldProtein.getJSONArray("organisms"); List<Long> newOrganisms = new ArrayList<>(oldOrganisms.length()); for (int i = 0; i < oldOrganisms.length(); i++) { Long oldOrganismId = oldOrganisms.getLong(i); Long newOrganismId = migrateOrganism(oldOrganismId); newOrganisms.add(newOrganismId); } newProtein.put("organisms", new JSONArray(newOrganisms)); } // TODO: unify the Protein object schema so this sort of handling isn't necessary. JSONArray sequences = oldProtein.getJSONArray("sequences"); List<Long> newSequenceIds = new ArrayList<>(sequences.length()); for (int i = 0; i < sequences.length(); i++) { Long sequenceId = sequences.getLong(i); // checks if sequence has already been written/migrated if (sequenceMigrationMap.containsKey(sequenceId)) { // add migrated sequence ID to list of referenced sequences in the reaction protein object Long writtenSeqId = sequenceMigrationMap.get(sequenceId); newSequenceIds.add(writtenSeqId); } else { Seq seq = api.getReadDB().getSeqFromID(sequenceId); Long oldSeqOrganismId = seq.getOrgId(); Long newSeqOrganismId = migrateOrganism(oldSeqOrganismId); seq.getMetadata().put("source_sequence_ids", sequenceId); // Store the seq document to get an id that'll be stored in the protein object. int seqId = api.getWriteDB().submitToActSeqDB( seq.getSrcdb(), seq.getEc(), seq.getOrgName(), newSeqOrganismId, // Use freshly migrated organism id to replace the old one. seq.getSequence(), seq.getReferences(), seq.getReactionsCatalyzed(), // these will be updated in afterProcessReactions() MongoDBToJSON.conv(seq.getMetadata()) ); // TODO: we should migrate all the seq documents with zero references over to the new DB. sequenceMigrationMap.put(sequenceId, (long) seqId); // Convert to Long to match ID type seen in MongoDB. TODO: clean up all the IDs, make them all Longs. newSequenceIds.add(Long.valueOf(seqId)); } } // Store the migrated sequence ids for this protein. newProtein.put("sequences", new JSONArray(newSequenceIds)); return newProtein; } }