BiointerpretationProcessor.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation;

import act.server.NoSQLAPI;
import act.shared.Chemical;
import act.shared.Organism;
import act.shared.Reaction;
import act.shared.Seq;
import act.shared.helpers.MongoDBToJSON;
import act.shared.helpers.P;
import chemaxon.license.LicenseProcessingException;
import chemaxon.reaction.ReactionException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONObject;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

public abstract class BiointerpretationProcessor {
  private static final Logger LOGGER = LogManager.getFormatterLogger(BiointerpretationProcessor.class);

  private NoSQLAPI api;
  private Map<Long, Long> oldChemIdToNewChemId = new HashMap<>();
  private Map<Long, String> newChemIdToInchi = new HashMap<>();
  private HashMap<Long, Long> organismMigrationMap = new HashMap<>();
  private HashMap<Long, Long> sequenceMigrationMap = new HashMap<>();
  private HashMap<Long, Long> reactionMigrationMap = new HashMap<>();

  boolean initCalled = false;

  /**
   * Returns the name of this biointerpretation step.
   * @return A name string for logging.
   */
  public abstract String getName();

  public BiointerpretationProcessor(NoSQLAPI api) {
    this.api = api;
  }

  /**
   * Initializes this processing step.  Must be called before run().
   * @throws Exception
   */
  public abstract void init() throws Exception;

  /**
   * Subclasses should call this in their init() implementations to prevent an exception being thrown when run() is
   * called.  This prevents run attempts without initialization.
   *
   * This isn't the most elegant way of handling this (a factory or dependency injection would be better), but this is
   * quick, safe, and effective.
   */
  protected void markInitialized() {
    initCalled = true;
  }

  protected void failIfNotInitialized() {
    if (!initCalled) {
      String msg = String.format("run() called without initialization for biointerpretation processor '%s'", getName());
      LOGGER.error(msg);
      throw new RuntimeException(msg);
    }
  }

  /**
   * Runs the biointerpretation processing on the read DB data and writes it to the write DB.
   * @throws Exception
   */
  public void run()
      throws IOException, LicenseProcessingException, ReactionException { // TODO: are these enough?
    failIfNotInitialized();

    LOGGER.debug("Starting %s", getName());
    long startTime = new Date().getTime();

    LOGGER.info("Processing chemicals");
    processChemicals();
    LOGGER.info("Done processing chemicals");
    afterProcessChemicals();
    LOGGER.info("Processing sequences");
    processSequences();
    LOGGER.info("Processing reactions");
    processReactions();
    LOGGER.info("Done processing reactions");
    afterProcessReactions();

    long endTime = new Date().getTime();
    LOGGER.debug(String.format("Time in seconds: %d", (endTime - startTime) / 1000));

    LOGGER.info("Done %s", getName());
  }

  /**
   * A hook that runs after all chemicals have been processed/migrated.  This is meant to
   * be overridden, as it does nothing by default.
   */
  protected void afterProcessChemicals() throws IOException, ReactionException {

  }

  /**
   * A hook that runs after all reactions have been processed/migrated. This is meant to update all the reaction IDs in
   * the Seq entries
   */
  protected void afterProcessReactions() throws IOException, ReactionException {
    Iterator<Seq> writtenSeqIterator = api.getWriteDB().getSeqIterator();

    while (writtenSeqIterator.hasNext()) {
      Seq writtenSeq = writtenSeqIterator.next();

      Set<Long> oldRxnRefs = writtenSeq.getReactionsCatalyzed();
      Set<Long> newRxnRefs = new HashSet<>();
      for (Long oldRxnRef : oldRxnRefs) {
        newRxnRefs.add(reactionMigrationMap.get(oldRxnRef));
      }

      writtenSeq.setReactionsCatalyzed(newRxnRefs);
      api.getWriteDB().updateRxnRefs(writtenSeq);
    }
  }

  protected NoSQLAPI getNoSQLAPI() {
    return this.api;
  }

  protected void writeMigratedReactionMap(Long oldId, Long newId) {
    reactionMigrationMap.put(oldId, newId);
  }

  protected Long readMigrationReactionMap(Long oldId) {
    return reactionMigrationMap.get(oldId);
  }

  protected Map<Long, Long> getOldChemIdToNewChemId() {
    return this.oldChemIdToNewChemId;
  }

  protected Map<Long, String> getNewChemIdToInchi() {
    return this.newChemIdToInchi;
  }

  protected Long mapOldChemIdToNewId(Long oldChemId) {
    // TODO: maybe raise a runtime exception if the result is null?
    return this.oldChemIdToNewChemId.get(oldChemId);
  }

  protected String mapNewChemIdToInChI(Long newChemId) {
    return this.newChemIdToInchi.get(newChemId);
  }


  /**
   * Process and migrate chemicals.  Default implementation merely copies, preserving source id.
   * @throws Exception
   */
  protected void processChemicals() throws IOException, ReactionException {
    Iterator<Chemical> chemicals = api.readChemsFromInKnowledgeGraph();
    while (chemicals.hasNext()) {
      // TODO: should we apply the blacklist here so everybody can benefit from it?
      Chemical chem = chemicals.next();
      Long oldId = chem.getUuid();
      chem = runSpecializedChemicalProcessing(chem);
      Long newId = api.writeToOutKnowlegeGraph(chem);
      // Cache the old-to-new id mapping so we don't have to hit the DB for each chemical.
      oldChemIdToNewChemId.put(oldId, newId);
      // Cache the id to InChI mapping so we don't have to re-load the chem documents just to get the InChI.
      newChemIdToInchi.put(newId, chem.getInChI());
    }
  }

  /**
   * Process and migrate sequences. This is meant to be overridden, as it does nothing by default.
   */
  protected void processSequences() {

  }


  /**
   * A hook that runs after the reaction's chemicals and proteins have been prepped for writing.  This is meant to
   * be overridden, as it does nothing by default.
   * @param chem The chem object about to be written.
   * @return The modified reaction.
   */
  protected Chemical runSpecializedChemicalProcessing(Chemical chem) {
    return chem;
  }

  /**
   * Process and migrate reactions.  Default implementation merely copies, preserving source id.
   * @throws Exception
   */
  protected void processReactions() throws IOException, ReactionException {
    //Scan through all Reactions and process each
    Iterator<Reaction> iterator = api.readRxnsFromInKnowledgeGraph();

    while (iterator.hasNext()) {
      // Get reaction from the read db
      Reaction oldRxn = iterator.next();
      Long oldId = Long.valueOf(oldRxn.getUUID());

      oldRxn = preProcessReaction(oldRxn);

      // preProcessReaction can return null to indicate that this reaction shouldn't be written to the new DB.
      if (oldRxn == null) {
        LOGGER.debug("preProcessReaction returned null for reaction %d, not saving to write DB", oldId);
        continue;
      }

      Reaction newRxn = new Reaction(
          -1, // Assume the id will be set when the reaction is written to the DB.
          new Long[0],
          new Long[0],
          new Long[0],
          new Long[0],
          new Long[0],
          oldRxn.getECNum(),
          oldRxn.getConversionDirection(),
          oldRxn.getPathwayStepDirection(),
          oldRxn.getReactionName(),
          oldRxn.getRxnDetailType()
      );

      // Add the data source and references from the source to the destination
      newRxn.setDataSource(oldRxn.getDataSource());
      for (P<Reaction.RefDataSource, String> ref : oldRxn.getReferences()) {
        newRxn.addReference(ref.fst(), ref.snd());
      }

      int newId = api.writeToOutKnowlegeGraph(newRxn);
      Long newIdL = Long.valueOf(newId);

      migrateReactionChemicals(newRxn, oldRxn);
      migrateAllProteins(newRxn, oldRxn, oldId);

      // Give the subclasses a chance at the reactions.
      newRxn = runSpecializedReactionProcessing(newRxn, newIdL);

      reactionMigrationMap.put(oldId, newIdL);

      // Update the reaction in the DB with the newly migrated protein data.
      api.getWriteDB().updateActReaction(newRxn, newId);
    }
  }

  /**
   * A hook that runs on the reaction from the read DB before it's written to the write DB.  This is meant to
   * be overridden, as it does nothing by default.
   *
   * Return an original or modified reaction to be migrated to the DB, or return null to have this reaction skipped.
   *
   * @param rxn The reaction object from the read DB.
   * @return The modified reaction or null if nothing should be written to the DB.
   */
  protected Reaction preProcessReaction(Reaction rxn) throws IOException, ReactionException {
    return rxn;
  }

  /**
   * A hook that runs after the reaction's chemicals and proteins have been prepped for writing.  This is meant to
   * be overridden, as it does nothing by default.
   * @param rxn The reaction object about to be written.
   * @return The modified reaction.
   */
  protected Reaction runSpecializedReactionProcessing(Reaction rxn, Long rxnId) throws IOException, ReactionException {
    return rxn;
  }

  /**
   * Migrates all protein data from oldRxn to newRxn, preserving the source reaction id on the protein objects.
   * @param newRxn The reaction to which to write protein data.
   * @param oldRxn The reaction from which to read protein data.
   * @param oldId The old reaction's ID (taken as a parameter for symmetry with newId).
   */
  protected void migrateAllProteins(Reaction newRxn, Reaction oldRxn, Long oldId) {
    for (JSONObject protein : oldRxn.getProteinData()) {
      JSONObject newProteinData = migrateProteinData(protein);
      // Save the source reaction ID for debugging/verification purposes.  TODO: is adding a field like this okay?
      newProteinData.put("source_reaction_id", oldId);
      newRxn.addProteinData(newProteinData);
    }
  }


  /**
   * Default implementation just copies chemicals, cofactors, and coefficients.
   * @param newRxn The new Reaction object to be written into the write DB.
   * @param oldRxn The old Reaction object read from the read DB.
   */
  protected void migrateReactionChemicals(Reaction newRxn, Reaction oldRxn) {
    // TODO: this has been written/re-written too many times.  Lift this into a shared superclass.
    Long[] oldSubstrates = oldRxn.getSubstrates();
    Long[] oldProducts = oldRxn.getProducts();
    List<Long> migratedSubstrates = new ArrayList<>(mapChemicalIds(oldSubstrates));
    List<Long> migratedProducts = new ArrayList<>(mapChemicalIds(oldProducts));

    // Substrate/product counts must be identical before and after migration.
    if (migratedSubstrates.size() != oldSubstrates.length ||
        migratedProducts.size() != oldProducts.length) {
      throw new RuntimeException(String.format(
          "Pre/post substrate/product migration lengths don't match for source reaction %d: %d -> %d, %d -> %d",
          oldRxn.getUUID(), oldSubstrates.length, migratedSubstrates.size(), oldProducts.length, migratedProducts.size()
      ));
    }

    newRxn.setSubstrates(migratedSubstrates.toArray(new Long[migratedSubstrates.size()]));
    newRxn.setProducts(migratedProducts.toArray(new Long[migratedProducts.size()]));

    // Copy over substrate/product coefficients one at a time based on index, which should be consistent.
    for (int i = 0; i < migratedSubstrates.size(); i++) {
      newRxn.setSubstrateCoefficient(migratedSubstrates.get(i), oldRxn.getSubstrateCoefficient(oldSubstrates[i]));
    }

    for (int i = 0; i < migratedProducts.size(); i++) {
      newRxn.setProductCoefficient(migratedProducts.get(i), oldRxn.getProductCoefficient(oldProducts[i]));
    }

    Long[] oldSubstrateCofactors = oldRxn.getSubstrateCofactors();
    Long[] oldProductCofactors = oldRxn.getProductCofactors();

    List<Long> migratedSubstrateCofactors = mapChemicalIds(oldSubstrateCofactors);
    List<Long> migratedProductCofactors = mapChemicalIds(oldProductCofactors);

    if (migratedSubstrateCofactors.size() != oldSubstrateCofactors.length ||
        migratedProductCofactors.size() != oldProductCofactors.length) {
      throw new RuntimeException(String.format(
          "Pre/post sub/prod cofactor migration lengths don't match for source reaction %d: %d -> %d, %d -> %d",
          oldRxn.getUUID(), oldSubstrateCofactors.length, migratedSubstrateCofactors.size(),
          oldProductCofactors.length, migratedProductCofactors.size()
      ));
    }

    newRxn.setSubstrateCofactors(migratedSubstrateCofactors.toArray(new Long[migratedSubstrateCofactors.size()]));
    newRxn.setProductCofactors(migratedProductCofactors.toArray(new Long[migratedProductCofactors.size()]));

    Long[] oldCoenzymes = oldRxn.getCoenzymes();
    List<Long> migratedCoenzymes = mapChemicalIds(oldCoenzymes);

    if (migratedCoenzymes.size() != oldCoenzymes.length) {
      throw new RuntimeException(String.format(
          "Pre/post coenzyme migration lengths don't match for source reaction %d: %d -> %d",
          oldRxn.getUUID(), oldCoenzymes.length, migratedCoenzymes.size()
      ));
    }
    newRxn.setCoenzymes(migratedCoenzymes.toArray(new Long[migratedCoenzymes.size()]));
  }

  private List<Long> mapChemicalIds(Long[] chemIds) {
    return
        Arrays.asList(chemIds).stream().
            map(oldChemIdToNewChemId::get).filter(x -> x != null).collect(Collectors.toList());
  }

  // Cache seen organism ids locally to speed up migration.
  private Long migrateOrganism(Long oldOrganismId) {
    if (organismMigrationMap.containsKey(oldOrganismId)) {
      return organismMigrationMap.get(oldOrganismId);
    }

    String organismName = api.getReadDB().getOrganismNameFromId(oldOrganismId);

    Long newOrganismId = null;

    // Assume any valid organism entry will have a name.
    if (organismName != null) {
      // TODO: reading from the writeDB is not so good, but we need to not insert twice.  Is there a better way?
      long writeDBOrganismId = api.getWriteDB().getOrganismId(organismName);
      if (writeDBOrganismId != -1) { // -1 is used in MongoDB.java for missing values.
        // Reuse the existing organism entry if we can find a matching one.
        newOrganismId = writeDBOrganismId;
      } else {
        // Use -1 for no NCBI Id.  Note that the NCBI parameter isn't even stored in the DB at present.
        Organism newOrganism = new Organism(oldOrganismId, -1, organismName);
        api.getWriteDB().submitToActOrganismNameDB(newOrganism);
        newOrganismId = newOrganism.getUUID();
      }

    }

    organismMigrationMap.put(oldOrganismId, newOrganismId);

    return newOrganismId;
  }

  protected JSONObject migrateProteinData(JSONObject oldProtein) {
    // Copy the protein object for modification.
    // With help from http://stackoverflow.com/questions/12809779/how-do-i-clone-an-org-json-jsonobject-in-java.
    JSONObject newProtein = new JSONObject(oldProtein, JSONObject.getNames(oldProtein));

    if (oldProtein.has("organism")) {
      // BRENDA protein entries just have one organism, so the migration is a little easier.
      Long oldOrganismId = oldProtein.getLong("organism");
      Long newOrganismId = migrateOrganism(oldOrganismId);
      newProtein.put("organism", newOrganismId);
    } else if (oldProtein.has("organisms")) { // Metacyc proteins use the plural "organisms."  Unclear why, but oh well.
      JSONArray oldOrganisms = oldProtein.getJSONArray("organisms");
      List<Long> newOrganisms = new ArrayList<>(oldOrganisms.length());
      for (int i = 0; i < oldOrganisms.length(); i++) {
        Long oldOrganismId = oldOrganisms.getLong(i);
        Long newOrganismId = migrateOrganism(oldOrganismId);
        newOrganisms.add(newOrganismId);
      }
      newProtein.put("organisms", new JSONArray(newOrganisms));
    }
    // TODO: unify the Protein object schema so this sort of handling isn't necessary.

    JSONArray sequences = oldProtein.getJSONArray("sequences");
    List<Long> newSequenceIds = new ArrayList<>(sequences.length());
    for (int i = 0; i < sequences.length(); i++) {
      Long sequenceId = sequences.getLong(i);

      // checks if sequence has already been written/migrated
      if (sequenceMigrationMap.containsKey(sequenceId)) {
        // add migrated sequence ID to list of referenced sequences in the reaction protein object
        Long writtenSeqId = sequenceMigrationMap.get(sequenceId);
        newSequenceIds.add(writtenSeqId);
      } else {
        Seq seq = api.getReadDB().getSeqFromID(sequenceId);

        Long oldSeqOrganismId = seq.getOrgId();
        Long newSeqOrganismId = migrateOrganism(oldSeqOrganismId);

        seq.getMetadata().put("source_sequence_ids", sequenceId);

        // Store the seq document to get an id that'll be stored in the protein object.
        int seqId = api.getWriteDB().submitToActSeqDB(
            seq.getSrcdb(),
            seq.getEc(),
            seq.getOrgName(),
            newSeqOrganismId, // Use freshly migrated organism id to replace the old one.
            seq.getSequence(),
            seq.getReferences(),
            seq.getReactionsCatalyzed(), // these will be updated in afterProcessReactions()
            MongoDBToJSON.conv(seq.getMetadata())
        );
        // TODO: we should migrate all the seq documents with zero references over to the new DB.

        sequenceMigrationMap.put(sequenceId, (long) seqId);

        // Convert to Long to match ID type seen in MongoDB.  TODO: clean up all the IDs, make them all Longs.
        newSequenceIds.add(Long.valueOf(seqId));
      }
    }
    // Store the migrated sequence ids for this protein.
    newProtein.put("sequences", new JSONArray(newSequenceIds));

    return newProtein;
  }

}