ReactionDesalter.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.desalting;

import act.server.NoSQLAPI;
import act.shared.Chemical;
import act.shared.Reaction;
import act.shared.helpers.P;
import chemaxon.license.LicenseProcessingException;
import chemaxon.reaction.ReactionException;
import com.act.biointerpretation.BiointerpretationProcessor;
import com.act.biointerpretation.Utils.ReactionComponent;
import com.act.biointerpretation.Utils.ReactionProjector;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;

import static com.act.biointerpretation.Utils.ReactionComponent.PRODUCT;
import static com.act.biointerpretation.Utils.ReactionComponent.SUBSTRATE;

/**
 * ReactionDesalter itself does the processing of the database using an instance of Desalter.
 * This class creates Synapse from Dr. Know.  Synapse is the database in which the chemicals
 * have been inspected for containing multiple species or ionized forms, and corrected.
 *
 * Created by jca20n on 10/22/15.
 */
public class ReactionDesalter extends BiointerpretationProcessor {
  private static final Logger LOGGER = LogManager.getFormatterLogger(ReactionDesalter.class);
  private static final String PROCESSOR_NAME = "Desalter";

  private static final String FAKE = "FAKE";

  // Don't use the superclass's maps, as we might convert one chemical into many.
  private Map<Long, List<Long>> oldChemicalIdToNewChemicalIds = new HashMap<>();
  private Map<String, Long> inchiToNewId = new HashMap<>();
  private Map<Pair<Long, Long>, Integer> desalterMultiplerMap = new HashMap<>(); // Old + new ids -> coeff. multipler.
  private Desalter desalter;
  private int desalterFailuresCounter = 0;

  @Override
  public String getName() {
    return PROCESSOR_NAME;
  }

  public ReactionDesalter(NoSQLAPI inputApi) {
    super(inputApi);
  }

  @Override
  public void init() throws IOException, ReactionException, LicenseProcessingException {
    desalter = new Desalter(new ReactionProjector());
    desalter.initReactors();
    markInitialized();
  }

  /**
   * This function reads the products and reactions from the db, desalts them and writes it back.
   */
  @Override
  public void run() throws IOException, LicenseProcessingException, ReactionException {
    failIfNotInitialized();

    LOGGER.debug("Starting Reaction Desalter");
    long startTime = new Date().getTime();

    desaltAllChemicals();
    desaltAllReactions();

    long endTime = new Date().getTime();
    LOGGER.debug(String.format("Time in seconds: %d", (endTime - startTime) / 1000));
  }

  public void desaltAllChemicals() throws IOException, LicenseProcessingException, ReactionException {
    Iterator<Chemical> chemicals = getNoSQLAPI().readChemsFromInKnowledgeGraph();
    while (chemicals.hasNext()) {
      Chemical chem = chemicals.next();
      desaltChemical(chem); // Ignore results, as the cached mapping will be used for reaction desalting.
    }
    LOGGER.info("Encountered %d failures while desalting all molecules", desalterFailuresCounter);
  }

  public void desaltAllReactions() throws IOException, LicenseProcessingException, ReactionException {
    //Scan through all Reactions and process each one.
    Iterator<Reaction> reactionIterator = getNoSQLAPI().readRxnsFromInKnowledgeGraph();

    while (reactionIterator.hasNext()) {
      Reaction oldRxn = reactionIterator.next();

      // I don't like modifying reaction objects in place, so we'll create a fresh one and write it to the new DB.
      Reaction desaltedReaction = new Reaction(
          -1, // Assume the id will be set when the reaction is written to the DB.
          new Long[0],
          new Long[0],
          new Long[0],
          new Long[0],
          new Long[0],
          oldRxn.getECNum(),
          oldRxn.getConversionDirection(),
          oldRxn.getPathwayStepDirection(),
          oldRxn.getReactionName(),
          oldRxn.getRxnDetailType()
      );

      // Add the data source and references from the source to the destination
      desaltedReaction.setDataSource(oldRxn.getDataSource());
      for (P<Reaction.RefDataSource, String> ref : oldRxn.getReferences()) {
        desaltedReaction.addReference(ref.fst(), ref.snd());
      }

      migrateReactionSubsProdsWCoeffs(desaltedReaction, oldRxn);

      int newId = getNoSQLAPI().writeToOutKnowlegeGraph(desaltedReaction);

      migrateAllProteins(desaltedReaction, oldRxn, Long.valueOf(oldRxn.getUUID()));

      // Update the reaction in the DB with the newly migrated protein data.
      getNoSQLAPI().getWriteDB().updateActReaction(desaltedReaction, newId);
    }

  }

  private void migrateReactionSubsProdsWCoeffs(Reaction newReaction, Reaction oldReaction) {
    {
      Pair<List<Long>, Map<Long, Integer>> newSubstratesAndCoefficients =
          buildIdAndCoefficientMapping(oldReaction, SUBSTRATE);
      newReaction.setSubstrates(newSubstratesAndCoefficients.getLeft().toArray(
          new Long[newSubstratesAndCoefficients.getLeft().size()]));
      newReaction.setAllSubstrateCoefficients(newSubstratesAndCoefficients.getRight());

      List<Long> newSubstrateCofactors = buildIdMapping(oldReaction.getSubstrateCofactors());
      newReaction.setSubstrateCofactors(newSubstrateCofactors.toArray(new Long[newSubstrateCofactors.size()]));
    }

    {
      Pair<List<Long>, Map<Long, Integer>> newProductsAndCoefficients =
          buildIdAndCoefficientMapping(oldReaction, PRODUCT);
      newReaction.setProducts(newProductsAndCoefficients.getLeft().toArray(
          new Long[newProductsAndCoefficients.getLeft().size()]));
      newReaction.setAllProductCoefficients(newProductsAndCoefficients.getRight());

      List<Long> newproductCofactors = buildIdMapping(oldReaction.getProductCofactors());
      newReaction.setProductCofactors(newproductCofactors.toArray(new Long[newproductCofactors.size()]));
    }
  }

  private List<Long> buildIdMapping(Long[] oldChemIds) {
    LinkedHashSet<Long> newIDs = new LinkedHashSet<>(oldChemIds.length);

    for (Long oldChemId : oldChemIds) {
      List<Long> newChemIds = oldChemicalIdToNewChemicalIds.get(oldChemId);
      if (newChemIds == null) {
        throw new RuntimeException(
            String.format("Found old chemical id %d that is not in the old -> new chem id map", oldChemId));
      }

      newIDs.addAll(newChemIds);
    }

    List<Long> results = new ArrayList<>();
    // TODO: does ArrayList's constructor also add all the hashed elements in order?  I know addAll does.
    results.addAll(newIDs);
    return results;
  }

  private Pair<List<Long>, Map<Long, Integer>> buildIdAndCoefficientMapping(Reaction oldRxn, ReactionComponent sOrP) {
    Long[] oldChemIds = sOrP == SUBSTRATE ? oldRxn.getSubstrates() : oldRxn.getProducts();
    List<Long> resultIds = new ArrayList<>(oldChemIds.length);
    Map<Long, Integer> newIdToCoefficientMap = new HashMap<>(oldChemIds.length);

    for (Long oldChemId : oldChemIds) {
      Integer originalRxnCoefficient = sOrP == SUBSTRATE ?
          oldRxn.getSubstrateCoefficient(oldChemId) : oldRxn.getProductCoefficient(oldChemId);

      List<Long> newChemIds = oldChemicalIdToNewChemicalIds.get(oldChemId);
      if (newChemIds == null) {
        throw new RuntimeException(
            String.format("Found old chemical id %d that is not in the old -> new chem id map", oldChemId));
      }

      for (Long newChemId : newChemIds) {
        // Deduplicate new chemicals in the list based on whether we've assigned coefficients for them or not.
        if (newIdToCoefficientMap.containsKey(newChemId)) {
          Integer coefficientAccumulator = newIdToCoefficientMap.get(newChemId);

          // If only one coefficient is null, we have a problem.  Just write null and hope we can figure it out later.
          if ((coefficientAccumulator == null && originalRxnCoefficient != null) ||
              (coefficientAccumulator != null && originalRxnCoefficient == null)) {
            LOGGER.error("Found null coefficient that needs to be merged with non-null coefficient. " +
                "New chem id: %d, old chem id: %d, coefficient value: %d, old rxn id: %d",
                newChemId, oldChemId, originalRxnCoefficient, oldRxn.getUUID());
            newIdToCoefficientMap.put(newChemId, null);
          } else if (coefficientAccumulator != null && originalRxnCoefficient != null) {
            /* If neither are null, multiply the coefficient to be added by the desalting multiplier and sum that
             * product with the existing count for this molecule. */
            Integer desalterMultiplier = desalterMultiplerMap.get(Pair.of(oldChemId, newChemId));
            originalRxnCoefficient *= desalterMultiplier;

            newIdToCoefficientMap.put(newChemId, coefficientAccumulator + originalRxnCoefficient);
          } // Else both are null we don't need to do anything.

          // We don't need to add this new id to the list of substrates/products because it's already there.
        } else {
          resultIds.add(newChemId); // Add the new id to the subs/prods list.
          Integer desalterMultiplier = desalterMultiplerMap.get(Pair.of(oldChemId, newChemId));
          if (originalRxnCoefficient == null) {
            if (!desalterMultiplier.equals(1)) {
              LOGGER.warn("Ignoring >1 desalting multipler due to existing null coefficient.  " +
                    "New chem id: %d, old chem id: %d, coefficient value: null, multiplier: %d, old rxn id: %d",
                newChemId, oldChemId, desalterMultiplier, oldRxn.getUUID());
            }
            newIdToCoefficientMap.put(newChemId, null);
          } else {
            newIdToCoefficientMap.put(newChemId, originalRxnCoefficient * desalterMultiplier);
          }
        }
      }
    }
    return Pair.of(resultIds, newIdToCoefficientMap);
  }


  /**
   * This function desalts a single chemical and returns the resulting ids of the modified chemicals that have been
   * written to the destination DB.  The results of the desalting process are also cached for later use in mapping
   * chemicals from the old DB to the new.  If the chemicals cannot be desalted, we just migrate the chemical unaltered.
   *
   * @param chemical A chemical to desalt.
   * @return A list of output ids of desalted chemicals
   */
  private List<Long> desaltChemical(Chemical chemical) throws IOException, ReactionException {
    Long originalId = chemical.getUuid();

    // If the chemical's ID maps to a single pre-seen entry, use its existing old id
    if (oldChemicalIdToNewChemicalIds.containsKey(originalId)) {
      LOGGER.error("desaltChemical was called on a chemical that was already desalted: %d", originalId);
    }

    // Otherwise need to clean the chemical
    String inchi = chemical.getInChI();

    // If it's FAKE, just go with it
    if (inchi.contains(FAKE)) {
      long newId = getNoSQLAPI().writeToOutKnowlegeGraph(chemical); //Write to the db
      List<Long> singletonId = Collections.unmodifiableList(Collections.singletonList(newId));
      inchiToNewId.put(inchi, newId);
      desalterMultiplerMap.put(Pair.of(originalId, newId), 1);
      oldChemicalIdToNewChemicalIds.put(originalId, singletonId);
      return singletonId;
    }

    Map<String, Integer> cleanedInchis = null;
    try {
      cleanedInchis = desalter.desaltInchi(inchi);
    } catch (Exception e) {
      // TODO: probably should handle this error differently, currently just letting pass unaltered
      LOGGER.error(String.format("Exception caught when desalting chemical %d: %s", originalId, e.getMessage()));
      desalterFailuresCounter++;
      long newId = getNoSQLAPI().writeToOutKnowlegeGraph(chemical); //Write to the db
      List<Long> singletonId = Collections.singletonList(newId);
      inchiToNewId.put(inchi, newId);
      desalterMultiplerMap.put(Pair.of(originalId, newId), 1);
      oldChemicalIdToNewChemicalIds.put(originalId, singletonId);
      return Collections.singletonList(newId);
    }

    List<Long> newIds = new ArrayList<>();
    // For each cleaned chemical, put in DB or update ID
    for (Map.Entry<String, Integer> pair : cleanedInchis.entrySet()) {
      String cleanInchi = pair.getKey();
      // If the cleaned inchi is already in DB, use existing ID, and hash the id
      long newId;

      if (inchiToNewId.containsKey(cleanInchi)) {
        newId = inchiToNewId.get(cleanInchi);
      } else {
        // Otherwise update the chemical, put into DB, and hash the id and inchi
        chemical.setInchi(cleanInchi);
        newId = getNoSQLAPI().writeToOutKnowlegeGraph(chemical); // Write to the db
        inchiToNewId.put(cleanInchi, newId);
      }
      /* The desalter converts complex molecules into a set of unique fragments, but maitains a count of those
       * fragments.  That fragment count must be multiplied by any reaction-specific coefficient to get the
       * true count of fragments participating in a reaction.
       *
       * Because different salts may have different coefficients, we must key the multiplier on (oldId, newId) to
       * ensure that multiple occurrences of the same desalted molecule in a reaction don't overwrite each other.
       *
       * We must save every (oldId, newId) pair, as each pair will be unique even if newId was seen before. */
      desalterMultiplerMap.put(Pair.of(originalId, newId), pair.getValue());

      newIds.add(newId);
    }

    // Store and return the cached list of chemical ids that we just created.  Make them immutable for safety's sake.
    List<Long> resultsToCache = Collections.unmodifiableList(newIds);
    oldChemicalIdToNewChemicalIds.put(originalId, resultsToCache);
    return resultsToCache;
  }
}