CofactorRemover.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.cofactorremoval;

import act.server.NoSQLAPI;
import act.shared.Chemical;
import act.shared.Reaction;
import com.act.biointerpretation.BiointerpretationProcessor;
import com.act.biointerpretation.Utils.ReactionComponent;
import com.act.biointerpretation.mechanisminspection.BlacklistedInchisCorpus;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

import static com.act.biointerpretation.Utils.ReactionComponent.PRODUCT;
import static com.act.biointerpretation.Utils.ReactionComponent.SUBSTRATE;

/**
 * This class reads in reactions from a read DB and processes each one such that cofactors are binned together
 * in either substrate/product cofactor lists. It removes both concrete cofactors (ie, ones with precise inchis)
 * as well as abstract ones (ie, FAKE inchis).  It sequentially removes the cofactors in a prioritized manner until only
 * one substrate and product remain.
 *
 * Uniqueness in this database is the matching of:
 * 1. the remaining substrate
 * 2. the remaining product
 * 3. the names of the substrate cofactors
 * 4. the names of the product cofactors
 *
 * Created by jca20n on 2/15/16.
 */
public class CofactorRemover extends BiointerpretationProcessor {
  private static final Logger LOGGER = LogManager.getFormatterLogger(CofactorRemover.class);
  private static final String PROCESSOR_NAME = "Cofactor Remover";

  private static final String FAKE = "FAKE";

  private FakeCofactorFinder fakeFinder;
  private CofactorsCorpus cofactorsCorpus;
  private Set<Long> knownCofactorReadDBIds = new HashSet<>();
  private Set<Long> knownCofactorWriteDBIds = null;

  private BlacklistedInchisCorpus blacklistedInchisCorpus;

  @Override
  public String getName() {
    return PROCESSOR_NAME;
  }

  public CofactorRemover(NoSQLAPI api) {
    super(api);
    fakeFinder = new FakeCofactorFinder();
  }

  public void init() throws IOException {
    cofactorsCorpus = new CofactorsCorpus();
    cofactorsCorpus.loadCorpus();

    blacklistedInchisCorpus = new BlacklistedInchisCorpus();
    blacklistedInchisCorpus.loadCorpus();

    markInitialized();
  }

  @Override
  protected Chemical runSpecializedChemicalProcessing(Chemical chem) {
    return assignCofactorStatus(chem);
  }

  private Chemical assignCofactorStatus(Chemical chemical) {
    Long oldId = chemical.getUuid();

    // First, check if the InChI needs to be updated.  A few cofactors are known to have broken InChIs.
    String inchi = blacklistedInchisCorpus.renameInchiIfFoundInBlacklist(chemical.getInChI());
    chemical.setInchi(inchi);

    boolean isCofactor = false;
    if (cofactorsCorpus.getInchiToName().containsKey(inchi)) {
      isCofactor = true;
    } else if (inchi.contains(FAKE) && (fakeFinder.scanAndReturnCofactorNameIfItExists(chemical) != null)) {
      // TODO: Abstract the Fake inchi checks into its own utility class.
      isCofactor = true;
    }

    // Set isCofactor *without* looking at previous determinations.  This is the single source of truth for cofactors.
    chemical.setIsCofactor(isCofactor);
    if (isCofactor) {
      knownCofactorReadDBIds.add(oldId);
    }

    return chemical;
  }

  @Override
  protected void afterProcessChemicals() {
    LOGGER.info("Found %d cofactors amongst %d migrated chemicals",
        knownCofactorReadDBIds.size(), getOldChemIdToNewChemId().size());
    LOGGER.info("Building cofactor status map for new chemical ids to facilitate cofactor removal");

    knownCofactorWriteDBIds = new HashSet<>(knownCofactorReadDBIds.size());
    for (Long oldId : knownCofactorReadDBIds) {
      knownCofactorWriteDBIds.add(mapOldChemIdToNewId(oldId));
    }

    if (knownCofactorWriteDBIds.size() != knownCofactorReadDBIds.size()) {
      String msg = String.format("Old and new cofactor id sets to not match in size: %d vs. %d",
          knownCofactorReadDBIds.size(), knownCofactorWriteDBIds.size());
      if (knownCofactorWriteDBIds.size() > knownCofactorReadDBIds.size()) {
        LOGGER.error(msg);
        throw new RuntimeException(msg);
      } else {
        LOGGER.warn("%s (might be the result of blacklisted InChI correction, " +
            "which can reduce the number of cofactors in the new DB)", msg);
      }
    }
    LOGGER.info("New cofactor id map constructed, ready to process reactions.");
    /* TODO: we want to prevent any further access to the old map of ids to avoid accidental use instead of
     * knownCofactorWriteDBIds.  Is there a better way than this? */
    knownCofactorReadDBIds = null;
  }

  @Override
  protected Reaction preProcessReaction(Reaction rxn) {
    findAndIsolateCoenzymesFromReaction(rxn);
    // Make sure the there are enough co/products and co/substrates in the processed reaction
    if ((rxn.getSubstrates().length == 0 && rxn.getSubstrateCofactors().length == 0) ||
        (rxn.getProducts().length == 0 && rxn.getProductCofactors().length == 0)) {
      LOGGER.warn("Reaction %d does not have any products or substrates after coenzyme removal.", rxn.getUUID());
      return null;
    }

    return rxn;
  }

  /**
   * The function removes similar chemicals from the substrates and products (conenzymes) and remove duplicates
   * within each category.
   * @param reaction The reaction being updated.
   */
  private void findAndIsolateCoenzymesFromReaction(Reaction reaction) {
    // Build ordered sets of the substrates/products.
    LinkedHashSet<Long> substrates = new LinkedHashSet<>(Arrays.asList(reaction.getSubstrates()));
    LinkedHashSet<Long> products = new LinkedHashSet<>(Arrays.asList(reaction.getProducts()));

    // Compute the intersection between the sets.
    Set<Long> intersection = new HashSet<>(substrates);
    intersection.retainAll(products);

    // A - int(A, B) = A / B
    substrates.removeAll(intersection);
    products.removeAll(intersection);

    // Update the reaction with the new (ordered) substrates/products + coenzymes.
    reaction.setSubstrates(substrates.toArray(new Long[substrates.size()]));
    reaction.setProducts(products.toArray(new Long[products.size()]));

    // Keep any existing coenzymes, but don't use them when computing the difference--they might be there for a reason.
    intersection.addAll(Arrays.asList(reaction.getCoenzymes()));
    reaction.setCoenzymes(intersection.toArray(new Long[intersection.size()]));
  }

  @Override
  protected Reaction runSpecializedReactionProcessing(Reaction rxn, Long newId) {
    // Bump up the cofactors to the cofactor list and update all substrates/products and their coefficients accordingly.
    updateReactionProductOrSubstrate(rxn, SUBSTRATE);
    updateReactionProductOrSubstrate(rxn, PRODUCT);
    return rxn;
  }

  /**
   * This function is the meat of the cofactor removal process.  It extracts all cofactors based on their ids and
   * places them in the appropriate collection within the reaciton.  Note that because this is executed by
   * BiointerpretationProcessor's `runSpecializedReactionProcessing` hook, the chemical ids have already been updated
   * to reference the chemical entries in the WriteDB.
   * @param reaction The reaction to update.
   * @param component Update substrates or products.
   */
  private void updateReactionProductOrSubstrate(Reaction reaction, ReactionComponent component) {
    Long[] chemIds, originalCofactorIds;
    if (component == SUBSTRATE) {
      chemIds = reaction.getSubstrates();
      originalCofactorIds = reaction.getSubstrateCofactors();
    } else {
      chemIds = reaction.getProducts();
      originalCofactorIds = reaction.getProductCofactors();
    }

    Map<Boolean, List<Long>> partitionedIds =
        Arrays.asList(chemIds).stream().collect(Collectors.partitioningBy(knownCofactorWriteDBIds::contains));

    List<Long> cofactorIds = partitionedIds.containsKey(true) ? partitionedIds.get(true) : Collections.EMPTY_LIST;
    List<Long> nonCofactorIds = partitionedIds.containsKey(false) ? partitionedIds.get(false) : Collections.EMPTY_LIST;

    // Retain previously partitioned cofactors if any exist.
    if (originalCofactorIds != null && originalCofactorIds.length > 0) {
      // Use an ordered set to unique the partitioned and previously specified cofactors.  Original cofactors go first.
      LinkedHashSet<Long> uniqueCofactorIds = new LinkedHashSet<>(Arrays.asList(originalCofactorIds));
      uniqueCofactorIds.addAll(cofactorIds);
      /* We do this potentially expensive de-duplication step only in the presumably rare case that we find a reaction
       * that already has cofactors set.  A reaction that has not already undergone cofactor removal is very unlikely to
       * have cofactors partitioned from substrates/products. */
      cofactorIds = new ArrayList<>(uniqueCofactorIds);
    }

    // Coefficients for cofactors should automatically fall out when we update the substrate/product list.
    if (component == SUBSTRATE) {
      reaction.setSubstrateCofactors(cofactorIds.toArray(new Long[cofactorIds.size()]));
      reaction.setSubstrates(nonCofactorIds.toArray(new Long[nonCofactorIds.size()]));
      /* Coefficients should already have been set when the reaction was migrated to the new DB, so no need to update.
       * Note that this assumption depends strongly on the current coefficient implementation in the Reaction model. */
    } else {
      reaction.setProductCofactors(cofactorIds.toArray(new Long[cofactorIds.size()]));
      reaction.setProducts(nonCofactorIds.toArray(new Long[nonCofactorIds.size()]));
    }
  }

  /**
   * Removes cofactors from a single reaction by its ID.
   *
   * Important: do not call this on an object that has been/will be used to process an entire DB (via the `run` method,
   * for example).  The two approaches to cofactor removal use the same cache objects which will be corrupted if the
   * object is reused (hence this method being protected).
   *
   * @param rxnId The id of the reaction to process.
   * @return The original and modified reaction object.
   * @throws IOException
   */
  protected Pair<Reaction, Reaction> removeCofactorsFromOneReaction(Long rxnId) throws IOException {
    Reaction oldRxn = getNoSQLAPI().readReactionFromInKnowledgeGraph(rxnId);
    if (oldRxn == null) {
      LOGGER.error("Could not find reaction %d in the DB", rxnId);
      return null;
    }

    Set<Long> allChemicalIds = new HashSet<>();
    allChemicalIds.addAll(Arrays.asList(oldRxn.getSubstrates()));
    allChemicalIds.addAll(Arrays.asList(oldRxn.getProducts()));
    allChemicalIds.addAll(Arrays.asList(oldRxn.getSubstrateCofactors()));
    allChemicalIds.addAll(Arrays.asList(oldRxn.getProductCofactors()));
    allChemicalIds.addAll(Arrays.asList(oldRxn.getCoenzymes()));

    for (Long id : allChemicalIds) {
      Chemical chem = getNoSQLAPI().readChemicalFromInKnowledgeGraph(id);
      if (chem == null) {
        LOGGER.error("Unable to find chemical %d for reaction %d in the DB", id, rxnId);
        return null;
      }
      // Simulate chemical migration so we play nicely with the cofactor remover.
      getOldChemIdToNewChemId().put(id, id);
      getNewChemIdToInchi().put(id, chem.getInChI());

      chem = assignCofactorStatus(chem);
      if (chem.isCofactor()) {
        LOGGER.info("Found participating cofactor %d: %s", chem.getUuid(), chem.getInChI());
      }
    }

    Reaction newRxn = new Reaction(
        -1,
        oldRxn.getSubstrates(),
        oldRxn.getProducts(),
        oldRxn.getSubstrateCofactors(),
        oldRxn.getProductCofactors(),
        oldRxn.getCoenzymes(),
        oldRxn.getECNum(),
        oldRxn.getConversionDirection(),
        oldRxn.getPathwayStepDirection(),
        oldRxn.getReactionName(),
        oldRxn.getRxnDetailType()
    );

    findAndIsolateCoenzymesFromReaction(newRxn);
    newRxn = runSpecializedReactionProcessing(newRxn, -1L);

    return Pair.of(oldRxn, newRxn);
  }
}