ReactionMerger.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.reactionmerging;

import act.server.NoSQLAPI;
import act.shared.Reaction;
import act.shared.helpers.P;
import com.act.biointerpretation.BiointerpretationProcessor;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.biopax.paxtools.model.level3.ConversionDirectionType;
import org.biopax.paxtools.model.level3.StepDirection;
import org.json.JSONObject;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;

/**
 * This creates Dr. Know from Lucille.  Dr. Know is the database in which all Reactions
 * have been merged based on the sameness of the reactions and product ids.
 */
public class ReactionMerger extends BiointerpretationProcessor {
  private static final Logger LOGGER = LogManager.getFormatterLogger(ReactionMerger.class);
  private static final String PROCESSOR_NAME = "Reaction Merger";

  @Override
  public String getName() {
    return PROCESSOR_NAME;
  }

  public ReactionMerger(NoSQLAPI noSQLAPI) {
    super(noSQLAPI);
  }

  @Override
  public void init() {
    // Do nothing for this class, as there's no initialization necessary.
    markInitialized();
  }

  @Override
  public void processReactions() {
    LOGGER.info("Reading all reactions");
    Iterator<Reaction> rxns = getNoSQLAPI().readRxnsFromInKnowledgeGraph();
    Map<SubstratesProducts, PriorityQueue<Long>> reactionGroups = hashReactions(rxns);
    LOGGER.info("Found %d reaction groups, merging", reactionGroups.size());
    mergeAllReactions(reactionGroups);
    LOGGER.info("Done merging reactions");
  }

  protected static Map<SubstratesProducts, PriorityQueue<Long>> hashReactions(Iterator<Reaction> reactionIterator) {
    HashMap<SubstratesProducts, PriorityQueue<Long>> reactionGroups = new HashMap<>();

    // Add the next available reaction to the map of substrates+products -> ids.
    // TODO: spill this map to disk if the map gets too large.
    while (reactionIterator.hasNext()) {
      Reaction rxn = reactionIterator.next();
      SubstratesProducts sp = new SubstratesProducts(rxn);
      PriorityQueue<Long> pq = reactionGroups.get(sp);
      Long id = Long.valueOf(rxn.getUUID());

      if (pq != null) {
        pq.add(id);
      } else {
        pq = new PriorityQueue<>(1);
        pq.add(id);
        reactionGroups.put(sp, pq);
      }
    }

    return reactionGroups;
  }

  public static class SubstratesProducts {
    // TODO: also consider ec-umber, coefficients, and other reaction attributes.
    Set<Long> substrates = null, products = null,
        substrateCofactors = null, productCofactors = null, coenzymes = null;
    Map<Long, Integer> substrateCoefficients = null, productCoefficients = null;
    String ecnum = null;
    ConversionDirectionType conversionDirectionType = null;
    StepDirection pathwayStepDirection = null;

    public SubstratesProducts(Reaction reaction) {
      // TODO: should we copy these to be safe, or just assume nobody will mess with them?
      this.substrates = new HashSet<>(Arrays.asList(reaction.getSubstrates()));
      this.products = new HashSet<>(Arrays.asList(reaction.getProducts()));
      this.substrateCofactors = new HashSet<>(Arrays.asList(reaction.getSubstrateCofactors()));
      this.productCofactors = new HashSet<>(Arrays.asList(reaction.getProductCofactors()));
      this.coenzymes = new HashSet<>(Arrays.asList(reaction.getCoenzymes()));

      this.substrateCoefficients = new HashMap<>(this.substrates.size());
      for (Long id : reaction.getSubstrateIdsOfSubstrateCoefficients()) {
        this.substrateCoefficients.put(id, reaction.getSubstrateCoefficient(id));
      }

      this.productCoefficients = new HashMap<>(this.products.size());
      for (Long id : reaction.getProductIdsOfProductCoefficients()) {
        this.productCoefficients.put(id, reaction.getProductCoefficient(id));
      }

      this.ecnum = reaction.getECNum();
      this.conversionDirectionType = reaction.getConversionDirection();
      this.pathwayStepDirection = reaction.getPathwayStepDirection();
    }

    @Override
    public boolean equals(Object o) {
      if (this == o) return true;
      if (o == null || getClass() != o.getClass()) return false;

      SubstratesProducts that = (SubstratesProducts) o;

      if (!substrates.equals(that.substrates)) return false;
      if (!products.equals(that.products)) return false;
      if (!substrateCofactors.equals(that.substrateCofactors)) return false;
      if (!productCofactors.equals(that.productCofactors)) return false;
      if (!coenzymes.equals(that.coenzymes)) return false;
      if (!substrateCoefficients.equals(that.substrateCoefficients)) return false;
      if (!productCoefficients.equals(that.productCoefficients)) return false;
      if (ecnum != null ? !ecnum.equals(that.ecnum) : that.ecnum != null) return false;
      return conversionDirectionType == that.conversionDirectionType &&
          pathwayStepDirection == that.pathwayStepDirection;

    }

    @Override
    public int hashCode() {
      int result = substrates.hashCode();
      result = 31 * result + products.hashCode();
      result = 31 * result + substrateCofactors.hashCode();
      result = 31 * result + productCofactors.hashCode();
      result = 31 * result + coenzymes.hashCode();
      result = 31 * result + substrateCoefficients.hashCode();
      result = 31 * result + productCoefficients.hashCode();
      result = 31 * result + (ecnum != null ? ecnum.hashCode() : 0);
      result = 31 * result + (conversionDirectionType != null ? conversionDirectionType.hashCode() : 0);
      result = 31 * result + (pathwayStepDirection != null ? pathwayStepDirection.hashCode() : 0);
      return result;
    }
  }

  private Reaction mergeReactions(List<Reaction> reactions) {
    if (reactions.size() < 1) {
      return null;
    }
    Reaction fr = reactions.get(0); // fr = First reaction; we'll refer to it a lot in a moment.
    Reaction mergedReaction = new Reaction(
        -1, // Assume the id will be set when the reaction is written to the DB.
        fr.getSubstrates(),
        fr.getProducts(),
        fr.getSubstrateCofactors(),
        fr.getProductCofactors(),
        fr.getCoenzymes(),
        fr.getECNum(),
        fr.getConversionDirection(),
        fr.getPathwayStepDirection(),
        fr.getReactionName(),
        fr.getRxnDetailType()
    );
    mergedReaction.setDataSource(fr.getDataSource());
    // Write stub reaction to DB to get its id, which is required for migrating sequences.
    int newId = getNoSQLAPI().writeToOutKnowlegeGraph(mergedReaction);

    // TODO: are there other fields we need to capture in this merge?
    // TODO: add source ids for the various attributes to make debugging easier.
    for (Reaction r : reactions) {
      // TODO: should these be sorted before adding?
      for (P<Reaction.RefDataSource, String> ref : r.getReferences()) {
        mergedReaction.addReference(ref.fst(), ref.snd());
      }
      for (JSONObject protein : r.getProteinData()) {
        // Save the source reaction ID for debugging/verification purposes.  TODO: is adding a field like this okay?
        protein.put("source_reaction_id", r.getUUID());
        writeMigratedReactionMap((long) r.getUUID(), (long) newId);
        JSONObject newProteinData = migrateProteinData(protein);
        mergedReaction.addProteinData(newProteinData);
      }

      // Set the data source as MERGED if this is a combination of multiple sources.  The protein data will store which.
      if (mergedReaction.getDataSource() != Reaction.RxnDataSource.MERGED &&
          mergedReaction.getDataSource() != r.getDataSource()) {
        mergedReaction.setDataSource(Reaction.RxnDataSource.MERGED);
      }
    }

    migrateReactionChemicals(mergedReaction, fr);

    // Update the reaction in the DB with the newly migrated protein data.
    getNoSQLAPI().getWriteDB().updateActReaction(mergedReaction, newId);

    return mergedReaction;
  }

  protected void mergeAllReactions(Map<SubstratesProducts, PriorityQueue<Long>> reactionGroups) {
    /* Maintain stability by constructing the ordered set of minimum group reaction ids so that we can iterate
     * over reactions in the same order they occur in the source DB.  Stability makes life easier in a number of ways
     * (easier testing, deterministic output, general sanity) so we go to the trouble here. */
    final HashMap<Long, PriorityQueue<Long>> minGroupIdsToGroups = new HashMap<>(reactionGroups.size());
    for (Map.Entry<SubstratesProducts, PriorityQueue<Long>> entry : reactionGroups.entrySet()) {
      minGroupIdsToGroups.put(entry.getValue().peek(), entry.getValue());
    }

    List<Long> orderedIds = Arrays.asList(minGroupIdsToGroups.keySet().toArray(new Long[minGroupIdsToGroups.size()]));
    Collections.sort(orderedIds);

    for (Long nextId : orderedIds) {
      PriorityQueue<Long> groupIds = minGroupIdsToGroups.get(nextId);
      List<Reaction> reactions = new ArrayList<>(groupIds.size());
      for (Long id : groupIds) {
        // Since we've only installed reaction IDs based on instances we've seen, this should be safe.
        reactions.add(getNoSQLAPI().readReactionFromInKnowledgeGraph(id));
      }

      mergeReactions(reactions);
    }
  }
}