SequenceMerger.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.biointerpretation.sequencemerging;

import act.installer.GenbankInstaller;
import act.installer.UniprotInstaller;
import act.server.NoSQLAPI;
import act.shared.Organism;
import act.shared.Reaction;
import act.shared.Seq;
import act.shared.helpers.MongoDBToJSON;
import com.act.biointerpretation.BiointerpretationProcessor;
import com.act.biointerpretation.Utils.OrgMinimalPrefixGenerator;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONObject;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;

public class SequenceMerger extends BiointerpretationProcessor {
  private static final Logger LOGGER = LogManager.getFormatterLogger(SequenceMerger.class);
  private static final String PROCESSOR_NAME = "Sequence Merger";
  private static final String SYNONYMS = "synonyms";
  private static final String SOURCE_SEQUENCE_IDS = "source_sequence_ids";
  private static final String SOURCE_REACTION_ID = "source_reaction_id";
  private static final String PROTEIN_EXISTENCE = "proteinExistence";
  private static final String COMMENT = "comment";
  private static final String TEXT = "text";
  private static final String TYPE = "type";
  private static final String BRENDA_ID = "brenda_id";
  private static final String XREF = "xref";
  private static final String NAME = "name";
  private static final String PRODUCT_NAMES = "product_names";
  private static final String ACCESSION = "accession";
  private static final String SRC = "src";
  private static final String PATENT = "Patent";
  private static final String VAL = "val";
  private static final String PATENT_YEAR = "patent_year";
  private static final String COUNTRY_CODE = "country_code";
  private static final String PATENT_NUMBER = "patent_number";
  private static final String PMID = "PMID";
  private static final String SEQUENCES = "sequences";
  private static final String ORGANISM = "organism";

  private Map<Long, Long> sequenceMigrationMap = new HashMap<>();
  private Map<Long, Long> organismMigrationMap = new HashMap<>();

  private Map<String, String> minimalPrefixMapping;

  public SequenceMerger(NoSQLAPI noSQLAPI) {
    super(noSQLAPI);
  }

  @Override
  public String getName() {
    return PROCESSOR_NAME;
  }

  @Override
  public void init() {
    Iterator<Organism> orgIterator = getNoSQLAPI().readOrgsFromInKnowledgeGraph();

    OrgMinimalPrefixGenerator prefixGenerator = new OrgMinimalPrefixGenerator(orgIterator);
    minimalPrefixMapping = prefixGenerator.getMinimalPrefixMapping();

    markInitialized();
  }

  /**
   * Copies all reactions over to the WriteDB and stores the mapping from old ID to new ID in reactionMigrationMap
   *
   * For each Reaction:
   * 1) Updates the IDs of the merged sequences to the new merged Sequence ID
   * 2) Since organism names were mapped to their minimal prefix, updates the IDs of the organisms to the
   * ID of the minimal prefix
   * 3) Updates the source_reaction_id
   */
  @Override
  public void processReactions() {
    Iterator<Reaction> reactionIterator = getNoSQLAPI().readRxnsFromInKnowledgeGraph();

    while (reactionIterator.hasNext()) {
      Reaction oldRxn = reactionIterator.next();
      Set<JSONObject> proteins = oldRxn.getProteinData();

      for (JSONObject protein : proteins) {
        JSONArray sequenceIDs = protein.getJSONArray(SEQUENCES);
        Set<Long> newSequenceIDs = new HashSet<>();

        for (int i = 0; i < sequenceIDs.length(); i++) {
          newSequenceIDs.add(sequenceMigrationMap.get(sequenceIDs.getLong(i)));
        }

        protein.put(SEQUENCES, new JSONArray(newSequenceIDs));

        protein.put(ORGANISM, organismMigrationMap.get(protein.getLong(ORGANISM)));

        protein.put(SOURCE_REACTION_ID, (long) oldRxn.getUUID());
      }

      oldRxn.setProteinData(proteins);

      Long newId = (long) getNoSQLAPI().writeToOutKnowlegeGraph(oldRxn);

      writeMigratedReactionMap((long) oldRxn.getUUID(), newId);
    }
  }


  @Override
  public void processSequences() {
    Iterator<Seq> sequences = getNoSQLAPI().readSeqsFromInKnowledgeGraph();
    Map<UniqueSeq, List<Seq>> sequenceGroups = new HashMap<>();

    int numberOfSequencesMerged = 0;

    // # of sequences that aren't merged due to lack of Seq entry matches
    int numberOfSequencesUnmerged = 0;

    // # of sequences that aren't merged due to lack of information
    int numberOfSequencesUnmergedInfo = 0;

    // stores all sequences with the same ecnum, organism (accounts for prefix), and protein sequence in the same list
    while (sequences.hasNext()) {
      Seq sequence = sequences.next();

      /* changes the organism name to its minimal prefix; must occur before stored in the sequenceGroup map so that
      all seq entries with the same minimal prefix org name, ecnum, & protein sequence are merged */
      migrateOrganism(sequence);

      if (sequence.getOrgName() == null || sequence.getOrgName().isEmpty() ||
          sequence.getSequence() == null || sequence.getSequence().isEmpty() ||
          sequence.getEc() == null || sequence.getEc().isEmpty()) {
        // copy sequence directly, no merging will be possible
        writeSequence(sequence);
        numberOfSequencesUnmergedInfo++;
      }

      UniqueSeq uniqueSeq = new UniqueSeq(sequence);

      if (sequenceGroups.containsKey(uniqueSeq)) {
        // add UniqueSeq object to already existent list that shares the same ecnum, organism & protein sequence
        sequenceGroups.get(uniqueSeq).add(sequence);
      } else {
        // create a new modifiable list for the UniqueSeq object and add a new mapping
        List<Seq> seqs = new ArrayList<>();
        seqs.add(sequence);
        sequenceGroups.put(uniqueSeq, seqs);
      }
    }

    for (Map.Entry<UniqueSeq, List<Seq>> sequenceGroup : sequenceGroups.entrySet()) {
      List<Seq> allMatchedSeqs = sequenceGroup.getValue();

      if (allMatchedSeqs.size() == 1) {
        numberOfSequencesUnmerged++;
      } else {
        numberOfSequencesMerged += allMatchedSeqs.size();
      }

      // stores the IDs of all sequences that are about to be merged
      Set<Long> matchedSeqsIDs = new HashSet<>();

      for (Seq sequence : allMatchedSeqs) {
        matchedSeqsIDs.add((long) sequence.getUUID());
      }

      // merges all sequences that share the same ecnum, organism and protein sequence
      Seq mergedSequence = mergeSequences(allMatchedSeqs);

      // for reference, adds all the seq IDs that were merged
      mergedSequence.getMetadata().put(SOURCE_SEQUENCE_IDS, matchedSeqsIDs);

      Long mergedSeqId = writeSequence(mergedSequence);

      // maps the old duplicate sequences to the new merged sequence entry
      for (Long matchedSeqId : matchedSeqsIDs) {
        sequenceMigrationMap.put(matchedSeqId, mergedSeqId);
      }
    }

    LOGGER.info("%d number of sequences merged", numberOfSequencesMerged);
    LOGGER.info("%d number of sequences unmerged due to lack of information", numberOfSequencesUnmergedInfo);
    LOGGER.info("%d number of sequences unmerged due to lack of Seq entry matches", numberOfSequencesUnmerged);
  }

  private Long writeSequence(Seq sequence) {
    return (long) getNoSQLAPI().getWriteDB().submitToActSeqDB(
        sequence.getSrcdb(),
        sequence.getEc(),
        sequence.getOrgName(),
        organismMigrationMap.get(sequence.getOrgId()),
        sequence.getSequence(),
        sequence.getReferences(),
        sequence.getReactionsCatalyzed(),
        MongoDBToJSON.conv(sequence.getMetadata())
    );
  }

  /**
   * Changes organism name to its minimal prefix and updates the organism ID appropriately
   * @param sequence the Seq entry we are updating
   */
  private void migrateOrganism(Seq sequence) {
    if (sequence.getOrgName() == null || sequence.getOrgName().isEmpty()) {
      return;
    }

    String organismName = checkForOrgPrefix(sequence.getOrgName());
    sequence.setOrgName(organismName);

    Long newOrgId = getNoSQLAPI().getWriteDB().getOrganismId(organismName);

    if (newOrgId == -1) {
      newOrgId = getNoSQLAPI().getWriteDB().submitToActOrganismNameDB(organismName);
    }

    organismMigrationMap.put(sequence.getOrgId(), newOrgId);
  }

  /**
   * Checks if there is an existing organism prefix in the prefix tree;
   * @param orgName the organism name you are checking for a valid prefix
   * @return a valid prefix
   */
  private String checkForOrgPrefix(String orgName) {
    return minimalPrefixMapping.get(orgName);
  }

  /**
   * This class is used to group sequences that share the same ecnum, organism and protein sequence
   */
  private static class UniqueSeq {
    String ecnum;
    String organism;
    String protSeq;

    private UniqueSeq (Seq sequence) {
      this.ecnum = sequence.getEc();
      this.organism = sequence.getOrgName();
      this.protSeq = sequence.getSequence();
    }

    @Override
    public boolean equals(Object o) {
      if (this == o) return true;
      if (o == null || getClass() != o.getClass()) return false;

      UniqueSeq uniqueSeq = (UniqueSeq) o;

      if (ecnum != null ? !ecnum.equals(uniqueSeq.ecnum) : uniqueSeq.ecnum != null) return false;
      if (organism != null ? !organism.equals(uniqueSeq.organism) : uniqueSeq.organism != null) return false;
      return protSeq != null ? protSeq.equals(uniqueSeq.protSeq) : uniqueSeq.protSeq == null;

    }

    @Override
    public int hashCode() {
      int result = ecnum != null ? ecnum.hashCode() : 0;
      result = 31 * result + (organism != null ? organism.hashCode() : 0);
      result = 31 * result + (protSeq != null ? protSeq.hashCode() : 0);
      return result;
    }
  }

  private Seq mergeSequences(List<Seq> sequences) {
    if (sequences.size() < 1) {
      throw new RuntimeException("0 matched sequences in this sequence group");
    } else if (sequences.size() == 1) {
      return sequences.get(0);
    }

    Seq firstSequence = sequences.get(0);
    JSONObject firstSeqMetadata = firstSequence.getMetadata();

    // this field is empty for every Seq entry, so we're removing it
    firstSeqMetadata.remove(PROTEIN_EXISTENCE);

    /* we want to convert the brenda_ids from being stored in a comment JSONArray to being stored in
    an xref map (JSONObject) */
    JSONArray comment = firstSeqMetadata.getJSONArray(COMMENT);

    Set<Long> brendaIds = new HashSet<>();

    for (int i = 0; i < comment.length(); i++) {
      JSONObject commentObject = comment.getJSONObject(i);

      if (commentObject.has(TEXT) && commentObject.has(TYPE) &&
          commentObject.getString(TYPE).equals(BRENDA_ID)) {
        brendaIds.add(commentObject.getLong(TEXT));
      }
    }

    firstSeqMetadata.remove(COMMENT);

    JSONObject xrefObject = new JSONObject();
    xrefObject.put(BRENDA_ID, brendaIds);

    firstSeqMetadata.put(XREF, xrefObject);

    // initialized mergedSequence with firstSequence
    Seq mergedSequence = new Seq(
        -1, // assume ID will be set when the sequence is written to the DB
        firstSequence.getEc(),
        firstSequence.getOrgId(),
        firstSequence.getOrgName(),
        firstSequence.getSequence(),
        firstSequence.getReferences(),
        MongoDBToJSON.conv(firstSequence.getMetadata()),
        firstSequence.getSrcdb()
    );

    mergedSequence.setReactionsCatalyzed(firstSequence.getReactionsCatalyzed());

    // merge the rest of the matched sequences
    for (Seq sequence : sequences) {
      if (!mergedSequence.getEc().equals(sequence.getEc()) ||
          !mergedSequence.getSequence().equals(sequence.getSequence()) ||
          !mergedSequence.getOrgName().equals(sequence.getOrgName())) {

        String msg = "matching sequence map constructed improperly; at least one of ec #, protein sequence, & " +
            "organism don't match";

        LOGGER.error(msg);
        throw new RuntimeException(msg);
      }

      mergeReferences(mergedSequence.getReferences(), sequence.getReferences());

      mergeMetadata(mergedSequence.getMetadata(), sequence.getMetadata());

      mergeReactionRefs(mergedSequence.getReactionsCatalyzed(), sequence.getReactionsCatalyzed());
    }

    return mergedSequence;
  }

  private void mergeReactionRefs(Set<Long> mergedReactionRefs, Set<Long> newReactionRefs) {
    if (mergedReactionRefs == null || mergedReactionRefs.size() == 0) {
      mergedReactionRefs = newReactionRefs;
      return;
    }

    for (Long newReactionRef : newReactionRefs) {
      // Set operations automatically handle the case that the newReactionRef already exists in the mergedReactionRefs
      mergedReactionRefs.add(newReactionRef);
    }
  }

  private void mergeMetadata(JSONObject mergedMetadata, JSONObject newMetadata) {
    if (mergedMetadata == null || mergedMetadata == new JSONObject()) {
      mergedMetadata = newMetadata;
      return;
    }

    // ensures that the new gene name is added to the synonyms list in the case that it doesn't match the old gene name
    boolean geneNameMatches = true;

    if (newMetadata.has(NAME) && mergedMetadata.has(NAME)) {
      if (!newMetadata.getString(NAME).equals(mergedMetadata.getString(NAME))) {
        geneNameMatches = false;
      }
    }

    if (!mergedMetadata.has(NAME) && newMetadata.has(NAME)) {
      mergedMetadata.put(NAME, newMetadata.getString(NAME));
    }

    if (newMetadata.has(SYNONYMS)) {
      if (!geneNameMatches) {
        newMetadata.append(SYNONYMS, newMetadata.getString(NAME));
      }

      JSONArray newSynonyms = newMetadata.getJSONArray(SYNONYMS);

      if (mergedMetadata.has(SYNONYMS)) {
        for (int i = 0; i < newSynonyms.length(); i++) {
          mergedMetadata = GenbankInstaller.updateArrayField(SYNONYMS, newSynonyms.getString(i), mergedMetadata);
        }
      } else {
        mergedMetadata.put(SYNONYMS, newSynonyms);
      }
    }

    if (newMetadata.has(PRODUCT_NAMES)) {
      JSONArray newProductNames = newMetadata.getJSONArray(PRODUCT_NAMES);

      if (mergedMetadata.has(PRODUCT_NAMES)) {
        for (int i = 0; i < newProductNames.length(); i++) {
          mergedMetadata = GenbankInstaller.updateArrayField(PRODUCT_NAMES, newProductNames.getString(i),
              mergedMetadata);
        }
      } else {
        mergedMetadata.put(PRODUCT_NAMES, newProductNames);
      }
    }

    if (newMetadata.has(ACCESSION)) {
      JSONObject newAccession = newMetadata.getJSONObject(ACCESSION);

      if (mergedMetadata.has(ACCESSION)) {
        mergedMetadata = GenbankInstaller.updateAccessions(newAccession, mergedMetadata, Seq.AccType.genbank_nucleotide,
            GenbankInstaller.NUCLEOTIDE_ACCESSION_PATTERN);
        mergedMetadata = GenbankInstaller.updateAccessions(newAccession, mergedMetadata, Seq.AccType.genbank_protein,
            GenbankInstaller.PROTEIN_ACCESSION_PATTERN);
        mergedMetadata = GenbankInstaller.updateAccessions(newAccession, mergedMetadata, Seq.AccType.uniprot,
            UniprotInstaller.UNIPROT_ACCESSION_PATTERN);
      } else {
        mergedMetadata.put(ACCESSION, newAccession);
      }
    }

    // converts old comment JSONArrays to fit the new xref JSONObject model
    if (newMetadata.has(COMMENT)) {
      JSONArray comment = newMetadata.getJSONArray(COMMENT);

      Set<Long> newBrendaIds = new HashSet<>();
      for (int i = 0; i < comment.length(); i++) {
        JSONObject commentObject = comment.getJSONObject(i);

        if (commentObject.has(TEXT) && commentObject.has(TYPE) &&
            commentObject.getString(TYPE).equals(BRENDA_ID)) {
          newBrendaIds.add(commentObject.getLong(TEXT));
        }
      }

      if (mergedMetadata.has(XREF) && mergedMetadata.getJSONObject(XREF).has(BRENDA_ID)) {
        JSONArray brendaIds = mergedMetadata.getJSONObject(XREF).getJSONArray(BRENDA_ID);

        Set<Long> oldBrendaIds = new HashSet<>();

        for (int i = 0; i < brendaIds.length(); i++) {
          oldBrendaIds.add((Long) brendaIds.get(i));
        }

        for (Long brendaId : newBrendaIds) {
          // set operations handle duplicate case
          oldBrendaIds.add(brendaId);
        }

        mergedMetadata.getJSONObject(XREF).put(BRENDA_ID, oldBrendaIds);
      } else {
        JSONObject xrefObject = new JSONObject();
        xrefObject.put(BRENDA_ID, newBrendaIds);
        mergedMetadata.put(XREF, xrefObject);
      }
    }
  }

  private void mergeReferences(List<JSONObject> mergedRefs, List<JSONObject> newRefs) {
    if (mergedRefs == null || mergedRefs.size() == 0) {
      mergedRefs = newRefs;
      return;
    }

    for (JSONObject newRef : newRefs) {
      if (newRef.getString(SRC).equals(PMID)) {
        String newPmid = newRef.getString(VAL);

        ListIterator<JSONObject> mergedRefsIterator = mergedRefs.listIterator();

        Set<String> oldPmids = new HashSet<>();

        while (mergedRefsIterator.hasNext()) {
          JSONObject mergedRef = mergedRefsIterator.next();

          if (mergedRef.getString(SRC).equals(PMID)) {
            oldPmids.add(mergedRef.getString(VAL));
          }
        }

        if (!oldPmids.contains(newPmid)) {
          mergedRefs.add(newRef);
        }
      } else if (newRef.getString(SRC).equals(PATENT)) {
        boolean patentExists = false;
        String newCountryCode = newRef.getString(COUNTRY_CODE);
        String newPatentNumber = newRef.getString(PATENT_NUMBER);
        String newPatentYear = newRef.getString(PATENT_YEAR);

        ListIterator<JSONObject> mergedRefsIterator = mergedRefs.listIterator();

        while (mergedRefsIterator.hasNext()) {
          JSONObject mergedRef = mergedRefsIterator.next();

          if (mergedRef.getString(SRC).equals(PATENT) &&
              mergedRef.getString(COUNTRY_CODE).equals(newCountryCode) &&
              mergedRef.getString(PATENT_NUMBER).equals(newPatentNumber) &&
              mergedRef.getString(PATENT_YEAR).equals(newPatentYear)) {
            patentExists = true;
            break;
          }
        }

        if (!patentExists) {
          mergedRefs.add(newRef);
        }
      }
    }
  }

}