/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.biointerpretation.sequencemerging; import act.installer.GenbankInstaller; import act.installer.UniprotInstaller; import act.server.NoSQLAPI; import act.shared.Organism; import act.shared.Reaction; import act.shared.Seq; import act.shared.helpers.MongoDBToJSON; import com.act.biointerpretation.BiointerpretationProcessor; import com.act.biointerpretation.Utils.OrgMinimalPrefixGenerator; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.json.JSONArray; import org.json.JSONObject; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.ListIterator; import java.util.Map; import java.util.Set; public class SequenceMerger extends BiointerpretationProcessor { private static final Logger LOGGER = LogManager.getFormatterLogger(SequenceMerger.class); private static final String PROCESSOR_NAME = "Sequence Merger"; private static final String SYNONYMS = "synonyms"; private static final String SOURCE_SEQUENCE_IDS = "source_sequence_ids"; private static final String SOURCE_REACTION_ID = "source_reaction_id"; private static final String PROTEIN_EXISTENCE = "proteinExistence"; private static final String COMMENT = "comment"; private static final String TEXT = "text"; private static final String TYPE = "type"; private static final String BRENDA_ID = "brenda_id"; private static final String XREF = "xref"; private static final String NAME = "name"; private static final String PRODUCT_NAMES = "product_names"; private static final String ACCESSION = "accession"; private static final String SRC = "src"; private static final String PATENT = "Patent"; private static final String VAL = "val"; private static final String PATENT_YEAR = "patent_year"; private static final String COUNTRY_CODE = "country_code"; private static final String PATENT_NUMBER = "patent_number"; private static final String PMID = "PMID"; private static final String SEQUENCES = "sequences"; private static final String ORGANISM = "organism"; private Map<Long, Long> sequenceMigrationMap = new HashMap<>(); private Map<Long, Long> organismMigrationMap = new HashMap<>(); private Map<String, String> minimalPrefixMapping; public SequenceMerger(NoSQLAPI noSQLAPI) { super(noSQLAPI); } @Override public String getName() { return PROCESSOR_NAME; } @Override public void init() { Iterator<Organism> orgIterator = getNoSQLAPI().readOrgsFromInKnowledgeGraph(); OrgMinimalPrefixGenerator prefixGenerator = new OrgMinimalPrefixGenerator(orgIterator); minimalPrefixMapping = prefixGenerator.getMinimalPrefixMapping(); markInitialized(); } /** * Copies all reactions over to the WriteDB and stores the mapping from old ID to new ID in reactionMigrationMap * * For each Reaction: * 1) Updates the IDs of the merged sequences to the new merged Sequence ID * 2) Since organism names were mapped to their minimal prefix, updates the IDs of the organisms to the * ID of the minimal prefix * 3) Updates the source_reaction_id */ @Override public void processReactions() { Iterator<Reaction> reactionIterator = getNoSQLAPI().readRxnsFromInKnowledgeGraph(); while (reactionIterator.hasNext()) { Reaction oldRxn = reactionIterator.next(); Set<JSONObject> proteins = oldRxn.getProteinData(); for (JSONObject protein : proteins) { JSONArray sequenceIDs = protein.getJSONArray(SEQUENCES); Set<Long> newSequenceIDs = new HashSet<>(); for (int i = 0; i < sequenceIDs.length(); i++) { newSequenceIDs.add(sequenceMigrationMap.get(sequenceIDs.getLong(i))); } protein.put(SEQUENCES, new JSONArray(newSequenceIDs)); protein.put(ORGANISM, organismMigrationMap.get(protein.getLong(ORGANISM))); protein.put(SOURCE_REACTION_ID, (long) oldRxn.getUUID()); } oldRxn.setProteinData(proteins); Long newId = (long) getNoSQLAPI().writeToOutKnowlegeGraph(oldRxn); writeMigratedReactionMap((long) oldRxn.getUUID(), newId); } } @Override public void processSequences() { Iterator<Seq> sequences = getNoSQLAPI().readSeqsFromInKnowledgeGraph(); Map<UniqueSeq, List<Seq>> sequenceGroups = new HashMap<>(); int numberOfSequencesMerged = 0; // # of sequences that aren't merged due to lack of Seq entry matches int numberOfSequencesUnmerged = 0; // # of sequences that aren't merged due to lack of information int numberOfSequencesUnmergedInfo = 0; // stores all sequences with the same ecnum, organism (accounts for prefix), and protein sequence in the same list while (sequences.hasNext()) { Seq sequence = sequences.next(); /* changes the organism name to its minimal prefix; must occur before stored in the sequenceGroup map so that all seq entries with the same minimal prefix org name, ecnum, & protein sequence are merged */ migrateOrganism(sequence); if (sequence.getOrgName() == null || sequence.getOrgName().isEmpty() || sequence.getSequence() == null || sequence.getSequence().isEmpty() || sequence.getEc() == null || sequence.getEc().isEmpty()) { // copy sequence directly, no merging will be possible writeSequence(sequence); numberOfSequencesUnmergedInfo++; } UniqueSeq uniqueSeq = new UniqueSeq(sequence); if (sequenceGroups.containsKey(uniqueSeq)) { // add UniqueSeq object to already existent list that shares the same ecnum, organism & protein sequence sequenceGroups.get(uniqueSeq).add(sequence); } else { // create a new modifiable list for the UniqueSeq object and add a new mapping List<Seq> seqs = new ArrayList<>(); seqs.add(sequence); sequenceGroups.put(uniqueSeq, seqs); } } for (Map.Entry<UniqueSeq, List<Seq>> sequenceGroup : sequenceGroups.entrySet()) { List<Seq> allMatchedSeqs = sequenceGroup.getValue(); if (allMatchedSeqs.size() == 1) { numberOfSequencesUnmerged++; } else { numberOfSequencesMerged += allMatchedSeqs.size(); } // stores the IDs of all sequences that are about to be merged Set<Long> matchedSeqsIDs = new HashSet<>(); for (Seq sequence : allMatchedSeqs) { matchedSeqsIDs.add((long) sequence.getUUID()); } // merges all sequences that share the same ecnum, organism and protein sequence Seq mergedSequence = mergeSequences(allMatchedSeqs); // for reference, adds all the seq IDs that were merged mergedSequence.getMetadata().put(SOURCE_SEQUENCE_IDS, matchedSeqsIDs); Long mergedSeqId = writeSequence(mergedSequence); // maps the old duplicate sequences to the new merged sequence entry for (Long matchedSeqId : matchedSeqsIDs) { sequenceMigrationMap.put(matchedSeqId, mergedSeqId); } } LOGGER.info("%d number of sequences merged", numberOfSequencesMerged); LOGGER.info("%d number of sequences unmerged due to lack of information", numberOfSequencesUnmergedInfo); LOGGER.info("%d number of sequences unmerged due to lack of Seq entry matches", numberOfSequencesUnmerged); } private Long writeSequence(Seq sequence) { return (long) getNoSQLAPI().getWriteDB().submitToActSeqDB( sequence.getSrcdb(), sequence.getEc(), sequence.getOrgName(), organismMigrationMap.get(sequence.getOrgId()), sequence.getSequence(), sequence.getReferences(), sequence.getReactionsCatalyzed(), MongoDBToJSON.conv(sequence.getMetadata()) ); } /** * Changes organism name to its minimal prefix and updates the organism ID appropriately * @param sequence the Seq entry we are updating */ private void migrateOrganism(Seq sequence) { if (sequence.getOrgName() == null || sequence.getOrgName().isEmpty()) { return; } String organismName = checkForOrgPrefix(sequence.getOrgName()); sequence.setOrgName(organismName); Long newOrgId = getNoSQLAPI().getWriteDB().getOrganismId(organismName); if (newOrgId == -1) { newOrgId = getNoSQLAPI().getWriteDB().submitToActOrganismNameDB(organismName); } organismMigrationMap.put(sequence.getOrgId(), newOrgId); } /** * Checks if there is an existing organism prefix in the prefix tree; * @param orgName the organism name you are checking for a valid prefix * @return a valid prefix */ private String checkForOrgPrefix(String orgName) { return minimalPrefixMapping.get(orgName); } /** * This class is used to group sequences that share the same ecnum, organism and protein sequence */ private static class UniqueSeq { String ecnum; String organism; String protSeq; private UniqueSeq (Seq sequence) { this.ecnum = sequence.getEc(); this.organism = sequence.getOrgName(); this.protSeq = sequence.getSequence(); } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; UniqueSeq uniqueSeq = (UniqueSeq) o; if (ecnum != null ? !ecnum.equals(uniqueSeq.ecnum) : uniqueSeq.ecnum != null) return false; if (organism != null ? !organism.equals(uniqueSeq.organism) : uniqueSeq.organism != null) return false; return protSeq != null ? protSeq.equals(uniqueSeq.protSeq) : uniqueSeq.protSeq == null; } @Override public int hashCode() { int result = ecnum != null ? ecnum.hashCode() : 0; result = 31 * result + (organism != null ? organism.hashCode() : 0); result = 31 * result + (protSeq != null ? protSeq.hashCode() : 0); return result; } } private Seq mergeSequences(List<Seq> sequences) { if (sequences.size() < 1) { throw new RuntimeException("0 matched sequences in this sequence group"); } else if (sequences.size() == 1) { return sequences.get(0); } Seq firstSequence = sequences.get(0); JSONObject firstSeqMetadata = firstSequence.getMetadata(); // this field is empty for every Seq entry, so we're removing it firstSeqMetadata.remove(PROTEIN_EXISTENCE); /* we want to convert the brenda_ids from being stored in a comment JSONArray to being stored in an xref map (JSONObject) */ JSONArray comment = firstSeqMetadata.getJSONArray(COMMENT); Set<Long> brendaIds = new HashSet<>(); for (int i = 0; i < comment.length(); i++) { JSONObject commentObject = comment.getJSONObject(i); if (commentObject.has(TEXT) && commentObject.has(TYPE) && commentObject.getString(TYPE).equals(BRENDA_ID)) { brendaIds.add(commentObject.getLong(TEXT)); } } firstSeqMetadata.remove(COMMENT); JSONObject xrefObject = new JSONObject(); xrefObject.put(BRENDA_ID, brendaIds); firstSeqMetadata.put(XREF, xrefObject); // initialized mergedSequence with firstSequence Seq mergedSequence = new Seq( -1, // assume ID will be set when the sequence is written to the DB firstSequence.getEc(), firstSequence.getOrgId(), firstSequence.getOrgName(), firstSequence.getSequence(), firstSequence.getReferences(), MongoDBToJSON.conv(firstSequence.getMetadata()), firstSequence.getSrcdb() ); mergedSequence.setReactionsCatalyzed(firstSequence.getReactionsCatalyzed()); // merge the rest of the matched sequences for (Seq sequence : sequences) { if (!mergedSequence.getEc().equals(sequence.getEc()) || !mergedSequence.getSequence().equals(sequence.getSequence()) || !mergedSequence.getOrgName().equals(sequence.getOrgName())) { String msg = "matching sequence map constructed improperly; at least one of ec #, protein sequence, & " + "organism don't match"; LOGGER.error(msg); throw new RuntimeException(msg); } mergeReferences(mergedSequence.getReferences(), sequence.getReferences()); mergeMetadata(mergedSequence.getMetadata(), sequence.getMetadata()); mergeReactionRefs(mergedSequence.getReactionsCatalyzed(), sequence.getReactionsCatalyzed()); } return mergedSequence; } private void mergeReactionRefs(Set<Long> mergedReactionRefs, Set<Long> newReactionRefs) { if (mergedReactionRefs == null || mergedReactionRefs.size() == 0) { mergedReactionRefs = newReactionRefs; return; } for (Long newReactionRef : newReactionRefs) { // Set operations automatically handle the case that the newReactionRef already exists in the mergedReactionRefs mergedReactionRefs.add(newReactionRef); } } private void mergeMetadata(JSONObject mergedMetadata, JSONObject newMetadata) { if (mergedMetadata == null || mergedMetadata == new JSONObject()) { mergedMetadata = newMetadata; return; } // ensures that the new gene name is added to the synonyms list in the case that it doesn't match the old gene name boolean geneNameMatches = true; if (newMetadata.has(NAME) && mergedMetadata.has(NAME)) { if (!newMetadata.getString(NAME).equals(mergedMetadata.getString(NAME))) { geneNameMatches = false; } } if (!mergedMetadata.has(NAME) && newMetadata.has(NAME)) { mergedMetadata.put(NAME, newMetadata.getString(NAME)); } if (newMetadata.has(SYNONYMS)) { if (!geneNameMatches) { newMetadata.append(SYNONYMS, newMetadata.getString(NAME)); } JSONArray newSynonyms = newMetadata.getJSONArray(SYNONYMS); if (mergedMetadata.has(SYNONYMS)) { for (int i = 0; i < newSynonyms.length(); i++) { mergedMetadata = GenbankInstaller.updateArrayField(SYNONYMS, newSynonyms.getString(i), mergedMetadata); } } else { mergedMetadata.put(SYNONYMS, newSynonyms); } } if (newMetadata.has(PRODUCT_NAMES)) { JSONArray newProductNames = newMetadata.getJSONArray(PRODUCT_NAMES); if (mergedMetadata.has(PRODUCT_NAMES)) { for (int i = 0; i < newProductNames.length(); i++) { mergedMetadata = GenbankInstaller.updateArrayField(PRODUCT_NAMES, newProductNames.getString(i), mergedMetadata); } } else { mergedMetadata.put(PRODUCT_NAMES, newProductNames); } } if (newMetadata.has(ACCESSION)) { JSONObject newAccession = newMetadata.getJSONObject(ACCESSION); if (mergedMetadata.has(ACCESSION)) { mergedMetadata = GenbankInstaller.updateAccessions(newAccession, mergedMetadata, Seq.AccType.genbank_nucleotide, GenbankInstaller.NUCLEOTIDE_ACCESSION_PATTERN); mergedMetadata = GenbankInstaller.updateAccessions(newAccession, mergedMetadata, Seq.AccType.genbank_protein, GenbankInstaller.PROTEIN_ACCESSION_PATTERN); mergedMetadata = GenbankInstaller.updateAccessions(newAccession, mergedMetadata, Seq.AccType.uniprot, UniprotInstaller.UNIPROT_ACCESSION_PATTERN); } else { mergedMetadata.put(ACCESSION, newAccession); } } // converts old comment JSONArrays to fit the new xref JSONObject model if (newMetadata.has(COMMENT)) { JSONArray comment = newMetadata.getJSONArray(COMMENT); Set<Long> newBrendaIds = new HashSet<>(); for (int i = 0; i < comment.length(); i++) { JSONObject commentObject = comment.getJSONObject(i); if (commentObject.has(TEXT) && commentObject.has(TYPE) && commentObject.getString(TYPE).equals(BRENDA_ID)) { newBrendaIds.add(commentObject.getLong(TEXT)); } } if (mergedMetadata.has(XREF) && mergedMetadata.getJSONObject(XREF).has(BRENDA_ID)) { JSONArray brendaIds = mergedMetadata.getJSONObject(XREF).getJSONArray(BRENDA_ID); Set<Long> oldBrendaIds = new HashSet<>(); for (int i = 0; i < brendaIds.length(); i++) { oldBrendaIds.add((Long) brendaIds.get(i)); } for (Long brendaId : newBrendaIds) { // set operations handle duplicate case oldBrendaIds.add(brendaId); } mergedMetadata.getJSONObject(XREF).put(BRENDA_ID, oldBrendaIds); } else { JSONObject xrefObject = new JSONObject(); xrefObject.put(BRENDA_ID, newBrendaIds); mergedMetadata.put(XREF, xrefObject); } } } private void mergeReferences(List<JSONObject> mergedRefs, List<JSONObject> newRefs) { if (mergedRefs == null || mergedRefs.size() == 0) { mergedRefs = newRefs; return; } for (JSONObject newRef : newRefs) { if (newRef.getString(SRC).equals(PMID)) { String newPmid = newRef.getString(VAL); ListIterator<JSONObject> mergedRefsIterator = mergedRefs.listIterator(); Set<String> oldPmids = new HashSet<>(); while (mergedRefsIterator.hasNext()) { JSONObject mergedRef = mergedRefsIterator.next(); if (mergedRef.getString(SRC).equals(PMID)) { oldPmids.add(mergedRef.getString(VAL)); } } if (!oldPmids.contains(newPmid)) { mergedRefs.add(newRef); } } else if (newRef.getString(SRC).equals(PATENT)) { boolean patentExists = false; String newCountryCode = newRef.getString(COUNTRY_CODE); String newPatentNumber = newRef.getString(PATENT_NUMBER); String newPatentYear = newRef.getString(PATENT_YEAR); ListIterator<JSONObject> mergedRefsIterator = mergedRefs.listIterator(); while (mergedRefsIterator.hasNext()) { JSONObject mergedRef = mergedRefsIterator.next(); if (mergedRef.getString(SRC).equals(PATENT) && mergedRef.getString(COUNTRY_CODE).equals(newCountryCode) && mergedRef.getString(PATENT_NUMBER).equals(newPatentNumber) && mergedRef.getString(PATENT_YEAR).equals(newPatentYear)) { patentExists = true; break; } } if (!patentExists) { mergedRefs.add(newRef); } } } } }