UniprotSeqEntry.java example

Explorer
act-master
/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package act.installer.sequence;

import act.server.MongoDB;
import act.shared.Seq;
import act.shared.helpers.MongoDBToJSON;
import com.mongodb.DBObject;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONObject;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Map;

public class UniprotSeqEntry extends SequenceEntry {
  private static final Logger LOGGER = LogManager.getFormatterLogger(UniprotSeqEntry.class);
  private static final String PROTEIN = "protein";
  private static final String RECOMMENDED_NAME = "recommendedName";
  private static final String EC_NUMBER = "ecNumber";
  private static final String ACCESSION = "accession";
  private static final String GENE = "gene";
  private static final String NAME = "name";
  private static final String TYPE = "type";
  private static final String TEXT = "text";
  private static final String COMMENT = "comment";
  private static final String XREF = "xref";
  private static final String CATALYTIC_ACTIVITY = "catalytic activity";
  private static final String CATALYTIC_ACITIVITY_SNAKE = "catalytic_activity";
  private static final String VAL = "val";
  private static final String SRC = "src";
  private static final String PRIMARY = "primary";
  private static final String SYNONYM = "synonym";
  private static final String FULL_NAME = "fullName";
  private static final String SEQUENCE = "sequence";
  private static final String ORGANISM = "organism";
  private static final String SCIENTIFIC = "scientific";
  private static final String REFERENCE = "reference";
  private static final String CITATION = "citation";
  private static final String DB_REFERENCE = "dbReference";
  private static final String PUBMED = "PubMed";
  private static final String ID = "id";
  private static final String PMID = "PMID";
  private static final String EMBL = "EMBL";
  private static final String PROPERTY = "property";
  private static final String PROTEIN_SEQUENCE_ID = "protein sequence ID";
  private static final String VALUE = "value";
  private static final String SUBMITTED_NAME = "submittedName";
  private static final String ALTERNATIVE_NAME = "alternativeName";
  private static final String SYNONYMS = "synonyms";
  private static final String PRODUCT_NAMES = "product_names";
  private static final String UNCHARACTERIZED = "uncharacterized";

  private Document seqFile;
  private String ec;
  private JSONObject accessions;
  private String geneName;
  private List<String> geneSynonyms;
  private List<String> productNames;
  private List<Seq> matchingSeqs;
  private String catalyticActivity;
  private DBObject metadata;
  private String sequence;
  private String org;
  private Long orgId;
  private List<JSONObject> references;
  private Set<Long> catalyzedRxns;

  private NodeList proteinNodeList;
  private NodeList sequenceNodeList;
  private NodeList organismNodeList;
  private NodeList geneNodeList;

  // the minimalPrefixMapping is generated by OrgMinimalPrefixGenerator
  UniprotSeqEntry(Document doc, Map<String, String> minimalPrefixMapping) {
    this.seqFile = doc;
    checkNodeListLengths();
    this.ec = extractEc();
    this.accessions = extractAccessions();
    this.geneName = extractGeneName();
    this.geneSynonyms = extractGeneSynonyms();
    this.productNames = extractProductNames();
    this.catalyticActivity = extractCatalyticActivity();
    this.metadata = extractMetadata();
    this.sequence = extractSequence();
    this.org = extractOrg(minimalPrefixMapping);
    this.references = extractReferences();
    this.catalyzedRxns = new HashSet<>();
  }

  void init(MongoDB db) {
    this.orgId = extractOrgId(db);
    this.matchingSeqs = extractMatchingSeqs(db);
  }

  public DBObject getMetadata() { return this.metadata; }
  public JSONObject getAccession() { return this.accessions; }
  public String getGeneName() { return this.geneName; }
  public List<String> getGeneSynonyms() { return this.geneSynonyms; }
  public List<String> getProductName() { return this.productNames; }
  public List<Seq> getMatchingSeqs() { return this.matchingSeqs; }
  public List<JSONObject> getRefs() { return this.references; }
  public Long getOrgId() { return this.orgId; }
  public String getOrg() { return this.org; }
  public String getSeq() { return this.sequence; }
  public String getEc() { return this.ec; }
  public String getCatalyticActivity() {return this.catalyticActivity; }
  public Set<Long> getCatalyzedRxns() { return this.catalyzedRxns; }

  private void checkNodeListLengths() {
    proteinNodeList = seqFile.getElementsByTagName(PROTEIN);
    geneNodeList = seqFile.getElementsByTagName(GENE);
    sequenceNodeList = seqFile.getElementsByTagName(SEQUENCE);
    organismNodeList = seqFile.getElementsByTagName(ORGANISM);

    if (proteinNodeList.getLength() > 1) {
      throw new RuntimeException("multiple protein tags parsed");
    }

    if (geneNodeList.getLength() > 1) {
      throw new RuntimeException("multiple gene tags parsed");
    }

    if (organismNodeList.getLength() > 1) {
      throw new RuntimeException("multiple organism tags parsed");
    }

    if (sequenceNodeList.getLength() > 1) {
      throw new RuntimeException("multiple sequence tags parsed");
    } else if (sequenceNodeList.getLength() == 0) {
      throw new RuntimeException("no sequence tags parsed");
    }
  }

  /**
   * EC Numbers are stored as:
   *  <protein>
   *    <recommendedName>
   *      <fullName evidence="33">Alcohol dehydrogenase class-P</fullName>
   *      <shortName evidence="30">AtADH</shortName>
   *      <ecNumber evidence="18 22">1.1.1.1</ecNumber>
   *    </recommendedName>
   *  </protein>
   * Sometimes the <recommendedName> tag is replaced with a <submittedName> tag
   * @return the Ecnum as a string
   */
  private String extractEc() {
    if (proteinNodeList.getLength() == 1) {
      // since there is only one item in the list, retrieve the only node
      Node proteinNode = proteinNodeList.item(0);

      NodeList proteinChildNodes = proteinNode.getChildNodes();

      for (int i = 0; i < proteinChildNodes.getLength(); i++) {
        Node proteinChildNode = proteinChildNodes.item(i);

        if ((proteinChildNode.getNodeName().equals(RECOMMENDED_NAME) ||
            proteinChildNode.getNodeName().equals(SUBMITTED_NAME)) &&
            proteinChildNode.getNodeType() == Node.ELEMENT_NODE) {

          Element recommendedNameElement = (Element) proteinChildNode;

          if (recommendedNameElement.getElementsByTagName(EC_NUMBER).getLength() > 1) {
            throw new RuntimeException("multiple ec numbers per protein");
          } else if (recommendedNameElement.getElementsByTagName(EC_NUMBER).getLength() == 1) {
            return recommendedNameElement.getElementsByTagName(EC_NUMBER).item(0).getTextContent();
          }
        }
      }
    }

    return null;
  }

  /**
   * Uniprot accessions are stored as:
   * <accession>Q9SX08</accession>
   *
   * Nucleotide Accession in this example is: M12196
   * Protein Accession in this example is: AAA32728
   *<dbReference type="EMBL" id="M12196">
   *  <property type="protein sequence ID" value="AAA32728.1"/>
   *  <property type="molecule type" value="Genomic_DNA"/>
   *</dbReference>
   * @return a mapping of uniprot, genbank_nucleotide, and genbank_protein accessions
   */
  private JSONObject extractAccessions() {
    List<String> uniprotAccessions = new ArrayList<>();
    List<String> genbankNucleotideAccessions = new ArrayList<>();
    List<String> genbankProteinAccessions = new ArrayList<>();

    NodeList accessionNodeList = seqFile.getElementsByTagName(ACCESSION);

    for (int i = 0; i < accessionNodeList.getLength(); i++) {
      uniprotAccessions.add(accessionNodeList.item(i).getTextContent());
    }

    NodeList dbReferenceNodeList = seqFile.getElementsByTagName(DB_REFERENCE);

    for (int i = 0; i < dbReferenceNodeList.getLength(); i++) {
      Node dbReferenceNode = dbReferenceNodeList.item(i);

      if (dbReferenceNode.getNodeType() == Node.ELEMENT_NODE) {
        Element dbReferenceElement = (Element) dbReferenceNode;

        // EMBL and Genbank Accession IDs are the same
        if (dbReferenceElement.hasAttribute(TYPE) && dbReferenceElement.getAttribute(TYPE).equals(EMBL) &&
            dbReferenceElement.hasAttribute(ID)) {

          NodeList propertyNodeList = dbReferenceElement.getElementsByTagName(PROPERTY);

          /* there are some duplicate dbReferenceElements, so we want to make sure we only add those with
           'property' sub tags */
          if (propertyNodeList.getLength() > 0) {
            genbankNucleotideAccessions.add(dbReferenceElement.getAttribute(ID));
          }

          for (int j = 0; j < propertyNodeList.getLength(); j++) {
            Node propertyNode = propertyNodeList.item(j);

            if (propertyNode.getNodeType() == Node.ELEMENT_NODE) {
              Element propertyElement = (Element) propertyNode;

              if (propertyElement.hasAttribute(TYPE) && propertyElement.getAttribute(TYPE).equals(PROTEIN_SEQUENCE_ID)
                  && propertyElement.hasAttribute(VALUE)) {

                // example: <property type="protein sequence ID" value="BAA19616.1"/>
                genbankProteinAccessions.add(propertyElement.getAttribute(VALUE).split("\\.")[0]);
              }
            }
          }
        }
      }
    }

    JSONObject accessions = new JSONObject();
    accessions.put(Seq.AccType.uniprot.toString(), uniprotAccessions);
    accessions.put(Seq.AccType.genbank_nucleotide.toString(), genbankNucleotideAccessions);
    accessions.put(Seq.AccType.genbank_protein.toString(), genbankProteinAccessions);

    return accessions;
  }

  /**
   * The gene name is stored with the type="primary"
   *<gene>
   *  <name type="primary" evidence="32">ADH1</name>
   *  <name type="synonym" evidence="31">ADH</name>
   *  <name type="ordered locus" evidence="36">At1g77120</name>
   *  <name type="ORF" evidence="35">F22K20.19</name>
   *</gene>
   *
   * @return the primary gene name as a string
   */
  private String extractGeneName() {
    if (geneNodeList.getLength() == 1) {
      // since there is only one item in the list, retrieve the only node
      Node geneNode = geneNodeList.item(0);

      NodeList geneChildNodes = geneNode.getChildNodes();

      for (int i = 0; i < geneChildNodes.getLength(); i++) {
        Node geneChildNode = geneChildNodes.item(i);

        if (geneChildNode.getNodeName().equals(NAME) && geneChildNode.getNodeType() == Node.ELEMENT_NODE) {
          Element geneChildElement = (Element) geneChildNode;

          if (geneChildElement.hasAttribute(TYPE) && geneChildElement.getAttribute(TYPE).equals(PRIMARY)) {
            return geneChildElement.getTextContent();
          }
        }
      }
    }

    return null;
  }

  /**
   * The gene name synonyms are stored with the type="synonym"
   *<gene>
   *  <name type="primary" evidence="32">ADH1</name>
   *  <name type="synonym" evidence="31">ADH</name>
   *  <name type="ordered locus" evidence="36">At1g77120</name>
   *  <name type="ORF" evidence="35">F22K20.19</name>
   *</gene>
   *
   * @return the gene name synonyms as a list
   */
  private List<String> extractGeneSynonyms() {
    List<String> geneSynonyms = new ArrayList<>();

    if (geneNodeList.getLength() == 1) {
      // since there is only one item in the list, retrieve the only node
      Node geneNode = geneNodeList.item(0);

      NodeList geneChildNodes = geneNode.getChildNodes();

      for (int i = 0; i < geneChildNodes.getLength(); i++) {
        Node geneChildNode = geneChildNodes.item(i);

        if (geneChildNode.getNodeName().equals(NAME) && geneChildNode.getNodeType() == Node.ELEMENT_NODE) {
          Element geneChildElement = (Element) geneChildNode;

          if (geneChildElement.hasAttribute(TYPE) && geneChildElement.getAttribute(TYPE).equals(SYNONYM)) {
            geneSynonyms.add(geneChildElement.getTextContent());
          }
        }
      }
    }

    return geneSynonyms;
  }

  /**
   * Product names are stored as:
   *<protein>
   *  <recommendedName>
   *    <fullName>Amine sulfotransferase</fullName>
   *    <ecNumber>2.8.2.3</ecNumber>
   *  </recommendedName>
   *  <alternativeName>
   *    <fullName>SULT-X2</fullName>
   *  </alternativeName>
   *  <alternativeName>
   *    <fullName>Sulfotransferase 3A1</fullName>
   *    <shortName>ST3A1</shortName>
   *  </alternativeName>
   *</protein>
   * Sometimes the <recommendedName> tag is replaced with a <submittedName> tag
   * @return the list of product names
   */
  private List<String> extractProductNames() {
    List<String> productNames = new ArrayList<>();

    if (proteinNodeList.getLength() == 1) {
      // since there is only one item in the list, retrieve the only node
      Node proteinNode = proteinNodeList.item(0);

      NodeList proteinChildNodes = proteinNode.getChildNodes();

      for (int i = 0; i < proteinChildNodes.getLength(); i++) {
        Node proteinChildNode = proteinChildNodes.item(i);

        if ((proteinChildNode.getNodeName().equals(RECOMMENDED_NAME) ||
            proteinChildNode.getNodeName().equals(SUBMITTED_NAME) ||
            proteinChildNode.getNodeName().equals(ALTERNATIVE_NAME)) &&
            proteinChildNode.getNodeType() == Node.ELEMENT_NODE) {

          Element recommendedNameElement = (Element) proteinChildNode;

          if (recommendedNameElement.getElementsByTagName(FULL_NAME).getLength() > 0) {
            // there should only be one full name
            String productName = recommendedNameElement.getElementsByTagName(FULL_NAME).item(0).getTextContent();

            // handles cases: Uncharacterized protein, Putative uncharacterized protein, etc
            if (productName.toLowerCase().contains(UNCHARACTERIZED)) {
              LOGGER.error("Skipping uncharacterized protein");
              break;
            }

            // Collections.singletonList used over Arrays.asList because it takes less memory
            productNames.add(productName);
          }

        }
      }
    }

    return productNames;
  }

  /**
   * Catalytic activity strings are stored as:
   *<comment type="catalytic activity">
   *  <text evidence="18 22">An alcohol + NAD(+) = an aldehyde or ketone + NADH.</text>
   *</comment>
   * @return the catalytic activity string
   */
  private String extractCatalyticActivity() {
    NodeList commentNodeList = seqFile.getElementsByTagName(COMMENT);

    for (int i = 0; i < commentNodeList.getLength(); i++) {
      Node commentNode = commentNodeList.item(i);

      if (commentNode.getNodeType() == Node.ELEMENT_NODE) {
        Element commentElement = (Element) commentNode;

        if (commentElement.hasAttribute(TYPE) && commentElement.getAttribute(TYPE).equals(CATALYTIC_ACTIVITY)) {
          NodeList commentChildNodes = commentElement.getChildNodes();

          // there should only be one text element child containing the string of interest
          if (commentChildNodes.getLength() == 1 && commentChildNodes.item(0).getNodeName().equals(TEXT)) {
            return commentChildNodes.item(0).getTextContent();
          } else if (commentChildNodes.getLength() > 1) {
            LOGGER.error("more than one catalytic activity string");
          }
        }

      }
    }

    return null;

  }

  private DBObject extractMetadata() {
    JSONObject obj = new JSONObject();

    obj.put(NAME, geneName);
    obj.put(SYNONYMS, geneSynonyms);
    obj.put(PRODUCT_NAMES, productNames);
    obj.put(XREF, new JSONObject());
    obj.put(ACCESSION, accessions);
    obj.put(CATALYTIC_ACITIVITY_SNAKE, catalyticActivity);

    return MongoDBToJSON.conv(obj);
  }

  /**
   * Sequence strings are stored as:
   *<sequence length="379" mass="41178" checksum="32550529538B9669" modified="2007-05-29" version="2">
   *MSTTGQIIRCKAAVAWEAGKPLVIEEVEVAPPQKHEVRIKILFTSLCHTDVYFWEAKGQT
   *PLFPRIFGHEAGGIVESVGEGVTDLQPGDHVLPIFTGECGECRHCHSEESNMCDLLRINT
   *ERGGMIHDGESRFSINGKPIYHFLGTSTFSEYTVVHSGQVAKINPDAPLDKVCIVSCGLS
   *TGLGATLNVAKPKKGQSVAIFGLGAVGLGAAEGARIAGASRIIGVDFNSKRFDQAKEFGV
   *TECVNPKDHDKPIQQVIAEMTDGGVDRSVECTGSVQAMIQAFECVHDGWGVAVLVGVPSK
   *DDAFKTHPMNFLNERTLKGTFFGNYKPKTDIPGVVEKYMNKELELEKFITHTVPFSEINK
   *AFDYMLKGESIRCIITMGA
   *</sequence>
   * @return the sequence string
   */
  private String extractSequence() {
    return sequenceNodeList.item(0).getTextContent();
  }

  /**
   * The organism name is stored with the type="scientific"
   * <organism>
   *  <name type="scientific">Arabidopsis thaliana</name>
   *  <name type="common">Mouse-ear cress</name>
   *  <dbReference type="NCBI Taxonomy" id="3702"/>
   *</organism>
   * @return the organism as a string
   */
  private String extractOrg(Map<String, String> minimalPrefixMapping) {
    if (organismNodeList.getLength() == 1) {
      // since there is only one item in the list, retrieve the only node
      Node organismNode = organismNodeList.item(0);

      NodeList organismChildNodes = organismNode.getChildNodes();

      for (int i = 0; i < organismChildNodes.getLength(); i++) {
        Node organismChildNode = organismChildNodes.item(i);

        if (organismChildNode.getNodeName().equals(NAME) && organismChildNode.getNodeType() == Node.ELEMENT_NODE) {
          Element organismChildElement = (Element) organismChildNode;

          if (organismChildElement.hasAttribute(TYPE) && organismChildElement.getAttribute(TYPE).equals(SCIENTIFIC)) {
            String orgName = organismChildElement.getTextContent();

            if (minimalPrefixMapping.containsKey(orgName)) {
              return minimalPrefixMapping.get(orgName);
            } else {
              return orgName;
            }
          }
        }
      }
    }

    return null;
  }

  private Long extractOrgId(MongoDB db) {
    long id = db.getOrganismId(org);

    // if id == -1L, this means this organism does not exist in the database
    if (id != -1L) {
      return id;
    } else {
      return db.submitToActOrganismNameDB(org);
    }
  }

  /**
   * The Pubmed Ids are stored in the <dbReference> tags with type="Pubmed"
   *<reference key="4">
   *  <citation type="journal article" date="1996" name="Mol. Biol. Evol." volume="13" first="433" last="436">
   *    <title>Intra- and interspecific variation of the alcohol dehydrogenase locus region in wild plants Arabis gemmifera and Arabidopsis thaliana.</title>
   *    <authorList>
   *      <person name="Miyashita N.T."/>
   *    </authorList>
   *    <dbReference type="PubMed" id="8587508"/>
   *    <dbReference type="DOI" id="10.1093/oxfordjournals.molbev.a025603"/>
   *  </citation>
   *  <scope>NUCLEOTIDE SEQUENCE [GENOMIC DNA]</scope>
   *  <source>
   *    <strain>cv. Aa-0</strain>
   *  </source>
   *</reference>
   * @return a list of JSONObjects containing the extracted PubMed Ids
   */
  private List<JSONObject> extractReferences() {
    NodeList referenceNodeList = seqFile.getElementsByTagName(REFERENCE);

    List<JSONObject> references = new ArrayList<>();

    for (int i = 0; i < referenceNodeList.getLength(); i++) {
      Node referenceNode = referenceNodeList.item(i);

      if (referenceNode.getNodeType() == Node.ELEMENT_NODE) {
        Element referenceElement = (Element) referenceNode;

        if (referenceElement.getElementsByTagName(CITATION).getLength() > 1) {
          LOGGER.error("more than one citation per reference");
        } else if (referenceElement.getElementsByTagName(CITATION).getLength() == 0) {
          break;
        }

        Node citationNode = referenceElement.getElementsByTagName(CITATION).item(0);

        if (citationNode.getNodeType() == Node.ELEMENT_NODE) {
          Element citationElement = (Element) citationNode;

          NodeList dbReferenceNodeList = citationElement.getElementsByTagName(DB_REFERENCE);

          for (int j = 0; j < dbReferenceNodeList.getLength(); j++) {
            Node dbReferenceNode = dbReferenceNodeList.item(j);

            if (dbReferenceNode.getNodeType() == Node.ELEMENT_NODE) {
              Element dbReferenceElement = (Element) dbReferenceNode;

              if (dbReferenceElement.hasAttribute(TYPE) && dbReferenceElement.getAttribute(TYPE).equals(PUBMED) &&
                  dbReferenceElement.hasAttribute(ID)) {
                JSONObject obj = new JSONObject();

                obj.put(VAL, dbReferenceElement.getAttribute(ID));
                obj.put(SRC, PMID);

                references.add(obj);
              }
            }
          }
        }
      }
    }

    return references;
  }

  /**
   * In the case that ecnum, sequence, & org are all found in the uniprot file, this retrieves all sequence matches from
   * the installer database.
   * In the case that there is no ecnum, but there is a genbank protein accession number, this
   * retrieves all sequences that carry that genbank protein accession number.
   * In the case that there is no ecnum or genbank protein accession number, but there is a genbank nucleotide accession
   * number, this retrieves all sequences that carry that genbank nucleotide accession number and protein sequence.
   * If none of this is the case, then returns the empty list.
   * @param db
   * @return the list of Seq entries that should be updated with the data from the uniprot file
   */
  private List<Seq> extractMatchingSeqs(MongoDB db) {
    JSONArray genbankProteinAccessions = accessions.getJSONArray(Seq.AccType.genbank_protein.toString());
    JSONArray genbankNucleotideAccessions = accessions.getJSONArray(Seq.AccType.genbank_nucleotide.toString());

    List<Seq> seqs = new ArrayList<>();

    if (ec != null) {

      return db.getSeqFromSeqEcOrg(sequence, ec, org);

    } else if (genbankProteinAccessions != null && genbankProteinAccessions.length() > 0) {

      for (int i = 0; i < genbankProteinAccessions.length(); i++) {
        seqs.addAll(db.getSeqFromGenbankProtAccession(genbankProteinAccessions.getString(i)));
      }

    } else if (genbankNucleotideAccessions != null && genbankNucleotideAccessions.length() > 0) {

      for (int i = 0; i < genbankNucleotideAccessions.length(); i++) {

        List<Seq> seqFromNucAcc =
            db.getSeqFromGenbankNucAccessionSeq(genbankNucleotideAccessions.getString(i), sequence);

        if (seqFromNucAcc.size() > 1) {
          LOGGER.error("multiple seq entries match nucleotide accession + protein sequence");
        }

        seqs.addAll(seqFromNucAcc);
      }

    }

    return seqs;
  }

}