/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package act.installer.sequence;
import act.server.MongoDB;
import act.shared.Seq;
import act.shared.helpers.MongoDBToJSON;
import com.mongodb.DBObject;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONObject;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Map;
public class UniprotSeqEntry extends SequenceEntry {
private static final Logger LOGGER = LogManager.getFormatterLogger(UniprotSeqEntry.class);
private static final String PROTEIN = "protein";
private static final String RECOMMENDED_NAME = "recommendedName";
private static final String EC_NUMBER = "ecNumber";
private static final String ACCESSION = "accession";
private static final String GENE = "gene";
private static final String NAME = "name";
private static final String TYPE = "type";
private static final String TEXT = "text";
private static final String COMMENT = "comment";
private static final String XREF = "xref";
private static final String CATALYTIC_ACTIVITY = "catalytic activity";
private static final String CATALYTIC_ACITIVITY_SNAKE = "catalytic_activity";
private static final String VAL = "val";
private static final String SRC = "src";
private static final String PRIMARY = "primary";
private static final String SYNONYM = "synonym";
private static final String FULL_NAME = "fullName";
private static final String SEQUENCE = "sequence";
private static final String ORGANISM = "organism";
private static final String SCIENTIFIC = "scientific";
private static final String REFERENCE = "reference";
private static final String CITATION = "citation";
private static final String DB_REFERENCE = "dbReference";
private static final String PUBMED = "PubMed";
private static final String ID = "id";
private static final String PMID = "PMID";
private static final String EMBL = "EMBL";
private static final String PROPERTY = "property";
private static final String PROTEIN_SEQUENCE_ID = "protein sequence ID";
private static final String VALUE = "value";
private static final String SUBMITTED_NAME = "submittedName";
private static final String ALTERNATIVE_NAME = "alternativeName";
private static final String SYNONYMS = "synonyms";
private static final String PRODUCT_NAMES = "product_names";
private static final String UNCHARACTERIZED = "uncharacterized";
private Document seqFile;
private String ec;
private JSONObject accessions;
private String geneName;
private List<String> geneSynonyms;
private List<String> productNames;
private List<Seq> matchingSeqs;
private String catalyticActivity;
private DBObject metadata;
private String sequence;
private String org;
private Long orgId;
private List<JSONObject> references;
private Set<Long> catalyzedRxns;
private NodeList proteinNodeList;
private NodeList sequenceNodeList;
private NodeList organismNodeList;
private NodeList geneNodeList;
// the minimalPrefixMapping is generated by OrgMinimalPrefixGenerator
UniprotSeqEntry(Document doc, Map<String, String> minimalPrefixMapping) {
this.seqFile = doc;
checkNodeListLengths();
this.ec = extractEc();
this.accessions = extractAccessions();
this.geneName = extractGeneName();
this.geneSynonyms = extractGeneSynonyms();
this.productNames = extractProductNames();
this.catalyticActivity = extractCatalyticActivity();
this.metadata = extractMetadata();
this.sequence = extractSequence();
this.org = extractOrg(minimalPrefixMapping);
this.references = extractReferences();
this.catalyzedRxns = new HashSet<>();
}
void init(MongoDB db) {
this.orgId = extractOrgId(db);
this.matchingSeqs = extractMatchingSeqs(db);
}
public DBObject getMetadata() { return this.metadata; }
public JSONObject getAccession() { return this.accessions; }
public String getGeneName() { return this.geneName; }
public List<String> getGeneSynonyms() { return this.geneSynonyms; }
public List<String> getProductName() { return this.productNames; }
public List<Seq> getMatchingSeqs() { return this.matchingSeqs; }
public List<JSONObject> getRefs() { return this.references; }
public Long getOrgId() { return this.orgId; }
public String getOrg() { return this.org; }
public String getSeq() { return this.sequence; }
public String getEc() { return this.ec; }
public String getCatalyticActivity() {return this.catalyticActivity; }
public Set<Long> getCatalyzedRxns() { return this.catalyzedRxns; }
private void checkNodeListLengths() {
proteinNodeList = seqFile.getElementsByTagName(PROTEIN);
geneNodeList = seqFile.getElementsByTagName(GENE);
sequenceNodeList = seqFile.getElementsByTagName(SEQUENCE);
organismNodeList = seqFile.getElementsByTagName(ORGANISM);
if (proteinNodeList.getLength() > 1) {
throw new RuntimeException("multiple protein tags parsed");
}
if (geneNodeList.getLength() > 1) {
throw new RuntimeException("multiple gene tags parsed");
}
if (organismNodeList.getLength() > 1) {
throw new RuntimeException("multiple organism tags parsed");
}
if (sequenceNodeList.getLength() > 1) {
throw new RuntimeException("multiple sequence tags parsed");
} else if (sequenceNodeList.getLength() == 0) {
throw new RuntimeException("no sequence tags parsed");
}
}
/**
* EC Numbers are stored as:
* <protein>
* <recommendedName>
* <fullName evidence="33">Alcohol dehydrogenase class-P</fullName>
* <shortName evidence="30">AtADH</shortName>
* <ecNumber evidence="18 22">1.1.1.1</ecNumber>
* </recommendedName>
* </protein>
* Sometimes the <recommendedName> tag is replaced with a <submittedName> tag
* @return the Ecnum as a string
*/
private String extractEc() {
if (proteinNodeList.getLength() == 1) {
// since there is only one item in the list, retrieve the only node
Node proteinNode = proteinNodeList.item(0);
NodeList proteinChildNodes = proteinNode.getChildNodes();
for (int i = 0; i < proteinChildNodes.getLength(); i++) {
Node proteinChildNode = proteinChildNodes.item(i);
if ((proteinChildNode.getNodeName().equals(RECOMMENDED_NAME) ||
proteinChildNode.getNodeName().equals(SUBMITTED_NAME)) &&
proteinChildNode.getNodeType() == Node.ELEMENT_NODE) {
Element recommendedNameElement = (Element) proteinChildNode;
if (recommendedNameElement.getElementsByTagName(EC_NUMBER).getLength() > 1) {
throw new RuntimeException("multiple ec numbers per protein");
} else if (recommendedNameElement.getElementsByTagName(EC_NUMBER).getLength() == 1) {
return recommendedNameElement.getElementsByTagName(EC_NUMBER).item(0).getTextContent();
}
}
}
}
return null;
}
/**
* Uniprot accessions are stored as:
* <accession>Q9SX08</accession>
*
* Nucleotide Accession in this example is: M12196
* Protein Accession in this example is: AAA32728
*<dbReference type="EMBL" id="M12196">
* <property type="protein sequence ID" value="AAA32728.1"/>
* <property type="molecule type" value="Genomic_DNA"/>
*</dbReference>
* @return a mapping of uniprot, genbank_nucleotide, and genbank_protein accessions
*/
private JSONObject extractAccessions() {
List<String> uniprotAccessions = new ArrayList<>();
List<String> genbankNucleotideAccessions = new ArrayList<>();
List<String> genbankProteinAccessions = new ArrayList<>();
NodeList accessionNodeList = seqFile.getElementsByTagName(ACCESSION);
for (int i = 0; i < accessionNodeList.getLength(); i++) {
uniprotAccessions.add(accessionNodeList.item(i).getTextContent());
}
NodeList dbReferenceNodeList = seqFile.getElementsByTagName(DB_REFERENCE);
for (int i = 0; i < dbReferenceNodeList.getLength(); i++) {
Node dbReferenceNode = dbReferenceNodeList.item(i);
if (dbReferenceNode.getNodeType() == Node.ELEMENT_NODE) {
Element dbReferenceElement = (Element) dbReferenceNode;
// EMBL and Genbank Accession IDs are the same
if (dbReferenceElement.hasAttribute(TYPE) && dbReferenceElement.getAttribute(TYPE).equals(EMBL) &&
dbReferenceElement.hasAttribute(ID)) {
NodeList propertyNodeList = dbReferenceElement.getElementsByTagName(PROPERTY);
/* there are some duplicate dbReferenceElements, so we want to make sure we only add those with
'property' sub tags */
if (propertyNodeList.getLength() > 0) {
genbankNucleotideAccessions.add(dbReferenceElement.getAttribute(ID));
}
for (int j = 0; j < propertyNodeList.getLength(); j++) {
Node propertyNode = propertyNodeList.item(j);
if (propertyNode.getNodeType() == Node.ELEMENT_NODE) {
Element propertyElement = (Element) propertyNode;
if (propertyElement.hasAttribute(TYPE) && propertyElement.getAttribute(TYPE).equals(PROTEIN_SEQUENCE_ID)
&& propertyElement.hasAttribute(VALUE)) {
// example: <property type="protein sequence ID" value="BAA19616.1"/>
genbankProteinAccessions.add(propertyElement.getAttribute(VALUE).split("\\.")[0]);
}
}
}
}
}
}
JSONObject accessions = new JSONObject();
accessions.put(Seq.AccType.uniprot.toString(), uniprotAccessions);
accessions.put(Seq.AccType.genbank_nucleotide.toString(), genbankNucleotideAccessions);
accessions.put(Seq.AccType.genbank_protein.toString(), genbankProteinAccessions);
return accessions;
}
/**
* The gene name is stored with the type="primary"
*<gene>
* <name type="primary" evidence="32">ADH1</name>
* <name type="synonym" evidence="31">ADH</name>
* <name type="ordered locus" evidence="36">At1g77120</name>
* <name type="ORF" evidence="35">F22K20.19</name>
*</gene>
*
* @return the primary gene name as a string
*/
private String extractGeneName() {
if (geneNodeList.getLength() == 1) {
// since there is only one item in the list, retrieve the only node
Node geneNode = geneNodeList.item(0);
NodeList geneChildNodes = geneNode.getChildNodes();
for (int i = 0; i < geneChildNodes.getLength(); i++) {
Node geneChildNode = geneChildNodes.item(i);
if (geneChildNode.getNodeName().equals(NAME) && geneChildNode.getNodeType() == Node.ELEMENT_NODE) {
Element geneChildElement = (Element) geneChildNode;
if (geneChildElement.hasAttribute(TYPE) && geneChildElement.getAttribute(TYPE).equals(PRIMARY)) {
return geneChildElement.getTextContent();
}
}
}
}
return null;
}
/**
* The gene name synonyms are stored with the type="synonym"
*<gene>
* <name type="primary" evidence="32">ADH1</name>
* <name type="synonym" evidence="31">ADH</name>
* <name type="ordered locus" evidence="36">At1g77120</name>
* <name type="ORF" evidence="35">F22K20.19</name>
*</gene>
*
* @return the gene name synonyms as a list
*/
private List<String> extractGeneSynonyms() {
List<String> geneSynonyms = new ArrayList<>();
if (geneNodeList.getLength() == 1) {
// since there is only one item in the list, retrieve the only node
Node geneNode = geneNodeList.item(0);
NodeList geneChildNodes = geneNode.getChildNodes();
for (int i = 0; i < geneChildNodes.getLength(); i++) {
Node geneChildNode = geneChildNodes.item(i);
if (geneChildNode.getNodeName().equals(NAME) && geneChildNode.getNodeType() == Node.ELEMENT_NODE) {
Element geneChildElement = (Element) geneChildNode;
if (geneChildElement.hasAttribute(TYPE) && geneChildElement.getAttribute(TYPE).equals(SYNONYM)) {
geneSynonyms.add(geneChildElement.getTextContent());
}
}
}
}
return geneSynonyms;
}
/**
* Product names are stored as:
*<protein>
* <recommendedName>
* <fullName>Amine sulfotransferase</fullName>
* <ecNumber>2.8.2.3</ecNumber>
* </recommendedName>
* <alternativeName>
* <fullName>SULT-X2</fullName>
* </alternativeName>
* <alternativeName>
* <fullName>Sulfotransferase 3A1</fullName>
* <shortName>ST3A1</shortName>
* </alternativeName>
*</protein>
* Sometimes the <recommendedName> tag is replaced with a <submittedName> tag
* @return the list of product names
*/
private List<String> extractProductNames() {
List<String> productNames = new ArrayList<>();
if (proteinNodeList.getLength() == 1) {
// since there is only one item in the list, retrieve the only node
Node proteinNode = proteinNodeList.item(0);
NodeList proteinChildNodes = proteinNode.getChildNodes();
for (int i = 0; i < proteinChildNodes.getLength(); i++) {
Node proteinChildNode = proteinChildNodes.item(i);
if ((proteinChildNode.getNodeName().equals(RECOMMENDED_NAME) ||
proteinChildNode.getNodeName().equals(SUBMITTED_NAME) ||
proteinChildNode.getNodeName().equals(ALTERNATIVE_NAME)) &&
proteinChildNode.getNodeType() == Node.ELEMENT_NODE) {
Element recommendedNameElement = (Element) proteinChildNode;
if (recommendedNameElement.getElementsByTagName(FULL_NAME).getLength() > 0) {
// there should only be one full name
String productName = recommendedNameElement.getElementsByTagName(FULL_NAME).item(0).getTextContent();
// handles cases: Uncharacterized protein, Putative uncharacterized protein, etc
if (productName.toLowerCase().contains(UNCHARACTERIZED)) {
LOGGER.error("Skipping uncharacterized protein");
break;
}
// Collections.singletonList used over Arrays.asList because it takes less memory
productNames.add(productName);
}
}
}
}
return productNames;
}
/**
* Catalytic activity strings are stored as:
*<comment type="catalytic activity">
* <text evidence="18 22">An alcohol + NAD(+) = an aldehyde or ketone + NADH.</text>
*</comment>
* @return the catalytic activity string
*/
private String extractCatalyticActivity() {
NodeList commentNodeList = seqFile.getElementsByTagName(COMMENT);
for (int i = 0; i < commentNodeList.getLength(); i++) {
Node commentNode = commentNodeList.item(i);
if (commentNode.getNodeType() == Node.ELEMENT_NODE) {
Element commentElement = (Element) commentNode;
if (commentElement.hasAttribute(TYPE) && commentElement.getAttribute(TYPE).equals(CATALYTIC_ACTIVITY)) {
NodeList commentChildNodes = commentElement.getChildNodes();
// there should only be one text element child containing the string of interest
if (commentChildNodes.getLength() == 1 && commentChildNodes.item(0).getNodeName().equals(TEXT)) {
return commentChildNodes.item(0).getTextContent();
} else if (commentChildNodes.getLength() > 1) {
LOGGER.error("more than one catalytic activity string");
}
}
}
}
return null;
}
private DBObject extractMetadata() {
JSONObject obj = new JSONObject();
obj.put(NAME, geneName);
obj.put(SYNONYMS, geneSynonyms);
obj.put(PRODUCT_NAMES, productNames);
obj.put(XREF, new JSONObject());
obj.put(ACCESSION, accessions);
obj.put(CATALYTIC_ACITIVITY_SNAKE, catalyticActivity);
return MongoDBToJSON.conv(obj);
}
/**
* Sequence strings are stored as:
*<sequence length="379" mass="41178" checksum="32550529538B9669" modified="2007-05-29" version="2">
*MSTTGQIIRCKAAVAWEAGKPLVIEEVEVAPPQKHEVRIKILFTSLCHTDVYFWEAKGQT
*PLFPRIFGHEAGGIVESVGEGVTDLQPGDHVLPIFTGECGECRHCHSEESNMCDLLRINT
*ERGGMIHDGESRFSINGKPIYHFLGTSTFSEYTVVHSGQVAKINPDAPLDKVCIVSCGLS
*TGLGATLNVAKPKKGQSVAIFGLGAVGLGAAEGARIAGASRIIGVDFNSKRFDQAKEFGV
*TECVNPKDHDKPIQQVIAEMTDGGVDRSVECTGSVQAMIQAFECVHDGWGVAVLVGVPSK
*DDAFKTHPMNFLNERTLKGTFFGNYKPKTDIPGVVEKYMNKELELEKFITHTVPFSEINK
*AFDYMLKGESIRCIITMGA
*</sequence>
* @return the sequence string
*/
private String extractSequence() {
return sequenceNodeList.item(0).getTextContent();
}
/**
* The organism name is stored with the type="scientific"
* <organism>
* <name type="scientific">Arabidopsis thaliana</name>
* <name type="common">Mouse-ear cress</name>
* <dbReference type="NCBI Taxonomy" id="3702"/>
*</organism>
* @return the organism as a string
*/
private String extractOrg(Map<String, String> minimalPrefixMapping) {
if (organismNodeList.getLength() == 1) {
// since there is only one item in the list, retrieve the only node
Node organismNode = organismNodeList.item(0);
NodeList organismChildNodes = organismNode.getChildNodes();
for (int i = 0; i < organismChildNodes.getLength(); i++) {
Node organismChildNode = organismChildNodes.item(i);
if (organismChildNode.getNodeName().equals(NAME) && organismChildNode.getNodeType() == Node.ELEMENT_NODE) {
Element organismChildElement = (Element) organismChildNode;
if (organismChildElement.hasAttribute(TYPE) && organismChildElement.getAttribute(TYPE).equals(SCIENTIFIC)) {
String orgName = organismChildElement.getTextContent();
if (minimalPrefixMapping.containsKey(orgName)) {
return minimalPrefixMapping.get(orgName);
} else {
return orgName;
}
}
}
}
}
return null;
}
private Long extractOrgId(MongoDB db) {
long id = db.getOrganismId(org);
// if id == -1L, this means this organism does not exist in the database
if (id != -1L) {
return id;
} else {
return db.submitToActOrganismNameDB(org);
}
}
/**
* The Pubmed Ids are stored in the <dbReference> tags with type="Pubmed"
*<reference key="4">
* <citation type="journal article" date="1996" name="Mol. Biol. Evol." volume="13" first="433" last="436">
* <title>Intra- and interspecific variation of the alcohol dehydrogenase locus region in wild plants Arabis gemmifera and Arabidopsis thaliana.</title>
* <authorList>
* <person name="Miyashita N.T."/>
* </authorList>
* <dbReference type="PubMed" id="8587508"/>
* <dbReference type="DOI" id="10.1093/oxfordjournals.molbev.a025603"/>
* </citation>
* <scope>NUCLEOTIDE SEQUENCE [GENOMIC DNA]</scope>
* <source>
* <strain>cv. Aa-0</strain>
* </source>
*</reference>
* @return a list of JSONObjects containing the extracted PubMed Ids
*/
private List<JSONObject> extractReferences() {
NodeList referenceNodeList = seqFile.getElementsByTagName(REFERENCE);
List<JSONObject> references = new ArrayList<>();
for (int i = 0; i < referenceNodeList.getLength(); i++) {
Node referenceNode = referenceNodeList.item(i);
if (referenceNode.getNodeType() == Node.ELEMENT_NODE) {
Element referenceElement = (Element) referenceNode;
if (referenceElement.getElementsByTagName(CITATION).getLength() > 1) {
LOGGER.error("more than one citation per reference");
} else if (referenceElement.getElementsByTagName(CITATION).getLength() == 0) {
break;
}
Node citationNode = referenceElement.getElementsByTagName(CITATION).item(0);
if (citationNode.getNodeType() == Node.ELEMENT_NODE) {
Element citationElement = (Element) citationNode;
NodeList dbReferenceNodeList = citationElement.getElementsByTagName(DB_REFERENCE);
for (int j = 0; j < dbReferenceNodeList.getLength(); j++) {
Node dbReferenceNode = dbReferenceNodeList.item(j);
if (dbReferenceNode.getNodeType() == Node.ELEMENT_NODE) {
Element dbReferenceElement = (Element) dbReferenceNode;
if (dbReferenceElement.hasAttribute(TYPE) && dbReferenceElement.getAttribute(TYPE).equals(PUBMED) &&
dbReferenceElement.hasAttribute(ID)) {
JSONObject obj = new JSONObject();
obj.put(VAL, dbReferenceElement.getAttribute(ID));
obj.put(SRC, PMID);
references.add(obj);
}
}
}
}
}
}
return references;
}
/**
* In the case that ecnum, sequence, & org are all found in the uniprot file, this retrieves all sequence matches from
* the installer database.
* In the case that there is no ecnum, but there is a genbank protein accession number, this
* retrieves all sequences that carry that genbank protein accession number.
* In the case that there is no ecnum or genbank protein accession number, but there is a genbank nucleotide accession
* number, this retrieves all sequences that carry that genbank nucleotide accession number and protein sequence.
* If none of this is the case, then returns the empty list.
* @param db
* @return the list of Seq entries that should be updated with the data from the uniprot file
*/
private List<Seq> extractMatchingSeqs(MongoDB db) {
JSONArray genbankProteinAccessions = accessions.getJSONArray(Seq.AccType.genbank_protein.toString());
JSONArray genbankNucleotideAccessions = accessions.getJSONArray(Seq.AccType.genbank_nucleotide.toString());
List<Seq> seqs = new ArrayList<>();
if (ec != null) {
return db.getSeqFromSeqEcOrg(sequence, ec, org);
} else if (genbankProteinAccessions != null && genbankProteinAccessions.length() > 0) {
for (int i = 0; i < genbankProteinAccessions.length(); i++) {
seqs.addAll(db.getSeqFromGenbankProtAccession(genbankProteinAccessions.getString(i)));
}
} else if (genbankNucleotideAccessions != null && genbankNucleotideAccessions.length() > 0) {
for (int i = 0; i < genbankNucleotideAccessions.length(); i++) {
List<Seq> seqFromNucAcc =
db.getSeqFromGenbankNucAccessionSeq(genbankNucleotideAccessions.getString(i), sequence);
if (seqFromNucAcc.size() > 1) {
LOGGER.error("multiple seq entries match nucleotide accession + protein sequence");
}
seqs.addAll(seqFromNucAcc);
}
}
return seqs;
}
}