/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package act.installer.sequence;
import act.server.MongoDB;
import act.shared.Seq;
import act.shared.helpers.MongoDBToJSON;
import com.mongodb.DBObject;
import org.biojava.nbio.core.sequence.features.FeatureInterface;
import org.biojava.nbio.core.sequence.features.Qualifier;
import org.biojava.nbio.core.sequence.template.AbstractSequence;
import org.biojava.nbio.core.sequence.template.Compound;
import org.json.JSONArray;
import org.json.JSONObject;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GenbankSeqEntry extends SequenceEntry {
private static final String PROTEIN_SEQ_TYPE = "Protein";
private static final String DNA_SEQ_TYPE = "DNA";
private static final String TRANSLATION = "translation";
private static final String EC_NUMBER = "EC_number";
private static final String PMID = "PMID";
private static final String PATENT = "Patent";
private static final String COUNTRY_CODE = "countryCode";
private static final String PATENT_NUMBER = "patentNumber";
private static final String PATENT_YEAR = "patentYear";
private static final String COUNTRY_CODE_SNAKE = "country_code";
private static final String PATENT_NUMBER_SNAKE = "patent_number";
private static final String PATENT_YEAR_SNAKE = "patent_year";
private static final String SOURCE = "source";
private static final String ORGANISM = "organism";
private static final String PROTEIN_ID = "protein_id";
private static final String PROTEIN = "Protein";
private static final String NAME = "name";
private static final String GENE = "gene";
private static final String GENE_SYNONYM = "gene_synonym";
private static final String PRODUCT = "product";
private static final String VAL = "val";
private static final String SRC = "src";
private static final String SYNONYMS = "synonyms";
private static final String PRODUCT_NAMES = "product_names";
private static final String XREF = "xref";
private static final String ACCESSION = "accession";
private static final Pattern GENE_NAME_PATTERN = Pattern.compile("(\\S*)\\s*.*");
private AbstractSequence seqObject;
private Map<String, List<Qualifier>> cdsQualifierMap;
private String seqType;
private DBObject metadata;
private JSONObject accessions;
private List<JSONObject> references;
private List<JSONObject> pmids;
private List<JSONObject> patents;
private String sequence;
private String geneName;
private List<String> productNames;
private List<String> geneSynonyms;
private List<Seq> matchingSeqs;
private String org;
private Long orgId;
private String ec;
private Set<Long> catalyzedRxns;
// the minimalPrefixMapping is generated by OrgMinimalPrefixGenerator
GenbankSeqEntry(AbstractSequence sequence) {
this.seqObject = sequence;
this.seqType = PROTEIN_SEQ_TYPE;
}
GenbankSeqEntry(AbstractSequence sequence, Map<String, List<Qualifier>> cdsQualifierMap) {
this.seqObject = sequence;
this.seqType = DNA_SEQ_TYPE;
this.cdsQualifierMap = cdsQualifierMap;
}
void init(MongoDB db, Map<String, String> minimalPrefixMapping) {
this.ec = extractEc();
this.accessions = extractAccessions();
this.geneName = extractGeneName();
this.geneSynonyms = extractGeneSynonyms();
this.productNames = extractProductName();
this.metadata = extractMetadata();
this.sequence = extractSequence();
this.org = extractOrg(minimalPrefixMapping);
this.orgId = extractOrgId(db);
this.references = extractReferences();
this.matchingSeqs = extractMatchingSeqs(db);
this.catalyzedRxns = new HashSet<>();
}
public DBObject getMetadata() { return this.metadata; }
public JSONObject getAccession() { return this.accessions; }
public String getGeneName() { return this.geneName; }
public List<String> getGeneSynonyms() { return this.geneSynonyms; }
public List<String> getProductName() { return this.productNames; }
public List<JSONObject> getPmids() { return this.pmids; }
public List<JSONObject> getPatents() { return this.patents; }
public List<JSONObject> getRefs() { return this.references; }
public List<Seq> getMatchingSeqs() { return this.matchingSeqs; }
public Long getOrgId() { return this.orgId; }
public String getOrg() { return this.org; }
public String getSeq() { return this.sequence; }
public String getEc() { return this.ec; }
public Set<Long> getCatalyzedRxns() { return this.catalyzedRxns; }
private String extractSequence() {
if (seqType.equals(PROTEIN_SEQ_TYPE)) {
return seqObject.getSequenceAsString();
} else if (seqType.equals(DNA_SEQ_TYPE)) {
if (cdsQualifierMap != null && cdsQualifierMap.containsKey(TRANSLATION)) {
return cdsQualifierMap.get(TRANSLATION).get(0).getValue();
}
}
return null;
}
private String extractEc() {
Map<String, List<Qualifier>> qualifierMap = null;
if (seqType.equals(PROTEIN_SEQ_TYPE)) {
qualifierMap = getQualifierMap(PROTEIN_SEQ_TYPE);
} else if (seqType.equals(DNA_SEQ_TYPE)) {
qualifierMap = cdsQualifierMap;
}
if (qualifierMap != null && qualifierMap.containsKey(EC_NUMBER)) {
String ec_value = qualifierMap.get(EC_NUMBER).get(0).getValue();
// there was a case where the EC_Number qualifier existed, but the value was empty or null
if (ec_value == null || ec_value.isEmpty()) {
return null;
}
return ec_value;
} else {
return null;
}
}
private List<JSONObject> extractPmids() {
List<String> pmids = seqObject.getPMIDS();
List<JSONObject> references = new ArrayList<>();
for (String pmid : pmids) {
JSONObject obj = new JSONObject();
obj.put(VAL, pmid);
obj.put(SRC, PMID);
references.add(obj);
}
this.pmids = references;
return references;
}
private List<JSONObject> extractPatents() {
List<Map> patents = seqObject.getPatents();
List<JSONObject> references = new ArrayList<>();
for (Map patent : patents) {
JSONObject obj = new JSONObject();
obj.put(SRC, PATENT);
obj.put(COUNTRY_CODE_SNAKE, patent.get(COUNTRY_CODE));
obj.put(PATENT_NUMBER_SNAKE, patent.get(PATENT_NUMBER));
obj.put(PATENT_YEAR_SNAKE, patent.get(PATENT_YEAR));
references.add(obj);
}
this.patents = references;
return references;
}
private List<JSONObject> extractReferences() {
List<JSONObject> references = new ArrayList<>();
references.addAll(extractPmids());
references.addAll(extractPatents());
return references;
}
private String extractOrg(Map<String, String> minimalPrefixMapping) {
Map<String, List<Qualifier>> qualifierMap = getQualifierMap(SOURCE);
if (qualifierMap != null && qualifierMap.containsKey(ORGANISM)) {
String orgName = qualifierMap.get(ORGANISM).get(0).getValue();
if (minimalPrefixMapping.containsKey(orgName)) {
return minimalPrefixMapping.get(orgName);
} else {
return orgName;
}
}
return null;
}
private Long extractOrgId(MongoDB db) {
long id = db.getOrganismId(org);
if (id != -1L) {
return id;
} else {
return db.submitToActOrganismNameDB(org);
}
}
/** accessions are stored in a JSONObject where the keys are either "genbank-protein" or "genbank-nucleotide" and
* the values are JSONArrays of the accession keys
* @return the accession JSONObject
*/
private JSONObject extractAccessions() {
JSONArray proteinAccessions = null;
JSONArray nucleotideAccessions = null;
if (seqType.equals(PROTEIN_SEQ_TYPE)) {
proteinAccessions = new JSONArray(Collections.singletonList(seqObject.getAccession().getID()));
} else if (seqType.equals(DNA_SEQ_TYPE)) {
if (cdsQualifierMap != null && cdsQualifierMap.containsKey(PROTEIN_ID)) {
// example: /protein_id="BAA25015.1"
String[] splitId = cdsQualifierMap.get(PROTEIN_ID).get(0).getValue().split("\\.");
proteinAccessions = new JSONArray(Collections.singletonList(splitId[0]));
}
}
if (seqType.equals(DNA_SEQ_TYPE)) {
nucleotideAccessions = new JSONArray(Collections.singletonList(seqObject.getAccession().getID()));
}
JSONObject accessions = new JSONObject();
if (proteinAccessions != null) {
accessions.put(Seq.AccType.genbank_protein.toString(), proteinAccessions);
}
if (nucleotideAccessions != null) {
accessions.put(Seq.AccType.genbank_nucleotide.toString(), nucleotideAccessions);
}
return accessions;
}
private List<Seq> extractMatchingSeqs(MongoDB db) {
if (ec != null) {
return db.getSeqFromSeqEcOrg(sequence, ec, org);
} else {
return db.getSeqFromGenbankProtAccession((accessions.getJSONArray(Seq.AccType.genbank_protein.toString())).getString(0));
}
}
private String extractGeneName() {
if (seqType.equals(PROTEIN_SEQ_TYPE)) {
Map<String, List<Qualifier>> proteinQualifierMap = getQualifierMap(PROTEIN);
// check if gene name is in Protein feature key, otherwise check for gene name in header
if (proteinQualifierMap != null && proteinQualifierMap.containsKey(NAME)) {
return proteinQualifierMap.get(NAME).get(0).getValue();
} else {
String header = seqObject.getOriginalHeader();
Matcher m = GENE_NAME_PATTERN.matcher(header);
if (m.find()) {
// some cases where genbank files have accession id's in the place of the gene name in the header of the file
if (m.group(1).equals((accessions.getJSONArray(Seq.AccType.genbank_protein.toString())).getString(0))) {
return null;
}
return m.group(1);
}
}
} else if (seqType.equals(DNA_SEQ_TYPE)) {
if (cdsQualifierMap != null && cdsQualifierMap.containsKey(GENE)) {
return cdsQualifierMap.get(GENE).get(0).getValue();
}
}
return null;
}
private DBObject extractMetadata() {
JSONObject obj = new JSONObject();
obj.put(NAME, geneName);
obj.put(SYNONYMS, geneSynonyms);
obj.put(PRODUCT_NAMES, productNames);
obj.put(XREF, new JSONObject());
obj.put(ACCESSION, accessions);
return MongoDBToJSON.conv(obj);
}
private List<String> extractGeneSynonyms() {
ArrayList<String> geneSynonyms = new ArrayList<>();
if (seqType.equals(PROTEIN_SEQ_TYPE)) {
{
Map<String, List<Qualifier>> qualifierMap = getQualifierMap(PROTEIN);
if (qualifierMap != null && qualifierMap.containsKey(GENE_SYNONYM)) {
for (Qualifier qualifier : qualifierMap.get(GENE_SYNONYM)) {
geneSynonyms.add(qualifier.getValue());
}
}
if (qualifierMap != null && qualifierMap.containsKey(GENE)) {
for (Qualifier qualifier : qualifierMap.get(GENE)) {
geneSynonyms.add(qualifier.getValue());
}
}
}
{
Map<String, List<Qualifier>> qualifierMap = getQualifierMap(GENE);
if (qualifierMap != null) {
if (qualifierMap.containsKey(GENE)) {
for (Qualifier qualifier : qualifierMap.get(GENE)) {
if (!geneSynonyms.contains(qualifier.getValue())) {
geneSynonyms.add(qualifier.getValue());
}
}
}
if (qualifierMap.containsKey(GENE_SYNONYM)) {
for (Qualifier qualifier : qualifierMap.get(GENE_SYNONYM)) {
if (!geneSynonyms.contains(qualifier.getValue())) {
geneSynonyms.add(qualifier.getValue());
}
}
}
}
}
}
return geneSynonyms;
}
private List<String> extractProductName() {
Map<String, List<Qualifier>> qualifierMap = null;
if (seqType.equals(PROTEIN_SEQ_TYPE)) {
qualifierMap = getQualifierMap(PROTEIN);
} else if (seqType.equals(DNA_SEQ_TYPE)) {
qualifierMap = cdsQualifierMap;
}
if (qualifierMap != null && qualifierMap.containsKey(PRODUCT)) {
return Collections.singletonList(qualifierMap.get(PRODUCT).get(0).getValue());
}
return null;
}
private Map<String, List<Qualifier>> getQualifierMap(String feature_type) {
for (FeatureInterface<AbstractSequence<Compound>, Compound> feature :
(List<FeatureInterface<AbstractSequence<Compound>, Compound>>) seqObject.getFeatures()) {
if (feature.getType().equals(feature_type)) {
return feature.getQualifiers();
}
}
return null;
}
}