/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package act.installer;
import act.installer.sequence.GenbankSeqEntry;
import act.installer.sequence.GenbankSeqEntryFactory;
import act.server.DBIterator;
import act.server.MongoDB;
import act.shared.Organism;
import act.shared.Seq;
import com.act.biointerpretation.Utils.OrgMinimalPrefixGenerator;
import com.act.utils.parser.GenbankInterpreter;
import com.mongodb.DBObject;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.biojava.nbio.core.sequence.features.FeatureInterface;
import org.biojava.nbio.core.sequence.template.AbstractSequence;
import org.biojava.nbio.core.sequence.template.Compound;
import org.json.JSONArray;
import org.json.JSONObject;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
public class GenbankInstaller {
private static final Logger LOGGER = LogManager.getFormatterLogger(GenbankInstaller.class);
private static final GenbankSeqEntryFactory seqEntryFactory = new GenbankSeqEntryFactory();
private static final String OPTION_GENBANK_PATH = "p";
private static final String OPTION_DB_NAME = "d";
private static final String OPTION_SEQ_TYPE = "s";
private static final String ACCESSION = "accession";
private static final String NAME = "name";
private static final String COUNTRY_CODE = "country_code";
private static final String PATENT_NUMBER = "patent_number";
private static final String PATENT_YEAR = "patent_year";
private static final String SYNONYMS = "synonyms";
private static final String PRODUCT_NAMES = "product_names";
private static final String DNA = "DNA";
private static final String CDS = "CDS";
private static final String PROTEIN_ID = "protein_id";
private static final String PROTEIN = "Protein";
private static final String VAL = "val";
private static final String SRC = "src";
private static final String PMID = "PMID";
private static final String PATENT = "Patent";
// http://www.ncbi.nlm.nih.gov/Sequin/acc.html
public static final Pattern PROTEIN_ACCESSION_PATTERN = Pattern.compile("[a-zA-Z]{3}\\d{5}");
// matches WGS and MGA sequence accession patterns since they appear in Nucleotide files as well
public static final Pattern NUCLEOTIDE_ACCESSION_PATTERN =
Pattern.compile("[a-zA-Z]\\d{5}|[a-zA-Z]{2}\\d{6}|[a-zA-Z]{4}\\d{8,10}|[a-zA-Z]{5}\\d{7}");
public static final String HELP_MESSAGE = StringUtils.join(new String[]{
"This class is the driver to write sequence data from a Genbank file to our database. It can be used on the ",
"command line with a file path as a parameter."}, "");
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_GENBANK_PATH)
.argName("genbank file")
.desc("genbank file containing sequence and annotations")
.hasArg()
.longOpt("genbank")
.required()
);
add(Option.builder(OPTION_DB_NAME)
.argName("db name")
.desc("name of the database to be queried")
.hasArg()
.longOpt("database")
.required()
);
add(Option.builder(OPTION_SEQ_TYPE)
.argName("sequence type")
.desc("declares whether the sequence type is DNA or Protein")
.hasArg()
.longOpt("sequence")
.required()
);
add(Option.builder("h")
.argName("help")
.desc("Example of usage: -p filepath.gb -d marvin -s DNA")
.longOpt("help")
);
}};
public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
static {
HELP_FORMATTER.setWidth(100);
}
File genbankFile;
String seqType;
MongoDB db;
Map<String, String> minimalPrefixMapping;
// the minimalPrefixMapping is generated by OrgMinimalPrefixGenerator
public GenbankInstaller (File genbankFile, String seqType, MongoDB db, Map<String, String> minimalPrefixMapping) {
this.genbankFile = genbankFile;
this.seqType = seqType;
this.db = db;
this.minimalPrefixMapping = minimalPrefixMapping;
}
public void init() throws Exception {
GenbankInterpreter reader = new GenbankInterpreter(genbankFile, seqType);
reader.init();
List<AbstractSequence> sequences = reader.getSequences();
int sequenceCount = 0;
GenbankSeqEntry seqEntry;
for (AbstractSequence sequence : sequences) {
if (seqType.equals(DNA)) {
for (FeatureInterface<AbstractSequence<Compound>, Compound> feature :
(List<FeatureInterface<AbstractSequence<Compound>, Compound>>) sequence.getFeatures()) {
if (feature.getType().equals(CDS) && feature.getQualifiers().containsKey(PROTEIN_ID)) {
seqEntry = seqEntryFactory.createFromDNASequenceReference(sequence, feature.getQualifiers(), db,
minimalPrefixMapping);
addSeqEntryToDb(seqEntry, db);
sequenceCount++;
}
}
} else if (seqType.equals(PROTEIN)) {
seqEntry = seqEntryFactory.createFromProteinSequenceReference(sequence, db, minimalPrefixMapping);
addSeqEntryToDb(seqEntry, db);
sequenceCount++;
}
}
LOGGER.info("%s sequences installed in the db", sequenceCount);
}
/**
* Verifies the accession string according to the standard Genbank/Uniprot accession qualifications
* @param proteinAccession the accession string to be validated
* @param accessionPattern the pattern that the accession string should match
* @return
*/
public static boolean verifyAccession(String proteinAccession, Pattern accessionPattern) {
return accessionPattern.matcher(proteinAccession).matches();
}
/**
* Checks if the new value already exists in the field. If so, doesn't update the metadata. If it doesn't exist,
* appends the new value to the data.
* @param field the key referring to the array in the metadata we wish to update
* @param value the value we wish to add to the array
* @param data the metadata
* @return the updated metadata JSONObject
*/
public static JSONObject updateArrayField(String field, String value, JSONObject data) {
if (value == null || value.isEmpty()) {
return data;
}
if (data.has(field)) {
JSONArray fieldData = data.getJSONArray(field);
for (int i = 0; i < fieldData.length(); i++) {
if (fieldData.get(i).toString().equals(value)) {
return data;
}
}
}
return data.append(field, value);
}
/**
* Updates the accession JSONObject for the given accessions type
* @param newAccessionObject the new accession object to load in the new accessions of the given type
* @param metadata contains the accession object to be updated
* @param accType the type of accessions to update
* @param accessionPattern the accession pattern to validate the accession string according to Genbank/Uniprot
* standards
* @return the metadata containing the updated accession mapping
*/
public static JSONObject updateAccessions(JSONObject newAccessionObject, JSONObject metadata, Seq.AccType accType,
Pattern accessionPattern) {
JSONObject oldAccessionObject = metadata.getJSONObject(ACCESSION);
if (newAccessionObject.has(accType.toString())) {
JSONArray newAccTypeAccessions = newAccessionObject.getJSONArray(accType.toString());
for (int i = 0; i < newAccTypeAccessions.length(); i++) {
if (!verifyAccession(newAccTypeAccessions.getString(i), accessionPattern)) {
LOGGER.error("%s accession not the right format: %s\n", accType.toString(),
newAccTypeAccessions.getString(i));
continue;
}
oldAccessionObject = updateArrayField(accType.toString(), newAccTypeAccessions.getString(i),
oldAccessionObject);
}
}
return metadata.put(ACCESSION, oldAccessionObject);
}
/**
* Updates metadata and reference fields with the information extracted from file
* @param se an instance of the GenbankSeqEntry class that extracts all the relevant information from a sequence
* object
* @param db reference to the database that should be queried and updated
*/
private void addSeqEntryToDb(GenbankSeqEntry se, MongoDB db) {
List<Seq> seqs = se.getMatchingSeqs();
// no prior data on this sequence
if (seqs.isEmpty()) {
se.writeToDB(db, Seq.AccDB.genbank);
return;
}
// update prior data
for (Seq seq : seqs) {
JSONObject metadata = seq.getMetadata();
JSONObject accessions = se.getAccession();
if (!metadata.has(ACCESSION)) {
metadata.put(ACCESSION, accessions);
} else {
metadata = updateAccessions(accessions, metadata, Seq.AccType.genbank_nucleotide,
NUCLEOTIDE_ACCESSION_PATTERN);
metadata = updateAccessions(accessions, metadata, Seq.AccType.genbank_protein, PROTEIN_ACCESSION_PATTERN);
}
List<String> geneSynonyms = se.getGeneSynonyms();
if (se.getGeneName() != null) {
if (!metadata.has(NAME) || metadata.get(NAME) == null) {
metadata.put(NAME, se.getGeneName());
} else if (!se.getGeneName().equals(metadata.get(NAME))) {
geneSynonyms.add(se.getGeneName());
}
}
for (String geneSynonym : geneSynonyms) {
if (!geneSynonym.equals(metadata.get(NAME))) {
metadata = updateArrayField(SYNONYMS, geneSynonym, metadata);
}
}
if (se.getProductName() != null) {
metadata = updateArrayField(PRODUCT_NAMES, se.getProductName().get(0), metadata);
}
seq.setMetadata(metadata);
db.updateMetadata(seq);
List<JSONObject> oldRefs = seq.getReferences();
List<JSONObject> newPmidRefs = se.getPmids();
List<JSONObject> newPatentRefs = se.getPatents();
if (!oldRefs.isEmpty()) {
Set<String> oldPmids = new HashSet<>();
for (JSONObject oldRef : oldRefs) {
if (oldRef.get(SRC).equals(PMID)) {
oldPmids.add(oldRef.getString(VAL));
}
}
for (JSONObject newPmidRef : newPmidRefs) {
if (!oldPmids.contains(newPmidRef.getString(VAL))) {
oldRefs.add(newPmidRef);
}
}
for (JSONObject newPatentRef : newPatentRefs) {
Boolean patentExists = false;
String countryCode = (String) newPatentRef.get(COUNTRY_CODE);
String patentNumber = (String) newPatentRef.get(PATENT_NUMBER);
String patentYear = (String) newPatentRef.get(PATENT_YEAR);
// checks if any patents are equivalent
for (JSONObject newRef : oldRefs) {
if (newRef.get(SRC).equals(PATENT) && newRef.get(COUNTRY_CODE).equals(countryCode)
&& newRef.get(PATENT_NUMBER).equals(patentNumber) && newRef.get(PATENT_YEAR).equals(patentYear)) {
patentExists = true;
}
}
if (!patentExists) {
oldRefs.add(newPatentRef);
}
}
seq.setReferences(oldRefs);
} else {
seq.setReferences(se.getRefs());
}
if (seq.getReferences() != null) {
db.updateReferences(seq);
}
}
}
public static void main(String[] args) throws Exception {
Options opts = new Options();
for (Option.Builder b : OPTION_BUILDERS) {
opts.addOption(b.build());
}
CommandLine cl = null;
try {
CommandLineParser parser = new DefaultParser();
cl = parser.parse(opts, args);
} catch (ParseException e) {
LOGGER.error("Argument parsing failed: %s", e.getMessage());
HELP_FORMATTER.printHelp(GenbankInstaller.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
if (cl.hasOption("help")) {
HELP_FORMATTER.printHelp(GenbankInstaller.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
File genbankFile = new File(cl.getOptionValue(OPTION_GENBANK_PATH));
String dbName = cl.getOptionValue(OPTION_DB_NAME);
String seqType = cl.getOptionValue(OPTION_SEQ_TYPE);
if (!genbankFile.exists()) {
String msg = String.format("Genbank file path is null");
LOGGER.error(msg);
throw new RuntimeException(msg);
} else {
MongoDB db = new MongoDB("localhost", 27017, dbName);
DBIterator iter = db.getDbIteratorOverOrgs();
Iterator<Organism> orgIterator = new Iterator<Organism> () {
@Override
public boolean hasNext() {
boolean hasNext = iter.hasNext();
if (!hasNext)
iter.close();
return hasNext;
}
@Override
public Organism next() {
DBObject o = iter.next();
return db.convertDBObjectToOrg(o);
}
};
OrgMinimalPrefixGenerator prefixGenerator = new OrgMinimalPrefixGenerator(orgIterator);
Map<String, String> minimalPrefixMapping = prefixGenerator.getMinimalPrefixMapping();
GenbankInstaller installer = new GenbankInstaller(genbankFile, seqType, db, minimalPrefixMapping);
installer.init();
}
}
}