/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package act.installer;
import act.installer.sequence.UniprotSeqEntry;
import act.installer.sequence.UniprotSeqEntryFactory;
import act.server.DBIterator;
import act.server.MongoDB;
import act.shared.Organism;
import act.shared.Seq;
import com.act.biointerpretation.Utils.OrgMinimalPrefixGenerator;
import com.act.utils.parser.UniprotInterpreter;
import com.mongodb.DBObject;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.json.JSONArray;
import org.json.JSONObject;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
public class UniprotInstaller {
private static final Logger LOGGER = LogManager.getFormatterLogger(UniprotInstaller.class);
private static final UniprotSeqEntryFactory seqEntryFactory = new UniprotSeqEntryFactory();
private static final String OPTION_UNIPROT_PATH = "p";
private static final String OPTION_DB_NAME = "d";
private static final String NAME = "name";
private static final String ACCESSION = "accession";
private static final String SYNONYMS = "synonyms";
private static final String PRODUCT_NAMES = "product_names";
private static final String VAL = "val";
private static final String SRC = "src";
private static final String PMID = "PMID";
private static final String CATALYTIC_ACTIVITY = "catalytic_activity";
// http://www.uniprot.org/help/accession_numbers
public static final Pattern UNIPROT_ACCESSION_PATTERN =
Pattern.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}");
public static final String HELP_MESSAGE = StringUtils.join(new String[]{
"This class is the driver to write sequence data from a Uniprot file to our database. It can be used on the ",
"command line with a file path as a parameter."}, "");
public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {{
add(Option.builder(OPTION_UNIPROT_PATH)
.argName("uniprot file")
.desc("uniprot file containing sequence and annotations")
.hasArg()
.longOpt("uniprot")
.required()
);
add(Option.builder(OPTION_DB_NAME)
.argName("db name")
.desc("name of the database to be queried")
.hasArg()
.longOpt("database")
.required()
);
add(Option.builder("h")
.argName("help")
.desc("Example of usage: -p filepath.gb -d marvin")
.longOpt("help")
);
}};
public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();
static {
HELP_FORMATTER.setWidth(100);
}
File uniprotFile;
MongoDB db;
Map<String, String> minimalPrefixMapping;
// the minimalPrefixMapping is generated by OrgMinimalPrefixGenerator
public UniprotInstaller (File uniprotFile, MongoDB db, Map<String, String> minimalPrefixMapping) {
this.uniprotFile = uniprotFile;
this.db = db;
this.minimalPrefixMapping = minimalPrefixMapping;
}
public void init() throws IOException, SAXException, ParserConfigurationException, CompoundNotFoundException {
UniprotInterpreter uniprotInterpreter = new UniprotInterpreter(uniprotFile);
uniprotInterpreter.init();
UniprotSeqEntry seqEntry = seqEntryFactory.createFromDocumentReference(uniprotInterpreter.getXmlDocument(), db,
minimalPrefixMapping);
addSeqEntryToDb(seqEntry, db);
}
/**
* Verifies the accession string according to the standard Genbank/Uniprot accession qualifications
* @param proteinAccession the accession string to be validated
* @param accessionPattern the pattern that the accession string should match
* @return
*/
private boolean verifyAccession(String proteinAccession, Pattern accessionPattern) {
return accessionPattern.matcher(proteinAccession).matches();
}
/**
* Checks if the new value already exists in the field. If so, doesn't update the metadata. If it doesn't exist,
* appends the new value to the data.
* @param field the key referring to the array in the metadata we wish to update
* @param value the value we wish to add to the array
* @param data the metadata
* @return the updated metadata JSONObject
*/
private JSONObject updateArrayField(String field, String value, JSONObject data) {
if (value == null || value.isEmpty()) {
return data;
}
if (data.has(field)) {
JSONArray fieldData = data.getJSONArray(field);
for (int i = 0; i < fieldData.length(); i++) {
if (fieldData.get(i).toString().equals(value)) {
return data;
}
}
}
return data.append(field, value);
}
/**
* Updates the accession JSONObject for the given accessions type
* @param newAccessionObject the new accession object to load in the new accessions of the given type
* @param metadata contains the accession object to be updated
* @param accType the type of accessions to update
* @param accessionPattern the accession pattern to validate the accession string according to Genbank/Uniprot
* standards
* @return the metadata containing the updated accession mapping
*/
private JSONObject updateAccessions(JSONObject newAccessionObject, JSONObject metadata, Seq.AccType accType,
Pattern accessionPattern) {
JSONObject oldAccessionObject = metadata.getJSONObject(ACCESSION);
if (newAccessionObject.has(accType.toString())) {
JSONArray newAccTypeAccessions = newAccessionObject.getJSONArray(accType.toString());
for (int i = 0; i < newAccTypeAccessions.length(); i++) {
if (!verifyAccession(newAccTypeAccessions.getString(i), accessionPattern)) {
LOGGER.error("%s accession not the right format: %s\n", accType.toString(),
newAccTypeAccessions.getString(i));
continue;
}
oldAccessionObject = updateArrayField(accType.toString(), newAccTypeAccessions.getString(i),
oldAccessionObject);
}
}
return metadata.put(ACCESSION, oldAccessionObject);
}
/**
* Updates metadata and reference fields with the information extracted from file
* @param se an instance of the UniprotSeqEntry class that extracts all the relevant information from a sequence
* object
* @param db reference to the database that should be queried and updated
*/
private void addSeqEntryToDb(UniprotSeqEntry se, MongoDB db) {
List<Seq> seqs = se.getMatchingSeqs();
// no prior data on this sequence
if (seqs.isEmpty()) {
se.writeToDB(db, Seq.AccDB.uniprot);
return;
}
// update prior data
for (Seq seq : seqs) {
JSONObject metadata = seq.getMetadata();
JSONObject accessions = se.getAccession();
if (!metadata.has(ACCESSION)) {
metadata.put(ACCESSION, accessions);
} else {
metadata = updateAccessions(accessions, metadata, Seq.AccType.genbank_nucleotide,
GenbankInstaller.NUCLEOTIDE_ACCESSION_PATTERN);
metadata = updateAccessions(accessions, metadata, Seq.AccType.genbank_protein,
GenbankInstaller.PROTEIN_ACCESSION_PATTERN);
metadata = updateAccessions(accessions, metadata, Seq.AccType.uniprot, UNIPROT_ACCESSION_PATTERN);
}
List<String> geneSynonyms = se.getGeneSynonyms();
if (se.getGeneName() != null) {
if (!metadata.has(NAME) || metadata.isNull(NAME)) {
metadata.put(NAME, se.getGeneName());
} else if (!se.getGeneName().equals(metadata.get(NAME))) {
geneSynonyms.add(se.getGeneName());
}
}
for (String geneSynonym : geneSynonyms) {
if (!geneSynonym.equals(metadata.get(NAME))) {
metadata = updateArrayField(SYNONYMS, geneSynonym, metadata);
}
}
List<String> productNames = se.getProductName();
if (!productNames.isEmpty()) {
for (int i = 0; i < productNames.size(); i++) {
metadata = updateArrayField(PRODUCT_NAMES, productNames.get(i), metadata);
}
}
if (se.getCatalyticActivity() != null) {
metadata.put(CATALYTIC_ACTIVITY, se.getCatalyticActivity());
}
seq.setMetadata(metadata);
db.updateMetadata(seq);
List<JSONObject> oldRefs = seq.getReferences();
List<JSONObject> newPmidRefs = se.getRefs();
if (!oldRefs.isEmpty()) {
Set<String> oldPmids = new HashSet<>();
for (JSONObject oldRef : oldRefs) {
if (oldRef.get(SRC).equals(PMID)) {
oldPmids.add(oldRef.getString(VAL));
}
}
for (JSONObject newPmidRef : newPmidRefs) {
if (!oldPmids.contains(newPmidRef.getString(VAL))) {
oldRefs.add(newPmidRef);
}
}
seq.setReferences(oldRefs);
} else {
seq.setReferences(se.getRefs());
}
if (seq.getReferences() != null) {
db.updateReferences(seq);
}
}
}
public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException,
CompoundNotFoundException {
Options opts = new Options();
for (Option.Builder b : OPTION_BUILDERS) {
opts.addOption(b.build());
}
CommandLine cl = null;
try {
CommandLineParser parser = new DefaultParser();
cl = parser.parse(opts, args);
} catch (ParseException e) {
LOGGER.error("Argument parsing failed: %s", e.getMessage());
HELP_FORMATTER.printHelp(UniprotInstaller.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
if (cl.hasOption("help")) {
HELP_FORMATTER.printHelp(UniprotInstaller.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
System.exit(1);
}
File uniprotFile = new File(cl.getOptionValue(OPTION_UNIPROT_PATH));
String dbName = cl.getOptionValue(OPTION_DB_NAME);
if (!uniprotFile.exists()) {
String msg = String.format("Uniprot file path is null");
LOGGER.error(msg);
throw new RuntimeException(msg);
} else {
MongoDB db = new MongoDB("localhost", 27017, dbName);
DBIterator iter = db.getDbIteratorOverOrgs();
Iterator<Organism> orgIterator = new Iterator<Organism> () {
@Override
public boolean hasNext() {
boolean hasNext = iter.hasNext();
if (!hasNext)
iter.close();
return hasNext;
}
@Override
public Organism next() {
DBObject o = iter.next();
return db.convertDBObjectToOrg(o);
}
};
OrgMinimalPrefixGenerator prefixGenerator = new OrgMinimalPrefixGenerator(orgIterator);
Map<String, String> minimalPrefixMapping = prefixGenerator.getMinimalPrefixMapping();
UniprotInstaller installer = new UniprotInstaller(uniprotFile, db, minimalPrefixMapping);
installer.init();
}
}
}