package org.genedb.db.loading.auxiliary;
import org.genedb.db.loading.GoInstance;
import org.genedb.db.loading.ParsingException;
import org.gmod.schema.feature.Polypeptide;
import org.gmod.schema.feature.PolypeptideDomain;
import org.gmod.schema.mapped.Analysis;
import org.gmod.schema.mapped.DbXRef;
import org.gmod.schema.mapped.FeatureDbXRef;
import org.apache.log4j.Logger;
import org.hibernate.HibernateException;
import org.hibernate.Session;
import org.hibernate.jdbc.Work;
import org.springframework.orm.hibernate3.SessionFactoryUtils;
import org.springframework.transaction.annotation.Transactional;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
/**
* A generic loader for polypeptide domain results.
* The input file is assumed to be in a format defined in a DomainRow class.
* This loader has several options:
* <dl>
* <dt><code>programVersion</code><dd>The version of pfam_scan/prosite etc used. Required.</dd></dt>
* <dt><code>notFoundNotFatal</code><dd>Boolean. Attempting to load domains for a gene or polypeptide that is
* missing from the database is not fatal.</dd></dt>
* <dt><code>key-type</code>, whose possible values are:
* <dl>
* <dt><code>gene</code></dt><dd>The keys in the input file are gene names. This is the default.</dd>
* <dt><code>polypeptide</code></dt><dd>The keys in the input file are polypeptide names
* (possibly with colons converted to doubled dots).</dd>
* </dl>
* </dt>
* </dl>
*
* @author rh11
* @author te3
*/
public class DomainLoader extends Loader {
private static final Logger logger = Logger.getLogger(DomainLoader.class);
//Constants
String analysisProgramVersion;
String analysisProgram;
boolean notFoundNotFatal = false;
Pfam2GoFile pfam2GoFile;
@Override
protected Set<String> getOptionNames() {
Set<String> options = new HashSet<String>();
Collections.addAll(options, "key-type", "program-version", "not-found-not-fatal", "program");
return options;
}
private static enum KeyType {GENE, POLYPEPTIDE};
private KeyType keyType = KeyType.GENE;
@Override
protected boolean processOption(String optionName, String optionValue) {
if (optionName.equals("program-version")) {
analysisProgramVersion = optionValue;
return true;
}
else if (optionName.equals("program")) {
analysisProgram = optionValue;
return true;
}
else if (optionName.equals("not-found-not-fatal")) {
if (!optionValue.equals("true") && !optionValue.equals("false")) {
return false;
}
notFoundNotFatal = Boolean.valueOf(optionValue);
return true;
}
else if (optionName.equals("key-type")) {
if (optionValue == null)
return false;
if (optionValue.equals("gene")) {
keyType = KeyType.GENE;
return true;
}
if (optionValue.equals("polypeptide")) {
keyType = KeyType.POLYPEPTIDE;
return true;
}
}
return false;
}
@Override
protected void doLoad (InputStream inputStream, Session session) throws IOException {
loadDomainFile(new DomainFile(analysisProgram, inputStream), session);
}
private Analysis analysis;
@Transactional
protected void loadDomainFile(DomainFile domainFile, Session session) throws IOException {
// Add analysis
analysis = new Analysis();
analysis.setProgram(analysisProgram);
analysis.setProgramVersion(analysisProgramVersion);
sequenceDao.persist(analysis);
Collection<String> keys = domainFile.keys();
int n=1;
for (String key: keys) {
logger.info(String.format("Processing key '%s' [%d/%d]", key, n++, keys.size()));
loadKey(domainFile, key);
/*
* If the session isn't cleared out every so often, it
* starts to get pretty slow after a while if we're loading
* a large file.
*/
if (n % 5 == 1) {
session.flush();
logger.info("Clearing session");
session.clear();
}
}
//remove redundant go terms
try{
DeleteRedundantGOTerms.deleteRedundantGOTerms(session);
}catch (SQLException sqle){
logger.debug(sqle.toString());
}
}
private void loadKey(DomainFile domainFile, String key) throws IOException {
Polypeptide polypeptide = getPolypeptideForKey(key);
if (polypeptide == null) {
if (notFoundNotFatal) {
String errorMessage = String.format("Failed to find %s '%s'", keyType, key);
logger.error(errorMessage);
}
else {
throw new RuntimeException(String.format("Failed to find %s '%s'", keyType, key));
}
}
else {
for (DomainAcc acc: domainFile.accsForKey(key)) {
logger.debug(String.format("Processing '%s'", acc.getId()));
loadGroup(domainFile, key, acc, polypeptide);
}
}
}
private Polypeptide getPolypeptideForKey(String key) {
switch (keyType) {
case GENE: return getPolypeptideForGene(key);
case POLYPEPTIDE: return getPolypeptideByMangledName(key);
default: throw new RuntimeException("keyType does not take a legitimate value. This should be impossible.");
}
}
private void loadGroup(DomainFile domainFile, String gene, DomainAcc acc, Polypeptide polypeptide ) throws IOException {
logger.debug("In loadGroup()");
DbXRef interProDbxref = null;
if (acc != DomainAcc.NULL && analysis.getProgram().equals("iprscan")) {
logger.debug(String.format("Creating InterPro dbxref for '%s' with description '%s'",
acc.getId(), acc.getDescription()));
interProDbxref = objectManager.getDbXRef("InterPro", acc.getId(), acc.getDescription());
}
int n = -1;
for (DomainRow row: domainFile.rows(gene, acc)) {
n++;
logger.debug(row);
// Insert polypeptide_domain
DbXRef dbxref = objectManager.getDbXRef(row.db(), row.nativeAcc(), row.nativeDesc());
if (dbxref == null) {
throw new RuntimeException(String.format("Could not find database '%s' on line %d", row.db(), row.lineNumber()));
}
String domainUniqueName;
String accessionNumber = acc.getId();
if (accessionNumber == null) {
accessionNumber = row.nativeAcc();
}
if (n == 0) {
domainUniqueName = String.format("%s:%s:%s",
polypeptide.getUniqueName(), row.db(), accessionNumber);
} else {
domainUniqueName = String.format("%s:%s:%s:%d",
polypeptide.getUniqueName(), row.db(), accessionNumber, n);
}
PolypeptideDomain polypeptideDomain = sequenceDao.createPolypeptideDomain(domainUniqueName, polypeptide,
row.score(),row.acc().getDescription(), row.fmin(), row.fmax(),
dbxref, row.evalue(), analysis);
//add GO terms
addGoTerms(row.getGoTerms(), polypeptide, polypeptideDomain, row.getGoTermComment());
// link to InterPro dbxref if applicable
if (interProDbxref != null && analysis.getProgram().equals("iprscan")) {
FeatureDbXRef featureDbXRef = new FeatureDbXRef(interProDbxref, polypeptideDomain, true);
sequenceDao.persist(featureDbXRef);
polypeptideDomain.addFeatureDbXRef(featureDbXRef);
}
}
}
private void addGoTerms(Set<GoInstance> goTerms, Polypeptide polypeptide, PolypeptideDomain polypeptideDomain, String comment) {
for (GoInstance goInstance: goTerms) {
try {
if (polypeptide.getGo().contains(goInstance.getId())) {
logger.info(String.format("The GO term '%s' has already been added to polypeptide '%s'",
goInstance.getId(), polypeptide));
continue;
}
logger.info(String.format("Creating %s GO term '%s' for domain '%s'",
comment, goInstance.getId(), polypeptideDomain.getUniqueName()));
DbXRef withFrom = null;
if(goInstance.getWithFrom() != null)
withFrom = featureUtils.findOrCreateDbXRefForWithFrom(goInstance.getWithFrom());
featureUtils.createGoEntries(polypeptide, goInstance,
comment, withFrom);
} catch (ParsingException e) {
logger.error(e);
}
}
}
@Transactional
void clear(final String organismCommonName, final String analysisProgram) throws HibernateException, SQLException {
Session session = SessionFactoryUtils.getSession(sessionFactory, false);
session.doWork(new Work() {
public void execute(Connection connection) throws SQLException {
new ClearDomains(connection, organismCommonName, analysisProgram).clear();
}
});
}
}