package org.genedb.db.loading.auxiliary;
import org.genedb.db.dao.CvDao;
import org.genedb.db.dao.GeneralDao;
import org.genedb.db.dao.PubDao;
import org.genedb.db.dao.SequenceDao;
import org.genedb.db.loading.FeatureUtils;
import org.gmod.schema.feature.AbstractGene;
import org.gmod.schema.feature.Polypeptide;
import org.gmod.schema.feature.ProductiveTranscript;
import org.gmod.schema.feature.Transcript;
import org.gmod.schema.utils.ObjectManager;
import org.apache.log4j.Logger;
import org.hibernate.Session;
import org.hibernate.SessionFactory;
import org.springframework.orm.hibernate3.SessionFactoryUtils;
import org.springframework.transaction.annotation.Transactional;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.Collections;
import java.util.Set;
public abstract class Loader {
private static final Logger logger = Logger.getLogger(Loader.class);
protected GeneralDao generalDao;
protected SequenceDao sequenceDao;
protected CvDao cvDao;
protected PubDao pubDao;
protected FeatureUtils featureUtils;
protected SessionFactory sessionFactory;
protected ObjectManager objectManager;
/**
* What options does this loader accept?
* @return a set of option names.
*/
protected Set<String> getOptionNames() {
return Collections.emptySet();
}
/**
* Does this loader expect input files?
* @return <code>true</code> if so, <code>false</code> if not.
* The default implementation returns <code>true</code>.
*/
protected boolean loadsFromFile() {
return true;
}
/**
* Pass the specified option to the loader, if it is a valid option.
* This is used by {@link Load} to pass command-line options to the loader.
*
* @param optionName the option name
* @param optionValue the option value
* @return <code>true</code> if the option was valid and successfully processed,
* <code>false</code> if invalid.
*/
protected boolean processOptionIfValid(String optionName, String optionValue) {
if (getOptionNames().contains(optionName))
return processOption(optionName, optionValue);
else
return false;
}
protected boolean processOption(@SuppressWarnings("unused") String optionName, @SuppressWarnings("unused") String optionValue) {
throw new IllegalStateException("processOption() must be overridden if options are specified");
}
@Transactional(rollbackFor=IOException.class)
void load(InputStream inputStream) throws IOException {
Session session = SessionFactoryUtils.getSession(sessionFactory, false);
doLoad(inputStream, session);
}
/**
* FileProcessor data. This method must be implemented by all implementing classes.
* It will be called once for each input file.
* @param inputStream a stream from which the input data may be read, or <code>null</code>
* if {@link #loadsFromFile()} returns <code>false</code>
* @param session the Hibernate session
* @throws IOException
*/
protected abstract void doLoad(InputStream inputStream, Session session)
throws IOException;
public void setFeatureUtils(FeatureUtils featureUtils) {
this.featureUtils = featureUtils;
}
public void setSequenceDao(SequenceDao sequenceDao) {
this.sequenceDao = sequenceDao;
}
public void setGeneralDao(GeneralDao generalDao) {
this.generalDao = generalDao;
}
public void setCvDao(CvDao cvDao) {
this.cvDao = cvDao;
}
public void setPubDao(PubDao pubDao) {
this.pubDao = pubDao;
}
public void setSessionFactory(SessionFactory sessionFactory) {
this.sessionFactory = sessionFactory;
}
public void setObjectManager(ObjectManager objectManager) {
this.objectManager = objectManager;
}
public SessionFactory getSessionFactory() {
return sessionFactory;
}
public void afterPropertiesSet() {
/*
* We cannot set the DAOs of the objectManager
* directly in Load.xml, because that creates a circular
* reference that (understandably) causes Spring to
* throw a tantrum. Thus we inject them into
* here, and pass them to the ObjectManager after Spring
* configuration.
*/
objectManager.setGeneralDao(generalDao);
objectManager.setCvDao(cvDao);
objectManager.setPubDao(pubDao);
}
protected Polypeptide getPolypeptideForGene(String geneUniqueName) {
AbstractGene gene = sequenceDao.getFeatureByUniqueName(geneUniqueName,
AbstractGene.class);
if (gene == null) {
logger.error(String.format("Gene '%s' not found in database", geneUniqueName));
return null;
}
Collection<Transcript> transcripts = gene.getTranscripts();
if (transcripts.isEmpty()) {
logger.error(String.format("Gene '%s' has no transcripts", geneUniqueName));
return null;
}
// Select the coding transcript with the least feature_id,
// logging an error if there's more than one.
ProductiveTranscript selectedTranscript = null;
int numberOfProductiveTranscripts = 0;
for (Transcript transcript : transcripts)
if (transcript instanceof ProductiveTranscript) {
++ numberOfProductiveTranscripts;
if (selectedTranscript == null
|| transcript.getFeatureId() < selectedTranscript.getFeatureId())
{
selectedTranscript = (ProductiveTranscript) transcript;
}
}
if (selectedTranscript == null) {
logger.error("Gene '%s' has no coding transcripts.");
return null;
}
if (numberOfProductiveTranscripts > 1)
logger.error(String.format("The gene '%s' is alternatively spliced: " +
"we don't know to which transcript the protein data applies.\n" +
"We've selected '%s', the first coding transcript by loading" +
"order: there's no reason to believe that is right!",
geneUniqueName, selectedTranscript.getUniqueName()));
return selectedTranscript.getProtein();
}
/**
* If ever anything deserved an explanation, this does!
*
* InterPro doesn't allow the sequence names in its input FASTA
* files to contain colons: it seems to treat the colon as a
* separator of some sort. Therefore, when the polypeptide
* sequences are extracted, colons in the polypeptide name
* are transliterated to dots. Unfortunately it didn't occur
* to me at the time that this transformation is not easily
* reversible, given that the names can and do contain dots.
*
* What we do, therefore, is to successively change dots into
* colons from right to left, until we find the thus-named
* polypeptide in the database. One can imagine
* plausible-looking pairs of names for which this procedure
* would fail: for example, suppose we had genes called XY001023
* and XY001023.1, which might be possible under some naming scheme.
* If the former is alternatively spliced, the polypeptide produced
* from its second transcript would be XY001023:1:pep; the polypeptide
* produced from (the first transcript of) the latter would be XY001023.1:pep.
*
* To be on the safe side, in future we'll translate a colon into a
* double dot (lying the colon on its side, as it were). To allow for
* this, the very first thing we do is to check whether the supplied
* mangled name contains a double dot. (If it does, we assume that
* the double-dot translation has been used.)
*
* TODO When the present, complicated, logic is no longer needed, remove
* it and this explanation, and just use the double-dot translation.
*
* @param mangledName
* @return
*/
protected Polypeptide getPolypeptideByMangledName(String mangledName) {
if (mangledName.contains(".."))
return sequenceDao.getFeatureByUniqueName(mangledName.replaceAll("\\.\\.", ":"), Polypeptide.class);
StringBuilder name = new StringBuilder(mangledName);
int lastDot;
while (0 < (lastDot = name.lastIndexOf("."))) {
name.setCharAt(lastDot, ':');
logger.debug(String.format("Looking for polypeptide feature '%s'", name));
Polypeptide polypeptide = sequenceDao.getFeatureByUniqueName(name.toString(), Polypeptide.class);
if (polypeptide != null)
return polypeptide;
}
return sequenceDao.getFeatureByUniqueName(mangledName, Polypeptide.class);
}
}