package org.genedb.db.loading;
import org.genedb.db.dao.CvDao;
import org.genedb.db.dao.GeneralDao;
import org.genedb.db.dao.OrganismDao;
import org.genedb.db.dao.PubDao;
import org.genedb.util.Counters;
import org.genedb.util.IterableArray;
import org.gmod.schema.cfg.FeatureTypeUtils;
import org.gmod.schema.feature.AbstractExon;
import org.gmod.schema.feature.AbstractGene;
import org.gmod.schema.feature.Centromere;
import org.gmod.schema.feature.Contig;
import org.gmod.schema.feature.DirectRepeatRegion;
import org.gmod.schema.feature.FivePrimeUTR;
import org.gmod.schema.feature.Gap;
import org.gmod.schema.feature.Gene;
import org.gmod.schema.feature.InvertedRepeatRegion;
import org.gmod.schema.feature.MRNA;
import org.gmod.schema.feature.NcRNA;
import org.gmod.schema.feature.Polypeptide;
import org.gmod.schema.feature.PolypeptideMotif;
import org.gmod.schema.feature.ProductiveTranscript;
import org.gmod.schema.feature.Pseudogene;
import org.gmod.schema.feature.PseudogenicTranscript;
import org.gmod.schema.feature.RRNA;
import org.gmod.schema.feature.Region;
import org.gmod.schema.feature.RepeatRegion;
import org.gmod.schema.feature.RepeatUnit;
import org.gmod.schema.feature.SECISElement;
import org.gmod.schema.feature.SnRNA;
import org.gmod.schema.feature.SnoRNA;
import org.gmod.schema.feature.Supercontig;
import org.gmod.schema.feature.TRNA;
import org.gmod.schema.feature.ThreePrimeUTR;
import org.gmod.schema.feature.TopLevelFeature;
import org.gmod.schema.feature.Transcript;
import org.gmod.schema.feature.UTR;
import org.gmod.schema.mapped.Analysis;
import org.gmod.schema.mapped.DbXRef;
import org.gmod.schema.mapped.Feature;
import org.gmod.schema.mapped.FeatureCvTerm;
import org.gmod.schema.mapped.HasPubsAndDbXRefs;
import org.gmod.schema.mapped.Organism;
import org.gmod.schema.mapped.Pub;
import org.gmod.schema.mapped.PubDbXRef;
import org.gmod.schema.mapped.Synonym;
import org.gmod.schema.utils.ObjectManager;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.hibernate.Session;
import org.hibernate.SessionFactory;
import org.hibernate.criterion.Restrictions;
import org.springframework.orm.hibernate3.SessionFactoryUtils;
import org.springframework.transaction.annotation.Transactional;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Deals with loading an organism from an EMBL file into a Chado database.
* It's expected to be configured as a singleton Spring bean. The main
* calling point (and only public method, apart from the property setters)
* is {@link #load(EmblFile)}.
*
* @author rh11
*
*/
class EmblLoader {
private static final Logger logger = Logger.getLogger(EmblLoader.class);
// Constants
/**
* A unique number should be appended to the uniquename of the singly
* spliced transcript features in the database. We must do this because
* Artemis requires each feature to have a globally unique name, and does
* not work correctly if a transcript has the same uniquename as its gene.
* Similarly the GFF3 feature format requires feature names to be
* globally unique, and we want to be able to export our data in GFF3 format.
*
* For alternatively-spliced genes, on the other hand, there is no need to
* append the transcript type, because the transcript will have an assigned
* uniquename (the /systematic_id of the CDS) that is different from the
* uniquename of the gene (the /shared_id of the CDS).
*/
//private enum AppendType { ALWAYS, NEVER, SINGLY_SPLICED_ONLY };
//private static final AppendType APPEND_TYPE_TO_TRANSCRIPT_UNIQUENAME = AppendType.SINGLY_SPLICED_ONLY;
// Injected beans
private CvDao cvDao;
private GeneralDao generalDao;
private OrganismDao organismDao;
private PubDao pubDao;
private ObjectManager objectManager; // See #afterPropertiesSet()
private SessionFactory sessionFactory;
private FeatureUtils featureUtils;
private SynonymManager synonymManager = new SynonymManager();
// Configurable parameters
private Organism organism;
private Class<? extends TopLevelFeature> topLevelFeatureClass = Supercontig.class;
private boolean continueOnError = false;
public enum OverwriteExisting {YES, NO, MERGE}
private OverwriteExisting overwriteExisting = OverwriteExisting.NO;
private boolean sloppyControlledCuration = false;
private boolean reportUnusedQualifiers = true;
private boolean goTermErrorsAreNotFatal = false;
private Collection<String> ignoredFeatures = new HashSet<String>();
private Collection<String> ignoredQualifiers = new HashSet<String>();
private Map<String,Collection<String>> ignoredQualifiersByFeatureType = new HashMap<String,Collection<String>>();
/**
* Set the organism into which to load data.
*
* @param organismCommonName the common name of the organism
*/
public void setOrganismCommonName(String organismCommonName) {
this.organism = organismDao.getOrganismByCommonName(organismCommonName);
if (organism == null) {
throw new IllegalArgumentException(String.format("Organism '%s' not found", organismCommonName));
}
}
/**
* Set the class of top-level feature that this EMBL file represents.
* The default, if this method is not called, is <code>Supercontig</code>.
*
* @param topLevelFeatureClass
*/
public void setTopLevelFeatureClass(Class<? extends TopLevelFeature> topLevelFeatureClass) {
this.topLevelFeatureClass = topLevelFeatureClass;
}
/**
* Whether we should overwrite an existing top-level feature if it has
* the same name as the one specified in this file. The default, if this
* method is not called, is <code>NO</code>.
*
* If overwriteExisting is <code>NO</code>, the file will be skipped on the
* grounds that it's already loaded. If it's <code>YES</code>, the previously
* existing top-level feature, and features located on it, will
* be deleted first. If it's <code>MERGE</code>, the existing top-level feature
* and all features located on it will be retained, and any features
* specified in this file will be loaded in addition.
*
* @param overwriteExisting <code>YES</code> if we should overwrite an
* existing top-level feature, <code>NO</code> if not, or <code>MERGE</code>
* if we should merge the contents of this file with an existing feature.
*/
public void setOverwriteExisting(OverwriteExisting overwriteExisting) {
this.overwriteExisting = overwriteExisting;
}
public OverwriteExisting getOverwriteExisting() {
return this.overwriteExisting;
}
/**
* Whether to deal with controlled_curation qualifiers that don't have the expected
* format. The default, if this method is not called, is <code>false</code>.
*
* If set to true, we simply parse any dbxref from the /controlled_curation qualifier,
* and add the complete text as a /curation qualifier.
*
* @param sloppyControlledCuration
*/
public void setSloppyControlledCuration(boolean sloppyControlledCuration) {
this.sloppyControlledCuration = sloppyControlledCuration;
}
/**
* Whether GO term errors - in particular the case where the database does not contain
* a term with the specified accession number - should be logged and ignored rather than
* fatal.
*
* @param goTermErrorsAreNotFatal
*/
public void setGoTermErrorsAreNotFatal(boolean goTermErrorsAreNotFatal) {
this.goTermErrorsAreNotFatal = goTermErrorsAreNotFatal;
}
/**
* Whether we should continue if we encounter an error while loading a feature.
* You should not usually set this option; it can be useful if you need to load
* a file quickly and don't mind if some features are missing from the result.
* <p>
* In particular, you should <b>not</b> use this option when loading production
* data!
*
* @param continueOnError
*/
public void setContinueOnError(boolean continueOnError) {
if (continueOnError) {
logger.warn("We will continue if an error is encountered loading a feature");
}
this.continueOnError = continueOnError;
}
/**
* Whether we should log a list of unused qualifiers once the file has been loaded.
* If set to true, this list is logged as a series of WARN messages, one for each
* type of feature encountered in the file that has unused qualifiers. The default
* value is <code>true</code>.
*
* @param reportUnusedQualifiers
*/
public void setReportUnusedQualifiers(boolean reportUnusedQualifiers) {
this.reportUnusedQualifiers = reportUnusedQualifiers;
}
/**
* Ignore features of the named type.
*
* @param feature the name of the feature type to ignore
*/
public void ignoreFeature(String featureType) {
ignoredFeatures.add(featureType);
}
/**
* Ignore the named qualifier.
*
* @param qualifier the name of the qualifier to ignore
*/
public void ignoreQualifier(String qualifier) {
ignoredQualifiers.add(qualifier);
}
/**
* Ignore the named qualifier when it appears on a feature of the specified type.
*
* @param qualifier the name of the qualifier to ignore
* @param featureType the type of feature on which to ignore the named qualifier
*/
public void ignoreQualifier(String qualifier, String featureType) {
synchronized(ignoredQualifiersByFeatureType) {
if (!ignoredQualifiersByFeatureType.containsKey(featureType)) {
ignoredQualifiersByFeatureType.put(featureType, new HashSet<String>());
}
ignoredQualifiersByFeatureType.get(featureType).add(qualifier);
}
}
private void propagateIgnoreFeaturesAndQualifiers(FeatureTable featureTable) {
for (String featureType: ignoredFeatures) {
featureTable.ignoreFeature(featureType);
}
for (String qualifier: ignoredQualifiers) {
featureTable.ignoreQualifier(qualifier);
}
for (Map.Entry<String,Collection<String>> entry: ignoredQualifiersByFeatureType.entrySet()) {
String featureType = entry.getKey();
for (String qualifier: entry.getValue()) {
featureTable.ignoreQualifier(qualifier, featureType);
}
}
}
private Session session;
/**
* The main calling point for this class. Takes a parsed EMBL file, and loads
* it into the database. Each call to this method constitutes a separate
* Hibernate transaction. Even though the EMBL file has been parsed before
* this method is called, its internal consistency has not been verified.
* If we encounter a problem, a <code>DataError</code> is thrown and the
* transaction is rolled back.
*
* @param emblFile the parsed EMBL file
* @throws DataError if a data problem is discovered
*/
public void load(EmblFile emblFile) throws DataError {
//PropertyConfigurator.configure("resources/classpath/log4j.loader.properties");
propagateIgnoreFeaturesAndQualifiers(emblFile.getFeatureTable());
TopLevelFeature topLevelFeature;
try {
topLevelFeature = getTopLevelFeature(emblFile.getAccession());
} catch (TopLevelFeatureException e) {
logger.error(e.getMessage());
return;
}
doLoad(emblFile, topLevelFeature);
/* Unused qualifiers can only be reported if there is a featuretable.
* In some of our embl files, there are no features (E.g., Etenella
* contigs) and this causes the loader to fail as it tries to look
* in the featuretable for unused qualifiers. Hence, the if clause
* below was modified to check if the featretable is null (the
* reportUnusedQualifiers is true by default)
* nds 26 august 2010
*/
if (reportUnusedQualifiers && emblFile.getFeatureTable()!=null) {
reportUnusedQualifiers(emblFile.getFeatureTable());
} else {
logger.debug("Not reporting on unused qualifiers");
}
}
@Transactional(rollbackFor=DataError.class) // Will also rollback for runtime exceptions, by default
private void doLoad(EmblFile emblFile, TopLevelFeature topLevelFeature) throws DataError {
/*
* Thanks to the @Transactional annotation,
* Spring will automatically initiate a transaction when
* we're called, which will be committed on successful
* return or rolled back if we throw an exception.
*/
this.session = SessionFactoryUtils.doGetSession(sessionFactory, false);
synonymManager.startSession(session);
taxonomicDivision = emblFile.getTaxonomicDivision();
logger.trace("taxonomicDivision = " + taxonomicDivision);
if (topLevelFeature == null) {
logger.info("Creating topLevelFeature: " + emblFile.getAccession());
topLevelFeature = TopLevelFeature.make(topLevelFeatureClass, emblFile.getAccession(), organism);
topLevelFeature.markAsTopLevelFeature();
topLevelFeature.setResidues(emblFile.getSequence());
session.persist(topLevelFeature);
organism = (Organism) session.merge(organism);
if (!organism.isPopulated()) {
logger.info(String.format("Marking organism '%s' as populated", organism));
session.persist(organism.addProperty("genedb_misc", "populated"));
}
} else {
topLevelFeature = (TopLevelFeature) session.merge(topLevelFeature);
}
init(topLevelFeature);
EmblLocation.Join contigLocations = emblFile.getContigLocations();
if (contigLocations != null) {
loadContigsAndGaps(contigLocations);
}
loadFeatures(emblFile.getFeatureTable());
}
/**
* This exception is thrown by the <code>getTopLevelFeature</code> method
*
* @author rh11
*
*/
private static class TopLevelFeatureException extends Exception {
public TopLevelFeatureException(String message) {
super(message);
}
}
/**
* Get the top-level feature onto which we should add our features, based on
* the <code>overwriteExisting</code> policy. In the commonest case, overwriteExisting
* will have its default value of <code>NO</code>, and this method will either return
* </code>null</code> if the feature doesn't already exist or throw an exception if it
* does.
*
* @param uniqueName the unique name of the top-level feature we are loading
* @return the existing top-level feature to use, or <code>null</code> if we should create a new one.
* @throws TopLevelFeatureException if there is a problem; i.e. the feature exists when it shouldn't,
* or fails to exist (or isn't a top-level feature) when it should.
*/
@Transactional(rollbackFor=TopLevelFeatureException.class) // Will also rollback for runtime exceptions, by default
private TopLevelFeature getTopLevelFeature(String uniqueName)
throws TopLevelFeatureException {
Session session = SessionFactoryUtils.doGetSession(sessionFactory, false);
Feature existingTopLevelFeature = (Feature) session.createCriteria(Feature.class)
.add(Restrictions.eq("organism", organism))
.add(Restrictions.eq("uniqueName", uniqueName))
.uniqueResult();
if (existingTopLevelFeature != null) {
switch (overwriteExisting) {
case YES:
logger.trace(String.format("Deleting existing feature '%s' (ID=%d)",
existingTopLevelFeature.getUniqueName(), existingTopLevelFeature.getFeatureId()));
if (! (existingTopLevelFeature instanceof TopLevelFeature)) {
logger.warn(String.format("Existing feature is %s, not a top-level feature",
existingTopLevelFeature.getClass()));
}
existingTopLevelFeature.delete();
break;
case NO:
throw new TopLevelFeatureException(String.format("The organism '%s' already has feature '%s'",
organism.getCommonName(), uniqueName));
case MERGE:
if (existingTopLevelFeature instanceof TopLevelFeature) {
return (TopLevelFeature) existingTopLevelFeature;
} else {
throw new TopLevelFeatureException(String.format("We can't merge onto the feature '%s', because it's not a top-level feature",
existingTopLevelFeature.getUniqueName()));
}
}
} else if (overwriteExisting == OverwriteExisting.MERGE) {
throw new TopLevelFeatureException(String.format("Cannot MERGE because feature '%s' does not exist", uniqueName));
}
session.flush();
return null;
}
private String taxonomicDivision;
private TopLevelFeature topLevelFeature;
private Map<String,AbstractGene> genesByUniqueName = new HashMap<String,AbstractGene>();
private Map<String,Transcript> transcriptsByUniqueName = new HashMap<String,Transcript>();
private NavigableMap<Integer,Contig> contigsByStart = new TreeMap<Integer,Contig>();
private Set<String> repeatRegionUniqueNames = new HashSet<String>();
private Set<String> repeatUnitUniqueNames = new HashSet<String>();
private Map<String,Integer> syntheticNcRNAIndexByType = new HashMap<String,Integer>();
/**
* We want to create a single Analysis object/row for each distinct analysis program
* referenced in /similarity qualifiers in this file. These are stored in this map.
*/
private Map<String,Analysis> similarityAnalysisByProgram = new HashMap<String,Analysis>();
/**
* Reset all our local state: necessary if the user retries after an error,
* or if the same EmblLoader object is used more than once (to load more than one file).
*
* @param topLevelFeature
*/
private void init(TopLevelFeature topLevelFeature) {
if (topLevelFeature == null) {
throw new IllegalArgumentException("topLevelFeature cannot be null");
}
this.topLevelFeature = topLevelFeature;
this.genesByUniqueName.clear();
this.transcriptsByUniqueName.clear();
this.contigsByStart.clear();
this.similarityAnalysisByProgram.clear();
this.repeatRegionUniqueNames.clear();
this.repeatUnitUniqueNames.clear();
this.syntheticNcRNAIndexByType.clear();
this.archivedFeatureIndexes.clear();
this.motifIndex = 1;
this.objectManager.flush();
}
private void loadContigsAndGaps(EmblLocation.Join locations) throws DataError {
int pos = 0; // Position (interbase) on topLevelFeature
for(EmblLocation location: locations.locations) {
if (location instanceof EmblLocation.External) {
EmblLocation.External externalLocation = (EmblLocation.External) location;
int contigLength = externalLocation.simple.getLength();
String contigUniqueName = externalLocation.accession;
Contig contig = createContig(pos, contigLength, contigUniqueName);
contigsByStart.put(pos, contig);
pos += contigLength;
} else if (location instanceof EmblLocation.Complement) {
EmblLocation complementedLocation = ((EmblLocation.Complement) location).location;
if (!(complementedLocation instanceof EmblLocation.External)) {
throw new DataError("The CO section should contain only external references and gaps");
}
EmblLocation.External externalComplementedLocation = (EmblLocation.External) complementedLocation;
int contigLength = externalComplementedLocation.simple.getLength();
String contigUniqueName = externalComplementedLocation.accession + "_reversed";
Contig contig = createContig(pos, contigLength, contigUniqueName);
contigsByStart.put(pos, contig);
pos += contigLength;
} else if (location instanceof EmblLocation.Gap) {
EmblLocation.Gap gapLocation = (EmblLocation.Gap) location;
int gapLength = gapLocation.getLength();
logger.debug(String.format("Creating gap at %d-%d", pos, pos + gapLength));
Gap gap = topLevelFeature.addGap(pos, pos + gapLength);
session.persist(gap);
pos += gapLength;
} else {
throw new DataError("The CO section should contain only external references and gaps");
}
}
}
/**
* @param pos
* @param contigLength
* @param contigUniqueName
* @return
*/
private Contig createContig(int pos, int contigLength, String contigUniqueName) {
logger.debug(String.format("Creating contig '%s' at %d-%d", contigUniqueName, pos, pos + contigLength));
Contig contig = TopLevelFeature.make(Contig.class, contigUniqueName, organism);
contig.setResidues(topLevelFeature.getResidues(pos, pos + contigLength));
session.persist(contig);
topLevelFeature.addLocatedChild(contig, pos, pos + contigLength, (short) 0, 0);
return contig;
}
private void locate(Feature feature, EmblLocation location) {
locate(feature, location.getFmin(), location.getFmax(), (short) location.getStrand(), null);
}
private void locate(Feature feature, int fmin, int fmax, short strand, Integer phase) {
topLevelFeature.addLocatedChild(feature, fmin, fmax, strand, phase);
Contig contig = contigsByStart.isEmpty() ? null : contigsByStart.floorEntry(fmin).getValue();
if (contig == null || fmax > contig.getFmax()) {
logger.debug(String.format("The feature '%s' (%s) is not contained in a contig",
feature.getUniqueName(), feature.getName()));
return;
}
logger.debug(String.format("The feature '%s' lies on contig '%s'", feature.getUniqueName(), contig.getUniqueName()));
contig.addLocatedChild(feature, fmin - contig.getFmin(), fmax - contig.getFmin(), strand, phase, 1, 1);
}
private void loadFeatures(FeatureTable featureTable) throws DataError {
List<FeatureTable.Feature> utrs = new ArrayList<FeatureTable.Feature>();
if (featureTable == null) {
logger.error("No feature table found!");
return;
}
for (FeatureTable.Feature feature: featureTable.getFeatures()) {
try {
loadFeature(utrs, feature);
}
catch (DataError e) {
e.setLineNumber(feature.lineNumber);
if (continueOnError) {
logger.error("Continuing after error", e);
} else {
throw e;
}
}
}
for (FeatureTable.Feature utr: utrs) {
try {
loadUTR(utr);
}
catch (DataError e) {
logger.debug("Caught DataError while loading UTR. Setting line number to " + utr.lineNumber);
e.setLineNumber(utr.lineNumber);
throw e;
}
}
}
private void loadFeature(List<FeatureTable.Feature> utrs,
FeatureTable.Feature feature) throws DataError {
String featureType = feature.type;
if (feature.location.getFmax() < feature.location.getFmin()) {
throw new DataError("Location has fmax before fmin");
}
Feature focalFeature = null;
if (featureType.equals("repeat_region")) {
focalFeature = loadRepeatRegion(feature);
}
else if (featureType.equals("repeat_unit")) {
focalFeature = loadRepeatUnit(feature);
}
else if (featureType.equals("CDS")) {
focalFeature = loadCDS((FeatureTable.CDSFeature) feature);
}
else if (featureType.equals("tRNA")) {
focalFeature = loadNcRNA(TRNA.class, featureType, feature);
}
else if (featureType.equals("rRNA")) {
focalFeature = loadNcRNA(RRNA.class, featureType, feature);
}
else if (featureType.equals("snRNA")) {
focalFeature = loadNcRNA(SnRNA.class, featureType, feature);
}
else if (featureType.equals("snoRNA")) {
focalFeature = loadNcRNA(SnoRNA.class, featureType, feature);
}
else if (featureType.equals("misc_RNA") || featureType.equals("ncRNA")) {
focalFeature = loadNcRNA(NcRNA.class, featureType, feature);
}
else if (featureType.equals("3'UTR") || featureType.equals("5'UTR")) {
utrs.add(feature);
}
else if (featureType.equals("gap")) {
focalFeature = loadGap(feature);
}
else if (featureType.equals("CDS_motif")) {
focalFeature = loadMotif(feature);
}
else if (featureType.equals("LTR")) {
throw new DataError("Tell Robin he needs to write code for loading LTR features!");
// TODO
}
else if (featureType.equals("fasta_record")) {
loadFastaRecord(feature); // These are often used to identify individual contigs within a bin chromosome
}
else if (featureType.equals("misc_feature") && feature.getQualifierValues("note").contains(new String("centromere"))){
/* Centromeres are pulled out as misc_features with note="centromere" with writedb_entry.
* Here we check for this note and load the centromere. Rest of the misc features are archived.
* nds, 30 Nov 2010
*/
focalFeature = loadCentromere(feature);
}
else if (featureType.equals("misc_feature") && feature.getQualifierValues("note").contains(new String("SECIS_element"))){
/* SECIS_elements are pulled out with a note='SECIS_element' */
focalFeature = loadSECISElement(feature);
}
else {
logger.info(String.format("Archiving %s feature", featureType));
archiveFeature(feature);
}
if (focalFeature != null) {
archiveUnusedQualifiers(feature, focalFeature);
}
}
private void loadFastaRecord(FeatureTable.Feature feature) {
String featureUniqueName = feature.getQualifierValues("label").get(0);
logger.warn(String.format("Creating archived contig feature '%s' from '%s' feature on line %d",
featureUniqueName, feature.type, feature.lineNumber));
Feature focalFeature = new Region(
organism, featureUniqueName,
/*analysis:*/false, /*obsolete:*/true,
new Timestamp(System.currentTimeMillis()));
locate(focalFeature, feature.location);
int rank=0;
focalFeature.addFeatureProp(
String.format("Archived from %s feature %s with location %s; file '%s', line %d",
feature.type, featureUniqueName, feature.location, feature.getFilePath(), feature.lineNumber),
"feature_property", "comment", rank++);
for (String note: feature.getQualifierValues("note")) {
focalFeature.addFeatureProp(note, "feature_property", "comment", rank++);
}
for (String colour: feature.getQualifierValues("colour")) {
focalFeature.addFeatureProp(colour, "genedb_misc", "colour", rank++);
}
session.persist(focalFeature);
archiveUnusedQualifiers(feature, focalFeature);
}
// Centromeres
private Centromere loadCentromere(FeatureTable.Feature feature) throws DataError {
EmblLocation centromereLocation = feature.location;
String centromereName = feature.getUniqueName();
logger.info(String.format("Adding a centromere %s at %d-%d on %s",
centromereName, centromereLocation.getFmin(), centromereLocation.getFmax(), topLevelFeature.getUniqueName() ));
Centromere centromere = Centromere.make(topLevelFeature, centromereName, centromereLocation.getFmin(), centromereLocation.getFmax());
session.persist(centromere);
// Add any literature (duplicated effort here since the processLiterature() method is within the geneLoader). Fix later.
Pattern literaturePattern = Pattern.compile("(?:PMID:)?\\s*(\\d+)(?:;.*)?");
for (String pmid: feature.getQualifierValues("literature", "citation")) {
Matcher matcher = literaturePattern.matcher(pmid);
if (!matcher.matches()) {
throw new DataError("Failed to parse literature/citation qualifier: " + pmid);
}
String accession = matcher.group(1);
DbXRef dbXRef = objectManager.getDbXRef("PMID", accession);
Pub pub = objectManager.getPub(String.format("PMID:%s", accession), "unfetched");
session.persist(pub.addDbXRef(dbXRef, true));
session.persist(centromere.addPub(pub));
}
return centromere;
}
//SECIS_elements
private SECISElement loadSECISElement(FeatureTable.Feature feature) throws DataError {
EmblLocation secisLocation = feature.location;
String secisName = feature.getUniqueName();
logger.info(String.format("Adding a SECIS_element %s at %d-%d on %s",
secisName, secisLocation.getFmin(), secisLocation.getFmax(), topLevelFeature.getUniqueName() ));
SECISElement secisElement = new SECISElement(organism, secisName);
locate(secisElement,secisLocation);
session.persist(secisElement);
int rank=0;
for (String note: feature.getQualifierValues("note")) {
if(!note.equalsIgnoreCase("SECIS_element") && !note.equalsIgnoreCase("false")){ //The note=false just means it is not obsolete
secisElement.addFeatureProp(note, "feature_property", "comment", rank++);
}
}
return secisElement;
}
private Counters archivedFeatureIndexes = new Counters();
private void archiveFeature(FeatureTable.Feature feature) {
String featureUniqueName = String.format("%s:archived:%s:%d",
topLevelFeature.getUniqueName(), feature.type,
archivedFeatureIndexes.nextval(feature.type));
logger.warn(String.format("Archiving '%s' feature on line %d as '%s'",
feature.type, feature.lineNumber, featureUniqueName));
Feature focalFeature = new Region(
organism, featureUniqueName,
/*analysis:*/false, /*obsolete:*/true,
new Timestamp(System.currentTimeMillis()));
locate(focalFeature, feature.location);
focalFeature.addFeatureProp(
String.format("Archived from %s feature with location %s; file '%s', line %d",
feature.type, feature.location, feature.getFilePath(), feature.lineNumber),
"feature_property", "comment", 0);
session.persist(focalFeature);
archiveUnusedQualifiers(feature, focalFeature);
}
private void archiveUnusedQualifiers(FeatureTable.Feature feature, Feature focalFeature) {
int rank = 0;
for (String unusedQualifier: feature.getUnusedQualifiers()) {
logger.trace(String.format("Archiving qualifier on '%s': %s",
focalFeature.getUniqueName(), unusedQualifier));
focalFeature.addFeatureProp(unusedQualifier, "genedb_misc", "EMBL_qualifier", rank++);
}
}
private void reportUnusedQualifiers(FeatureTable featureTable) {
Map<String,Set<String>> unusedQualifiersByFeatureType = new HashMap<String, Set<String>>();
// Collect unused qualifiers
for (FeatureTable.Feature feature: featureTable.getFeatures()) {
for (String unusedQualifier: feature.getUnusedQualifierNames()) {
if (!unusedQualifiersByFeatureType.containsKey(feature.type)) {
unusedQualifiersByFeatureType.put(feature.type, new HashSet<String>());
}
unusedQualifiersByFeatureType.get(feature.type).add(unusedQualifier);
}
}
// Report unused qualifiers
if (unusedQualifiersByFeatureType.isEmpty()) {
logger.info("No unused qualifiers to report");
return;
}
for (Map.Entry<String,Set<String>> entry: unusedQualifiersByFeatureType.entrySet()) {
StringBuilder message = new StringBuilder(String.format("Unused qualifiers for %s features:%n", entry.getKey()));
for (String qualifierName: entry.getValue()) {
message.append(String.format("\t/%s%n", qualifierName));
}
logger.warn(message);
}
}
private Gap loadGap(FeatureTable.Feature gapFeature) {
EmblLocation gapLocation = gapFeature.location;
logger.debug(String.format("Creating gap at %d-%d", gapLocation.getFmin(), gapLocation.getFmax()));
Gap gap = topLevelFeature.addGap(gapLocation.getFmin(), gapLocation.getFmax());
session.persist(gap);
int rank=0;
for (String note: gapFeature.getQualifierValues("note")) {
gap.addFeatureProp(note, "feature_property", "comment", rank++);
}
return gap;
}
private int motifIndex = 1;
private PolypeptideMotif loadMotif(FeatureTable.Feature motifFeature) {
String motifUniqueName = String.format("%s:motif:%d", topLevelFeature.getUniqueName(), motifIndex++);
PolypeptideMotif motif = new PolypeptideMotif(organism, motifUniqueName);
session.persist(motif);
locate(motif, motifFeature.location);
int rank = 0;
for (String note: motifFeature.getQualifierValues("note")) {
motif.addFeatureProp(note, "feature_property", "comment", rank++);
}
return motif;
}
private Feature loadRepeatRegion(FeatureTable.Feature repeatRegionFeature) throws DataError {
String repeatRegionName = repeatRegionFeature.getQualifierValue("FEAT_NAME");
EmblLocation repeatRegionLocation = repeatRegionFeature.location;
int fmin = repeatRegionLocation.getFmin();
int fmax = repeatRegionLocation.getFmax();
String repeatType = repeatRegionFeature.getQualifierValue("rpt_type");
final Class<? extends RepeatRegion> repeatRegionClass;
if (repeatType == null) {
repeatRegionClass = RepeatRegion.class;
} else {
repeatType = repeatType.toLowerCase();
if (repeatType.equals("direct")) {
repeatRegionClass = DirectRepeatRegion.class;
} else if (repeatType.equals("inverted")) {
repeatRegionClass = InvertedRepeatRegion.class;
} else {
throw new DataError(String.format("Unknown repeat type '%s'", repeatType));
}
}
String repeatRegionUniqueName = String.format("%s:repeat:%d-%d", topLevelFeature.getUniqueName(), fmin, fmax);
if (repeatRegionUniqueNames.contains(repeatRegionUniqueName)) {
logger.warn(String.format("The repeat region '%s' already exists." +
"Ignoring second (or subsequent) occurence at line %d",
repeatRegionUniqueName, repeatRegionFeature.lineNumber));
return null;
}
repeatRegionUniqueNames.add(repeatRegionUniqueName);
logger.debug(String.format("Creating repeat region '%s' (%s) of type '%s' at %d-%d",
repeatRegionUniqueName, repeatRegionName, repeatRegionClass.getSimpleName(), fmin, fmax));
RepeatRegion repeatRegion = RepeatRegion.make(repeatRegionClass,
organism, repeatRegionUniqueName, repeatRegionName);
int rank = 0;
String label = repeatRegionFeature.getQualifierValue("label");
if (label != null) {
repeatRegion.addFeatureProp(String.format("/label=%s", label), "feature_property", "comment", rank++);
}
for(String note : repeatRegionFeature.getQualifierValues("note")) {
repeatRegion.addFeatureProp(note, "feature_property", "comment", rank++);
}
// Add a comment for the /rpt_family, if present
String rptFamily = repeatRegionFeature.getQualifierValue("rpt_family");
if (rptFamily != null) {
repeatRegion.addFeatureProp(String.format("/rpt_family=%s", rptFamily),
"feature_property", "comment", rank++);
}
session.persist(repeatRegion);
locate(repeatRegion, fmin, fmax, (short)0, null);
return repeatRegion;
}
// TODO loadRepeatUnit is very similar to loadRepeatRegion: unify?
private Feature loadRepeatUnit(FeatureTable.Feature repeatUnitFeature) throws DataError {
EmblLocation repeatUnitLocation = repeatUnitFeature.location;
int fmin = repeatUnitLocation.getFmin();
int fmax = repeatUnitLocation.getFmax();
String repeatUnitUniqueName = String.format("%s:repeat_unit:%d-%d", topLevelFeature.getUniqueName(), fmin, fmax);
if (repeatUnitUniqueNames.contains(repeatUnitUniqueName)) {
logger.warn(String.format("The repeat region '%s' already exists." +
"Ignoring second (or subsequent) occurence at line %d",
repeatUnitUniqueName, repeatUnitFeature.lineNumber));
return null;
}
repeatUnitUniqueNames.add(repeatUnitUniqueName);
logger.debug(String.format("Creating repeat unit '%s' at %d-%d",
repeatUnitUniqueName, fmin, fmax));
RepeatUnit repeatUnit = RepeatUnit.make(RepeatUnit.class,
organism, repeatUnitUniqueName, null);
String colour = repeatUnitFeature.getQualifierValue("colour");
if (colour != null) {
repeatUnit.addFeatureProp(colour, "genedb_misc", "colour", 0);
}
int rank = 0;
String label = repeatUnitFeature.getQualifierValue("label");
if (label != null) {
repeatUnit.addFeatureProp(String.format("/label=%s", label), "feature_property", "comment", rank++);
}
for (String note: repeatUnitFeature.getQualifierValues("note")) {
repeatUnit.addFeatureProp(note, "feature_property", "comment", rank++);
}
session.persist(repeatUnit);
locate(repeatUnit, fmin, fmax, (short)0, null);
return repeatUnit;
}
// Can't define static fields in inner classes, grr.
private static final Set<String> goQualifiers = new HashSet<String>();
static {
Collections.addAll(goQualifiers, "aspect", "GOid", "term", "qualifier",
"evidence", "db_xref", "with", "from", "date", "autocomment", "attribution");
}
/**
* Abstract superclass for gene loaders.
* <p>
* It is the responsibility of each implementing class to set at least
* the fields <code>geneUniqueName</code>, <code>transcriptUniqueName</code>
* and <code>geneName</code> in its constructor.
*
* @author rh11
*
*/
abstract class GeneLoader {
protected FeatureTable.Feature feature;
protected EmblLocation location;
protected boolean isPseudo = false;
protected boolean singlySpliced = true;
protected boolean isObsolete = false;
protected String geneUniqueName = null;
protected String transcriptUniqueName = null;
protected String geneName;
protected Transcript transcript;
/**
* The focal feature is the one to which annotation is added.
* This is the polypeptide where possible, or the transcript if
* there is no polypeptide.
*/
protected Feature focalFeature;
protected Integer phase = null;
public GeneLoader(FeatureTable.Feature feature) {
this.feature = feature;
this.location = feature.location;
}
protected Class<? extends AbstractGene> getGeneClass() {
return isPseudo ? Pseudogene.class : Gene.class;
}
protected abstract Class<? extends Transcript> getTranscriptClass();
protected String getTranscriptType() {
/*
* This assumes that transcript feature classes are annotated with term
* rather than accession, which is true at the time of writing.
*/
return FeatureTypeUtils.getFeatureTypeForClass(getTranscriptClass()).term();
}
/**
* The main entry point to a gene loader.
*/
public Feature load() throws DataError {
if (geneUniqueName == null) {
throw new RuntimeException("Cannot load a gene with no uniqueName");
}
if (transcriptUniqueName == null) {
throw new RuntimeException("Cannot load a transcript with no uniqueName");
}
loadTranscript(loadOrFetchGene());
return focalFeature;
}
private AbstractGene loadOrFetchGene() {
if (topLevelFeature.getClass().equals(Gene.class)) {
/* If the gene is acting as the topLevelFeature
* don't create a new gene feature here
* Instead return the existing topLevel gene feature
*/
logger.debug(String.format("The toplevel feature is a gene"));
return (AbstractGene) topLevelFeature;
}
else if (singlySpliced) {
AbstractGene gene = createSinglySplicedGene();
genesByUniqueName.put(geneUniqueName, gene);
return gene;
}
else {
if (genesByUniqueName.containsKey(geneUniqueName)) {
logger.debug(String.format("Gene for shared ID '%s' already exists", geneUniqueName));
return genesByUniqueName.get(geneUniqueName);
} else {
// This is the first transcript, so create the gene
AbstractGene gene = createGene();
genesByUniqueName.put(geneUniqueName, gene);
return gene;
}
}
}
private AbstractGene createSinglySplicedGene() {
if (transcriptUniqueName.contains(".")) {
logger.warn(String.format(
"The transcript '%s' is not alternately spliced, yet its systematic name contains a dot",
transcriptUniqueName));
}
return createGene();
}
private AbstractGene createGene() {
logger.debug(String.format("Creating gene '%s' (%s)", geneUniqueName, geneName));
AbstractGene gene = AbstractGene.make(getGeneClass(), organism, geneUniqueName, geneName);
gene.setObsolete(isObsolete); //Is it obsolete?
logger.info(String.format("Setting gene %s 's obsolete status to %s", gene.getUniqueName(), isObsolete));
locate(gene, location);
session.persist(gene);
return gene;
}
private void loadTranscript(AbstractGene gene) throws DataError {
logger.debug(String.format("Creating transcript '%s' for gene '%s'", transcriptUniqueName, gene.getUniqueName()));
/**
* A unique number should be appended to the uniquename of the singly
* spliced transcript features in the database. We must do this because
* Artemis requires each feature to have a globally unique name, and does
* not work correctly if a transcript has the same uniquename as its gene.
* Similarly the GFF3 feature format requires feature names to be
* globally unique, and we want to be able to export our data in GFF3 format.
*
* For alternatively-spliced genes, on the other hand, there is no need to
* append the transcript type, because the transcript will have an assigned
* uniquename (the /systematic_id of the CDS) that is different from the
* uniquename of the gene (the /shared_id of the CDS).
*/
String actualTranscriptUniqueName;
if (transcriptUniqueName.equals(gene.getUniqueName())) { // will occur for singly-spliced genes
actualTranscriptUniqueName = String.format("%s.1", transcriptUniqueName); //make transcript uniquename differ from gene uniquename
} else {
actualTranscriptUniqueName = transcriptUniqueName;
}
this.transcript = gene.makeTranscript(getTranscriptClass(), actualTranscriptUniqueName, location.getFmin(), location.getFmax(), gene, location);
transcript.setObsolete(isObsolete); //Is it obsolete?
session.persist(transcript);
focalFeature = transcript;
if (transcript instanceof ProductiveTranscript) {
Polypeptide polypeptide = ((ProductiveTranscript) transcript).getProtein();
if (polypeptide != null) {
polypeptide.setObsolete(isObsolete);
focalFeature = polypeptide;
}
}
transcriptsByUniqueName.put(actualTranscriptUniqueName /*transcriptUniqueName */, transcript);
loadExons(actualTranscriptUniqueName);
processTranscriptQualifiers();
}
/**
* For each <code>/<qualifierName></code> qualifier, add a synonym of type
* <code>synonymType</code> to the transcript.
*
* @param qualifierName the name of the qualifer
* @param synonymType the type of synonym. Should be a term in the <code>genedb_synonym_type</code> CV
* @param isCurrent whether the synonym is current or not
*/
protected void addTranscriptSynonymsFromQualifier(String qualifierName, String synonymType, boolean isCurrent) {
Set<String> synonyms = new HashSet<String>();
for (String synonymString: feature.getQualifierValues(qualifierName)) {
if (synonyms.contains(synonymString)) {
logger.error(String.format("The qualifier /%s=\"%s\" is repeated on transcript '%s'",
qualifierName, synonymString, transcriptUniqueName));
continue;
}
synonyms.add(synonymString);
logger.debug(String.format("Adding %s '%s' for transcript", synonymType, synonymString));
Synonym synonym = synonymManager.getSynonym(synonymType, synonymString);
session.persist(transcript.addSynonym(synonym, isCurrent, /*isInternal:*/ false));
}
}
/**
* For each <code>/<qualifierName></code> qualifier, add a property of
* the specified type to the polypeptide, if there is one, or else to the
* transcript.
*
* @param qualifierName the qualifier name
* @param propertyCvName the name of the CV to which the property term belongs.
* Should be either <code>feature_property</code> for built-in Chado
* properties, or <code>genedb_misc</code> for local additions.
* @param propertyTermName the term name corresponding to the property to add.
* If it belongs to the <code>genedb_misc</code> CV, it should be a child of
* the term <code>genedb_misc:feature_props</code>.
* @param isUnique whether this qualifier may appear only once.
* @return the number of properties that were added
* @throws DataError
*/
protected int processPropertyQualifier(String qualifierName, String propertyCvName, String propertyTermName, boolean isUnique) throws DataError {
return processPropertyQualifier(qualifierName, propertyCvName, propertyTermName, qualifierParsers.get(qualifierName), isUnique);
}
protected int processPropertyQualifier(String qualifierName, String propertyCvName, String propertyTermName) throws DataError {
return processPropertyQualifier(qualifierName, propertyCvName, propertyTermName, qualifierParsers.get(qualifierName), false);
}
private int processPropertyQualifier(String qualifierName, String propertyCvName, String propertyTermName,
TermParser parser, boolean isUnique) throws DataError {
Set<String> values = new HashSet<String>();
int rank = 0;
for(String qualifierValue: feature.getQualifierValues(qualifierName)) {
if (parser != null) {
for (String normalisedValue: parser.parse(qualifierValue)) {
rank = processNormalisedProperty(qualifierName, propertyCvName, propertyTermName,
isUnique, values, rank, normalisedValue);
}
} else {
rank = processNormalisedProperty(qualifierName, propertyCvName, propertyTermName,
isUnique, values, rank, qualifierValue);
}
}
return rank;
}
private int processNormalisedProperty(String qualifierName, String propertyCvName,
String propertyTermName, boolean isUnique, Set<String> values, int rank,
String normalisedValue) throws DataError {
if (values.contains(normalisedValue)) {
logger.warn(String.format("Qualifier /%s=\"%s\" appears more than once on feature at line %d. Ignoring subsequent occurrences.",
qualifierName, normalisedValue, this.feature.lineNumber));
} else {
if (isUnique && !values.isEmpty()) {
throw new DataError(String.format("More than one /%s qualifier found", qualifierName));
}
logger.debug(String.format("Adding %s:%s '%s' for transcript",
propertyCvName, propertyTermName, normalisedValue));
values.add(normalisedValue);
focalFeature.addFeatureProp(normalisedValue, propertyCvName, propertyTermName, rank++);
}
return rank;
}
protected void processCvTermQualifier(String qualifierName, String cvName, String dbName, boolean createTerms)
throws DataError {
processCvTermQualifier(qualifierName, cvName, dbName, createTerms, qualifierParsers.get(qualifierName));
}
protected void processCvTermQualifier(String qualifierName, String cvName, String dbName,
boolean createTerms, TermParser termParser)
throws DataError {
Set<String> terms = new HashSet<String>();
for (String term: feature.getQualifierValues(qualifierName)) {
if (termParser != null) {
for (String partNormalisedTerm: termParser.parse(term)) {
String normalisedTerm = partNormalisedTerm.trim();
processNormalisedCvTermQualifier(qualifierName, cvName, dbName, createTerms, terms, term,
normalisedTerm);
}
} else {
processNormalisedCvTermQualifier(qualifierName, cvName, dbName, createTerms, terms, term, term);
}
}
}
private void processNormalisedCvTermQualifier(String qualifierName, String cvName,
String dbName, boolean createTerms, Set<String> terms, String term, String normalisedTerm)
throws DataError {
String lcNormalisedTerm = normalisedTerm.toLowerCase();
if (terms.contains(lcNormalisedTerm)) {
logger.warn(
String.format("The qualifier /%s=\"%s\" appears more than once. Ignoring subsequent copies.",
qualifierName, term));
return;
} else {
terms.add(lcNormalisedTerm);
}
FeatureCvTerm featureCvTerm = focalFeature.addCvTerm(cvName, normalisedTerm, dbName, createTerms);
if (featureCvTerm == null) {
throw new DataError(
String.format("Failed to find term '%s' in CV '%s'", normalisedTerm, cvName));
}
session.persist(featureCvTerm);
}
private void loadExons() throws DataError { //if you don't specify the transcriptUniqueName use the existing one taken from EMBL file
loadExons(transcriptUniqueName);
}
/*
* specifying a transcriptUniquename whe you load the exons allows the modified transcriptUniqueName (with appended .1) to be used
* in making the exonUniqueName for singly spliced genes
*/
private void loadExons(String actualTranscriptUniqueName) throws DataError {
int exonIndex = 0;
for (EmblLocation exonLocation: location.getParts()) {
if (exonLocation instanceof EmblLocation.External) {
throw new DataError("Found an external exon (trans-splicing). We can't handle that yet.");
}
String exonUniqueName = String.format("%s:exon:%d", actualTranscriptUniqueName, ++exonIndex);
logger.debug(String.format("Creating exon '%s' at %d-%d", exonUniqueName, exonLocation.getFmin(), exonLocation.getFmax()));
AbstractExon exon = transcript.createExon(exonUniqueName, exonLocation.getFmin(), exonLocation.getFmax(), phase);
exon.setObsolete(isObsolete);
session.persist(exon);
}
}
protected void processGO() throws DataError {
String comment = "From EMBL file"; //default value for autocomment
for (String go: feature.getQualifierValues("GO")) {
GoInstance goInstance = new GoInstance();
for (String subqualifier: go.split("; ?")) {
subqualifier = subqualifier.trim();
if (subqualifier.length() == 0) {
continue;
}
int equalsIndex = subqualifier.indexOf('=');
if (equalsIndex == -1) {
throw new DataError(String.format("Failed to parse /GO=\"%s\"", go));
}
String key = subqualifier.substring(0, equalsIndex);
String value = subqualifier.substring(equalsIndex + 1);
/* nds (24.6.2010):It is rare but sometimes the key here
* has acquired an unnecessary space either by data errors
* in Chado or in the EMBL file.
* Example: au tocomment="From EMBL file"
* The replace below was put in place to deal with this problem.
*/
key = key.replaceAll("\\s","").trim();
if (!goQualifiers.contains(key)) {
throw new DataError(String.format("Failed to parse /GO=\"%s\"; don't know what to do with %s=%s", go, key, value));
}
// "aspect", "GOid", "term", "qualifier", "evidence", "db_xref", "with", "date", "attribution", "residue", "autocomment"
/* nds (25.6.2010): Sometimes the values have the same
* problem as above but a replace cannot be applied for
* all the values. I've commented out the replace below
* as it's possible this was a problem specific for
* Plasmodium reichenowi.
*/
// if(!key.equals("autocomment")){
// value = value.replaceAll("\\s","").trim();
// }
if (key.equals("GOid")) {
goInstance.setId(value);
} else if (key.equals("date")) {
goInstance.setDate(value);
} else if (key.equals("evidence")) {
GoEvidenceCode evidenceCode = GoEvidenceCode.parse(value);
if (evidenceCode == null) {
throw new DataError(String.format("Failed to parse GO evidence code '%s'", value));
}
goInstance.setEvidence(evidenceCode);
} else if (key.equals("qualifier")) {
goInstance.addQualifier(value);
} else if (key.equals("with") || key.equals("from")) {
goInstance.setWithFrom(value);
} else if (key.equals("aspect")) {
goInstance.setSubtype(value);
} else if (key.equals("attribution")) {
goInstance.setAttribution(value);
} else if (key.equals("residue")) {
goInstance.setResidue(value);
} else if (key.equals("db_xref")) {
goInstance.setRef(value);
/* TODO: Temp fix to avoid duplicate pubdbxref entries,
* fix properly later using the object manager: nds*/
Pattern DBXREF_PATTERN = Pattern.compile("(\\w+):(\\w+)");
Matcher matcher = DBXREF_PATTERN.matcher(value);
if(matcher.matches() && (matcher.group(1).equalsIgnoreCase("PMID") || matcher.group(1).equalsIgnoreCase("PUBMED"))){
seenPubAccessions.add(matcher.group(2));
}
} else if (key.equals("autocomment")){
comment = value;
}
}
if (goTermErrorsAreNotFatal) {
try {
featureUtils.createGoEntries(focalFeature, goInstance, comment /*"From EMBL file"*/, (DbXRef) null);
} catch (DataError e) {
}
} else {
featureUtils.createGoEntries(focalFeature, goInstance, comment /*"From EMBL file" */, (DbXRef) null);
}
}
}
/* Here are some examples of /similarity qualifiers from chromosome 1 of Trypanosoma brucei:
FT /similarity="blastp; SWALL:Q26723 (EMBL:M20871);
FT Trypanosoma brucei brucei; variant-specific antigen;
FT ESAG3; ; id=70%; ; E()=2e-42; score=438; ; ;"
FT /similarity="fasta; SWALL:P26328 (EMBL:56768); Trypanosoma
FT brucei brucei; variant surface glycoprotein ILTat 1.23
FT precursor; ; length 532 aa; id=30.35%; ungapped id=32.34%;
FT E()=1.2e-34; ; 537 aa overlap; query 9-528 aa; subject
FT 10-530 aa"
FT /similarity="fasta; SWALL:O97352 (EMBL:AJ012199);
FT Trypanosoma brucei; ILTat 1.61 metacyclic VSG protein; ;
FT length 518 aa; id=29.83%; ungapped id=32.99%; E()=3.6e-31;
FT ; 543 aa overlap; query 2-528 aa; subject 10-516 aa"
FT /similarity="blastp; SWALL:Q8WPR3 (EMBL:AL671259);
FT Trypanosoma brucei; ESAG3; H25N7.29; ; id=74%; ;
FT E()=4e-28; score=310; ; ;"
And here are some examples from Schistosoma mansoni Smp_scaff000604, to show how minimal
the provided data can sometimes be:
FT /similarity="blastp; RF:XP_970827.1; ; ; ; ; id=61.0%; ;
FT E()=3.9e-21; ; ; ;"
FT /similarity="blastp; RF:NP_956088.1; ; ; ; ; id=58.4%; ;
FT E()=1.6e-17; ; ; ;"
FT /similarity="blastp; GB:BAD74067.1; ; ; ; ; id=54.4%; ;
FT E()=2.6e-17; ; ; ;"
And from Leishmania major chromosome 32, to show the use of multiple secondary cross-references:
FT /similarity="fasta; SWALL:Q9BUG7 (EMBL:BC002634,
FT AAH02634); Homo sapiens; hypothetical protein; ; length
FT 322 aa; id=40.063%; ungapped id=46.691%; E()=1.2e-32; ;
FT 301 aa overlap; query 23-324 aa; subject 12-298 aa"
I don't know how normal this is, but there's at least one with a line-break in the
middle of the E() value! (Again from Leishmania major, chromosome 32.)
FT /similarity="fasta; SWALL:EAA26969 (EMBL:AABX01000759,
FT EAA26969); Neurospora crassa; hypothetical protein; ;
FT length 335 aa; id=38.462%; ungapped id=47.273%; E()=6.4e-
FT 17; ; 303 aa overlap; query 3-306 aa; subject 13-321 aa"
And here is an example from Eimeria tenella where the algorithm, program and version are
all specified in the first field.
FT /similarity="ComparativeBlastX_uni blastall v2.2.6;
FT SWALL:A6WB28.1; ; ; ; ; ; ; E()=19.0063; ; 58 aa overlap;
FT query 24-81 aa; subject 875-1042 aa"
*/
/* Edited regex below to understand decimal points in the raw score, and
* forward slashed & hyphens in db names and accessions. Matching this
* using one long regex like the one below is probably not sustainable.
* TODO: Find better alternative
* nds, 26 Oct 2011
*/
private final Pattern similarityPattern = Pattern.compile(
"(\\w+|\\w+ +\\w+ +v[\\d.]+);" + // 1. Algorithm, e.g. fasta, blastp
"\\s*([\\w+\\-/]+):([\\-\\w.]+)" + // 2,3. Primary dbxref, e.g. SWALL:Q26723
"(?:\\s+\\(([\\w+\\-/]+):([\\-\\w.]+(?:,\\s*(?:[\\w+\\-/]+:)?[\\-\\w.]+)*)\\))?;" + // 4,5. Optional secondary dbxrefs, e.g. "EMBL:M20871", "EMBL:BC002634, AAH02634"
"\\s*([^;]+)?;" + // 6. Organism name
"\\s*([^;]+)?;" + // 7. Product name
"\\s*([^;]+)?;" + // 8. Gene name
"\\s*(?:length\\s+(\\d+)\\s+aa)?;" + // 9. Optional match length
"\\s*(?:id=(\\d{1,3}(?:\\.\\d{1,3})?)%)?;" + // 10. Optional degree of identity (percentage)
"\\s*(?:ungapped\\s+id=(\\d{1,3}(?:\\.\\d{1,3})?)%)?;" + // 11. Optional ungapped identity (percentage)
"\\s*E\\(\\)=(\\d*(?:\\.\\d+)?(?:e[+-]? ?\\d+)?);" + // 12. E-value
"\\s*(?:score=(\\d+\\.*\\d*))?;" + // 13. Optional score
"\\s*(?:(\\d+)\\s+aa\\s+overlap)?;" + // 14. Optional overlap length (integer)
"\\s*(?:query\\s+(\\d+)-\\s*(\\d+) aa)?;" + // 15,16. Optional query location
"\\s*(?:subject\\s+(\\d+)-\\s*?(\\d+) aa)?"); // 17,18. Optional subject location
protected void processSimilarityQualifiers() throws DataError {
for (String similarityString: feature.getQualifierValues("similarity")) {
processSimilarityQualifier(similarityString);
}
}
private Map<String,Integer> numberOfSimilaritiesByPrimaryDbXRef = new HashMap<String,Integer>();
private void processSimilarityQualifier(String similarityString) throws DataError {
Matcher matcher = similarityPattern.matcher(similarityString);
if (!matcher.matches()) {
throw new DataError(String.format("Failed to parse /similarity=\"%s\"", similarityString));
}
Similarity similarity = new Similarity();
String program = matcher.group(1);
if (!similarityAnalysisByProgram.containsKey(program)) {
logger.trace(String.format("Creating Analysis object for program '%s'", program));
Analysis analysis = new Analysis();
if (program.indexOf(' ') > 0) {
// Program string contains spaces, so it's of the form "algorithm program version"
String[] splitProgram = program.split(" +");
if (splitProgram.length != 3) {
throw new DataError("Unexpected problem parsing similarity program: " + program);
}
analysis.setAlgorithm(splitProgram[0]);
analysis.setProgram (splitProgram[1]);
analysis.setProgramVersion(splitProgram[2]);
} else {
analysis.setProgram(program);
analysis.setProgramVersion("unknown");
}
similarityAnalysisByProgram.put(program, analysis);
}
Analysis analysis = similarityAnalysisByProgram.get(program);
session.saveOrUpdate(analysis);
similarity.setAnalysis(analysis);
DbXRef primaryDbXRef = objectManager.getDbXRef(matcher.group(2), matcher.group(3));
if (primaryDbXRef == null) {
throw new DataError(String.format("Could not find database '%s' for primary dbxref of /similarity", matcher.group(2)));
}
similarity.setPrimaryDbXRef(primaryDbXRef);
{
// Set the unique identifier to something unique
String primaryDbXRefString = primaryDbXRef.toString();
if (!numberOfSimilaritiesByPrimaryDbXRef.containsKey(primaryDbXRefString)) {
numberOfSimilaritiesByPrimaryDbXRef.put(primaryDbXRefString, 1);
} else {
numberOfSimilaritiesByPrimaryDbXRef.put(primaryDbXRefString, 1 + numberOfSimilaritiesByPrimaryDbXRef.get(primaryDbXRefString));
}
int numberOfSimilarities = numberOfSimilaritiesByPrimaryDbXRef.get(primaryDbXRefString);
similarity.setUniqueIdentifier(String.format("%s_%s_%d", transcriptUniqueName, primaryDbXRefString, numberOfSimilarities));
}
if (matcher.group(4) != null) {
String dbName = matcher.group(4);
for (String accession: matcher.group(5).split(",\\s*")) {
int colonIndex = accession.indexOf(':');
if (colonIndex >= 0) {
dbName = accession.substring(0, colonIndex);
accession = accession.substring(colonIndex + 1);
}
DbXRef secondaryDbXRef = objectManager.getDbXRef(dbName, accession);
if (secondaryDbXRef == null) {
throw new DataError(String.format("Could not find database '%s' for secondary dbxref of /similarity", matcher.group(4)));
}
similarity.addDbXRef(secondaryDbXRef);
}
}
// These three may be null, which is okay
similarity.setOrganismName(matcher.group(6));
similarity.setProduct(matcher.group(7));
similarity.setGeneName(matcher.group(8));
if (matcher.group(9) != null) {
try {
similarity.setLength(Integer.parseInt(matcher.group(9)));
} catch (NumberFormatException e) {
throw new DataError("Failed to parse length field of /similarity: " + matcher.group(9));
}
}
if (matcher.group(10) != null) {
try {
similarity.setId(Double.parseDouble(matcher.group(10)));
} catch (NumberFormatException e) {
throw new DataError("Failed to parse id field of /similarity: " + matcher.group(10));
}
}
if (matcher.group(11) != null) {
try {
similarity.setUngappedId(Double.parseDouble(matcher.group(11)));
} catch (NumberFormatException e) {
throw new DataError("Failed to parse ungapped id field of /similarity: " + matcher.group(11));
}
}
String eValueString = matcher.group(12);
if (eValueString.startsWith("e") || eValueString.startsWith("E")) {
eValueString = "1" + eValueString;
}
try {
similarity.setEValue(Double.parseDouble(eValueString.replaceAll("\\s+", "")));
} catch (NumberFormatException e) {
throw new DataError("Failed to parse E() field of /similarity: " + eValueString);
}
if (matcher.group(13) != null) {
try {
similarity.setRawScore(Double.parseDouble(matcher.group(13)));
} catch (NumberFormatException e) {
throw new DataError("Failed to parse score field of /similarity: " + matcher.group(13));
}
}
if (matcher.group(14) != null) {
try {
similarity.setOverlap(Integer.parseInt(matcher.group(14)));
} catch (NumberFormatException e) {
throw new DataError("Failed to parse score field of /similarity: " + matcher.group(13));
}
}
if (matcher.group(15) != null) {
try {
similarity.setQueryStart(Integer.parseInt(matcher.group(15)));
similarity.setQueryEnd(Integer.parseInt(matcher.group(16)));
} catch (NumberFormatException e) {
throw new DataError(String.format("Failed to parse query location of /similarity: %s-%s", matcher.group(15), matcher.group(16)));
}
}
if (matcher.group(17) != null) {
try {
similarity.setTargetStart(Integer.parseInt(matcher.group(17)));
similarity.setTargetEnd(Integer.parseInt(matcher.group(18)));
} catch (NumberFormatException e) {
throw new DataError(String.format("Failed to parse subject location of /similarity: %s-%s", matcher.group(17), matcher.group(18)));
}
}
focalFeature.addSimilarity(similarity);
}
private final Pattern subqualifierPattern = Pattern.compile("\\G\\s*([^=]+)=\\s*([^;]*)\\s*(?:;|\\z)");
protected void processCuration() throws DataError {
processPropertyQualifier("curation", "genedb_misc", "curation");
if (sloppyControlledCuration) {
processControlledCurationSloppy();
} else {
processControlledCurationStrict();
}
}
private void processControlledCurationSloppy() throws DataError {
int rank = feature.getQualifierValues("curation").size();
for (String controlledCuration: feature.getQualifierValues("controlled_curation")) {
String curation = String.format("[%s]", controlledCuration);
logger.trace(String.format("Sloppy curation: adding /curation=\"%s\" with rank %d", curation, rank));
focalFeature.addFeatureProp(curation, "genedb_misc", "curation", rank++);
Matcher matcher = subqualifierPattern.matcher(controlledCuration);
while (matcher.find()) {
String key = matcher.group(1).toLowerCase();
String value = matcher.group(2);
if (key.equals("db_xref") && value.length() > 0) {
if (value.indexOf(":") < 0) {
value = "PMID:" + value;
}
logger.trace(String.format("Sloppy controlled_curation: adding dbxref for '%s'", value));
addDbXRefs(value);
}
}
}
}
private Set<String> subqualifiers = new HashSet<String>() {{
Collections.addAll(this,
"term", "cv", "qualifier", "evidence", "db_xref", "residue", "attribution", "date");
}};
private void processControlledCurationStrict() throws DataError {
Set<String> seenQualifiedTerms = new HashSet<String>();
for (String controlledCuration: feature.getQualifierValues("controlled_curation")) {
Matcher matcher = subqualifierPattern.matcher(controlledCuration);
Map<String, String> valuesByKey = new HashMap<String, String>();
while (matcher.find()) {
String key = matcher.group(1).toLowerCase();
String value = matcher.group(2);
if (subqualifiers.contains(key)) {
valuesByKey.put(key, value);
}
}
if (!valuesByKey.containsKey("term")) {
throw new DataError("/controlled_curation has no 'term' field");
}
String term = valuesByKey.get("term");
String cv = valuesByKey.containsKey("cv") ? valuesByKey.get("cv") : "CC_genedb_controlledcuration";
String qualifiedTerm = String.format("%s:%s", cv, term);
if (seenQualifiedTerms.contains(qualifiedTerm)) {
logger.warn(String.format(
"There is more than one /controlled_curation qualifier with term '%s' in %s feature on line %d." +
"Ignoring subsequent occurences.",
qualifiedTerm, feature.type, feature.lineNumber));
continue;
}
seenQualifiedTerms.add(qualifiedTerm);
logger.trace(String.format("/controlled_curation: adding term '%s:%s' to %s",
cv, term, focalFeature));
FeatureCvTerm featureCvTerm = focalFeature.addCvTerm(cv, term);
featureCvTerm.addPropIfNotNull("feature_property", "date", valuesByKey.get("date"));
featureCvTerm.addPropIfNotNull("genedb_misc", "attribution", valuesByKey.get("attribution"));
featureCvTerm.addPropIfNotNull("genedb_misc", "evidence", valuesByKey.get("evidence"));
featureCvTerm.addPropIfNotNull("genedb_misc", "qualifier", valuesByKey.get("qualifier"));
if (valuesByKey.containsKey("db_xref")) {
addDbXRefs(featureCvTerm, valuesByKey.get("db_xref"));
}
session.persist(featureCvTerm);
}
}
private Pattern dbxrefPattern = Pattern.compile("([^:]+):(.*)");
/**
* Add DbXRefs to the focal feature.
* @param dbxrefs a pipe-separated list of <code>db:accession</code>
* @throws DataError if the string cannot be parsed or the database does not exist
*/
protected void addDbXRefs(String dbxrefs) throws DataError {
for (String dbxref: dbxrefs.split("\\|")) {
addDbXRef(focalFeature, dbxref);
}
}
/**
* Add a DbXRef to the specified object.
* @param target the object to which the reference should be added
* @param dbxref a string of the form <code>db:accession</code>
* @throws DataError if the string cannot be parsed or the database does not exist
*/
private void addDbXRef(HasPubsAndDbXRefs target, String dbxref) throws DataError {
Matcher matcher = dbxrefPattern.matcher(dbxref);
if (!matcher.matches()) {
throw new DataError(String.format("db_xref '%s' is not of the form database:accession", dbxref));
}
String dbName = matcher.group(1);
String accession = matcher.group(2);
addDbXRef(target, dbName, accession);
}
private void addDbXRef(HasPubsAndDbXRefs target, String dbName, String accession) throws DataError {
DbXRef dbXRef = objectManager.getDbXRef(dbName, accession);
if (dbXRef == null) {
throw new DataError(String.format("Database '%1$s' does not exist (for dbxref '%1$s:%2$s')",
dbName, accession));
}
if (dbName.equals("PMID")) {
// PMID is a special case; these are stored as FeaturePubs
addPub(target, accession, dbXRef);
}
else {
session.persist(target.addDbXRef(dbXRef));
}
}
private void addPub(HasPubsAndDbXRefs target, String accession, DbXRef dbXRef) {
logger.trace(String.format("Adding publication id '%s' to %s",
accession, target.toString()));
Pub pub = objectManager.getPub(String.format("PMID:%s", accession), "unfetched");
session.persist(pub.addDbXRef(dbXRef, true));
session.persist(target.addPub(pub));
}
private Set<String> seenPubAccessions = new HashSet<String>();
private void addPub(String accession) {
if (seenPubAccessions.contains(accession)) {
logger.info(String.format(
"Ignoring duplicate publication with accession '%s' on %s feature at line %d",
accession, feature.type, feature.lineNumber));
return;
}
DbXRef dbXRef = objectManager.getDbXRef("PMID", accession);
addPub(focalFeature, accession, dbXRef);
seenPubAccessions.add(accession);
}
private Pattern literaturePattern = Pattern.compile("(?:PMID:)?\\s*(\\d+)(?:;.*)?");
protected void processLiterature() throws DataError {
for (String pmid: feature.getQualifierValues("literature", "citation")) {
Matcher matcher = literaturePattern.matcher(pmid);
if (!matcher.matches()) {
throw new DataError("Failed to parse literature/citation qualifier: " + pmid);
}
addPub(matcher.group(1));
}
}
/**
* Add DbXRefs to the specified FeatureCvTerm.
* @param dbxrefs a pipe-separated list of <code>db:accession</code>
* @throws DataError
*/
protected void addDbXRefs(FeatureCvTerm featureCvTerm, String dbxrefs) throws DataError {
for (String dbxref: dbxrefs.split("\\|")) {
addDbXRef(featureCvTerm, dbxref);
}
}
/**
* Use the qualifiers of the CDS feature to add various bits of annotation
* to the transcript (or to the polypeptide, if there is one). Specifically,
* add synonyms, properties and products.
*/
protected void processTranscriptQualifiers() throws DataError {
checkForPreviousSystematicIdEqualToSystematicId();
addTranscriptSynonymsFromQualifier("synonym", "synonym", true);
addTranscriptSynonymsFromQualifier("previous_systematic_id", "previous_systematic_id", true);
int commentRank = processPropertyQualifier("note", "feature_property", "comment");
for (String name: qualifierProperties) {
processPropertyQualifier(name, "genedb_misc", name, uniqueQualifiers.contains(name));
}
for (String translation: feature.getQualifierValues("translation")) {
translation = translation.replaceAll("\\s+", "");
translation = translation.toUpperCase();
logger.debug(String.format("Setting translation to sequence from EMBL file : %s", translation));
focalFeature.setResidues(translation);
focalFeature.addFeatureProp("Translation loaded from EMBL", "feature_property", "comment", commentRank++);
}
addColourToExons();
processCvTermQualifier("class", "RILEY", "RILEY", false, normaliseRileyNumber);
processCvTermQualifier("product", "genedb_products", "PRODUCT", true);
String label = feature.getQualifierValue("label");
if (label != null) {
focalFeature.addFeatureProp(String.format("/label=%s", label), "feature_property", "comment", commentRank++);
}
if (taxonomicDivision.equals("PRO")) {
// Bacteria don't have splicing, so a CDS feature is a gene and
// a transcript and that is the end of it. One or more /gene
// or /synonym qualifiers may be used to indicate synonyms.
addTranscriptSynonymsFromQualifier("gene", "synonym", true);
}
if (feature.hasQualifier("partial")) {
logger.trace(String.format("Marking feature '%s' as partial", focalFeature.getUniqueName()));
focalFeature.addFeatureProp("partial", "feature_property", "comment", commentRank++);
}
for (String dbxrefs: feature.getQualifierValues("db_xref")) {
addDbXRefs(dbxrefs);
}
processGO();
processSimilarityQualifiers();
processCuration();
processLiterature();
}
protected void addColourToExons() throws DataError {
String colour = feature.getQualifierValue("colour");
if (colour == null) {
return;
}
String normalisedColour = colour;
TermParser colourParser = qualifierParsers.get("colour");
if (colourParser != null) {
if (!(colourParser instanceof TermNormaliser)) {
throw new RuntimeException("The /colour parser is not a TermNormaliser?!");
}
TermNormaliser colourNormaliser = (TermNormaliser) colourParser;
normalisedColour = colourNormaliser.normalise(colour);
}
logger.trace(String.format("Adding /colour=\"%s\" to exons of '%s'", normalisedColour, transcript.getUniqueName()));
for (AbstractExon exon: transcript.getExons()) {
logger.trace(String.format("Adding /colour=\"%s\" to exon '%s'", normalisedColour, exon.getUniqueName()));
exon.addFeatureProp(normalisedColour, "genedb_misc", "colour", 0);
}
}
/**
* Explicitly check for the case where /previous_systematic_id is equal to the
* actual systematic ID, because otherwise the resulting constraint violation
* error is difficult to understand and track down.
* @throws DataError if so
*/
private void checkForPreviousSystematicIdEqualToSystematicId() throws DataError {
String systematicId = feature.getQualifierValue("systematic_id");
if (systematicId == null) {
return;
}
for (String temporarySystematicId: feature.getQualifierValues("previous_systematic_id")) {
if (systematicId.equals(temporarySystematicId)) {
throw new DataError("Feature has /previous_systematic_id with the same value as /systematic_id");
}
}
}
}
/**
* Parse a property value. What this means will depend on the specific parser used.
* The parser may return multiple values, e.g. for semicolon-separated product terms.
*/
private static interface TermParser {
public Iterable<String> parse(String term) throws DataError;
}
/**
* Normalise a property value. What this means will depend on the specific normaliser used.
* A TermNormaliser is a special sort of TermParser for the common case where the result
* is a single string.
*/
private static abstract class TermNormaliser implements TermParser {
public abstract String normalise(String term) throws DataError;
public final Iterable<String> parse(String term) throws DataError {
return Collections.singleton(normalise(term));
}
}
/**
* A term parser for products, which splits on semicolon.
*/
private static final TermParser productParser = new TermParser() {
public Iterable<String> parse(String term) throws DataError {
return IterableArray.fromArray(term.split(";\\s+"));
}
};
/**
* A term normaliser for Riley classification numbers, which for example
* will normalise "2.2.07" to "2.2.7".
*/
private static final TermNormaliser normaliseRileyNumber = new TermNormaliser() {
private final Pattern RILEY_PATTERN = Pattern.compile("(\\d{1,2})\\.(\\d{1,2})\\.(\\d{1,2})");
@Override
public String normalise(String term) throws DataError {
Matcher matcher = RILEY_PATTERN.matcher(term);
if (!matcher.matches()) {
throw new DataError(String.format("Failed to parse Riley number '%s'", term));
}
return String.format("%d.%d.%d",
Integer.parseInt(matcher.group(1)),
Integer.parseInt(matcher.group(2)),
Integer.parseInt(matcher.group(3)));
}
};
/**
* A term normaliser (and format validator) for integers.
*/
private static final TermNormaliser normaliseInteger = new TermNormaliser() {
@Override
public String normalise(String term) throws DataError {
try {
return String.valueOf(Integer.parseInt(term));
} catch (NumberFormatException e) {
throw new DataError(String.format("Failed to parse integer '%s'", term));
}
}
};
/**
* A list of the qualifiers that correspond directly to similarly-named
* properties in the <code>genedb_misc</code> CV.
*/
private static final List<String> qualifierProperties = new ArrayList<String>();
private static final Map<String,TermParser> qualifierParsers = new HashMap<String,TermParser>();
private static final Set<String> uniqueQualifiers = new HashSet<String>();
static {
Collections.addAll(qualifierProperties,
"method", "colour", "status",
"blast_file", "blastn_file", "blastp+go_file", "blastp_file",
"blastx_file", "fasta_file", "fastx_file", "tblastn_file",
"tblastx_file", "clustalx_file", "sigcleave_file", "pepstats_file",
"EC_number", "private");
Collections.addAll(uniqueQualifiers, "colour", "status");
qualifierParsers.put("colour", normaliseInteger);
qualifierParsers.put("product", productParser);
// Some files (e.g. Streptococcus_pneumoniae_D39.embl) have things other than integers in /status.
// qualifierParsers.put("status", normaliseInteger);
}
class CDSLoader extends GeneLoader {
public CDSLoader(FeatureTable.CDSFeature cdsFeature) throws DataError {
super(cdsFeature);
isPseudo = cdsFeature.isPseudo();
isObsolete = cdsFeature.isObsolete();
geneUniqueName = cdsFeature.getSharedId();
transcriptUniqueName = cdsFeature.getUniqueName();
if (taxonomicDivision.equals("PRO")) {
// Bacteria don't have splicing, so a CDS feature is a gene and
// a transcript and that is the end of it. One or more /gene
// qualifiers may be used to indicate synonyms. The primary_name
// is optional, as usual.
geneName = cdsFeature.getQualifierValue("primary_name");
} else {
geneName = cdsFeature.getGeneName();
}
String codonStart = cdsFeature.getQualifierValue("codon_start");
if (codonStart != null) {
try {
phase = Integer.parseInt(codonStart) - 1;
} catch (NumberFormatException e) {
throw new DataError(
String.format("Could not parse value of /codon_start qualifier ('%s')", codonStart));
}
if (phase < 0 || phase > 2) {
throw new DataError(
String.format("Value of /codon_start qualifier out of range (%d)", phase+1));
}
}
singlySpliced = false;
if (geneUniqueName == null) {
singlySpliced = true;
geneUniqueName = transcriptUniqueName;
}
}
@Override
protected Class<? extends Transcript> getTranscriptClass() {
return isPseudo ? PseudogenicTranscript.class : MRNA.class;
}
}
private Feature loadCDS(FeatureTable.CDSFeature cdsFeature) throws DataError {
return new CDSLoader(cdsFeature).load();
}
/* Loader for non-coding RNA features */
private class NcRNALoader extends GeneLoader {
private Class<? extends NcRNA> transcriptClass;
private String type;
public NcRNALoader(Class<? extends NcRNA> transcriptClass, String type,
FeatureTable.Feature feature)
throws DataError
{
super(feature);
this.transcriptClass = transcriptClass;
this.type = type;
geneUniqueName = transcriptUniqueName = feature.getUniqueName(false);
if (geneUniqueName == null) {
geneUniqueName = transcriptUniqueName = makeSyntheticName();
}
}
private String makeSyntheticName() {
if (syntheticNcRNAIndexByType.containsKey(type)) {
syntheticNcRNAIndexByType.put(type, 1 + syntheticNcRNAIndexByType.get(type));
} else {
syntheticNcRNAIndexByType.put(type, 1);
}
return String.format("%s_%s%d",
topLevelFeature.getUniqueName(), type, syntheticNcRNAIndexByType.get(type));
}
@Override
protected void processTranscriptQualifiers() throws DataError {
int commentRank = processPropertyQualifier("note", "feature_property", "comment");
if (TRNA.class.isAssignableFrom(transcriptClass)) {
processPropertyQualifier("anticodon", "feature_property", "anticodon", true);
}
processPropertyQualifier("colour", "genedb_misc", "colour", true);
processCvTermQualifier("product", "genedb_products", "PRODUCT", true);
addColourToExons();
String label = feature.getQualifierValue("label");
if (label != null) {
logger.trace(String.format("Adding /label=\"%s\" as comment on '%s'",
label, focalFeature.getUniqueName()));
focalFeature.addFeatureProp(String.format("/label=%s", label), "feature_property", "comment", commentRank++);
}
if (taxonomicDivision.equals("PRO")) {
// Bacteria don't have splicing, so a CDS feature is a gene and
// a transcript and that is the end of it. One or more /gene
// qualifiers may be used to indicate synonyms.
addTranscriptSynonymsFromQualifier("gene", "synonym", true);
addTranscriptSynonymsFromQualifier("synonym", "synonym", true);
}
for (String dbxrefs: feature.getQualifierValues("db_xref")) {
addDbXRefs(dbxrefs);
}
processCuration();
processLiterature();
}
@Override
protected Class<? extends Transcript> getTranscriptClass() {
return transcriptClass;
}
}
private Feature loadNcRNA(Class<? extends NcRNA> rnaClass, String rnaType,
FeatureTable.Feature feature) throws DataError {
return new NcRNALoader(rnaClass, rnaType, feature).load();
}
/* UTR */
private List<UTR> loadUTR(FeatureTable.Feature utrFeature) throws DataError {
String utrType = utrFeature.type;
EmblLocation utrLocation = utrFeature.location;
String uniqueName = utrFeature.getUniqueName();
logger.debug(String.format("Loading %s for '%s' at %s", utrType, uniqueName, utrLocation));
Transcript transcript = transcriptsByUniqueName.get(uniqueName); //Straightforward case; Transcript found by name
/* Due to the various forms of old-fashioned transcript names (:mRNA, .\d:mRNA etc) it is not always
* straightforward figuring out what the transcript should be for a UTR. We do a little guesswork here
* to find the gene name and then look for a corresponding transcript from the list we have seen already.
* This does not work for alternatively spliced genes where several transcripts match up to the gene name.
* Revisit this after Smansoni has been loaded.
* nds, 16th Nov 2010 & 26th Sep 2011
*/
if(transcript == null){
List<String> possibleTranscriptNames = new ArrayList<String>();
String possibleGeneName; //trying to figure out what the gene name is
//Various Transcript patterns
Pattern withNumberAndMrna = Pattern.compile("(\\S+)\\.\\d+:mRNA");
Pattern withOnlyMrna = Pattern.compile("(\\S+):mRNA");
Matcher matcher1 = withNumberAndMrna.matcher(uniqueName);
Matcher matcher2 = withOnlyMrna.matcher(uniqueName);
if(matcher1.matches() ){
possibleGeneName = matcher1.group(1);
}else if(matcher2.matches()){
possibleGeneName = matcher2.group(1);
}else{
possibleGeneName = uniqueName;
}
//if(uniqueName.matches("\\S+\\.\\d:mRNA")){ //@$$! Need to escape the .!
/* possibleGeneName = uniqueName.substring(0,uniqueName.length()-7);
}else if(uniqueName.matches("\\S+:mRNA")){
possibleGeneName = uniqueName.substring(0,uniqueName.length()-5);
}else{
possibleGeneName = uniqueName;
} */
System.out.println("The possible gene name is " + possibleGeneName);
for(String s: transcriptsByUniqueName.keySet()){
System.out.println("Transcript " + s);
if(s.matches(possibleGeneName.concat(".\\d")) || s.matches(possibleGeneName.concat(".\\d:mRNA"))){
possibleTranscriptNames.add(s);
}
}
if(possibleTranscriptNames.size()==1){ //No alternative splicing
transcript = transcriptsByUniqueName.get(possibleTranscriptNames.get(0));
logger.warn(String.format("Assuming %s is the transcript for this UTR for %s", possibleTranscriptNames.get(0), uniqueName));
}else if (possibleTranscriptNames.size()==0){
throw new DataError(String.format("Could not find a transcript '%s' for %s", uniqueName, utrType));
}else if (possibleTranscriptNames.size() > 1) {
throw new DataError(String.format("Multiple transcripts possible for this UTR", uniqueName, utrType));
}
}
List<UTR> utrs = new ArrayList<UTR>();
Class<? extends UTR> utrClass;
if (utrType.equals("3'UTR")) {
utrClass = ThreePrimeUTR.class;
} else if (utrType.equals("5'UTR")) {
utrClass = FivePrimeUTR.class;
} else {
throw new RuntimeException(String.format("Unrecognised UTR feature type '%s'", utrType));
}
int part = 1;
List<EmblLocation> utrParts = utrLocation.getParts();
for (EmblLocation utrPartLocation: utrParts) {
String utrUniqueName = String.format("%s:%dutr", transcript.getUniqueName()/*uniqueName*/, utrClass == ThreePrimeUTR.class ? 3 : 5);
if (utrParts.size() > 1) {
utrUniqueName += ":" + part;
}
logger.debug(String.format("Creating %s feature '%s' at %d-%d",
utrType, utrUniqueName, utrPartLocation.getFmin(), utrPartLocation.getFmax()));
UTR utr = transcript.createUTR(utrClass, utrUniqueName, utrPartLocation.getFmin(), utrPartLocation.getFmax());
utrs.add(utr);
session.persist(utr);
++ part;
}
return utrs;
}
/* Setters and Spring stuff */
public void setOrganismDao(OrganismDao organismDao) {
this.organismDao = organismDao;
}
/**
* Set the ObjectManager. This is expected to be called by Spring.
* We will inject the GeneralDao object into the ObjectManager ourselves from
* {@link #afterPropertiesSet}, so this ObjectManager need not have the GeneralDao
* injected yet. This avoids circularity.
*
* @param objectManager
*/
public void setObjectManager(ObjectManager objectManager) {
this.objectManager = objectManager;
}
public void setSessionFactory(SessionFactory sessionFactory) {
this.sessionFactory = sessionFactory;
}
public void setGeneralDao(GeneralDao generalDao) {
this.generalDao = generalDao;
}
public void setCvDao(CvDao cvDao) {
this.cvDao = cvDao;
}
public void setPubDao(PubDao pubDao) {
this.pubDao = pubDao;
}
public void setFeatureUtils(FeatureUtils featureUtils) {
this.featureUtils = featureUtils;
}
public void afterPropertiesSet() {
synonymManager.setObjectManager(objectManager);
/*
* We cannot set the DAOs of the objectManager
* directly in Load.xml, because that creates a circular
* reference that (understandably) causes Spring to
* throw a tantrum. Thus we inject them into
* here, and pass them to the ObjectManager after Spring
* configuration.
*/
objectManager.setGeneralDao(generalDao);
objectManager.setCvDao(cvDao);
objectManager.setPubDao(pubDao);
}
}