package org.gmod.schema.feature;
import org.genedb.db.analyzers.AllNamesAnalyzer;
import org.genedb.db.analyzers.AlphaNumericAnalyzer;
import org.gmod.schema.cfg.FeatureType;
import org.gmod.schema.mapped.CvTerm;
import org.gmod.schema.mapped.DbXRef;
import org.gmod.schema.mapped.Feature;
import org.gmod.schema.mapped.FeatureCvTerm;
import org.gmod.schema.mapped.FeatureLoc;
import org.gmod.schema.mapped.FeatureProp;
import org.gmod.schema.mapped.FeatureRelationship;
import org.gmod.schema.mapped.Organism;
import org.gmod.schema.utils.PeptideProperties;
import org.gmod.schema.utils.StrandedLocation;
import org.apache.log4j.Logger;
import org.biojava.bio.BioException;
import org.biojava.bio.proteomics.IsoelectricPointCalc;
import org.biojava.bio.proteomics.MassCalc;
import org.biojava.bio.seq.ProteinTools;
import org.biojava.bio.seq.io.SymbolTokenization;
import org.biojava.bio.symbol.SimpleSymbolList;
import org.biojava.bio.symbol.SymbolList;
import org.biojava.bio.symbol.SymbolPropertyTable;
import org.hibernate.search.annotations.Analyzer;
import org.hibernate.search.annotations.Field;
import org.hibernate.search.annotations.Index;
import org.hibernate.search.annotations.Indexed;
import org.hibernate.search.annotations.Store;
import org.springframework.util.StringUtils;
import com.google.common.collect.Lists;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import javax.persistence.Entity;
import javax.persistence.Transient;
@SuppressWarnings("serial")
@Entity
@FeatureType(cv="sequence", term="polypeptide")
@Indexed
public class Polypeptide extends Region {
private static Logger logger = Logger.getLogger(Polypeptide.class);
@Transient
private Transcript transcript;
Polypeptide() {
// empty
}
public Polypeptide(Organism organism, String uniqueName, boolean analysis,
boolean obsolete, Timestamp dateAccessioned) {
super(organism, uniqueName, analysis, obsolete, dateAccessioned);
}
public Polypeptide(Organism organism, String uniqueName) {
this(organism, uniqueName, false, false, new Timestamp(System.currentTimeMillis()));
}
@Transient
public Transcript getTranscript() {
if (transcript != null) {
return transcript;
}
for (FeatureRelationship relation : getFeatureRelationshipsForSubjectId()) {
Feature transcriptFeature = relation.getObjectFeature();
if (transcriptFeature instanceof Transcript) {
transcript = (Transcript) transcriptFeature;
break;
}
}
if (transcript == null) {
logger.error(String.format("The polypeptide '%s' has no associated transcript", getUniqueName()));
return null;
}
return transcript;
}
@Transient
public AbstractGene getGene() {
if (getTranscript() == null) {
return null;
}
return getTranscript().getGene();
}
@Transient
@Field(name = "gene", index = Index.UN_TOKENIZED, store = Store.YES)
public String getGeneUniqueName() {
//logger.warn("Getting gene name");
AbstractGene gene = getGene();
if (gene != null) {
return gene.getUniqueName();
}
return null;
}
@Transient
@Field(name = "alternateTranscriptNumber", index = Index.UN_TOKENIZED, store = Store.YES)
public int getAlternateTranscriptNumber() {
AbstractGene gene = getGene();
if (gene != null) {
return gene.getNonObsoleteTranscripts().size();
}
return 0;
}
@Transient
@Field(name = "alternateTranscripts", index = Index.UN_TOKENIZED, store = Store.YES)
public String getAlternateTranscripts() {
AbstractGene gene = getGene();
if (gene != null) {
return gene.alternateTranscripts();
}
return null;
}
@Transient
public List<String> getProducts() {
List<String> products = new ArrayList<String>();
for (FeatureCvTerm featureCvTerm : this.getFeatureCvTerms()) {
if (featureCvTerm.getType().getCv().getName().equals("genedb_products")) {
products.add(featureCvTerm.getType().getName());
}
}
return products;
}
public void addProduct(String product) {
addCvTerm("genedb_products", product);
}
/**
* Get the ID number of the colour associated with this polypeptide.
* It is often unassigned, in which case <code>null</code> is returned.
*
* @return
*/
@Transient
public Integer getColourId() {
/* Sometimes there is no colour property at all,
and sometimes there is a colour property with a null value.
I don't know why this inconsistency exists. rh11 */
String colourIdString = getProperty("genedb_misc", "colour");
if (colourIdString == null || colourIdString.equals("")) {
return null;
}
return Integer.valueOf(colourIdString);
}
/**
* Get all the polypeptide regions of the specified type.
* @param <T> the type of region. Must be a subclass of <code>PolypeptideRegion</code>
* @param type a class object representing the region type. For example, <code>PolypeptideDomain.class</code>
* @return a sorted set of those regions of the requested type
*/
@Transient
public <T extends PolypeptideRegion> SortedSet<T> getRegions(Class<T> type) {
SortedSet<T> domains = new TreeSet<T>();
for (FeatureLoc domainLoc: this.getFeatureLocsForSrcFeatureId()) {
Feature domain = domainLoc.getFeature();
if (type.isAssignableFrom(domain.getClass())) {
domains.add(type.cast(domain));
}
}
return domains;
}
/**
* Returns the featurelocs of amino acid features located on the polypeptide.
*
* @param <T> the type of feature, must be a subclass of amino acid feature For example <code>ModifiedAminoAcidFeature.class</code>.
* @return a sorted set of featurelocs
*/
@Transient
public <T extends AminoAcid> List<FeatureLoc> getAminoAcidFeatureLocs(Class<T> type) {
List<FeatureLoc> aminoAcidFeatureLocs = new ArrayList<FeatureLoc>();
for (FeatureLoc domainLoc: this.getFeatureLocsForSrcFeatureId()) {
Feature domain = domainLoc.getFeature();
if (type.isAssignableFrom(domain.getClass())) {
aminoAcidFeatureLocs.add(domainLoc);
}
}
return aminoAcidFeatureLocs;
}
/**
* Get the (predicted) domains of this polypeptide.
* @return a sorted set of domains
*/
public SortedSet<PolypeptideDomain> getDomains() {
return getRegions(PolypeptideDomain.class);
}
/**
* Get the (predicted) MembraneStructure of this protein.
* @return the (predicted) MembraneStructure of this protein, or <code>null</code>
* if there is none.
*/
@Transient
public MembraneStructure getMembraneStructure() {
Set<MembraneStructure> membraneStructures = getRegions(MembraneStructure.class);
if (membraneStructures.isEmpty()) {
return null;
}
if (membraneStructures.size() > 1) {
throw new IllegalStateException(String.format("Found more than one MembraneStructure for polypeptide '%s'",
getUniqueName()));
}
return membraneStructures.iterator().next();
}
/**
* Calculate the predicted properties of this polypeptide.
*
* @return a <code>PeptideProperties</code> object containing the predicted
* properties of this polypeptide.
*/
public PeptideProperties calculateStats() {
if (this.getResidues() == null) {
logger.warn("No residues for '" + this.getUniqueName() + "'");
return null;
}
String residuesString = new String(this.getResidues());
SymbolList residuesSymbolList = null;
PeptideProperties pp = new PeptideProperties();
try {
SymbolTokenization proteinTokenization = ProteinTools.getTAlphabet().getTokenization("token");
residuesSymbolList = new SimpleSymbolList(proteinTokenization, residuesString);
if (residuesSymbolList.length() == 0) {
logger.error(String.format("Polypeptide feature '%s' has zero-length residues", this.getUniqueName()));
return pp;
}
try {
// if the sequence ends with a termination symbol (*), we need to remove it
if (residuesSymbolList.symbolAt(residuesSymbolList.length()) == ProteinTools.ter()) {
if (residuesSymbolList.length() == 1) {
logger.error(String.format("Polypeptide feature '%s' only has termination symbol", this.getUniqueName()));
return pp;
}
residuesSymbolList = residuesSymbolList.subList(1, residuesSymbolList.length() - 1);
}
} catch (IndexOutOfBoundsException exception) {
throw new RuntimeException(exception);
}
} catch (BioException e) {
logger.error("Can't translate into a protein sequence", e);
return pp;
}
pp.setAminoAcids(residuesSymbolList.length());
try {
double isoElectricPoint = new IsoelectricPointCalc().getPI(residuesSymbolList, false, false);
pp.setIsoelectricPoint(isoElectricPoint);
} catch (Exception e) {
logger.error(String.format("Error computing protein isoelectric point for '%s'", residuesSymbolList), e);
}
double mass2 = calculateMass(residuesSymbolList);
if (mass2 != -1) {
//mass = mass2;
pp.setMass(mass2);
}
double charge = calculateCharge(residuesString);
pp.setCharge(charge);
return pp;
}
private double calculateMass(SymbolList residuesSymbolList) {
try {
double massInDaltons = MassCalc.getMass(residuesSymbolList, SymbolPropertyTable.AVG_MASS, true);
return massInDaltons;
} catch (Exception exp) {
logger.error(String.format("Error computing protein mass in '%s' because '%s'", getUniqueName(), exp.getMessage()));
}
return -1.0;
}
/**
* Calculate the charge of a polypeptide.
*
* @param residues a string representing the polypeptide residues, using the single-character code
* @return the charge of this polypeptide (in what units?)
*/
private double calculateCharge(String residues) {
double charge = 0.0;
for (char aminoAcid: residues.toCharArray()) {
switch (aminoAcid) {
case 'B': case 'Z': charge += -0.5; break;
case 'D': case 'E': charge += -1.0; break;
case 'H': charge += 0.5; break;
case 'K': case 'R': charge += 1.0; break;
/*
* EMBOSS seems to think that 'O' (presumably Pyrrolysine)
* also contributes +1 to the charge. According to Wikipedia,
* this obscure amino acid is found only in methanogenic archaea,
* so it's unlikely to trouble us soon. Still, it can't hurt:
*/
case 'O': charge += 1.0; break;
}
}
return charge;
}
public static Polypeptide make(Feature parent, StrandedLocation location,
String systematicId, Organism organism, Timestamp now) {
Polypeptide polypeptide = new Polypeptide(organism, systematicId, false, false, now);
parent.addLocatedChild(polypeptide, location);
return polypeptide;
}
@Transient
@Field(name="gpiAnchored", index=Index.UN_TOKENIZED, store=Store.NO)
public boolean isGPIAnchored() {
return hasProperty("genedb_misc", "GPI_anchored");
}
/**
* Add an orthologue link from the specified polypeptide to this one.
* @param source the source polypeptide
* @return the newly-created FeatureRelationship object
*/
public FeatureRelationship addOrthologue(Polypeptide source) {
return this.addFeatureRelationship(source, "sequence", "orthologous_to");
}
/**
* Add an paralogue link from the specified polypeptide to this one.
* @param source the source polypeptide
* @return the newly-created FeatureRelationship object
*/
public FeatureRelationship addParalogue(Polypeptide source) {
return this.addFeatureRelationship(source, "sequence", "paralogous_to");
}
@Transient
@Field(name="signalP", index=Index.UN_TOKENIZED, store=Store.NO)
public boolean isSignalP() {
//logger.warn("Getting signal P");
if (hasProperty("genedb_misc", "SignalP_prediction")
|| hasProperty("genedb_misc", "signal_peptide_probability")
|| hasProperty("genedb_misc", "signal_anchor_probability")) {
return true;
}
return false;
}
@Transient
@Field(index=Index.UN_TOKENIZED, store=Store.YES)
public String getSequenceResidues() {
String residues = getResidues();
if (residues != null) {
if ((residues.length() > 1) && (residues.endsWith("*"))) {
// remove any trailing stars
int max_index = residues.length() - 1;
residues = residues.substring(0, max_index);
}
}
//logger.warn("SEQUENCE!" + getResidues());
return residues;
}
@Transient
@Field(name="apicoplast", index=Index.UN_TOKENIZED, store=Store.NO)
public boolean isApicoplast() {
String s = getProperty("genedb_misc", "PlasmoAP_score");
int score;
try {
score = Integer.parseInt(s);
}
catch (RuntimeException exp) {
return false;
}
if (score > 4) {
return true;
}
return false;
}
@Transient
@Field(index=Index.UN_TOKENIZED, store=Store.NO)
public String getNumberTMDomains() {
return String.format("%05d", this.getRegions(TransmembraneRegion.class).size());
}
@Transient
@Field(index=Index.TOKENIZED, store=Store.YES)
public String getSequenceLength(){
return String.format("%06d", this.getSeqLen());
}
/**
* FIXED - This method is no longer duplicated (and also in the ProductiveTranscript class)
* @return
*/
@Transient
@Analyzer(impl = AllNamesAnalyzer.class)
@Field(name = "product", index = Index.TOKENIZED, store = Store.YES)
public String getProductsAsSpaceSeparatedString() {
List<String> products = getProducts();
if (products == null) {
return null;
}
return StringUtils.collectionToDelimitedString(products, " ");
}
/**
* FIXED - This method is no longer duplicated (and also in the ProductiveTranscript class)
* @return
*/
@Transient
@Analyzer(impl = AllNamesAnalyzer.class)
@Field(name = "expandedProduct", index = Index.TOKENIZED, store = Store.YES)
public String getProductsAsSeparatedString() {
List<String> products = getProducts();
if (products == null) {
return null;
}
// we only munge in the expandedProduct lucene field, because
// we are assuming this exists just for display
List<String> munged = Lists.newArrayList();
for (String product : products) {
if (product.contains("-")) {
munged.add(product.replace("-", ""));
}
}
products.addAll(munged);
return StringUtils.collectionToDelimitedString(products, " ");
}
@Transient
@Analyzer(impl = AlphaNumericAnalyzer.class)
@Field(name = "productAlphanumeric", index = Index.TOKENIZED, store = Store.YES)
public String getProductsAlphanumeric(){
List<String> products = getProducts();
if (products == null) {
return null;
}
return StringUtils.collectionToDelimitedString(products, " ");
}
@Transient
@Field(index=Index.UN_TOKENIZED, store=Store.NO)
public String getMass() {
try {
PeptideProperties pp = calculateStats();
if (pp.isHasMass()) {
long mass = Math.round(pp.getMassInDaltons());
return String.format("%09d", mass);
}
return "";
}
catch (RuntimeException exp) {
return "";
}
}
@Transient
@Field(index=Index.TOKENIZED, store=Store.NO)
public String getEcNums() {
List<String> ecNums = new ArrayList<String>();
for (FeatureProp fp : getFeatureProps()) {
CvTerm type = fp.getType();
if (type.getName().equals("EC_number") && type.getCv().getName().equals("genedb_misc")) {
ecNums.add(fp.getValue());
}
}
return StringUtils.collectionToDelimitedString(ecNums, " ");
}
@Transient
@Field(index=Index.TOKENIZED, store=Store.YES)
public String getAllCuration() {
List<String> curation = new ArrayList<String>();
for (FeatureProp fp : getFeatureProps()) {
CvTerm type = fp.getType();
if (type.getCv().getName().equals("genedb_misc") && type.getName().equals("curation")) {
curation.add(fp.getValue());
}
if (type.getCv().getName().equals("feature_property") && type.getName().equals("comment")) {
curation.add(fp.getValue());
}
}
// we add terms from the CC_genedb_controlledcuration featurecvterm here
curation.addAll(populateFromFeatureCvTerms("CC_genedb_controlledcuration"));
return StringUtils.collectionToDelimitedString(curation, " ");
}
@Transient
@Field(index=Index.TOKENIZED, store=Store.NO)
public String getGo() {
List<String> go = new ArrayList<String>();
go.addAll(populateFromFeatureCvTerms("biological_process"));
go.addAll(populateFromFeatureCvTerms("molecular_function"));
go.addAll(populateFromFeatureCvTerms("cellular_component"));
return StringUtils.collectionToDelimitedString(go, " ");
}
private Collection<String> populateFromFeatureCvTerms(String cvNamePrefix) {
List<String> ret = new ArrayList<String>();
for (FeatureCvTerm fct : getFeatureCvTermsFilteredByCvNameStartsWith(cvNamePrefix)) {
ret.add(String.format("%s %s", fct.getCvTerm().getName(), fct.getCvTerm().getDbXRef().getAccession()));
}
return ret;
}
@Transient
@Analyzer(impl = AllNamesAnalyzer.class)
@Field(index=Index.TOKENIZED, store=Store.YES)
public String getPfam(){
List<String> pfams = new ArrayList<String>();
for (PolypeptideDomain domain : this.getDomains()) {
DbXRef dbXRef = domain.getDbXRef();
if(dbXRef.getDb().getName().equals("Pfam")){
String accession = dbXRef.getAccession();
String description = dbXRef.getDescription();
pfams.add(accession);
pfams.add(description);
if (accession.startsWith("PF")) {
String trimmed = accession.replace("PF", "");
// we only need the numerical one (as that is what's used in the search)
pfams.add(trimmed);
//pfams.add(String.format("Pf:%s", trimmed));
//pfams.add(String.format("Pfam:%s", trimmed));
}
}
}
if (pfams.size() == 0)
return null;
return StringUtils.collectionToDelimitedString(pfams, " ");
}
/**
* Get a collection of the ProteinMatch features that represent similarities
* between this polypeptide and another (as defined by a /similarity qualifier
* in a PSU EMBL file).
*
* @return a collection of the ProteinMatch features that represent similarities
* between this polypeptide and another.
*/
@Transient
public Collection<ProteinMatch> getSimilarityMatches() {
List<ProteinMatch> proteinMatches = new ArrayList<ProteinMatch>();
for (FeatureLoc featureLoc: this.getFeatureLocsForSrcFeatureId()) {
if (featureLoc.getRank() != 0) {
continue;
}
Feature feature = featureLoc.getFeature();
if (feature instanceof ProteinMatch) {
proteinMatches.add((ProteinMatch) feature);
} else {
logger.debug(String.format("getSimilarityMatches: %s is '%s', not ProteinMatch",
feature, feature.getClass()));
}
}
return proteinMatches;
}
/**
* Get a collection of the ProteinMatch features that represent orthologue clusters
* to which this polypeptide belongs.
* @return a collection of the ProteinMatch features that represent orthologue clusters
* to which this polypeptide belongs
*/
@Transient
public Collection<ProteinMatch> getOrthologueClusters() {
return getRelatedFeatures(ProteinMatch.class, "sequence", "orthologous_to");
}
/**
* Get a collection of all the polypeptides to which this polypeptide
* @return
*/
@Transient
public Collection<Polypeptide> getDirectOrthologues() {
return getRelatedFeatures(Polypeptide.class, "sequence", "orthologous_to");
}
}