package org.genedb.crawl.modelling; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.log4j.Logger; import org.biojava.bio.BioException; import org.biojava.bio.proteomics.IsoelectricPointCalc; import org.biojava.bio.proteomics.MassCalc; import org.biojava.bio.seq.ProteinTools; import org.biojava.bio.seq.io.SymbolTokenization; import org.biojava.bio.symbol.IllegalSymbolException; import org.biojava.bio.symbol.SimpleSymbolList; import org.biojava.bio.symbol.SymbolList; import org.biojava.bio.symbol.SymbolPropertyTable; import org.genedb.crawl.mappers.FeatureMapper; import org.genedb.crawl.mappers.OrganismsMapper; import org.genedb.crawl.mappers.RegionsMapper; import org.genedb.crawl.model.Coordinates; import org.genedb.crawl.model.Cvterm; import org.genedb.crawl.model.Feature; import org.genedb.crawl.model.LocatedFeature; import org.genedb.crawl.model.Organism; import org.genedb.crawl.model.Property; import org.genedb.util.TranslationException; import org.genedb.util.Translator; import org.springframework.beans.factory.annotation.Autowired; import uk.ac.sanger.artemis.sequence.Bases; /** * A class that does useful things with mappers, centering around features. * * @author gv1 * */ public class FeatureMapperUtil { private static Logger logger = Logger.getLogger(FeatureMapperUtil.class); @Autowired private OrganismsMapper organismsMapper; @Autowired private RegionsMapper regionsMapper; @Autowired public FeatureMapper featureMapper; public static final Set<String> transcriptTypes = new HashSet<String>(Arrays.asList(new String[] {"mRNA", "ncRNA", "snoRNA", "snRNA", "tRNA", "miscRNA", "rRNA"})); public Feature getFeature(String uniqueName, String name, String organism) { Integer organism_id = null; if (organism != null) { Organism o = this.getOrganism(organism); if (o != null) organism_id = o.ID; } Feature resultFeature = featureMapper.getWithSynonym(uniqueName, name, organism_id, null); return resultFeature; } public Organism getOrganism(String organism) { Organism mappedOrganism = null; if (organism.contains(":")) { String[] split = organism.split(":"); if (split.length == 2) { String prefix = split[0]; String orgDescriptor = split[1]; if (prefix.equals("com")) { mappedOrganism = organismsMapper.getByCommonName(orgDescriptor); } else if (prefix.equals("tax")) { mappedOrganism = organismsMapper.getByTaxonID(orgDescriptor); } else if (prefix.equals("org")) { mappedOrganism = organismsMapper.getByID(Integer.parseInt(orgDescriptor)); } } } else { mappedOrganism = organismsMapper.getByCommonName(organism); } return mappedOrganism; } public void summarise(Feature feature) { feature.coordinates = featureMapper.coordinates(feature); // TODO - this might need to be fixed to work with non-LocatedFeature // instances if (feature instanceof LocatedFeature && feature.coordinates != null && feature.coordinates.size() > 0) { LocatedFeature locatedFeature = (LocatedFeature) feature; Coordinates c = locatedFeature.coordinates.get(0); locatedFeature.fmin = c.fmin; locatedFeature.fmax = c.fmax; locatedFeature.region = c.region; locatedFeature.phase = c.phase; locatedFeature.strand = c.strand; } feature.properties = featureMapper.properties(feature); feature.terms = featureMapper.terms(feature); feature.synonyms = featureMapper.synonyms(feature); feature.pubs = featureMapper.pubs(feature); feature.dbxrefs = featureMapper.dbxrefs(feature); feature.domains = featureMapper.domains(feature); for (Feature domain : feature.domains) { summarise(domain); } feature.orthologues = featureMapper.orthologues(feature); } /** * * An isoform is defined (in order of searching) : * - as the parent transcript (for a multi-isoform gene), or * - the first transcript if the requested feature is not a descendant of transcript, or * - the gene if part of a gene model. * * If nothing is found, then assume it is not part of a gene model, in which case just * return the requested feature. * * @param feature * @param ofType * @return */ public Feature getIsoform(Feature feature, List<Cvterm> ofType) { Feature gene = this.getAncestorGene(feature, ofType); if (gene == null) return feature; getDescendants(gene, ofType, false); Feature transcript = getTranscript(feature, gene, true); // if there are no transcripts, must use the gene if (transcript == null) return gene; // if the transcript is alone, use the gene List<Feature> transcripts = getTranscripts(gene); if (transcripts.size() == 1) return gene; return transcript; } public Feature getAncestorGene(Feature currentFeature, List<Cvterm> ofType) { if (currentFeature.type.name.equals("gene") || currentFeature.type.name.equals("pseudogene")) return currentFeature; List<Feature> parents = featureMapper.parents(currentFeature, ofType); for (Feature parent : parents) { // parents are objects Feature root = getAncestorGene(parent, ofType); if (root != null) { return root; } } return null; } public void getDescendants(Feature feature, List<Cvterm> ofType, boolean includeSummaries) { feature.children = featureMapper.children(feature, ofType); if (includeSummaries) summarise(feature); if (feature.children == null) return; for (Feature child : feature.children) { // children are subjects getDescendants(child, ofType, includeSummaries); } } public List<Feature> getTranscripts(Feature gene) { List<Feature> transcripts = new ArrayList<Feature>(); for (Feature child : gene.children) { if ( transcriptTypes.contains(child.type.name)) { transcripts.add(child); } } return transcripts; } public Feature getTranscript(final Feature requested, final Feature hierarchyFeature, final boolean ignoreObsoletes) { Feature firstTranscript = null; logger.info(String.format(" -> %s (%s) ", hierarchyFeature.uniqueName, hierarchyFeature.type.name)); Collections.sort(hierarchyFeature.children, new FeatureUniqueNameSorter()); for (Feature child : hierarchyFeature.children) { if (child.type.name.equals("mRNA")) { if (ignoreObsoletes && child.isObsolete) continue; logger.info(String.format(" --> %s (%s) ", child.uniqueName, child.type.name)); if (requested.uniqueName.equals(hierarchyFeature.uniqueName) || requested.uniqueName.equals(child.uniqueName)) { logger.info("1"); return child; } if (firstTranscript == null) firstTranscript = child; for (Feature grandChild : child.children) { if (ignoreObsoletes && grandChild.isObsolete) continue; logger.info(String.format(" ---> %s (%s) ", grandChild.uniqueName, grandChild.type.name)); String grandChildType = grandChild.type.name; if (requested.type.name.equals(grandChildType) && requested.uniqueName.equals(grandChild.uniqueName)) { if (grandChildType.equals("polypeptide")) { logger.info("2"); return child; } else if (grandChildType.equals("exon")) { logger.info("3"); return child; } } } } } logger.info(" --->! "); return firstTranscript; } public List<Feature> getExons(Feature transcript) { List<Feature> exons = new ArrayList<Feature>(); for (Feature potential_exon : transcript.children) { if (potential_exon.type.name.equals("exon")) { logger.info("exon " + potential_exon.uniqueName); potential_exon.coordinates = featureMapper.coordinates(potential_exon); exons.add(potential_exon); } } return exons; } public String getExonSequence(List<Feature> exons) { StringBuffer sequence = new StringBuffer(); for (Feature exon : exons) { Coordinates coordinates = exon.coordinates.get(0); String dnaString = regionsMapper.sequenceTrimmed(coordinates.region, coordinates.fmin, coordinates.fmax).dna; // if (coordinates.strand < 0) // dnaString = Bases.reverseComplement(dnaString); logger.info(String.format(">%s (%s) %d-%d \n%s", exon.uniqueName, coordinates.region, coordinates.fmin, coordinates.fmax, dnaString)); sequence.append(dnaString); } return sequence.toString(); } class FeatureLocationSorter implements Comparator<Feature> { @Override public int compare(Feature feature1, Feature feature2) { Coordinates coordinates1 = feature1.coordinates.get(0); assert(coordinates1 != null); Coordinates coordinates2 = feature2.coordinates.get(0); assert(coordinates2 != null); if (coordinates1.fmin > coordinates2.fmin) return 1; if (coordinates1.fmin < coordinates2.fmin) return -1; return 0; } } class FeatureUniqueNameSorter implements Comparator<Feature> { @Override public int compare(Feature feature1, Feature feature2) { return feature1.uniqueName.compareTo(feature2.uniqueName); } } public List<Property> getPolypeptideProperties(Feature feature, Feature hierarchyFeature) throws NumberFormatException, BioException, TranslationException { Feature transcript = getTranscript(feature, hierarchyFeature, false); if (transcript == null) throw new RuntimeException("Could not find transcript"); //featureMapper.properties(transcript); List<Feature> exons = getExons(transcript); Collections.sort(exons, new FeatureLocationSorter()); Feature firstExon = exons.get(0); if (firstExon == null) throw new RuntimeException("Could not find exon"); String exonSequence = getExonSequence(exons); logger.info(String.format("transcript : %s, exonSequence : %s", transcript.uniqueName, exonSequence)); Coordinates firstExonCoordinate = firstExon.coordinates.get(0); boolean isFwd = (firstExonCoordinate.strand > 0) ? true : false; if (! isFwd) { logger.info("reversing"); exonSequence = Bases.reverseComplement(exonSequence); logger.info(String.format("reversed transcript : %s, exonSequence : %s", transcript.uniqueName, exonSequence)); } String phase = firstExonCoordinate.phase; if (phase == null) phase = "0"; Organism o = organismsMapper.getByID(feature.organism_id); Property translationTableProp = organismsMapper.getOrganismProp(o, "genedb_misc", "translationTable"); int translationTable = 1; if (translationTableProp != null && translationTableProp.value != null) { translationTable = Integer.parseInt(translationTableProp.value); } return getPolypeptideProperties(exonSequence, Integer.parseInt(phase), translationTable); } public static List<Property> getPolypeptideProperties(String dnaString, int phase, int translationTable) throws BioException, TranslationException { List<Property> properties = new ArrayList<Property>(); // Coordinates coordinates = feature.coordinates.get(0); // int phase = (coordinates.phase != null) ? // Integer.parseInt(coordinates.phase) : 0; // String dnaString = regionsMapper.sequenceTrimmed(coordinates.region, // coordinates.fmin, coordinates.fmax).dna; String residuesString = translate(translationTable, dnaString, phase, false); logger.info(residuesString); // strip stop codons (to avoid hard failures) residuesString = residuesString.replaceAll("[*#+]",""); SymbolTokenization proteinTokenization = ProteinTools.getTAlphabet().getTokenization("token"); SymbolList residuesSymbolList = null; residuesSymbolList = new SimpleSymbolList(proteinTokenization, residuesString); logger.info(residuesSymbolList); if (residuesSymbolList.length() == 0) { throw new RuntimeException(String.format("Polypeptide feature has zero-length residues")); // return properties; } // if the sequence ends with a termination symbol (*), we need to // remove it if (residuesSymbolList.symbolAt(residuesSymbolList.length()) == ProteinTools.ter()) { if (residuesSymbolList.length() == 1) { throw new RuntimeException(String.format("Polypeptide feature only has termination symbol")); // return properties; } residuesSymbolList = residuesSymbolList.subList(1, residuesSymbolList.length() - 1); } Property aminoProp = new Property(); aminoProp.name = "Amino Acids"; aminoProp.value = String.valueOf(residuesSymbolList.length()); properties.add(aminoProp); DecimalFormat df = new DecimalFormat("#.0"); double isoElectricPoint = new IsoelectricPointCalc().getPI(residuesSymbolList, false, false); Property isoProp = new Property(); isoProp.name = "Isoelectric Point"; isoProp.value = "pH " + df.format(isoElectricPoint); properties.add(isoProp); double mass2 = calculateMass(residuesSymbolList); Property massProp = new Property(); massProp.name = "Mass"; massProp.value = df.format(mass2 / 1000) + " kDa"; properties.add(massProp); double charge = calculateCharge(residuesString); Property chargeProp = new Property(); chargeProp.name = "Charge"; chargeProp.value = df.format(charge); properties.add(chargeProp); return properties; } public static String translate(int translationTableId, String dnaSequence, int phase, boolean stopCodonTranslatedAsSelenocysteine) throws TranslationException { return Translator.getTranslator(translationTableId).translate(dnaSequence, phase, stopCodonTranslatedAsSelenocysteine); } public static double calculateMass(SymbolList residuesSymbolList) throws IllegalSymbolException { double massInDaltons = MassCalc.getMass(residuesSymbolList, SymbolPropertyTable.AVG_MASS, true); return massInDaltons; } /** * Calculate the charge of a polypeptide. * * @param residues * a string representing the polypeptide residues, using the * single-character code * @return the charge of this polypeptide (in what units?) */ public static double calculateCharge(String residues) { double charge = 0.0; for (char aminoAcid : residues.toCharArray()) { switch (aminoAcid) { case 'B': case 'Z': charge += -0.5; break; case 'D': case 'E': charge += -1.0; break; case 'H': charge += 0.5; break; case 'K': case 'R': charge += 1.0; break; /* * EMBOSS seems to think that 'O' (presumably Pyrrolysine) also * contributes +1 to the charge. According to Wikipedia, this * obscure amino acid is found only in methanogenic archaea, so * it's unlikely to trouble us soon. Still, it can't hurt: */ case 'O': charge += 1.0; break; } } return charge; } }