FeatureMapperUtil.java example

Explorer

crawl2-master
- src
  - main
    - java
      - org
        genedb
        crawl
        CrawlErrorType.java
        CrawlException.java
        Tester.java
        annotations
        ListType.java
        ResourceDescription.java
        bam
        BAMConverter.java
        BioDataFileStore.java
        BioDataFileStoreInitializer.java
        Sam.java
        business
        FileUtil.java
        TabixGenerator.java
        TabixReader.java
        TabixReaderInfo.java
        client
        CrawlClient.java
        controller
        BaseController.java
        FeatureController.java
        FeaturesController.java
        OrganismsController.java
        RegionsController.java
        SamController.java
        TermsController.java
        TestingController.java
        VariantController.java
        editor
        DatePropertyEditor.java
        ListSplittingPropertyEditor.java
        dao
        FeatureDAO.java
        FeaturesDAO.java
        OrganismsDAO.java
        RegionsDAO.java
        SamDAO.java
        TermsDAO.java
        VariantDAO.java
        backend
        BaseDAO.java
        FeatureDAO.java
        FeaturesDAO.java
        OrganismsDAO.java
        RegionsDAO.java
        SamDAO.java
        TermsDAO.java
        VariantDAO.java
        proxy
        DAOFactory.java
        DAOInvocationHandler.java
        Proxies.java
        ProxyFeatureMapper.java
        elasticsearch
        Connection.java
        LocalConnection.java
        TransportConnection.java
        index
        IndexBuilder.java
        NonDatabaseDataSourceIndexBuilder.java
        cv
        CvIndexBuilder.java
        das
        DASFileBuilder.java
        DASIndexBuilder.java
        DasFetcher.java
        gff
        FeatureBeanFactory.java
        GFFAnnotatationExtractor.java
        GFFFeature.java
        GFFFileFilter.java
        GFFIndexBuilder.java
        GFFSequenceExtractor.java
        json
        OrganismIndexBuilder.java
        ReferenceIndexBuilder.java
        sql
        CvtermUtil.java
        IncrementalSQLIndexBuilder.java
        SQLIndexer.java
        mappers
        ElasticSearchBaseMapper.java
        ElasticSearchFeatureCvtermMapper.java
        ElasticSearchFeatureMapper.java
        ElasticSearchFeaturesMapper.java
        ElasticSearchOrganismsMapper.java
        ElasticSearchRegionsMapper.java
        ElasticSearchTermsMapper.java
        plugin
        RegexPlugin.java
        RegexQueryParser.java
        hazelcast
        HazelcastMonitor.java
        HazelcastService.java
        json
        JsonDateDeserializer.java
        JsonDateSerializer.java
        JsonIzer.java
        mappers
        AuditMapper.java
        FeatureCvtermMapper.java
        FeatureMapper.java
        FeaturesMapper.java
        MapperUtil.java
        OrganismsMapper.java
        RegionsMapper.java
        TermsMapper.java
        model
        Alignment.java
        AlignmentSequenceAlias.java
        Alignments.java
        Analysis.java
        AnalysisFeature.java
        Argument.java
        BioDataFile.java
        BlastPair.java
        Change.java
        Coordinates.java
        CrawlError.java
        Cv.java
        Cvterm.java
        CvtermProp.java
        CvtermRelationship.java
        Db.java
        Dbxref.java
        Exon.java
        Feature.java
        FeatureCollection.java
        FeatureGenes.java
        FeatureGenesList.java
        FeatureRelationship.java
        Gene.java
        HierarchicalFeature.java
        HierarchicalFeatureList.java
        HierarchyGeneFetchResult.java
        HierarchyRelation.java
        LocatedFeature.java
        LocationBoundaries.java
        MappedCoverage.java
        MappedQuery.java
        MappedQueryRecordElementList.java
        MappedSAMHeader.java
        MappedSAMRecord.java
        MappedSAMRecords.java
        MappedSAMSequence.java
        MappedSAMSequenceList.java
        MappedVCFRecord.java
        MappedVariantBase.java
        Organism.java
        OrganismList.java
        Orthologue.java
        Property.java
        Pub.java
        Reference.java
        RegionsInOrganism.java
        Resource.java
        Sequence.java
        Service.java
        Statistic.java
        Synonym.java
        Transcript.java
        Variant.java
        XMLResponseWrapper.java
        adapter
        AlignmentBlockAdapter.java
        modelling
        FeatureMapperUtil.java
        LocatedFeatureUtil.java
        RegionFeatureBuilder.java
        servlet
        CrawlServletContextListener.java
        view
        BaseView.java
        CrawlMappingExceptionResolver.java
        CrawlViewResolver.java
        JacksonView.java
        XMLView.java
        util
        TranslationException.java
        Translator.java
      - uk
        ac
        sanger
        artemis
        components
        variant
        BCFReaderAdapter.java
        FTPSeekableStream.java
        IntEnumPatternResolver.java
        MyVCFRecord.java
        TabixReaderAdapter.java
        VCFRecordAdapter.java
        VariantFilterOption.java
        VariantFilterOptions.java
        VariantReaderAdapter.java
  - test
    - java
      - org
        genedb
        crawl
        ClientTest.java
        CvTest.java
        DatabaseTest.java
        ElasticTest.java
        FTPTest.java
        GFFFeatureTest.java
        VariantFilterOptionsTest.java
        elasticsearch
        index
        ReferenceTest.java
      - uk
        ac
        sanger
        artemis
        components
        variant
        VariantTest.java

package org.genedb.crawl.modelling;

import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.log4j.Logger;
import org.biojava.bio.BioException;
import org.biojava.bio.proteomics.IsoelectricPointCalc;
import org.biojava.bio.proteomics.MassCalc;
import org.biojava.bio.seq.ProteinTools;
import org.biojava.bio.seq.io.SymbolTokenization;
import org.biojava.bio.symbol.IllegalSymbolException;
import org.biojava.bio.symbol.SimpleSymbolList;
import org.biojava.bio.symbol.SymbolList;
import org.biojava.bio.symbol.SymbolPropertyTable;
import org.genedb.crawl.mappers.FeatureMapper;
import org.genedb.crawl.mappers.OrganismsMapper;
import org.genedb.crawl.mappers.RegionsMapper;
import org.genedb.crawl.model.Coordinates;
import org.genedb.crawl.model.Cvterm;
import org.genedb.crawl.model.Feature;
import org.genedb.crawl.model.LocatedFeature;
import org.genedb.crawl.model.Organism;
import org.genedb.crawl.model.Property;
import org.genedb.util.TranslationException;
import org.genedb.util.Translator;
import org.springframework.beans.factory.annotation.Autowired;

import uk.ac.sanger.artemis.sequence.Bases;

/**
 * A class that does useful things with mappers, centering around features.
 *
 * @author gv1
 *
 */
public class FeatureMapperUtil {

    private static Logger   logger = Logger.getLogger(FeatureMapperUtil.class);

    @Autowired
    private OrganismsMapper organismsMapper;

    @Autowired
    private RegionsMapper   regionsMapper;

    @Autowired
    public FeatureMapper    featureMapper;

    public static final Set<String> transcriptTypes = new HashSet<String>(Arrays.asList(new String[] {"mRNA", "ncRNA", "snoRNA", "snRNA", "tRNA", "miscRNA", "rRNA"}));

    public Feature getFeature(String uniqueName, String name, String organism) {
        Integer organism_id = null;

        if (organism != null) {
            Organism o = this.getOrganism(organism);
            if (o != null)
                organism_id = o.ID;
        }

        Feature resultFeature = featureMapper.getWithSynonym(uniqueName, name, organism_id, null);

        return resultFeature;
    }

    public Organism getOrganism(String organism) {
        Organism mappedOrganism = null;

        if (organism.contains(":")) {
            String[] split = organism.split(":");

            if (split.length == 2) {

                String prefix = split[0];
                String orgDescriptor = split[1];

                if (prefix.equals("com")) {
                    mappedOrganism = organismsMapper.getByCommonName(orgDescriptor);
                } else if (prefix.equals("tax")) {
                    mappedOrganism = organismsMapper.getByTaxonID(orgDescriptor);
                } else if (prefix.equals("org")) {
                    mappedOrganism = organismsMapper.getByID(Integer.parseInt(orgDescriptor));
                }

            }

        } else {

            mappedOrganism = organismsMapper.getByCommonName(organism);

        }

        return mappedOrganism;
    }

    public void summarise(Feature feature) {

        feature.coordinates = featureMapper.coordinates(feature);

        // TODO - this might need to be fixed to work with non-LocatedFeature
        // instances
        if (feature instanceof LocatedFeature && feature.coordinates != null && feature.coordinates.size() > 0) {
            LocatedFeature locatedFeature = (LocatedFeature) feature;
            Coordinates c = locatedFeature.coordinates.get(0);
            locatedFeature.fmin = c.fmin;
            locatedFeature.fmax = c.fmax;
            locatedFeature.region = c.region;
            locatedFeature.phase = c.phase;
            locatedFeature.strand = c.strand;

        }

        feature.properties = featureMapper.properties(feature);
        feature.terms = featureMapper.terms(feature);
        feature.synonyms = featureMapper.synonyms(feature);
        feature.pubs = featureMapper.pubs(feature);
        feature.dbxrefs = featureMapper.dbxrefs(feature);
        feature.domains = featureMapper.domains(feature);

        for (Feature domain : feature.domains) {
            summarise(domain);
        }

        feature.orthologues = featureMapper.orthologues(feature);

    }

    /**
     *
     * An isoform is defined (in order of searching) :
     *   - as the parent transcript (for a multi-isoform gene), or
     *   - the first transcript if the requested feature is not a descendant of transcript, or
     *   - the gene if part of a gene model.
     *
     * If nothing is found, then assume it is not part of a gene model, in which case just
     * return the requested feature.
     *
     * @param feature
     * @param ofType
     * @return
     */
    public Feature getIsoform(Feature feature, List<Cvterm> ofType) {

        Feature gene = this.getAncestorGene(feature, ofType);

        if (gene == null)
            return feature;

        getDescendants(gene, ofType, false);
        Feature transcript = getTranscript(feature, gene, true);

        // if there are no transcripts, must use the gene
        if (transcript == null)
            return gene;

        // if the transcript is alone, use the gene
        List<Feature> transcripts = getTranscripts(gene);
        if (transcripts.size() == 1)
            return gene;

        return transcript;
    }

    public Feature getAncestorGene(Feature currentFeature, List<Cvterm> ofType) {

        if (currentFeature.type.name.equals("gene") || currentFeature.type.name.equals("pseudogene"))
            return currentFeature;

        List<Feature> parents = featureMapper.parents(currentFeature, ofType);

        for (Feature parent : parents) {
            // parents are objects
            Feature root = getAncestorGene(parent, ofType);

            if (root != null) {
                return root;
            }

        }

        return null;
    }

    public void getDescendants(Feature feature, List<Cvterm> ofType, boolean includeSummaries) {

        feature.children = featureMapper.children(feature, ofType);
        if (includeSummaries)
            summarise(feature);

        if (feature.children == null)
            return;

        for (Feature child : feature.children) {
            // children are subjects
            getDescendants(child, ofType, includeSummaries);
        }

    }

    public List<Feature> getTranscripts(Feature gene) {
        List<Feature> transcripts = new ArrayList<Feature>();
        for (Feature child : gene.children) {
            if ( transcriptTypes.contains(child.type.name)) {
                transcripts.add(child);
            }
        }
        return transcripts;
    }

    public Feature getTranscript(final Feature requested, final Feature hierarchyFeature, final boolean ignoreObsoletes) {

        Feature firstTranscript = null;

        logger.info(String.format(" -> %s (%s) ", hierarchyFeature.uniqueName, hierarchyFeature.type.name));

        Collections.sort(hierarchyFeature.children, new FeatureUniqueNameSorter());

        for (Feature child : hierarchyFeature.children) {
            if (child.type.name.equals("mRNA")) {

                if (ignoreObsoletes && child.isObsolete)
                    continue;

                logger.info(String.format(" --> %s (%s) ", child.uniqueName, child.type.name));

                if (requested.uniqueName.equals(hierarchyFeature.uniqueName) || requested.uniqueName.equals(child.uniqueName)) {
                    logger.info("1");
                    return child;
                }

                if (firstTranscript == null)
                    firstTranscript = child;

                for (Feature grandChild : child.children) {

                    if (ignoreObsoletes && grandChild.isObsolete)
                        continue;

                    logger.info(String.format(" ---> %s (%s) ", grandChild.uniqueName, grandChild.type.name));
                    String grandChildType = grandChild.type.name;
                    if (requested.type.name.equals(grandChildType) && requested.uniqueName.equals(grandChild.uniqueName)) {
                        if (grandChildType.equals("polypeptide")) {
                            logger.info("2");
                            return child;
                        } else if (grandChildType.equals("exon")) {
                            logger.info("3");
                            return child;
                        }
                    }
                }
            }
        }

        logger.info(" --->! ");

        return firstTranscript;
    }

    public List<Feature> getExons(Feature transcript) {
        List<Feature> exons = new ArrayList<Feature>();
        for (Feature potential_exon : transcript.children) {
            if (potential_exon.type.name.equals("exon")) {
                logger.info("exon " + potential_exon.uniqueName);
                potential_exon.coordinates = featureMapper.coordinates(potential_exon);
                exons.add(potential_exon);
            }
        }
        return exons;
    }

    public String getExonSequence(List<Feature> exons) {
        StringBuffer sequence = new StringBuffer();
        for (Feature exon : exons) {
            Coordinates coordinates = exon.coordinates.get(0);
            String dnaString = regionsMapper.sequenceTrimmed(coordinates.region, coordinates.fmin, coordinates.fmax).dna;

            // if (coordinates.strand < 0)
            //                 dnaString = Bases.reverseComplement(dnaString);

            logger.info(String.format(">%s (%s) %d-%d \n%s", exon.uniqueName, coordinates.region, coordinates.fmin, coordinates.fmax, dnaString));

            sequence.append(dnaString);
        }
        return sequence.toString();
    }

    class FeatureLocationSorter implements Comparator<Feature> {

        @Override
        public int compare(Feature feature1, Feature feature2) {

            Coordinates coordinates1 = feature1.coordinates.get(0);
            assert(coordinates1 != null);

            Coordinates coordinates2 = feature2.coordinates.get(0);
            assert(coordinates2 != null);

            if (coordinates1.fmin > coordinates2.fmin)
                return 1;
            if (coordinates1.fmin < coordinates2.fmin)
                return -1;
            return 0;
        }

    }

    class FeatureUniqueNameSorter implements Comparator<Feature> {
        @Override
        public int compare(Feature feature1, Feature feature2) {
            return feature1.uniqueName.compareTo(feature2.uniqueName);
        }
    }

    public List<Property> getPolypeptideProperties(Feature feature, Feature hierarchyFeature) throws NumberFormatException, BioException, TranslationException {
        Feature transcript = getTranscript(feature, hierarchyFeature, false);
        if (transcript == null)
            throw new RuntimeException("Could not find transcript");

        //featureMapper.properties(transcript);


        List<Feature> exons = getExons(transcript);

        Collections.sort(exons, new FeatureLocationSorter());

        Feature firstExon = exons.get(0);
        if (firstExon == null)
            throw new RuntimeException("Could not find exon");

        String exonSequence = getExonSequence(exons);
        logger.info(String.format("transcript : %s, exonSequence : %s", transcript.uniqueName, exonSequence));

        Coordinates firstExonCoordinate = firstExon.coordinates.get(0);

        boolean isFwd = (firstExonCoordinate.strand > 0) ? true : false;
        if (! isFwd) {
            logger.info("reversing");
            exonSequence = Bases.reverseComplement(exonSequence);
            logger.info(String.format("reversed transcript : %s, exonSequence : %s", transcript.uniqueName, exonSequence));
        }

        String phase = firstExonCoordinate.phase;
        if (phase == null)
            phase = "0";

        Organism o = organismsMapper.getByID(feature.organism_id);

        Property translationTableProp = organismsMapper.getOrganismProp(o, "genedb_misc", "translationTable");

        int translationTable = 1;
        if (translationTableProp != null && translationTableProp.value != null) {
            translationTable = Integer.parseInt(translationTableProp.value);
        }

        return getPolypeptideProperties(exonSequence, Integer.parseInt(phase), translationTable);
    }

    public static List<Property> getPolypeptideProperties(String dnaString, int phase, int translationTable) throws BioException, TranslationException {

        List<Property> properties = new ArrayList<Property>();

        // Coordinates coordinates = feature.coordinates.get(0);
        // int phase = (coordinates.phase != null) ?
        // Integer.parseInt(coordinates.phase) : 0;

        // String dnaString = regionsMapper.sequenceTrimmed(coordinates.region,
        // coordinates.fmin, coordinates.fmax).dna;

        String residuesString = translate(translationTable, dnaString, phase, false);

        logger.info(residuesString);

        // strip stop codons (to avoid hard failures)
        residuesString = residuesString.replaceAll("[*#+]","");

        SymbolTokenization proteinTokenization = ProteinTools.getTAlphabet().getTokenization("token");
        SymbolList residuesSymbolList = null;
        residuesSymbolList = new SimpleSymbolList(proteinTokenization, residuesString);

        logger.info(residuesSymbolList);

        if (residuesSymbolList.length() == 0) {
            throw new RuntimeException(String.format("Polypeptide feature has zero-length residues"));
            // return properties;
        }

        // if the sequence ends with a termination symbol (*), we need to
        // remove it
        if (residuesSymbolList.symbolAt(residuesSymbolList.length()) == ProteinTools.ter()) {
            if (residuesSymbolList.length() == 1) {
                throw new RuntimeException(String.format("Polypeptide feature only has termination symbol"));
                // return properties;
            }
            residuesSymbolList = residuesSymbolList.subList(1, residuesSymbolList.length() - 1);
        }


        Property aminoProp = new Property();
        aminoProp.name = "Amino Acids";
        aminoProp.value = String.valueOf(residuesSymbolList.length());
        properties.add(aminoProp);

        DecimalFormat df = new DecimalFormat("#.0");
        double isoElectricPoint = new IsoelectricPointCalc().getPI(residuesSymbolList, false, false);

        Property isoProp = new Property();
        isoProp.name = "Isoelectric Point";
        isoProp.value = "pH " + df.format(isoElectricPoint);
        properties.add(isoProp);

        double mass2 = calculateMass(residuesSymbolList);

        Property massProp = new Property();
        massProp.name = "Mass";
        massProp.value = df.format(mass2 / 1000) + " kDa";
        properties.add(massProp);

        double charge = calculateCharge(residuesString);

        Property chargeProp = new Property();
        chargeProp.name = "Charge";
        chargeProp.value = df.format(charge);
        properties.add(chargeProp);

        return properties;

    }

    public static String translate(int translationTableId, String dnaSequence, int phase, boolean stopCodonTranslatedAsSelenocysteine) throws TranslationException {
        return Translator.getTranslator(translationTableId).translate(dnaSequence, phase, stopCodonTranslatedAsSelenocysteine);
    }

    public static double calculateMass(SymbolList residuesSymbolList) throws IllegalSymbolException {

        double massInDaltons = MassCalc.getMass(residuesSymbolList, SymbolPropertyTable.AVG_MASS, true);
        return massInDaltons;

    }

    /**
     * Calculate the charge of a polypeptide.
     *
     * @param residues
     *            a string representing the polypeptide residues, using the
     *            single-character code
     * @return the charge of this polypeptide (in what units?)
     */
    public static double calculateCharge(String residues) {
        double charge = 0.0;
        for (char aminoAcid : residues.toCharArray()) {
            switch (aminoAcid) {
                case 'B':
                case 'Z':
                    charge += -0.5;
                    break;
                case 'D':
                case 'E':
                    charge += -1.0;
                    break;
                case 'H':
                    charge += 0.5;
                    break;
                case 'K':
                case 'R':
                    charge += 1.0;
                    break;
                /*
                 * EMBOSS seems to think that 'O' (presumably Pyrrolysine) also
                 * contributes +1 to the charge. According to Wikipedia, this
                 * obscure amino acid is found only in methanogenic archaea, so
                 * it's unlikely to trouble us soon. Still, it can't hurt:
                 */
                case 'O':
                    charge += 1.0;
                    break;
            }
        }
        return charge;
    }

}