CarbbankRecord.java example

Explorer

eurocarbdb-master
- application
- core-api
  - src
    - org
      - eurocarbdb
        dataaccess
        BasicEurocarbObject.java
        Contributed.java
        EntityDoesntExistException.java
        EntityManager.java
        Eurocarb.java
        EurocarbObject.java
        HibernateEntityManager.java
        HibernateXMLEntityManager.java
        core
        Author.java
        BiologicalContext.java
        BiologicalContextAssociation.java
        BiologicalContextContributor.java
        BiologicalContextToGlycoconjugate.java
        Composition.java
        CompositionId.java
        Contributor.java
        Disease.java
        DiseaseContext.java
        DiseaseRelations.java
        DiseaseSynonym.java
        Evidence.java
        EvidenceContext.java
        Experiment.java
        ExperimentContext.java
        ExperimentStep.java
        ExternalDatabaseReference.java
        GlycanSequence.java
        GlycanSequenceContext.java
        GlycanSequenceEvidence.java
        GlycanSequenceReference.java
        GlycanSequenceRelations.java
        Glycoconjugate.java
        Glycoprotein.java
        Journal.java
        JournalReference.java
        Perturbation.java
        PerturbationContext.java
        PerturbationRelations.java
        Reference.java
        ReferencedEvidence.java
        Taxonomy.java
        TaxonomyProteomeSkRanked.java
        TaxonomyRelations.java
        TaxonomySubtype.java
        TaxonomySynonym.java
        Technique.java
        TissueTaxonomy.java
        TissueTaxonomyRelations.java
        TissueTaxonomySynonym.java
        ref
        BcsdbReference.java
        CarbbankReference.java
        CfgReference.java
        GenericReference.java
        GlyaffinityReference.java
        GlycobaseDublinReference.java
        GlycobaseLilleReference.java
        GlycosciencesDeReference.java
        KeggReference.java
        seq
        GlycanMonosaccharide.java
        GlycanResidue.java
        GlycanSubstituent.java
        SubstructureQuery.java
        SubstructureQueryCriterion.java
        SubstructureQueryGenerator.java
        SubstructureQueryGeneratorImpl1.java
        SubstructureQueryGeneratorImpl2.java
        SubstructureQueryResult.java
        exception
        DataAccessException.java
        DataException.java
        EurocarbException.java
        InvalidAssociationException.java
        InvalidPropertyException.java
        UnauthorisedAccessException.java
        hibernate
        GenericEnumUserType.java
        HibernateUtils.java
        SugarSequenceUserType.java
        hplc
        Autogu.java
        Column.java
        Content.java
        ContributorIndustry.java
        Detector.java
        Digest.java
        DigestProfile.java
        DigestSingle.java
        DisRefine.java
        Disappeared.java
        Enzyme.java
        Glycan.java
        GlycanDisease.java
        GlycanPerturbation.java
        GlycanSequenceTemp.java
        GlycanSourceLink.java
        GlycanTaxonomy.java
        GlycanTissue.java
        HplcPeaksAnnotated.java
        HplcPeaksIntegrated.java
        Instrument.java
        IntegrationMethod.java
        Mandetector.java
        MethodRun.java
        Multipleglycoct.java
        MultistructuresGlycoct.java
        ParentProfile.java
        PictorialRepresentation.java
        Profile.java
        ProfileData.java
        Ref.java
        RefLink.java
        RefLinkId.java
        RefTaxLink.java
        StructuresGlycoct.java
        indexes
        Index.java
        IndexByContributedDate.java
        IndexByContributorName.java
        IndexByMostEvidence.java
        IndexByResidueCount.java
        Indexable.java
        ms
        Acquisition.java
        AcquisitionToPersubstitution.java
        Analyser.java
        AnalyserParameter.java
        Annotation.java
        DataProcessing.java
        Device.java
        DeviceSettings.java
        EsiParameter.java
        Fragmentation.java
        FragmentationParameter.java
        FragmentationType.java
        Ion.java
        IonComposition.java
        IontrapParameter.java
        Laser.java
        LaserParameter.java
        MaldiMatrix.java
        MaldiParameter.java
        Manufacturer.java
        MassDetector.java
        MassDetectorParameter.java
        MethodOfCombination.java
        MsMsRelationship.java
        PeakAnnotated.java
        PeakAnnotatedToIon.java
        PeakAnnotatedToSmallMolecule.java
        PeakAnnotation.java
        PeakAnnotationComparator.java
        PeakLabeled.java
        PeakLabeledComparator.java
        PeakList.java
        PeakListToDataProcessing.java
        PeakProcessing.java
        Persubstitution.java
        ReducingEnd.java
        Scan.java
        ScanImage.java
        ScanToDataProcessing.java
        SmallMolecule.java
        SmallMoleculeComposition.java
        Software.java
        SoftwareType.java
        Source.java
        SourceParameter.java
        SumAverageRelationship.java
        TandemScanMethod.java
        TofParameter.java
        nmr
        NmrEvidence.java
        sugar
        Anomer.java
        Attachable.java
        Basetype.java
        Basetypes.java
        BasicLinkage.java
        BasicMolecule.java
        CarbohydrateChemistry.java
        CommonBasetype.java
        CommonSubstituent.java
        Composition.java
        CustomBasetype.java
        Element.java
        GlycosidicLinkage.java
        Ion.java
        Linkage.java
        LinkageType.java
        Massclass.java
        Modification.java
        ModificationType.java
        Molecule.java
        Monosaccharide.java
        PositionNotOccupiedException.java
        PositionOccupiedException.java
        PotentiallyIndefinite.java
        Residue.java
        ResidueFormat.java
        RingConformation.java
        SequenceFormat.java
        SequenceFormatException.java
        SimpleSubstituent.java
        StereoConfig.java
        Substituent.java
        Substituents.java
        Sugar.java
        SugarAnnotation.java
        SugarChemistryException.java
        SugarException.java
        SugarRepeat.java
        SugarRepeatAnnotation.java
        SugarSequence.java
        SugarVisitor.java
        Superclass.java
        impl
        ComplexMonosaccharide.java
        GenericResidue.java
        LinkageDisjunction.java
        ResidueDisjunction.java
        SimpleMonosaccharide.java
        seq
        CarbbankSequenceFormat.java
        GlycoctSequenceFormat.java
        GlycoctXmlSequenceFormat.java
        IupacSequenceFormat.java
        MultiSequenceFormat.java
        SequenceIterator.java
        grammar
        AstTranslatorVisitor.java
        GlycoctLexer.java
        GlycoctParser.java
        GlycoctParserAdaptor.java
        GlycoctParserTokenTypes.java
        GlycoctTest.java
        IupacLexer.java
        IupacParser.java
        IupacParserAdaptor.java
        IupacParserTokenTypes.java
        IupacTest.java
        LinkageToken.java
        MonosacResidueToken.java
        ParserAdaptor.java
        RepeatResidueToken.java
        ResidueToken.java
        SequenceTestHarness.java
        util
        BitSet.java
        CompositionVisitor.java
        FTP_Client.java
        JavaUtils.java
        Logger.java
        ProgressWatchable.java
        StringUtils.java
        Version.java
        Visitor.java
        XmlSerialiser.java
        carbbank
        CarbbankManager.java
        CarbbankParser.java
        CarbbankRecord.java
        CarbbankTaxonomy.java
        graph
        BreadthFirstGraphVisitor.java
        DepthFirstGraphVisitor.java
        Edge.java
        Graph.java
        GraphIterator.java
        GraphVisitor.java
        Graphable.java
        Graphs.java
        Path.java
        Tree.java
        Vertex.java
        mesh
        MeshReference.java
        ncbi
        NcbiTaxonomy.java
    - test

/*
*   EuroCarbDB, a framework for carbohydrate bioinformatics
*
*   Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as
*   indicated by the @author tags or express copyright attribution
*   statements applied by the authors.  
*
*   This copyrighted material is made available to anyone wishing to use, modify,
*   copy, or redistribute it subject to the terms and conditions of the GNU
*   Lesser General Public License, as published by the Free Software Foundation.
*   A copy of this license accompanies this distribution in the file LICENSE.txt.
*
*   This program is distributed in the hope that it will be useful,
*   but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*   or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
*   for more details.
*
*   Last commit: $Rev: 1932 $ by $Author: glycoslave $ on $Date:: 2010-08-05 #$  
*/

package org.eurocarbdb.util.carbbank;

import java.util.Map;
import java.util.List;
import java.util.Collections;

import org.apache.log4j.Logger;

import org.eurocarbdb.dataaccess.core.*;
import org.eurocarbdb.sugar.SugarSequence;
import org.eurocarbdb.sugar.SequenceFormat;
import org.eurocarbdb.sugar.seq.GlycoctSequenceFormat;

import static org.eurocarbdb.util.StringUtils.*;
import static org.eurocarbdb.util.carbbank.CarbbankParser.*;

import static org.eurocarbdb.dataaccess.Eurocarb.getEntityManager;


/**
*<p>
*   Represents an individual Carbbank structural record.
*</p>
*<p>
*   A typical Carbbank record looks like this:
*</p>
*<pre>   
    ; start of record
    ; db=ccsd29
    ; Record #=14
    CC: CCSD:43759
    AU: Albersheim P; Darvill A; Augur C; Cheong JJ; Eberhard S; Hahn MG;
    Marfa V; Mohnen D; O'Neill MA; Spiro MD; York WS
    TI: Oligosaccharins: Oligosaccharide regulatory molecules
    CT: Acc Chem Res (1992) 25: 77-83
    BS: (GS) Hansenula holstii
    BS: (GS) Pichia holstii
    SC: 14
    TN: XXFG
    MT: xyloglucan
    SB: Westra B
    DA: 06-12-1995
    FC: 66bf2505
    SI: CBank:17702
    ----------------
    structure:
    
     a-L-Fucp-(1-2)-b-D-Galp-(1-2)-a-D-Xylp-(1-6)+
                                                 |
                          a-D-Xylp-(1-6)+   b-D-Glcp-(1-4)-D-Glc
                                        |        |
                                   b-D-Glcp-(1-4)+
                                        |
           a-D-Xylp-(1-6)-b-D-Glcp-(1-4)+
    ================end of record
*</pre>
*<p>
*   This class provides an interface by which to extract data from
*   individual records and return this information in terms of Eurocarb
*   data objects.  
*</p>
*
*   @author mjh [glycoslave@gmail.com]
*/
public class CarbbankRecord 
{
    /** Logging handle. */
    static final Logger log = Logger.getLogger( CarbbankRecord.class );
    
    /** Entire raw carbbank record */
    private String rawEntry;
    
    /** Hash of carbbank keys => values. */
    private Map<String,String> data;
    
    /** Line number on which this record started */
    final int firstLine;
    
    /** Line number on which this record ends */
    final int lastLine;
    
    /** Carbbank-given id. */
    private int id;
    
    /** The glycan sequence encoded within this Carbbank record. */
    private GlycanSequence glycanSequence;
    
    /** The literature reference given for this Carbbank record. */
    private JournalReference journalReference;

    /** The entry reference created to present the link between 
    *   eurocarb & this carbbank record  */
    private Reference entryReference;
    
    /** List of contexts for this carbbank record */
    private List<BiologicalContext> contexts; 
    
    /** 
    *   Creates a new carbbank record from the passed data Map. 
    *   The data map is expected to be keyed by Carbbank 2-letter
    *   descriptor.
    *
    *   @param data         Hash of carbbank 2-letter keys to values
    *   @param from_line    First line of this carbbank record 
    *   @param to_line      Last line of this record
    */
    CarbbankRecord( String entry, Map<String,String> data, int from_line, int to_line )
    {
        assert data != null;
        assert data.size() > 0;
        
        this.data = data;  
        this.rawEntry = entry;
        
        assert from_line > 0;
        assert from_line < to_line;
        this.firstLine = from_line;
        this.lastLine = to_line;
        
        //  determine carbbank id
        assert data.containsKey("CC");
        String idstring = data.get("CC");
        this.id = parseCarbbankId( idstring );  
    }
    
    
    /** 
    *   Gets the Carbbank-supplied id.
    */
    public int getCarbbankId()
    {
        return id;
    }
    
    
    /** 
    *   Returns the (potentially multi-line) Carbbank sequence for 
    *   this Carbbank record.
    */
    public String getCarbbankSequence()
    {
        assert data != null;
        assert data.containsKey("sequence");
        return data.get("sequence");
    }
    
    
    /** 
    *   Returns the list of {@link BiologicalContext biological contexts}
    *   corresponding to the list of biological sources found within 
    *   this carbbank record. This list will be empty if no biological 
    *   contexts were found.
    */
    public List<BiologicalContext> getContexts()
    {
        if ( contexts == null ) __lookup_contexts();
        return contexts;
    }
    
    
    /*/* 
    *   Returns any/all evidence given in this entry. 
    *   
    *   <strong>Not yet implemented; returns null</strong> 
    * /
    public List<Evidence> getEvidence()
    {
        return null;
    }
    */
    
    /**
    *   Returns the sequence contained within this carbbank record
    *   in {@link GlycoctSequenceFormat GlycoCT format}.
    */
    public SugarSequence getGlycoctSequence()
    throws Exception
    {
        String carbbank_seq = this.getCarbbankSequence();
        
        assert carbbank_seq != null;
        assert carbbank_seq.length() > 0;
        
        String seq = CarbbankParser.translateCarbbankSequence( carbbank_seq ); 
        
        //  a null return value means an unparseable sequence
        if ( seq == null )
        {
            log.debug("carbbank sequence was deemed uparseable, returning null");
            return null;
        }
        
    return new SugarSequence( seq, SequenceFormat.Glycoct );
    }
    
    
    /**
    *   Returns a {@link Reference} object representing this Carbbank
    *   entry itself, specifically, its carbbank id.
    */
    public Reference getEntryReference()
    {
        if ( entryReference != null )
            return entryReference;
        
        Reference r = new Reference();
        
        r.setReferenceType( Reference.Type.DatabaseEntry.toString() );
        r.setExternalReferenceId( "" + this.getCarbbankId() );
        r.setExternalReferenceName( "Carbbank" );
        
        //  mjh: temporary addition for DB debugging cause lots of info!
        r.setReferenceComments( join( data, ": ", "<br/>" ) );

    // store the object in the session to avoid conflicts
    if( r.getReferenceId()<=0 )
        getEntityManager().store(r);

        this.entryReference = r;        
        return entryReference;
    }
    
    
    /*  getJournalReference  *//******************************************** 
    *
    *   Returns a Eurocarb {@link JournalReference} object encapsulating
    *   the literature reference given for this Carbbank entry. Returns 
    *   null if any exception is encountered during parse.
    *
    *   Carbbank fields used:<br/>
    *<ul>   
    *   <li>CT - citation, the journal/vol/page reference</li>
    *   <li>TI - title of the paper</li>
    *   <li>AU - author list, semicolon delimited</li>
    *   <li>SB - submitting author</li>
    *</ul>
    *<p>
    *   Citation is of typical citation form -- 
    *   <tt>[journal abbrev] ([year]) [volume]: [from_page]-[to_page]</tt>, eg: 
    *<pre>
    *       Acc Chem Res (1992) 25: 77-83'
    *</pre>
    *   Citation is mandatory.
    *</p>
    *<p>
    *   Author list is a semicolon-delimited list of authors, eg:<br/>
    *<pre>
    *       Albersheim P; Darvill A; Augur C; Cheong JJ; Eberhard S; Hahn MG;
    *       Marfa V; Mohnen D; O'Neill MA; Spiro MD; York WS
    *</pre>
    *</p>
    *<p>
    *   Title & submitting author are taken as given.
    *</p>
    */
    public JournalReference getJournalReference()
    {
        if ( journalReference != null ) 
            return journalReference;
        
        assert data != null;
        assert data.size() > 0;
        
        String citation   = data.get("CT");
        String title      = data.get("TI");
        String authorlist = data.get("AU");
        String submitter  = data.get("SB"); 
        
        assert citation != null;
        String idstring = citation + authorlist;
        
        //  check if this journalReference exists in the cache
        if ( (journalReference = CarbbankParser.referenceCache.get(idstring)) != null )
        {
            log.debug("journalReference is cached");
            return journalReference;
        }
            
        if ( log.isDebugEnabled() )
            log.debug( "parsing citation string '" + citation + "'" );
        
        try
        {
            String[] citation_pieces = citation.split("\\s*\\((?=\\d)");
            assert citation_pieces.length == 2;
            
            String[] pieces = citation_pieces[1].split("\\W+"); 
            
            String journal_name = citation_pieces[0];
            //System.out.println( "pieces are: " + join(", ", pieces ) );
            
            int    journal_year = pieces.length > 0
                                ? Integer.parseInt( pieces[0] )
                                : -1;
                                
            String journal_vol_ = ( pieces.length > 1 && pieces[1].length() > 0 )
                                ? pieces[1]
                                : null;
            
            String journal_p1   = pieces.length > 2
                                ? pieces[2]
                                : null;
            
            String journal_p2   = pieces.length > 3
                                ? pieces[3] 
                                : null;
                     
            if ( log.isDebugEnabled() )
            {
                log.debug( "parsed JournalReference => name: '" 
                         + journal_name 
                         + "'; year: " 
                         + journal_year 
                         + "'; vol: '" 
                         + journal_vol_
                         + "'; page: " 
                         + journal_p1 
                         + "-" 
                         + journal_p2 
                         );
            }
            
            assert title != null;
            assert journal_year > 1900;
            
            int journal_vol = __int( journal_vol_ );
            int page1 = __int( journal_p1 );
            int page2 = __int( journal_p2 );
            
            journalReference = JournalReference.lookupByCitation( 
                                    journal_name, journal_year, journal_vol, page1 ); 
            
            if ( journalReference == null ) 
            {
                journalReference = new JournalReference();
            
                journalReference.setTitle( title );
                journalReference.setAuthors( authorlist );
                journalReference.setFirstPage( page1 );
                journalReference.setLastPage( page2 );
                journalReference.setJournalVolume( journal_vol );
                journalReference.setPublicationYear( journal_year );
                
                Journal journal = null;
                if ( journalCache.containsKey( journal_name ) )
                {
                    journal = journalCache.get( journal_name );   
                }
                else 
                {
                    journal = Journal.createOrLookup( journal_name );
                    
                    if ( journal == null )
                    {
                        journal = new Journal();
                        journal.setJournalTitle( journal_name );
                    }
            
            // store the object in the session to avoid conflicts
            if( journal.getJournalId()<=0 )
            getEntityManager().store( journal );            
                }
                journalReference.setJournal( journal );
        
        // store the object in the session to avoid conflicts
        if( journalReference.getJournalReferenceId()<=0 )
            getEntityManager().store( journalReference );            

                CarbbankParser.journalCache.put( journal_name, journal );
            }
            
            //  cache it 
            CarbbankParser.referenceCache.put( idstring, journalReference );
            
            return journalReference;  
        }
        catch ( Exception e )
        {
            log.warn( "Caught exception parsing Carbbank journal "
                    + "reference, returning null", e );
            return null;
        }
    }
    
    
    /**
    *   Returns the {@link GlycanSequence glycan} encoded by 
    *   this carbbank entry. NOTE: carbbank structures are not 
    *   necessarily unique, in particular, identical structures 
    *   that are attached to different aglyca are regarded as 
    *   different structures by carbbank. Currently, carbbank
    *   aglyca are discarded by the current {@link CarbbankSequenceFormat 
    *   carbbank sequence format parser}. Accordingly, multiple
    *   carbbank records may return the same {@link GlycanSequence}.
    *
    *   @see CarbbankSequenceFormat
    *   @see CarbbankParser#sequenceCache
    */
    public GlycanSequence getGlycanSequence() throws Exception
    {
        if ( glycanSequence != null )
            return glycanSequence;
        
        SugarSequence sseq = this.getGlycoctSequence();
        if ( sseq == null )
        {
            if ( log.isDebugEnabled() )
                log.debug( "couldn't obtain a glycoct sequence for carbbank id " 
                         + this.getCarbbankId() );
            return null;
        }
        
        //  look in cache first
        if ( (glycanSequence = CarbbankParser.sequenceCache.get(sseq.toString())) != null )
        {
            //  if it exists we concatenate this carbbank id to the 
            //  existing id(s)
            log.debug("returning cached glycan sequence");
            return glycanSequence;
        }
        else
        {
            glycanSequence = GlycanSequence.lookupByExactSequence( sseq );
            if ( glycanSequence == null )
            {
                //  otherwise create a new sequence
                log.debug( "carbbank sequence does not exist in the "
                         + "data store, creating new sequence" );
                glycanSequence = new GlycanSequence(sseq);
        }

        // store the object in the session to avoid conflicts
        if( glycanSequence.getGlycanSequenceId()<=0 )
        getEntityManager().store(glycanSequence);
            
            //  cache it before returning
            CarbbankParser.sequenceCache.put( sseq.toString(), glycanSequence );
            
            return glycanSequence;
        }
    }
    
    
    /** 
    *   Returns all raw data for the current record keyed by 
    *   Carbbank 2-letter field. Map is returned by reference; 
    *   therefore changes made to the Map will be kept (!). 
    */
    public Map getRawData() {  return data;  }
    
    
    /** Returns the entire raw Carbbank entry as String. */
    public String getRawEntry() {  return rawEntry;  }
    
    /**
    *   Returns the raw value of the given field, <em>sans</em> parsing.
    *   Returns null if field doesn't exist.
    */
    public String getRawField( String field ) {  return data.get( field );  }
    
    
    public String toString()
    {
        return "carbbank record " 
             + id 
             + ": lines " 
             + firstLine 
             + "-" 
             + lastLine
             ;   
    }
    
    //~~~~~ PRIVATE METHODS ~~~~~
    
    /** 
    *   Extracts biological source information from this carbbank
    *   record and looks up biological context information for
    *   any/all biological sources.
    */
    private final void __lookup_contexts()
    {
        assert contexts == null;
        if ( data.containsKey("BS") )
        {
            String sourcelist = data.get("BS");
            this.contexts = parseBiologicalSource( sourcelist );            
        }
        else
        {
            this.contexts = Collections.emptyList();   
        }
    }

    
    /** Converts a string to an int, stripping letters as required. */
    private static final int __int( final String s )
    {
        if ( s == null || s.length() == 0 ) 
            return -1;
        
        try {  return new Integer( s ).intValue();  }
        catch ( NumberFormatException if_string_has_letters )
        {
            String number_only = s.replaceAll("\\D", "");
            return __int( number_only );
        }
    }

    
} // end class