CarbbankParser.java example

Explorer

eurocarbdb-master
- application
- core-api
  - src
    - org
      - eurocarbdb
        dataaccess
        BasicEurocarbObject.java
        Contributed.java
        EntityDoesntExistException.java
        EntityManager.java
        Eurocarb.java
        EurocarbObject.java
        HibernateEntityManager.java
        HibernateXMLEntityManager.java
        core
        Author.java
        BiologicalContext.java
        BiologicalContextAssociation.java
        BiologicalContextContributor.java
        BiologicalContextToGlycoconjugate.java
        Composition.java
        CompositionId.java
        Contributor.java
        Disease.java
        DiseaseContext.java
        DiseaseRelations.java
        DiseaseSynonym.java
        Evidence.java
        EvidenceContext.java
        Experiment.java
        ExperimentContext.java
        ExperimentStep.java
        ExternalDatabaseReference.java
        GlycanSequence.java
        GlycanSequenceContext.java
        GlycanSequenceEvidence.java
        GlycanSequenceReference.java
        GlycanSequenceRelations.java
        Glycoconjugate.java
        Glycoprotein.java
        Journal.java
        JournalReference.java
        Perturbation.java
        PerturbationContext.java
        PerturbationRelations.java
        Reference.java
        ReferencedEvidence.java
        Taxonomy.java
        TaxonomyProteomeSkRanked.java
        TaxonomyRelations.java
        TaxonomySubtype.java
        TaxonomySynonym.java
        Technique.java
        TissueTaxonomy.java
        TissueTaxonomyRelations.java
        TissueTaxonomySynonym.java
        ref
        BcsdbReference.java
        CarbbankReference.java
        CfgReference.java
        GenericReference.java
        GlyaffinityReference.java
        GlycobaseDublinReference.java
        GlycobaseLilleReference.java
        GlycosciencesDeReference.java
        KeggReference.java
        seq
        GlycanMonosaccharide.java
        GlycanResidue.java
        GlycanSubstituent.java
        SubstructureQuery.java
        SubstructureQueryCriterion.java
        SubstructureQueryGenerator.java
        SubstructureQueryGeneratorImpl1.java
        SubstructureQueryGeneratorImpl2.java
        SubstructureQueryResult.java
        exception
        DataAccessException.java
        DataException.java
        EurocarbException.java
        InvalidAssociationException.java
        InvalidPropertyException.java
        UnauthorisedAccessException.java
        hibernate
        GenericEnumUserType.java
        HibernateUtils.java
        SugarSequenceUserType.java
        hplc
        Autogu.java
        Column.java
        Content.java
        ContributorIndustry.java
        Detector.java
        Digest.java
        DigestProfile.java
        DigestSingle.java
        DisRefine.java
        Disappeared.java
        Enzyme.java
        Glycan.java
        GlycanDisease.java
        GlycanPerturbation.java
        GlycanSequenceTemp.java
        GlycanSourceLink.java
        GlycanTaxonomy.java
        GlycanTissue.java
        HplcPeaksAnnotated.java
        HplcPeaksIntegrated.java
        Instrument.java
        IntegrationMethod.java
        Mandetector.java
        MethodRun.java
        Multipleglycoct.java
        MultistructuresGlycoct.java
        ParentProfile.java
        PictorialRepresentation.java
        Profile.java
        ProfileData.java
        Ref.java
        RefLink.java
        RefLinkId.java
        RefTaxLink.java
        StructuresGlycoct.java
        indexes
        Index.java
        IndexByContributedDate.java
        IndexByContributorName.java
        IndexByMostEvidence.java
        IndexByResidueCount.java
        Indexable.java
        ms
        Acquisition.java
        AcquisitionToPersubstitution.java
        Analyser.java
        AnalyserParameter.java
        Annotation.java
        DataProcessing.java
        Device.java
        DeviceSettings.java
        EsiParameter.java
        Fragmentation.java
        FragmentationParameter.java
        FragmentationType.java
        Ion.java
        IonComposition.java
        IontrapParameter.java
        Laser.java
        LaserParameter.java
        MaldiMatrix.java
        MaldiParameter.java
        Manufacturer.java
        MassDetector.java
        MassDetectorParameter.java
        MethodOfCombination.java
        MsMsRelationship.java
        PeakAnnotated.java
        PeakAnnotatedToIon.java
        PeakAnnotatedToSmallMolecule.java
        PeakAnnotation.java
        PeakAnnotationComparator.java
        PeakLabeled.java
        PeakLabeledComparator.java
        PeakList.java
        PeakListToDataProcessing.java
        PeakProcessing.java
        Persubstitution.java
        ReducingEnd.java
        Scan.java
        ScanImage.java
        ScanToDataProcessing.java
        SmallMolecule.java
        SmallMoleculeComposition.java
        Software.java
        SoftwareType.java
        Source.java
        SourceParameter.java
        SumAverageRelationship.java
        TandemScanMethod.java
        TofParameter.java
        nmr
        NmrEvidence.java
        sugar
        Anomer.java
        Attachable.java
        Basetype.java
        Basetypes.java
        BasicLinkage.java
        BasicMolecule.java
        CarbohydrateChemistry.java
        CommonBasetype.java
        CommonSubstituent.java
        Composition.java
        CustomBasetype.java
        Element.java
        GlycosidicLinkage.java
        Ion.java
        Linkage.java
        LinkageType.java
        Massclass.java
        Modification.java
        ModificationType.java
        Molecule.java
        Monosaccharide.java
        PositionNotOccupiedException.java
        PositionOccupiedException.java
        PotentiallyIndefinite.java
        Residue.java
        ResidueFormat.java
        RingConformation.java
        SequenceFormat.java
        SequenceFormatException.java
        SimpleSubstituent.java
        StereoConfig.java
        Substituent.java
        Substituents.java
        Sugar.java
        SugarAnnotation.java
        SugarChemistryException.java
        SugarException.java
        SugarRepeat.java
        SugarRepeatAnnotation.java
        SugarSequence.java
        SugarVisitor.java
        Superclass.java
        impl
        ComplexMonosaccharide.java
        GenericResidue.java
        LinkageDisjunction.java
        ResidueDisjunction.java
        SimpleMonosaccharide.java
        seq
        CarbbankSequenceFormat.java
        GlycoctSequenceFormat.java
        GlycoctXmlSequenceFormat.java
        IupacSequenceFormat.java
        MultiSequenceFormat.java
        SequenceIterator.java
        grammar
        AstTranslatorVisitor.java
        GlycoctLexer.java
        GlycoctParser.java
        GlycoctParserAdaptor.java
        GlycoctParserTokenTypes.java
        GlycoctTest.java
        IupacLexer.java
        IupacParser.java
        IupacParserAdaptor.java
        IupacParserTokenTypes.java
        IupacTest.java
        LinkageToken.java
        MonosacResidueToken.java
        ParserAdaptor.java
        RepeatResidueToken.java
        ResidueToken.java
        SequenceTestHarness.java
        util
        BitSet.java
        CompositionVisitor.java
        FTP_Client.java
        JavaUtils.java
        Logger.java
        ProgressWatchable.java
        StringUtils.java
        Version.java
        Visitor.java
        XmlSerialiser.java
        carbbank
        CarbbankManager.java
        CarbbankParser.java
        CarbbankRecord.java
        CarbbankTaxonomy.java
        graph
        BreadthFirstGraphVisitor.java
        DepthFirstGraphVisitor.java
        Edge.java
        Graph.java
        GraphIterator.java
        GraphVisitor.java
        Graphable.java
        Graphs.java
        Path.java
        Tree.java
        Vertex.java
        mesh
        MeshReference.java
        ncbi
        NcbiTaxonomy.java
    - test

/*
*   EuroCarbDB, a framework for carbohydrate bioinformatics
*
*   Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as
*   indicated by the @author tags or express copyright attribution
*   statements applied by the authors.  
*
*   This copyrighted material is made available to anyone wishing to use, modify,
*   copy, or redistribute it subject to the terms and conditions of the GNU
*   Lesser General Public License, as published by the Free Software Foundation.
*   A copy of this license accompanies this distribution in the file LICENSE.txt.
*
*   This program is distributed in the hope that it will be useful,
*   but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*   or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
*   for more details.
*
*   Last commit: $Rev: 1932 $ by $Author: glycoslave $ on $Date:: 2010-08-05 #$  
*/

package org.eurocarbdb.util.carbbank;

import java.util.Map;
import java.util.List;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.Collections;

import java.io.InputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.IOException;

import org.apache.log4j.Logger;

import org.eurocarbdb.resourcesdb.Config;
import org.eurocarbdb.resourcesdb.io.MonosaccharideConverter;
import org.eurocarbdb.resourcesdb.GlycanNamescheme;
import org.eurocarbdb.MolecularFramework.sugar.Sugar;
import org.eurocarbdb.MolecularFramework.io.SugarImporterException;
import org.eurocarbdb.MolecularFramework.io.GlycoCT.SugarExporterGlycoCT;
import org.eurocarbdb.MolecularFramework.io.GlycoCT.SugarExporterGlycoCTCondensed;
import org.eurocarbdb.MolecularFramework.io.GlycoCT.SugarImporterGlycoCT;
import org.eurocarbdb.MolecularFramework.io.namespace.GlycoVisitorToGlycoCT;
import org.eurocarbdb.MolecularFramework.io.namespace.GlycoVisitorToGlycoCTextendMSDB;
import org.eurocarbdb.MolecularFramework.io.carbbank.SugarImporterCarbbank;
import org.eurocarbdb.MolecularFramework.util.visitor.GlycoVisitorException;
import org.eurocarbdb.MolecularFramework.util.validation.GlycoVisitorSugarGraph;
import org.eurocarbdb.MolecularFramework.util.validation.SugarGraphInformation;

import org.eurocarbdb.util.ProgressWatchable;

import org.eurocarbdb.dataaccess.core.Contributor;
import org.eurocarbdb.dataaccess.core.Taxonomy;
import org.eurocarbdb.dataaccess.core.TissueTaxonomy;
import org.eurocarbdb.dataaccess.core.BiologicalContext;
import org.eurocarbdb.dataaccess.core.DiseaseContext;
import org.eurocarbdb.dataaccess.core.Disease;
import org.eurocarbdb.dataaccess.core.Journal;
import org.eurocarbdb.dataaccess.core.JournalReference;
import org.eurocarbdb.dataaccess.core.GlycanSequence;

import static org.eurocarbdb.util.StringUtils.join;


/**
*<p>
*   Parses {@link http://biol.lancs.ac.uk/gig/pages/gag/carbbank.htm 
*   Carbbank} records from an {@link InputStream}. Individual records
*   are accessed as {@link CarbbankRecord}s.
*</p>
*<p>
*   To use:
*<pre>
        CarbbankParser cb = new CarbbankParser( open_stream );
        while ( CarbbankRecord record = cb.parse() )
        {
            //  a null record means end-of-file
            if ( record == null ) break;
            ...   
        }
*</pre>
*</p>
*   
*   @see CarbbankRecord
*   @author mjh [glycoslave@gmail.com]
*/
public class CarbbankParser implements ProgressWatchable
{
    //~~~~~~~~~~~~~~~~~~~~~ STATIC FIELDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~
    
    /** Logging handle. */
    static final Logger log = Logger.getLogger( CarbbankParser.class.getName() );
     
    /** Verbosity limitation of debugging information. Setting to 0
    *   allows a basic amount of debugging info through the logging 
    *   system; values higher than 0 provide more and more info. */
    private static final int DEBUG_LEVEL = 2;
    
    /** String used to delimit multiple values for Carbbank fields. */
    private static final String _DELIM_ = ";;";
    
    /** Cache of glycoct condensed sequence to Eurocarb glycan sequence object */
    static Map<String,GlycanSequence> 
        sequenceCache = new HashMap<String,GlycanSequence>();
    
    /** Cache of carbbank taxonomy string to Eurocarb taxonomy object */
    static Map<String,Taxonomy> 
        taxonomyCache = new HashMap<String,Taxonomy>();
    
    /** Cache of carbbank tissue string to Eurocarb tissue_taxonomy */
    static Map<String,TissueTaxonomy> 
        tissueCache = new HashMap<String,TissueTaxonomy>();

    /** Cache of carbbank disease string to Eurocarb disease */
    static Map<String,Disease> 
        diseaseCache = new HashMap<String,Disease>();
        
    /** Cache of carbbank reference string to Eurocarb reference */
    static Map<String,JournalReference> 
        referenceCache = new HashMap<String,JournalReference>();

    /** Cache of carbbank journal name to Eurocarb journal */
    static Map<String,Journal> 
        journalCache = new HashMap<String,Journal>();
        
        
    /** Number of times we hit the cache */    
    static int taxonomyCacheHits = 0;
    /** Number of times we hit the database (slow) */    
    static int taxonomyDatabaseHits = 0;
    /** Total number of taxonomy lookups. Should be == to cache hits + DB hits. */    
    static int taxonomyTotalLookups = 0;
    
    /** Number of times we hit the cache */    
    static int tissueCacheHits = 0;
    /** Number of times we hit the database (slow) */    
    static int tissueDatabaseHits = 0;
    /** Total number of tissue lookups. Should be == to cache hits + DB hits. */    
    static int tissueTotalLookups = 0;

    /** Number of times we hit the cache */    
    static int diseaseCacheHits = 0;
    /** Number of times we hit the database (slow) */    
    static int diseaseDatabaseHits = 0;
    /** Total number of disease lookups. Should be == to cache hits + DB hits. */    
    static int diseaseTotalLookups = 0;
    
    
    static  
    {         
        /*
        //  report stats on shutdown
        Runtime.getRuntime().addShutdownHook(
            new Thread() 
            {
                public void run() 
                {
                    System.out.println();
                    System.out.println("=== Summary ===");
                    
                    if ( recordsParsed > 0 )
                    {
                        System.out.println("total sequences parsed = " + recordsParsed );
                        System.out.println("unparseable sequences  = " + sequencesUnparseable );
                    }
                    
                    if ( taxonomyTotalLookups > 0 )
                    {
                        System.out.println("taxonomyCacheHits=" + taxonomyCacheHits);
                        System.out.println("taxonomyDatabaseHits=" + taxonomyDatabaseHits);
                        System.out.println("taxonomyTotalLookups=" + taxonomyTotalLookups);
                    }
                }
            }
        );
        */
    }

    
    //~~~~~~~~~~~~~~~~~~~~~ OBJECT FIELDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~
    
    /** Input stream from which to parse records */
    private BufferedReader in;
    
    /** If {@link #in} is a FileInputStream, then this is the size 
    *   of the file, in bytes.  */
    private long inputStreamSize = 0; 
    
    /** The number of bytes that have been read from {@link #in}. */
    long bytesRead = 0;
    
    /** When parsing started; ie: the first time the {@link #parse} 
    *   method was called. */
    long parsingStartTime = 0;
    
    /** Number of lines parsed, cumulative. */
    int lineCount = 0; 
    
    /** Number of records parsed, cumulative. */
    int recordsParsed = 0;
    
    int sequencesUnparseable = 0;
    
    //~~~~~~~~~~~~~~~~~~~~~ OBJECT METHODS ~~~~~~~~~~~~~~~~~~~~~~~~~~
  
    /**
    *   Convenience method that simply returns all carbbank records 
    *   that are parseable from the given {@link InputStream}.
    *
    *   Using this method consumes a significant amount of system memory,
    *   be warned.
    *
    *   mjh: commenting out cause memory usage is over the top.
    * /
    public static parseAll( InputStream instream, List<CarbbankRecord> records )
    {
        if ( records == null ) 
            throw new IllegalArgumentException(
                "expected a list for argument 'records', but got null");
            
        CarbbankParser parser = new CarbbankParser();
        parser.setInputStream( instream );
        int count_added = 0;
        
        while ( true )
        {
            CarbbankRecord r = parser.parse();
            if ( r == null ) 
                break;    
            
            records.add( r );
            count_added++;
        }
        
        log.info("Parsed " + count_added + " carbbank record(s)");
        return;
    }
    */
        

    /** Sets the input stream from which carbbank entries are read. */
    public void setInputStream( InputStream instream )
    {
        assert instream != null;
        this.in = new BufferedReader( new InputStreamReader( instream )); 
        
        //  if it's a File, we can read its size, which can be used to
        //  provide progress info.
        if ( instream instanceof FileInputStream )
            try {  inputStreamSize = ((FileInputStream) instream).getChannel().size();  }
            catch ( java.io.IOException e ) {  log.warn("While reading file size", e );  }
        
        //  reset progress counters
        this.bytesRead = 0;
        this.parsingStartTime = 0;
    }
    
    
    /**
    *   Parses & returns 1 carbbank entry. If a complete carbbank
    *   entry cannot be parsed from the stream set by {@link #setInputStream} 
    *   (eg: end of file/stream has been reached), this method returns null.
    */
    public CarbbankRecord parse() throws IOException
    {
        if ( parsingStartTime == 0 )
            parsingStartTime = System.currentTimeMillis();
        
        Map<String,String> map = new HashMap<String,String>();
        StringBuffer sequence = new StringBuffer();
        StringBuffer entry = new StringBuffer();
        
        int first_line = lineCount + 1;
        String last_key = null;
        
        while ( true )
        {
            String line = in.readLine();
            if ( line == null ) 
                break;
            
            bytesRead += line.length();
            entry.append( line );
            entry.append('\n');
            lineCount++;
            
            if ( line.length() <= 2 ) 
                continue;
            
            if ( line.startsWith("---") ) 
                // ignore it - it signals the start of a structure sequence
                continue;
            
            if ( line.startsWith(";") ) 
                // ignore it - comment line
                continue;
                
            if ( line.startsWith("===") ) 
            {
                //  signals the end of a record   
                assert sequence.length() > 0;
                map.put("sequence", sequence.toString() );
                break;
            }
            
            if ( line.startsWith(" ") )
            {
                //  signals that this is part of a (potentially multi-line)
                //  carbbank sequence                
                sequence.append( line );
                sequence.append('\n');
                continue;
            }
            
            if ( line.matches("^[A-Z][A-Z]: .*") )
            {
                //  ie: it's a 'key: value' line, the format of which 
                //  is '[A-Z][A-Z]: <data>'. data may be spread over
                //  multiple lines.
                assert line.length() > 3 
                    : "error at line " + lineCount + ". line was:\n" + line;
                
                String key = line.substring( 0, 2 );
                String val = line.substring( 3 );
                
                if ( map.containsKey(key) )
                    map.put( key, map.get(key) + _DELIM_ + val.trim() );
                else 
                    map.put( key, val.trim() );
                
                last_key = key;
                continue;
            }
            
            else
            {
                //  otherwise it can only be a continuation of a previous
                //  'key: value' line, and so append it to the last key
                //  we observed.
                assert map.containsKey( last_key )
                    : "error at line " + lineCount + ". line was:\n" + line;
                map.put( last_key, map.get(last_key) + line );
                continue;
            }
        }
           
        //  returning null tells the client of this parser that 
        //  there are no more sequences to parse. 
        if ( map.size() == 0 ) 
            return null;
        
        this.recordsParsed++;
        
        if ( log.isDebugEnabled() )
            log.debug( "parsed carbbank entry " 
                     + recordsParsed 
                     + ", lines "
                     + first_line
                     + "-"
                     + lineCount
                     );
        
        return new CarbbankRecord( entry.toString(), map, first_line, lineCount );
    }
    
    public static Contributor carbBankContributor=CarbbankManager.getCarbbankContributor();
        
    /** 
    *   Parses a biological source ('BS' field in carbbank) and extracts
    *   and looks up biological context information for each/all sources
    *   found. Returns an empty list if no sources could be found.
    */
    public static List<BiologicalContext> parseBiologicalSource( String bs_field )
    {
        String _bs_field = bs_field.trim();
        if ( _bs_field.length() == 0 ) 
            return Collections.emptyList();
        
        String[] sources = _bs_field.split( _DELIM_ );
        List<BiologicalContext> bc_list 
            = new ArrayList<BiologicalContext>( sources.length );
        
        for ( String source : sources )
        {
            log.debug("creating new biological context...");

            BiologicalContext bc = new BiologicalContext();
            bc.addContributor(carbBankContributor , "This biological context was parsed from "
                    + "Carbbank biological source string '" 
                    + source
                    + "'");
             
            if ( log.isDebugEnabled() )
                log.debug("looking up source text '" + source + "'");
            
            Map<String,String> map = new HashMap<String,String>();
            
            __convert_source_to_map( source, map );
            
            /*~~~  Taxonomy - 'domain'/'CN'/'GS' fields ~~~*/
            
            //  identify a taxonomic term with which to search             
            String taxonomy_name = null; 
            if ( map.containsKey("GS") )
                taxonomy_name = map.get("GS");
            else if ( map.containsKey("CN") )
                taxonomy_name = map.get("CN");
            else if ( map.containsKey("domain") ) 
                taxonomy_name = map.get("domain");
            
            //  ...then look it up
            __lookup_taxonomy( taxonomy_name, bc );
            
            
            /*~~~  Tissue Taxonomy - 'OT' field ~~~*/
            String tissue_name = map.get("OT");
            __lookup_tissue( tissue_name, bc );
                
            
            /*~~~  Disease(s) - 'disease' field ~~~*/
            String disease_name = map.get("disease");
            __lookup_disease( disease_name, bc );
            
            
            bc_list.add( bc );
        }
        
        if ( bc_list.size() == 0 ) 
        {
            log.debug("NO biological contexts given in record");
            bc_list = Collections.emptyList();
        }
            
        return bc_list;
    }
   
    
    /** 
    *   Parses a carbbank id from a carbbank 'CC' field. 
    *   Carbbank IDs are usually of form 'CCSD:NNNN' where N is a 
    *   a positive integer. This method returns only the numeric 
    *   portion. Returns -1 if the passed id string does not conform
    *   with the general Carbbank syntax.
    */
    public static int parseCarbbankId( String cc_field )
    {
        assert cc_field != null;
        assert cc_field.startsWith("CCSD:");
        String idstring = cc_field.substring( 5 );
        
        try {  return Integer.parseInt( idstring );  }
        catch ( NumberFormatException string_has_non_numerics_in_it )
        {
            try {  return Integer.parseInt( idstring.replaceAll("\\D", "") );  }
            catch ( NumberFormatException id_is_invalid )
            {
                return -1;   
            }
        }
    }
    

    public static Sugar removeAglyca(Sugar s) throws Exception
    {
        GlycoVisitorSugarGraph sugargraph_visitor = new GlycoVisitorSugarGraph();    
        sugargraph_visitor.start(s);

        List<SugarGraphInformation> sgi_list = sugargraph_visitor.getSugarGraphs();
        
        if ( sgi_list == null || sgi_list.size() == 0 )
            throw new Exception("SugarGraphInformation null or zero size");
        
        assert sgi_list.size() == 1;
        
        return sgi_list.get(0).getSugar();
    }
    

    /**
    *   Convenience method to translate a carbbank sequence into a GlycoCT
    *   sequence string. Syntactically invalid carbbank sequences or sequences
    *   with unparseable elements return null.
    */
    public static String translateCarbbankSequence( String carbbank_sequence )
    throws Exception
    {
        try
        {
            long start = System.currentTimeMillis();
            
            SugarImporterCarbbank carbbank_importer = new SugarImporterCarbbank();
            
            // GlycoVisitorToGlycoCT glycoct_visitor
                // = new GlycoVisitorToGlycoCT(
                        // new MonosaccharideConverter(
                                // new Config() ) );
                                
            GlycoVisitorToGlycoCTextendMSDB glycoct_visitor 
                = new GlycoVisitorToGlycoCTextendMSDB(
                    new MonosaccharideConverter( new Config() ));
                
            glycoct_visitor.setNameScheme(GlycanNamescheme.CARBBANK);    
            glycoct_visitor.setUseStrict( false );
            glycoct_visitor.setUseFusion(true);
                
            Sugar sugar = carbbank_importer.parse( carbbank_sequence );
            glycoct_visitor.start( sugar );
            sugar = glycoct_visitor.getNormalizedSugar();
            
            Sugar no_aglyca = removeAglyca(sugar);
            if( no_aglyca != null )
                sugar = no_aglyca;
        
            //  for glycoct-condensed
            SugarExporterGlycoCTCondensed glycoct_exporter = new SugarExporterGlycoCTCondensed();

            //  for glycoct-XML
            //SugarExporterGlycoCT glycoct_exporter = new SugarExporterGlycoCT();
            
            glycoct_exporter.start( sugar );
            
            //  for glycoct-condensed
            String glycoct_sequence = glycoct_exporter.getHashCode();
            
            //  for glycoct-XML
            //String glycoct_sequence = glycoct_exporter.getXMLCode();

            if ( log.isDebugEnabled() )
            {
                long elapsed = System.currentTimeMillis() - start;
                log.debug( "translation of carbbank sequence to glycoct took " 
                         + elapsed 
                         + "msec"
                         );
            }
            
            return glycoct_sequence;
        }
        catch ( Exception stupid_mf_exception )
        {
            log.warn( stupid_mf_exception );  
            throw stupid_mf_exception;
        }
    }
    
    
    /** 
    *   If parsing has started, and the InputStream being read from 
    *   was a file, then this returns the percent of the file that 
    *   has been read so far, otherwise returns zero. 
    *   @see #setInputStream
    */
    public int getPercentComplete()
    {
        if ( inputStreamSize == 0 )
            return 0;
        
        return (int) ((bytesRead / inputStreamSize) + 0.5);
    }

    
    /**
    *   The number of millisecs that have elapsed since the {@link #parse}
    *   method was first called on the {@link InputStream} passed to 
    *   {@link #setInputStream}.
    */
    public int getMillisecsElapsed()
    {
        if ( parsingStartTime == 0 ) 
            return 0;
        
        long now = System.currentTimeMillis();
        assert now >= parsingStartTime;
        
        return (int) (now - parsingStartTime);
    }
    
    
    //~~~~~~~~~~~~~~~~~~~~~ PRIVATE METHODS ~~~~~~~~~~~~~~~~~~~~~~~~~
    
    /**
    *   (Pre-)Loads a bunch of taxonomies for which we already have
    *   human-verified mappings of Carbbank term to NCBI id.
    */
    private static final void __preload_taxonomy_cache()
    {
        log.info("Preloading taxonomy cache...");
        assert taxonomyCache.size() == 0;
        for ( CarbbankTaxonomy ct : CarbbankTaxonomy.values() )
        {
            if ( ct.ncbiId < 0 ) 
                continue;
            
            Taxonomy tax = Taxonomy.lookupNcbiId( ct.ncbiId );
            if ( tax == null )
                tax = Taxonomy.UnknownTaxonomy();
            
            taxonomyCache.put( ct.carbbankName, tax );
        }
        log.info("Preloaded " + taxonomyCache.size() + " taxonomies");
    }
    
    
    /** 
    *   Parses a Carbbank biological source ('BS' field) string  
    *   into a Map of 2-letter field to value. Carbbank biological
    *   source strings are of form:<br/>
    *<pre>
    *   (GS) Phytophthora megasperma, (OT) cell wall, (*) f.sp. glycinea
    *</pre>
    *   The Map returned by this method for this string would be:
    *<pre>
    *   GS => Phytophthora megasperma
    *   OT => cell wall
    *   \* => f.sp. glycinea
    *</pre>
    */
    private static final void 
    __convert_source_to_map( String source, Map<String,String> map )
    {
        assert map != null;
        assert map.size() == 0;

        //  split on ', (XX)', where XX is a 2-letter uppercase descriptor 
        String[] bits = source.split( ",?\\s*[\\(\\)]\\s*" );
        
        //  the number of bits after splitting should be ODD,
        //  because the source text should start with a '(',
        //  which means the first element after splitting should be an empty string
        assert source.startsWith("(");
        if ( (bits.length & 1) == 0 )
            log.warn( "Uneven number of fields to values in string '"
                    + source 
                    + "'\nbits parsed were: "
                    + join(", ", bits ) 
                    ); 
        
        //  load the pieces into a hash, skipping empty strings
        for ( int i = 0; i < bits.length - 1; i++ )
        {
            if ( bits[i].length() == 0 ) continue;
            map.put( bits[i], bits[i+1] );
        }        
    }
    
    
    /** 
    *   Look up the given taxonomy name in the database, and stick it in
    *   the given BiologicalContext. 
    */
    private static final void 
    __lookup_taxonomy( String taxonomy_name, BiologicalContext bc )
    {
        if ( taxonomyCache.size() == 0 )
            __preload_taxonomy_cache();
        
        if ( taxonomy_name == null )
        {
            log.debug("no parseable taxonomy term, setting to Unknown taxonomy");
            bc.setTaxonomy( Taxonomy.UnknownTaxonomy() );   
            bc.getBiologicalContextContributor(carbBankContributor.getContributorId()).appendComment("(No parseable taxonomy term)");
            return;   
        }
            
        //  now full-text search for said taxonomic term
        //  - try looking in our static cache first to save a DB lookup
        Taxonomy tax = taxonomyCache.get( taxonomy_name );
        if ( tax != null )
        {
            if ( log.isDebugEnabled() )
                log.debug("taxonomy '" + taxonomy_name + "' is cached");
            
            //  it's in the cache, w00t!
            bc.setTaxonomy( tax );   
            
            taxonomyCacheHits++;
        }
        else
        {
            //  otherwise look it up...
            assert taxonomy_name.length() > 0;
            
            List<Taxonomy> results = null;
            
            try
            {
                results = Taxonomy.lookupExactNameOrSynonym( taxonomy_name );
                
                if ( results == null || results.size() == 0 )
                    results = Taxonomy.lookupNameOrSynonym( taxonomy_name );
            }
            catch ( Exception e )
            {
                log.warn( "Caught exception while hitting the DB", e );
                results = null;
            }
                
            if ( results != null && results.size() > 0 )
            {
                //  1 or more taxonomy matches.
                //
                //  !!! NOTE !!! 
                //  for now, we will simply take the first match,
                //  and discard the others, which upon future revision
                //  might need to be reviewed 
                //  !!! NOTE !!!
                
                if ( log.isDebugEnabled() )
                {
                    log.debug( "found " 
                             + results.size() 
                             + " result(s) for taxonomy '"
                             + taxonomy_name 
                             + "': "
                             + join(", ", results )
                             );
                }
                
                tax = results.get(0);
                bc.setTaxonomy( tax );
                taxonomyCache.put( taxonomy_name, tax );
            }
            else
            {
                //  taxonomy not found 
                bc.setTaxonomy( Taxonomy.UnknownTaxonomy() );   
                bc.getBiologicalContextContributor(carbBankContributor.getContributorId()).appendComment("Taxonomy term '" 
                                + taxonomy_name
                                + "' was not found"
                                );
            }

            taxonomyDatabaseHits++;
        }
        
        taxonomyTotalLookups++;
        return;
    }
    

    /** 
    *   Look up the given tissue name in the database, and stick it in
    *   the given BiologicalContext. 
    */
    private static final void 
    __lookup_tissue( String tissue_name, BiologicalContext bc )
    {
        if ( tissue_name == null || tissue_name.length() == 0 )
        {
            log.debug("no parseable tissue term, setting to Unknown tissue");
            bc.setTissueTaxonomy( TissueTaxonomy.UnknownTissue() );   
            bc.getBiologicalContextContributor(carbBankContributor.getContributorId()).appendComment( "(No parseable tissue term)" );
            return;   
        }
            
        //  now full-text search for said tissue
        //  - try looking in our static cache first to save a DB lookup
        TissueTaxonomy tissue = tissueCache.get( tissue_name );
        if ( tissue != null )
        {
            if ( log.isDebugEnabled() )
                log.debug("tissue '" + tissue_name + "' is cached");
            
            //  it's in the cache, w00t!
            bc.setTissueTaxonomy( tissue );   
            
            tissueCacheHits++;
        }
        else
        {
            //  otherwise look it up...
            List<TissueTaxonomy> results = null;
            
            try
            {
                results = TissueTaxonomy.lookupNameOrSynonym( tissue_name );
            }
            catch ( Exception e )
            {
                log.warn( "Caught exception while hitting the DB", e );
                results = null;
            }
                
            if ( results != null && results.size() > 0 )
            {
                //  1 or more tissue matches.
                //
                //  !!! NOTE !!! 
                //  for now, we will simply take the first match,
                //  and discard the others, which upon future revision
                //  might need to be reviewed 
                //  !!! NOTE !!!
                
                if ( log.isDebugEnabled() )
                {
                    log.debug( "found " 
                             + results.size() 
                             + " result(s) for tissue '"
                             + tissue_name 
                             + "': "
                             + join(", ", results )
                             );
                }
                
                tissue = results.get(0);
                bc.setTissueTaxonomy( tissue );
                tissueCache.put( tissue_name, tissue );
            }
            else
            {
                //  tissue not found 
                bc.setTissueTaxonomy( TissueTaxonomy.UnknownTissue() );   
                bc.getBiologicalContextContributor(carbBankContributor.getContributorId()).appendComment( "Tissue term '" 
                                + tissue_name
                                + "' was not found"
                                );
            }
            
            tissueDatabaseHits++;
        }
        
        tissueTotalLookups++;
        
        return;
    }

    
    /** 
    *   Look up the given disease name in the database, and stick it in
    *   the given BiologicalContext. 
    */
    private static final void 
    __lookup_disease( String disease_name, BiologicalContext bc )
    {
        if ( disease_name == null || disease_name.length() == 0 )
        {
            log.debug("no parseable disease term, setting to NO disease associations");
            bc.getBiologicalContextContributor(carbBankContributor.getContributorId()).appendComment( "(No disease terms found)" );
            return;   
        }
            
        //  Carbbank often uses 'cancer' as a term but 
        //  MeSH lists 'cancer' as 'neoplasm', so keeps assigning
        //  'cancer' as 'precancerous condition', which is wrong.
        //  this is a hack so it uses 'neoplasm' instead.
        if ( "cancer".equals( disease_name ) )
            disease_name = "neoplasm";
        
        //  now full-text search for said disease
        //  - try looking in our static cache first to save a DB lookup
        Disease disease = diseaseCache.get( disease_name );
        if ( disease != null )
        {
            if ( log.isDebugEnabled() )
                log.debug("disease '" + disease_name + "' is cached");
            
            //  it's in the cache, w00t!
            bc.addDiseaseAssociation( disease );   
            
            diseaseCacheHits++;
        }
        else
        {
            //  otherwise look it up...
            List<Disease> results = null;
            
            try
            {
                results = Disease.lookupNameOrSynonym( disease_name );
            }
            catch ( Exception e )
            {
                log.warn( "Caught exception while hitting the DB", e );
                results = null;
            }
                
            if ( results != null && results.size() > 0 )
            {
                //  1 or more disease matches.
                //
                //  !!! NOTE !!! 
                //  for now, we will simply take the first match,
                //  and discard the others, which upon future revision
                //  might need to be reviewed 
                //  !!! NOTE !!!
                
                if ( log.isDebugEnabled() )
                {
                    log.debug( "found " 
                             + results.size() 
                             + " result(s) for disease '"
                             + disease_name 
                             + "': "
                             + join(", ", results )
                             );
                }
                
                disease = results.get(0);
                bc.getDiseaseContexts().add( new DiseaseContext( bc, disease ) );
                diseaseCache.put( disease_name, disease );
            }
            else
            {
                //  disease not found 
            	bc.getBiologicalContextContributor(carbBankContributor.getContributorId()).appendComment( "Disease term '" 
                                + disease_name
                                + "' was not found"
                                );
            }
            
            diseaseDatabaseHits++;
        }
        
        diseaseTotalLookups++;
        
        return;
    }

    
    /** Simple command-line driver for testing */
    public static void main( String[] args )
    throws java.io.FileNotFoundException, IOException
    {
        if ( args.length == 0 ) 
        {
            System.err.println("no argument!");
            return;
        }
        
        CarbbankParser parser = new CarbbankParser();
        int c = 0;
        int good = 0, bad = 0;
        
        for ( String filename : args )
        {
            FileInputStream instream = new FileInputStream( filename );
            parser.setInputStream( instream );
            
            while ( true )
            {
                CarbbankRecord r = parser.parse();
                if ( r == null ) 
                    break;    
                
                //r.getContexts();
                //r.getJournalReference();
                try
                {
                    System.err.println( "carbbank:" );
                    System.err.println( r.getCarbbankSequence() );
                    System.err.println( "glycoct:" );
                    System.err.println( r.getGlycoctSequence() );
                }
                catch ( Exception e )
                {
                    System.err.println( e );
                    continue;                                        
                }
                
                c++;
            }
        }
        
        System.err.println("parsed " + c + " records");
        
    }
    
    
} // end class