/*
* EuroCarbDB, a framework for carbohydrate bioinformatics
*
* Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as
* indicated by the @author tags or express copyright attribution
* statements applied by the authors.
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
* A copy of this license accompanies this distribution in the file LICENSE.txt.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* Last commit: $Rev: 1932 $ by $Author: glycoslave $ on $Date:: 2010-08-05 #$
*/
package org.eurocarbdb.util.carbbank;
import java.util.Map;
import java.util.List;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.Collections;
import java.io.InputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.log4j.Logger;
import org.eurocarbdb.resourcesdb.Config;
import org.eurocarbdb.resourcesdb.io.MonosaccharideConverter;
import org.eurocarbdb.resourcesdb.GlycanNamescheme;
import org.eurocarbdb.MolecularFramework.sugar.Sugar;
import org.eurocarbdb.MolecularFramework.io.SugarImporterException;
import org.eurocarbdb.MolecularFramework.io.GlycoCT.SugarExporterGlycoCT;
import org.eurocarbdb.MolecularFramework.io.GlycoCT.SugarExporterGlycoCTCondensed;
import org.eurocarbdb.MolecularFramework.io.GlycoCT.SugarImporterGlycoCT;
import org.eurocarbdb.MolecularFramework.io.namespace.GlycoVisitorToGlycoCT;
import org.eurocarbdb.MolecularFramework.io.namespace.GlycoVisitorToGlycoCTextendMSDB;
import org.eurocarbdb.MolecularFramework.io.carbbank.SugarImporterCarbbank;
import org.eurocarbdb.MolecularFramework.util.visitor.GlycoVisitorException;
import org.eurocarbdb.MolecularFramework.util.validation.GlycoVisitorSugarGraph;
import org.eurocarbdb.MolecularFramework.util.validation.SugarGraphInformation;
import org.eurocarbdb.util.ProgressWatchable;
import org.eurocarbdb.dataaccess.core.Contributor;
import org.eurocarbdb.dataaccess.core.Taxonomy;
import org.eurocarbdb.dataaccess.core.TissueTaxonomy;
import org.eurocarbdb.dataaccess.core.BiologicalContext;
import org.eurocarbdb.dataaccess.core.DiseaseContext;
import org.eurocarbdb.dataaccess.core.Disease;
import org.eurocarbdb.dataaccess.core.Journal;
import org.eurocarbdb.dataaccess.core.JournalReference;
import org.eurocarbdb.dataaccess.core.GlycanSequence;
import static org.eurocarbdb.util.StringUtils.join;
/**
*<p>
* Parses {@link http://biol.lancs.ac.uk/gig/pages/gag/carbbank.htm
* Carbbank} records from an {@link InputStream}. Individual records
* are accessed as {@link CarbbankRecord}s.
*</p>
*<p>
* To use:
*<pre>
CarbbankParser cb = new CarbbankParser( open_stream );
while ( CarbbankRecord record = cb.parse() )
{
// a null record means end-of-file
if ( record == null ) break;
...
}
*</pre>
*</p>
*
* @see CarbbankRecord
* @author mjh [glycoslave@gmail.com]
*/
public class CarbbankParser implements ProgressWatchable
{
//~~~~~~~~~~~~~~~~~~~~~ STATIC FIELDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~
/** Logging handle. */
static final Logger log = Logger.getLogger( CarbbankParser.class.getName() );
/** Verbosity limitation of debugging information. Setting to 0
* allows a basic amount of debugging info through the logging
* system; values higher than 0 provide more and more info. */
private static final int DEBUG_LEVEL = 2;
/** String used to delimit multiple values for Carbbank fields. */
private static final String _DELIM_ = ";;";
/** Cache of glycoct condensed sequence to Eurocarb glycan sequence object */
static Map<String,GlycanSequence>
sequenceCache = new HashMap<String,GlycanSequence>();
/** Cache of carbbank taxonomy string to Eurocarb taxonomy object */
static Map<String,Taxonomy>
taxonomyCache = new HashMap<String,Taxonomy>();
/** Cache of carbbank tissue string to Eurocarb tissue_taxonomy */
static Map<String,TissueTaxonomy>
tissueCache = new HashMap<String,TissueTaxonomy>();
/** Cache of carbbank disease string to Eurocarb disease */
static Map<String,Disease>
diseaseCache = new HashMap<String,Disease>();
/** Cache of carbbank reference string to Eurocarb reference */
static Map<String,JournalReference>
referenceCache = new HashMap<String,JournalReference>();
/** Cache of carbbank journal name to Eurocarb journal */
static Map<String,Journal>
journalCache = new HashMap<String,Journal>();
/** Number of times we hit the cache */
static int taxonomyCacheHits = 0;
/** Number of times we hit the database (slow) */
static int taxonomyDatabaseHits = 0;
/** Total number of taxonomy lookups. Should be == to cache hits + DB hits. */
static int taxonomyTotalLookups = 0;
/** Number of times we hit the cache */
static int tissueCacheHits = 0;
/** Number of times we hit the database (slow) */
static int tissueDatabaseHits = 0;
/** Total number of tissue lookups. Should be == to cache hits + DB hits. */
static int tissueTotalLookups = 0;
/** Number of times we hit the cache */
static int diseaseCacheHits = 0;
/** Number of times we hit the database (slow) */
static int diseaseDatabaseHits = 0;
/** Total number of disease lookups. Should be == to cache hits + DB hits. */
static int diseaseTotalLookups = 0;
static
{
/*
// report stats on shutdown
Runtime.getRuntime().addShutdownHook(
new Thread()
{
public void run()
{
System.out.println();
System.out.println("=== Summary ===");
if ( recordsParsed > 0 )
{
System.out.println("total sequences parsed = " + recordsParsed );
System.out.println("unparseable sequences = " + sequencesUnparseable );
}
if ( taxonomyTotalLookups > 0 )
{
System.out.println("taxonomyCacheHits=" + taxonomyCacheHits);
System.out.println("taxonomyDatabaseHits=" + taxonomyDatabaseHits);
System.out.println("taxonomyTotalLookups=" + taxonomyTotalLookups);
}
}
}
);
*/
}
//~~~~~~~~~~~~~~~~~~~~~ OBJECT FIELDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~
/** Input stream from which to parse records */
private BufferedReader in;
/** If {@link #in} is a FileInputStream, then this is the size
* of the file, in bytes. */
private long inputStreamSize = 0;
/** The number of bytes that have been read from {@link #in}. */
long bytesRead = 0;
/** When parsing started; ie: the first time the {@link #parse}
* method was called. */
long parsingStartTime = 0;
/** Number of lines parsed, cumulative. */
int lineCount = 0;
/** Number of records parsed, cumulative. */
int recordsParsed = 0;
int sequencesUnparseable = 0;
//~~~~~~~~~~~~~~~~~~~~~ OBJECT METHODS ~~~~~~~~~~~~~~~~~~~~~~~~~~
/**
* Convenience method that simply returns all carbbank records
* that are parseable from the given {@link InputStream}.
*
* Using this method consumes a significant amount of system memory,
* be warned.
*
* mjh: commenting out cause memory usage is over the top.
* /
public static parseAll( InputStream instream, List<CarbbankRecord> records )
{
if ( records == null )
throw new IllegalArgumentException(
"expected a list for argument 'records', but got null");
CarbbankParser parser = new CarbbankParser();
parser.setInputStream( instream );
int count_added = 0;
while ( true )
{
CarbbankRecord r = parser.parse();
if ( r == null )
break;
records.add( r );
count_added++;
}
log.info("Parsed " + count_added + " carbbank record(s)");
return;
}
*/
/** Sets the input stream from which carbbank entries are read. */
public void setInputStream( InputStream instream )
{
assert instream != null;
this.in = new BufferedReader( new InputStreamReader( instream ));
// if it's a File, we can read its size, which can be used to
// provide progress info.
if ( instream instanceof FileInputStream )
try { inputStreamSize = ((FileInputStream) instream).getChannel().size(); }
catch ( java.io.IOException e ) { log.warn("While reading file size", e ); }
// reset progress counters
this.bytesRead = 0;
this.parsingStartTime = 0;
}
/**
* Parses & returns 1 carbbank entry. If a complete carbbank
* entry cannot be parsed from the stream set by {@link #setInputStream}
* (eg: end of file/stream has been reached), this method returns null.
*/
public CarbbankRecord parse() throws IOException
{
if ( parsingStartTime == 0 )
parsingStartTime = System.currentTimeMillis();
Map<String,String> map = new HashMap<String,String>();
StringBuffer sequence = new StringBuffer();
StringBuffer entry = new StringBuffer();
int first_line = lineCount + 1;
String last_key = null;
while ( true )
{
String line = in.readLine();
if ( line == null )
break;
bytesRead += line.length();
entry.append( line );
entry.append('\n');
lineCount++;
if ( line.length() <= 2 )
continue;
if ( line.startsWith("---") )
// ignore it - it signals the start of a structure sequence
continue;
if ( line.startsWith(";") )
// ignore it - comment line
continue;
if ( line.startsWith("===") )
{
// signals the end of a record
assert sequence.length() > 0;
map.put("sequence", sequence.toString() );
break;
}
if ( line.startsWith(" ") )
{
// signals that this is part of a (potentially multi-line)
// carbbank sequence
sequence.append( line );
sequence.append('\n');
continue;
}
if ( line.matches("^[A-Z][A-Z]: .*") )
{
// ie: it's a 'key: value' line, the format of which
// is '[A-Z][A-Z]: <data>'. data may be spread over
// multiple lines.
assert line.length() > 3
: "error at line " + lineCount + ". line was:\n" + line;
String key = line.substring( 0, 2 );
String val = line.substring( 3 );
if ( map.containsKey(key) )
map.put( key, map.get(key) + _DELIM_ + val.trim() );
else
map.put( key, val.trim() );
last_key = key;
continue;
}
else
{
// otherwise it can only be a continuation of a previous
// 'key: value' line, and so append it to the last key
// we observed.
assert map.containsKey( last_key )
: "error at line " + lineCount + ". line was:\n" + line;
map.put( last_key, map.get(last_key) + line );
continue;
}
}
// returning null tells the client of this parser that
// there are no more sequences to parse.
if ( map.size() == 0 )
return null;
this.recordsParsed++;
if ( log.isDebugEnabled() )
log.debug( "parsed carbbank entry "
+ recordsParsed
+ ", lines "
+ first_line
+ "-"
+ lineCount
);
return new CarbbankRecord( entry.toString(), map, first_line, lineCount );
}
public static Contributor carbBankContributor=CarbbankManager.getCarbbankContributor();
/**
* Parses a biological source ('BS' field in carbbank) and extracts
* and looks up biological context information for each/all sources
* found. Returns an empty list if no sources could be found.
*/
public static List<BiologicalContext> parseBiologicalSource( String bs_field )
{
String _bs_field = bs_field.trim();
if ( _bs_field.length() == 0 )
return Collections.emptyList();
String[] sources = _bs_field.split( _DELIM_ );
List<BiologicalContext> bc_list
= new ArrayList<BiologicalContext>( sources.length );
for ( String source : sources )
{
log.debug("creating new biological context...");
BiologicalContext bc = new BiologicalContext();
bc.addContributor(carbBankContributor , "This biological context was parsed from "
+ "Carbbank biological source string '"
+ source
+ "'");
if ( log.isDebugEnabled() )
log.debug("looking up source text '" + source + "'");
Map<String,String> map = new HashMap<String,String>();
__convert_source_to_map( source, map );
/*~~~ Taxonomy - 'domain'/'CN'/'GS' fields ~~~*/
// identify a taxonomic term with which to search
String taxonomy_name = null;
if ( map.containsKey("GS") )
taxonomy_name = map.get("GS");
else if ( map.containsKey("CN") )
taxonomy_name = map.get("CN");
else if ( map.containsKey("domain") )
taxonomy_name = map.get("domain");
// ...then look it up
__lookup_taxonomy( taxonomy_name, bc );
/*~~~ Tissue Taxonomy - 'OT' field ~~~*/
String tissue_name = map.get("OT");
__lookup_tissue( tissue_name, bc );
/*~~~ Disease(s) - 'disease' field ~~~*/
String disease_name = map.get("disease");
__lookup_disease( disease_name, bc );
bc_list.add( bc );
}
if ( bc_list.size() == 0 )
{
log.debug("NO biological contexts given in record");
bc_list = Collections.emptyList();
}
return bc_list;
}
/**
* Parses a carbbank id from a carbbank 'CC' field.
* Carbbank IDs are usually of form 'CCSD:NNNN' where N is a
* a positive integer. This method returns only the numeric
* portion. Returns -1 if the passed id string does not conform
* with the general Carbbank syntax.
*/
public static int parseCarbbankId( String cc_field )
{
assert cc_field != null;
assert cc_field.startsWith("CCSD:");
String idstring = cc_field.substring( 5 );
try { return Integer.parseInt( idstring ); }
catch ( NumberFormatException string_has_non_numerics_in_it )
{
try { return Integer.parseInt( idstring.replaceAll("\\D", "") ); }
catch ( NumberFormatException id_is_invalid )
{
return -1;
}
}
}
public static Sugar removeAglyca(Sugar s) throws Exception
{
GlycoVisitorSugarGraph sugargraph_visitor = new GlycoVisitorSugarGraph();
sugargraph_visitor.start(s);
List<SugarGraphInformation> sgi_list = sugargraph_visitor.getSugarGraphs();
if ( sgi_list == null || sgi_list.size() == 0 )
throw new Exception("SugarGraphInformation null or zero size");
assert sgi_list.size() == 1;
return sgi_list.get(0).getSugar();
}
/**
* Convenience method to translate a carbbank sequence into a GlycoCT
* sequence string. Syntactically invalid carbbank sequences or sequences
* with unparseable elements return null.
*/
public static String translateCarbbankSequence( String carbbank_sequence )
throws Exception
{
try
{
long start = System.currentTimeMillis();
SugarImporterCarbbank carbbank_importer = new SugarImporterCarbbank();
// GlycoVisitorToGlycoCT glycoct_visitor
// = new GlycoVisitorToGlycoCT(
// new MonosaccharideConverter(
// new Config() ) );
GlycoVisitorToGlycoCTextendMSDB glycoct_visitor
= new GlycoVisitorToGlycoCTextendMSDB(
new MonosaccharideConverter( new Config() ));
glycoct_visitor.setNameScheme(GlycanNamescheme.CARBBANK);
glycoct_visitor.setUseStrict( false );
glycoct_visitor.setUseFusion(true);
Sugar sugar = carbbank_importer.parse( carbbank_sequence );
glycoct_visitor.start( sugar );
sugar = glycoct_visitor.getNormalizedSugar();
Sugar no_aglyca = removeAglyca(sugar);
if( no_aglyca != null )
sugar = no_aglyca;
// for glycoct-condensed
SugarExporterGlycoCTCondensed glycoct_exporter = new SugarExporterGlycoCTCondensed();
// for glycoct-XML
//SugarExporterGlycoCT glycoct_exporter = new SugarExporterGlycoCT();
glycoct_exporter.start( sugar );
// for glycoct-condensed
String glycoct_sequence = glycoct_exporter.getHashCode();
// for glycoct-XML
//String glycoct_sequence = glycoct_exporter.getXMLCode();
if ( log.isDebugEnabled() )
{
long elapsed = System.currentTimeMillis() - start;
log.debug( "translation of carbbank sequence to glycoct took "
+ elapsed
+ "msec"
);
}
return glycoct_sequence;
}
catch ( Exception stupid_mf_exception )
{
log.warn( stupid_mf_exception );
throw stupid_mf_exception;
}
}
/**
* If parsing has started, and the InputStream being read from
* was a file, then this returns the percent of the file that
* has been read so far, otherwise returns zero.
* @see #setInputStream
*/
public int getPercentComplete()
{
if ( inputStreamSize == 0 )
return 0;
return (int) ((bytesRead / inputStreamSize) + 0.5);
}
/**
* The number of millisecs that have elapsed since the {@link #parse}
* method was first called on the {@link InputStream} passed to
* {@link #setInputStream}.
*/
public int getMillisecsElapsed()
{
if ( parsingStartTime == 0 )
return 0;
long now = System.currentTimeMillis();
assert now >= parsingStartTime;
return (int) (now - parsingStartTime);
}
//~~~~~~~~~~~~~~~~~~~~~ PRIVATE METHODS ~~~~~~~~~~~~~~~~~~~~~~~~~
/**
* (Pre-)Loads a bunch of taxonomies for which we already have
* human-verified mappings of Carbbank term to NCBI id.
*/
private static final void __preload_taxonomy_cache()
{
log.info("Preloading taxonomy cache...");
assert taxonomyCache.size() == 0;
for ( CarbbankTaxonomy ct : CarbbankTaxonomy.values() )
{
if ( ct.ncbiId < 0 )
continue;
Taxonomy tax = Taxonomy.lookupNcbiId( ct.ncbiId );
if ( tax == null )
tax = Taxonomy.UnknownTaxonomy();
taxonomyCache.put( ct.carbbankName, tax );
}
log.info("Preloaded " + taxonomyCache.size() + " taxonomies");
}
/**
* Parses a Carbbank biological source ('BS' field) string
* into a Map of 2-letter field to value. Carbbank biological
* source strings are of form:<br/>
*<pre>
* (GS) Phytophthora megasperma, (OT) cell wall, (*) f.sp. glycinea
*</pre>
* The Map returned by this method for this string would be:
*<pre>
* GS => Phytophthora megasperma
* OT => cell wall
* \* => f.sp. glycinea
*</pre>
*/
private static final void
__convert_source_to_map( String source, Map<String,String> map )
{
assert map != null;
assert map.size() == 0;
// split on ', (XX)', where XX is a 2-letter uppercase descriptor
String[] bits = source.split( ",?\\s*[\\(\\)]\\s*" );
// the number of bits after splitting should be ODD,
// because the source text should start with a '(',
// which means the first element after splitting should be an empty string
assert source.startsWith("(");
if ( (bits.length & 1) == 0 )
log.warn( "Uneven number of fields to values in string '"
+ source
+ "'\nbits parsed were: "
+ join(", ", bits )
);
// load the pieces into a hash, skipping empty strings
for ( int i = 0; i < bits.length - 1; i++ )
{
if ( bits[i].length() == 0 ) continue;
map.put( bits[i], bits[i+1] );
}
}
/**
* Look up the given taxonomy name in the database, and stick it in
* the given BiologicalContext.
*/
private static final void
__lookup_taxonomy( String taxonomy_name, BiologicalContext bc )
{
if ( taxonomyCache.size() == 0 )
__preload_taxonomy_cache();
if ( taxonomy_name == null )
{
log.debug("no parseable taxonomy term, setting to Unknown taxonomy");
bc.setTaxonomy( Taxonomy.UnknownTaxonomy() );
bc.getBiologicalContextContributor(carbBankContributor.getContributorId()).appendComment("(No parseable taxonomy term)");
return;
}
// now full-text search for said taxonomic term
// - try looking in our static cache first to save a DB lookup
Taxonomy tax = taxonomyCache.get( taxonomy_name );
if ( tax != null )
{
if ( log.isDebugEnabled() )
log.debug("taxonomy '" + taxonomy_name + "' is cached");
// it's in the cache, w00t!
bc.setTaxonomy( tax );
taxonomyCacheHits++;
}
else
{
// otherwise look it up...
assert taxonomy_name.length() > 0;
List<Taxonomy> results = null;
try
{
results = Taxonomy.lookupExactNameOrSynonym( taxonomy_name );
if ( results == null || results.size() == 0 )
results = Taxonomy.lookupNameOrSynonym( taxonomy_name );
}
catch ( Exception e )
{
log.warn( "Caught exception while hitting the DB", e );
results = null;
}
if ( results != null && results.size() > 0 )
{
// 1 or more taxonomy matches.
//
// !!! NOTE !!!
// for now, we will simply take the first match,
// and discard the others, which upon future revision
// might need to be reviewed
// !!! NOTE !!!
if ( log.isDebugEnabled() )
{
log.debug( "found "
+ results.size()
+ " result(s) for taxonomy '"
+ taxonomy_name
+ "': "
+ join(", ", results )
);
}
tax = results.get(0);
bc.setTaxonomy( tax );
taxonomyCache.put( taxonomy_name, tax );
}
else
{
// taxonomy not found
bc.setTaxonomy( Taxonomy.UnknownTaxonomy() );
bc.getBiologicalContextContributor(carbBankContributor.getContributorId()).appendComment("Taxonomy term '"
+ taxonomy_name
+ "' was not found"
);
}
taxonomyDatabaseHits++;
}
taxonomyTotalLookups++;
return;
}
/**
* Look up the given tissue name in the database, and stick it in
* the given BiologicalContext.
*/
private static final void
__lookup_tissue( String tissue_name, BiologicalContext bc )
{
if ( tissue_name == null || tissue_name.length() == 0 )
{
log.debug("no parseable tissue term, setting to Unknown tissue");
bc.setTissueTaxonomy( TissueTaxonomy.UnknownTissue() );
bc.getBiologicalContextContributor(carbBankContributor.getContributorId()).appendComment( "(No parseable tissue term)" );
return;
}
// now full-text search for said tissue
// - try looking in our static cache first to save a DB lookup
TissueTaxonomy tissue = tissueCache.get( tissue_name );
if ( tissue != null )
{
if ( log.isDebugEnabled() )
log.debug("tissue '" + tissue_name + "' is cached");
// it's in the cache, w00t!
bc.setTissueTaxonomy( tissue );
tissueCacheHits++;
}
else
{
// otherwise look it up...
List<TissueTaxonomy> results = null;
try
{
results = TissueTaxonomy.lookupNameOrSynonym( tissue_name );
}
catch ( Exception e )
{
log.warn( "Caught exception while hitting the DB", e );
results = null;
}
if ( results != null && results.size() > 0 )
{
// 1 or more tissue matches.
//
// !!! NOTE !!!
// for now, we will simply take the first match,
// and discard the others, which upon future revision
// might need to be reviewed
// !!! NOTE !!!
if ( log.isDebugEnabled() )
{
log.debug( "found "
+ results.size()
+ " result(s) for tissue '"
+ tissue_name
+ "': "
+ join(", ", results )
);
}
tissue = results.get(0);
bc.setTissueTaxonomy( tissue );
tissueCache.put( tissue_name, tissue );
}
else
{
// tissue not found
bc.setTissueTaxonomy( TissueTaxonomy.UnknownTissue() );
bc.getBiologicalContextContributor(carbBankContributor.getContributorId()).appendComment( "Tissue term '"
+ tissue_name
+ "' was not found"
);
}
tissueDatabaseHits++;
}
tissueTotalLookups++;
return;
}
/**
* Look up the given disease name in the database, and stick it in
* the given BiologicalContext.
*/
private static final void
__lookup_disease( String disease_name, BiologicalContext bc )
{
if ( disease_name == null || disease_name.length() == 0 )
{
log.debug("no parseable disease term, setting to NO disease associations");
bc.getBiologicalContextContributor(carbBankContributor.getContributorId()).appendComment( "(No disease terms found)" );
return;
}
// Carbbank often uses 'cancer' as a term but
// MeSH lists 'cancer' as 'neoplasm', so keeps assigning
// 'cancer' as 'precancerous condition', which is wrong.
// this is a hack so it uses 'neoplasm' instead.
if ( "cancer".equals( disease_name ) )
disease_name = "neoplasm";
// now full-text search for said disease
// - try looking in our static cache first to save a DB lookup
Disease disease = diseaseCache.get( disease_name );
if ( disease != null )
{
if ( log.isDebugEnabled() )
log.debug("disease '" + disease_name + "' is cached");
// it's in the cache, w00t!
bc.addDiseaseAssociation( disease );
diseaseCacheHits++;
}
else
{
// otherwise look it up...
List<Disease> results = null;
try
{
results = Disease.lookupNameOrSynonym( disease_name );
}
catch ( Exception e )
{
log.warn( "Caught exception while hitting the DB", e );
results = null;
}
if ( results != null && results.size() > 0 )
{
// 1 or more disease matches.
//
// !!! NOTE !!!
// for now, we will simply take the first match,
// and discard the others, which upon future revision
// might need to be reviewed
// !!! NOTE !!!
if ( log.isDebugEnabled() )
{
log.debug( "found "
+ results.size()
+ " result(s) for disease '"
+ disease_name
+ "': "
+ join(", ", results )
);
}
disease = results.get(0);
bc.getDiseaseContexts().add( new DiseaseContext( bc, disease ) );
diseaseCache.put( disease_name, disease );
}
else
{
// disease not found
bc.getBiologicalContextContributor(carbBankContributor.getContributorId()).appendComment( "Disease term '"
+ disease_name
+ "' was not found"
);
}
diseaseDatabaseHits++;
}
diseaseTotalLookups++;
return;
}
/** Simple command-line driver for testing */
public static void main( String[] args )
throws java.io.FileNotFoundException, IOException
{
if ( args.length == 0 )
{
System.err.println("no argument!");
return;
}
CarbbankParser parser = new CarbbankParser();
int c = 0;
int good = 0, bad = 0;
for ( String filename : args )
{
FileInputStream instream = new FileInputStream( filename );
parser.setInputStream( instream );
while ( true )
{
CarbbankRecord r = parser.parse();
if ( r == null )
break;
//r.getContexts();
//r.getJournalReference();
try
{
System.err.println( "carbbank:" );
System.err.println( r.getCarbbankSequence() );
System.err.println( "glycoct:" );
System.err.println( r.getGlycoctSequence() );
}
catch ( Exception e )
{
System.err.println( e );
continue;
}
c++;
}
}
System.err.println("parsed " + c + " records");
}
} // end class