/* * EuroCarbDB, a framework for carbohydrate bioinformatics * * Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as * indicated by the @author tags or express copyright attribution * statements applied by the authors. * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * A copy of this license accompanies this distribution in the file LICENSE.txt. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * Last commit: $Rev: 1932 $ by $Author: glycoslave $ on $Date:: 2010-08-05 #$ */ package org.eurocarbdb.util.carbbank; import java.util.Map; import java.util.List; import java.util.Collections; import org.apache.log4j.Logger; import org.eurocarbdb.dataaccess.core.*; import org.eurocarbdb.sugar.SugarSequence; import org.eurocarbdb.sugar.SequenceFormat; import org.eurocarbdb.sugar.seq.GlycoctSequenceFormat; import static org.eurocarbdb.util.StringUtils.*; import static org.eurocarbdb.util.carbbank.CarbbankParser.*; import static org.eurocarbdb.dataaccess.Eurocarb.getEntityManager; /** *<p> * Represents an individual Carbbank structural record. *</p> *<p> * A typical Carbbank record looks like this: *</p> *<pre> ; start of record ; db=ccsd29 ; Record #=14 CC: CCSD:43759 AU: Albersheim P; Darvill A; Augur C; Cheong JJ; Eberhard S; Hahn MG; Marfa V; Mohnen D; O'Neill MA; Spiro MD; York WS TI: Oligosaccharins: Oligosaccharide regulatory molecules CT: Acc Chem Res (1992) 25: 77-83 BS: (GS) Hansenula holstii BS: (GS) Pichia holstii SC: 14 TN: XXFG MT: xyloglucan SB: Westra B DA: 06-12-1995 FC: 66bf2505 SI: CBank:17702 ---------------- structure: a-L-Fucp-(1-2)-b-D-Galp-(1-2)-a-D-Xylp-(1-6)+ | a-D-Xylp-(1-6)+ b-D-Glcp-(1-4)-D-Glc | | b-D-Glcp-(1-4)+ | a-D-Xylp-(1-6)-b-D-Glcp-(1-4)+ ================end of record *</pre> *<p> * This class provides an interface by which to extract data from * individual records and return this information in terms of Eurocarb * data objects. *</p> * * @author mjh [glycoslave@gmail.com] */ public class CarbbankRecord { /** Logging handle. */ static final Logger log = Logger.getLogger( CarbbankRecord.class ); /** Entire raw carbbank record */ private String rawEntry; /** Hash of carbbank keys => values. */ private Map<String,String> data; /** Line number on which this record started */ final int firstLine; /** Line number on which this record ends */ final int lastLine; /** Carbbank-given id. */ private int id; /** The glycan sequence encoded within this Carbbank record. */ private GlycanSequence glycanSequence; /** The literature reference given for this Carbbank record. */ private JournalReference journalReference; /** The entry reference created to present the link between * eurocarb & this carbbank record */ private Reference entryReference; /** List of contexts for this carbbank record */ private List<BiologicalContext> contexts; /** * Creates a new carbbank record from the passed data Map. * The data map is expected to be keyed by Carbbank 2-letter * descriptor. * * @param data Hash of carbbank 2-letter keys to values * @param from_line First line of this carbbank record * @param to_line Last line of this record */ CarbbankRecord( String entry, Map<String,String> data, int from_line, int to_line ) { assert data != null; assert data.size() > 0; this.data = data; this.rawEntry = entry; assert from_line > 0; assert from_line < to_line; this.firstLine = from_line; this.lastLine = to_line; // determine carbbank id assert data.containsKey("CC"); String idstring = data.get("CC"); this.id = parseCarbbankId( idstring ); } /** * Gets the Carbbank-supplied id. */ public int getCarbbankId() { return id; } /** * Returns the (potentially multi-line) Carbbank sequence for * this Carbbank record. */ public String getCarbbankSequence() { assert data != null; assert data.containsKey("sequence"); return data.get("sequence"); } /** * Returns the list of {@link BiologicalContext biological contexts} * corresponding to the list of biological sources found within * this carbbank record. This list will be empty if no biological * contexts were found. */ public List<BiologicalContext> getContexts() { if ( contexts == null ) __lookup_contexts(); return contexts; } /*/* * Returns any/all evidence given in this entry. * * <strong>Not yet implemented; returns null</strong> * / public List<Evidence> getEvidence() { return null; } */ /** * Returns the sequence contained within this carbbank record * in {@link GlycoctSequenceFormat GlycoCT format}. */ public SugarSequence getGlycoctSequence() throws Exception { String carbbank_seq = this.getCarbbankSequence(); assert carbbank_seq != null; assert carbbank_seq.length() > 0; String seq = CarbbankParser.translateCarbbankSequence( carbbank_seq ); // a null return value means an unparseable sequence if ( seq == null ) { log.debug("carbbank sequence was deemed uparseable, returning null"); return null; } return new SugarSequence( seq, SequenceFormat.Glycoct ); } /** * Returns a {@link Reference} object representing this Carbbank * entry itself, specifically, its carbbank id. */ public Reference getEntryReference() { if ( entryReference != null ) return entryReference; Reference r = new Reference(); r.setReferenceType( Reference.Type.DatabaseEntry.toString() ); r.setExternalReferenceId( "" + this.getCarbbankId() ); r.setExternalReferenceName( "Carbbank" ); // mjh: temporary addition for DB debugging cause lots of info! r.setReferenceComments( join( data, ": ", "<br/>" ) ); // store the object in the session to avoid conflicts if( r.getReferenceId()<=0 ) getEntityManager().store(r); this.entryReference = r; return entryReference; } /* getJournalReference *//******************************************** * * Returns a Eurocarb {@link JournalReference} object encapsulating * the literature reference given for this Carbbank entry. Returns * null if any exception is encountered during parse. * * Carbbank fields used:<br/> *<ul> * <li>CT - citation, the journal/vol/page reference</li> * <li>TI - title of the paper</li> * <li>AU - author list, semicolon delimited</li> * <li>SB - submitting author</li> *</ul> *<p> * Citation is of typical citation form -- * <tt>[journal abbrev] ([year]) [volume]: [from_page]-[to_page]</tt>, eg: *<pre> * Acc Chem Res (1992) 25: 77-83' *</pre> * Citation is mandatory. *</p> *<p> * Author list is a semicolon-delimited list of authors, eg:<br/> *<pre> * Albersheim P; Darvill A; Augur C; Cheong JJ; Eberhard S; Hahn MG; * Marfa V; Mohnen D; O'Neill MA; Spiro MD; York WS *</pre> *</p> *<p> * Title & submitting author are taken as given. *</p> */ public JournalReference getJournalReference() { if ( journalReference != null ) return journalReference; assert data != null; assert data.size() > 0; String citation = data.get("CT"); String title = data.get("TI"); String authorlist = data.get("AU"); String submitter = data.get("SB"); assert citation != null; String idstring = citation + authorlist; // check if this journalReference exists in the cache if ( (journalReference = CarbbankParser.referenceCache.get(idstring)) != null ) { log.debug("journalReference is cached"); return journalReference; } if ( log.isDebugEnabled() ) log.debug( "parsing citation string '" + citation + "'" ); try { String[] citation_pieces = citation.split("\\s*\\((?=\\d)"); assert citation_pieces.length == 2; String[] pieces = citation_pieces[1].split("\\W+"); String journal_name = citation_pieces[0]; //System.out.println( "pieces are: " + join(", ", pieces ) ); int journal_year = pieces.length > 0 ? Integer.parseInt( pieces[0] ) : -1; String journal_vol_ = ( pieces.length > 1 && pieces[1].length() > 0 ) ? pieces[1] : null; String journal_p1 = pieces.length > 2 ? pieces[2] : null; String journal_p2 = pieces.length > 3 ? pieces[3] : null; if ( log.isDebugEnabled() ) { log.debug( "parsed JournalReference => name: '" + journal_name + "'; year: " + journal_year + "'; vol: '" + journal_vol_ + "'; page: " + journal_p1 + "-" + journal_p2 ); } assert title != null; assert journal_year > 1900; int journal_vol = __int( journal_vol_ ); int page1 = __int( journal_p1 ); int page2 = __int( journal_p2 ); journalReference = JournalReference.lookupByCitation( journal_name, journal_year, journal_vol, page1 ); if ( journalReference == null ) { journalReference = new JournalReference(); journalReference.setTitle( title ); journalReference.setAuthors( authorlist ); journalReference.setFirstPage( page1 ); journalReference.setLastPage( page2 ); journalReference.setJournalVolume( journal_vol ); journalReference.setPublicationYear( journal_year ); Journal journal = null; if ( journalCache.containsKey( journal_name ) ) { journal = journalCache.get( journal_name ); } else { journal = Journal.createOrLookup( journal_name ); if ( journal == null ) { journal = new Journal(); journal.setJournalTitle( journal_name ); } // store the object in the session to avoid conflicts if( journal.getJournalId()<=0 ) getEntityManager().store( journal ); } journalReference.setJournal( journal ); // store the object in the session to avoid conflicts if( journalReference.getJournalReferenceId()<=0 ) getEntityManager().store( journalReference ); CarbbankParser.journalCache.put( journal_name, journal ); } // cache it CarbbankParser.referenceCache.put( idstring, journalReference ); return journalReference; } catch ( Exception e ) { log.warn( "Caught exception parsing Carbbank journal " + "reference, returning null", e ); return null; } } /** * Returns the {@link GlycanSequence glycan} encoded by * this carbbank entry. NOTE: carbbank structures are not * necessarily unique, in particular, identical structures * that are attached to different aglyca are regarded as * different structures by carbbank. Currently, carbbank * aglyca are discarded by the current {@link CarbbankSequenceFormat * carbbank sequence format parser}. Accordingly, multiple * carbbank records may return the same {@link GlycanSequence}. * * @see CarbbankSequenceFormat * @see CarbbankParser#sequenceCache */ public GlycanSequence getGlycanSequence() throws Exception { if ( glycanSequence != null ) return glycanSequence; SugarSequence sseq = this.getGlycoctSequence(); if ( sseq == null ) { if ( log.isDebugEnabled() ) log.debug( "couldn't obtain a glycoct sequence for carbbank id " + this.getCarbbankId() ); return null; } // look in cache first if ( (glycanSequence = CarbbankParser.sequenceCache.get(sseq.toString())) != null ) { // if it exists we concatenate this carbbank id to the // existing id(s) log.debug("returning cached glycan sequence"); return glycanSequence; } else { glycanSequence = GlycanSequence.lookupByExactSequence( sseq ); if ( glycanSequence == null ) { // otherwise create a new sequence log.debug( "carbbank sequence does not exist in the " + "data store, creating new sequence" ); glycanSequence = new GlycanSequence(sseq); } // store the object in the session to avoid conflicts if( glycanSequence.getGlycanSequenceId()<=0 ) getEntityManager().store(glycanSequence); // cache it before returning CarbbankParser.sequenceCache.put( sseq.toString(), glycanSequence ); return glycanSequence; } } /** * Returns all raw data for the current record keyed by * Carbbank 2-letter field. Map is returned by reference; * therefore changes made to the Map will be kept (!). */ public Map getRawData() { return data; } /** Returns the entire raw Carbbank entry as String. */ public String getRawEntry() { return rawEntry; } /** * Returns the raw value of the given field, <em>sans</em> parsing. * Returns null if field doesn't exist. */ public String getRawField( String field ) { return data.get( field ); } public String toString() { return "carbbank record " + id + ": lines " + firstLine + "-" + lastLine ; } //~~~~~ PRIVATE METHODS ~~~~~ /** * Extracts biological source information from this carbbank * record and looks up biological context information for * any/all biological sources. */ private final void __lookup_contexts() { assert contexts == null; if ( data.containsKey("BS") ) { String sourcelist = data.get("BS"); this.contexts = parseBiologicalSource( sourcelist ); } else { this.contexts = Collections.emptyList(); } } /** Converts a string to an int, stripping letters as required. */ private static final int __int( final String s ) { if ( s == null || s.length() == 0 ) return -1; try { return new Integer( s ).intValue(); } catch ( NumberFormatException if_string_has_letters ) { String number_only = s.replaceAll("\\D", ""); return __int( number_only ); } } } // end class