/* * EuroCarbDB, a framework for carbohydrate bioinformatics * * Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as * indicated by the @author tags or express copyright attribution * statements applied by the authors. * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * A copy of this license accompanies this distribution in the file LICENSE.txt. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * Last commit: $Rev: 1932 $ by $Author: glycoslave $ on $Date:: 2010-08-05 #$ */ package org.eurocarbdb.sugar.seq; // stdlib imports import java.util.Set; import java.util.List; import java.util.HashSet; import java.util.LinkedList; import java.util.Comparator; import java.util.Collections; import java.util.ListIterator; import java.io.StringReader; // 3rd party imports import org.apache.log4j.Logger; import com.google.common.collect.ListMultimap; import com.google.common.collect.ArrayListMultimap; // eurocarb imports import org.eurocarbdb.util.Visitor; import org.eurocarbdb.util.graph.Graph; import org.eurocarbdb.util.graph.Vertex; import org.eurocarbdb.util.graph.Edge; import org.eurocarbdb.util.graph.Path; // import org.eurocarbdb.util.graph.DepthFirstGraphVisitor; import org.eurocarbdb.sugar.Sugar; import org.eurocarbdb.sugar.SugarRepeat; import org.eurocarbdb.sugar.Residue; import org.eurocarbdb.sugar.Linkage; import org.eurocarbdb.sugar.Anomer; import org.eurocarbdb.sugar.Basetype; import org.eurocarbdb.sugar.Substituent; import org.eurocarbdb.sugar.Substituents; import org.eurocarbdb.sugar.Monosaccharide; import org.eurocarbdb.sugar.SugarSequence; import org.eurocarbdb.sugar.SequenceFormat; import org.eurocarbdb.sugar.RingConformation; import org.eurocarbdb.sugar.GlycosidicLinkage; import org.eurocarbdb.sugar.SequenceFormatException; import org.eurocarbdb.sugar.impl.SimpleMonosaccharide; import org.eurocarbdb.sugar.seq.grammar.IupacLexer; import org.eurocarbdb.sugar.seq.grammar.IupacParser; import org.eurocarbdb.sugar.seq.grammar.ParserAdaptor; import org.eurocarbdb.sugar.seq.grammar.IupacParserAdaptor; // static imports import static java.util.Collections.sort; import static org.eurocarbdb.util.StringUtils.join; import static org.eurocarbdb.util.graph.Graphs.getPaths; /* class IupacSequenceFormat *//*********************************** * <style> tt { color: darkgreen; } </style> <p> Implementation of a parser/generator for the Eurocarb carbohydrate sequence format. This format is largely based on the <a href="http://www.chem.qmul.ac.uk/iupac/2carb/index.html">1996 IUPAC recommendations for carbohydrate nomenclature</a>. As is the norm for carbohydrate sequences, structures are read from right-to-left; that is, the "root" monosaccharide is always the rightmost monosaccharide. </p> <p> Samples of this format are as follows: <ul> <li>The human 'A' blood group antigen: <br/> <tt>GalNAc(a1-3)[Fuc(a1-2)]Gal</tt> </li> <li>The human 'B' blood group antigen: <br/> <tt>Gal(a1-3)[Fuc(a1-2)]Gal</tt> </li> <li>The human 'O' blood group antigen: <br/> <tt>Fuc(a1-2)Gal</tt> </li> <li> The N-glycan Man3GlcNAc2 core: <br/> <tt>Man(a1-6)[Man(1-3)]Man(b1-4)GlcNAc(b1-4)GlcNAc</tt> </li> <li>An example tri-antennary, tri-sialylated complex N-glycan: <br/> <tt>NeuAc(a2-6)Gal(b1-4)GlcNAc(b1-4)[NeuAc(a2-3)Gal(b1-4)GlcNAc(b1-2)]Man(a1-6)[NeuAc(a2-3)Gal(b1-4)GlcNAc(b1-2)Man(1-3)]Man(b1-4)GlcNAc(b1-4)GlcNAc</tt> </li> </ul> </p> <h2>Grammar</h2> <h3>Residues</h3> <p> Monosaccharide/residues must be between 3 and 6 characters long, and may consist of any alphanumeric or underscore ([A-Za-z0-9_]), upper or lower case, except for the first letter, which must be alphabetic and upper-case. Examples include Man, Glc, GlcNAc, NeuAc, Neu2Ac. </p> <p> This nomenclature is consistent with the overwhelming majority of common naturally occuring monosaccharide names covered by IUPAC. Names currently cannot contain hyphens ('-'), although these may be added to accomodate the reduced forms of sugars, which are commonly abbreviated to <tt>-ol</tt>, eg: <tt>GlcNAc-ol</tt>. </p> <h3>Linkages</h3> <p> Linkages generally take the form '<tt>([anomer][reducing-terminus]-[non-reducing-terminus])</tt>', eg: <tt>Gal(b1-4)Glc</tt> refers to an beta 1->4 linkage from the 1 (reducing) position of the Gal (Galactose), to the 4 (a non-reducing) position on the Glc (Glucose) to form the common disaccharide Lactose. Parentheses '()' are required around linkages and the internal delimiter '<tt>-</tt>' is required between reducing and non-reducing terminii. </p> <p> There are several other linkage descriptions that deviate from this, for example covalently bound inorganic phosphate ('<tt>P</tt>') and sulfate ('<tt>S</tt>'). These substituents omit the anomer and reducing terminus syntax, as per the following examples: <tt>P(-4)Gal(b1-4)Glc</tt>. <strong>(P & S linkage syntax is NOT yet supported by this parser)</strong> </p> <h3>Branches</h3> <p> Branches are indicated in the text sequence by the delimiters '<tt>[]</tt>', which surround the text from the opening (rightmost) linkage in the branch to the last (leftmost) residue in the branch. </p> <p> Created 06-Oct-2005. </p> * * @author mjh * @see IupacParser * @see IupacLexer * @see IupacParserAdaptor * @see SugarSequence * @see Sugar * ********************************************************************/ public class IupacSequenceFormat implements SequenceFormat { //~~~~~~~~~~~~~~~~~~~~~~ STATIC FIELDS ~~~~~~~~~~~~~~~~~~~~~~~~~~ /** Logging handle. */ static final Logger log = Logger.getLogger( IupacSequenceFormat.class ); //~~~~~~~~~~~~~~~~~~~~~~~~~~ FIELDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //~~~~~~~~~~~~~~~~~~~~~~~ CONSTRUCTORS ~~~~~~~~~~~~~~~~~~~~~~~~~~ // no constructors... //~~~~~~~~~~~~~~~~~~~~~~~~~ METHODS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /** Returns "Iupac" */ public String getName() { return "Iupac"; } /** {@inheritDoc} @see SequenceFormat#getMonosaccharide(String) */ public Monosaccharide getMonosaccharide( String seq ) throws SequenceFormatException { return SimpleMonosaccharide.forName( seq ); } public Substituent getSubstituent( String seq ) throws SequenceFormatException { return Substituents.getSubstituent( seq ); } /** {@inheritDoc} @see SequenceFormat#getSugar(String) */ public Sugar getSugar( String seq ) throws SequenceFormatException { IupacLexer lexer = new IupacLexer( new StringReader( seq ) ); IupacParser parser = new IupacParser( lexer ); ParserAdaptor.performParse( parser, seq ); return parser.getSugar(); } /** {@inheritDoc} @see SequenceFormat#getSequence(Sugar) */ public String getSequence( Sugar s ) { // throw new UnsupportedOperationException("NOT YET IMPLEMENTED"); // Graph<Linkage,Residue> graph = s.getGraph(); Generator g = new Generator( s ); String seq = g.generateSequence(); return seq; } /** {@inheritDoc} @see SequenceFormat#getSequence(Monosaccharide) */ public String getSequence( Monosaccharide m ) { /* Basetype b = m.getBasetype(); Anomer a = m.getAnomer(); RingConformation rc = m.getRingConformation(); throw new UnsupportedOperationException(); */ return m.getName(); } /** {@inheritDoc} @see SequenceFormat#getSequence(Residue) */ public String getSequence( Residue r ) { return r.getName(); } /** {@inheritDoc} @see SequenceFormat#getSequence(Substituent) */ public String getSequence( Substituent s ) { return s.getName(); } public static class Generator extends Visitor { private LinkedList<Object> seqList; private StringBuilder sequence; private final Graph<Linkage,Residue> graph; private final Sugar sugar; private boolean hasRepeat = false; static Comparator<Path<Linkage,Residue>> pathComparator = new Comparator<Path<Linkage,Residue>>() { int i1, i2, i3; public final int compare( Path<Linkage,Residue> p1 , Path<Linkage,Residue> p2 ) { int i = ((Integer) p2.countVertexes()).compareTo( p1.countVertexes() ); if ( i != 0 ) return i; int size = p1.countEdges(); for ( i = 1; i <= size; i++ ) { i1 = p1.getEdge(i).getValue().getParentTerminus(); i2 = p2.getEdge(i).getValue().getParentTerminus(); i3 = ((Integer) i2).compareTo( i1 ); // log.info("comparing:\n p1: " + p1 + "\n p2: " + p2 + "\n i=" + i + ", i1=" + i1 + ", i2=" + i2 + ", i3=" + i3 + ", size=" + size); if ( i3 != 0 ) return i3; } return 0; } public final boolean equals( Object x ) { return false; } }; public Generator( Sugar s ) { this.sugar = s; this.graph = s.getGraph(); this.seqList = new LinkedList<Object>(); } public String generateSequence() { // reset sequence list seqList.clear(); // get the list of all paths through the graph from // root to each leaf List<Path<Linkage,Residue>> paths = getPaths( graph ); // order these paths by length and linkage sort( paths, pathComparator ); if ( log.isDebugEnabled() ) { log.debug( "sorted paths (root -> leaf):\n " + join("\n ", paths ) ); } // highest sorted path becomes the 'main' branch, // the rest become branches // paths.get( 0 ).values( seqList ); paths.get( 0 ).elements( seqList ); // record which graph elements have been incorporated Set<Object> seen = new HashSet<Object>( graph.size() * 3 ); seen.addAll( paths.get( 0 ).elements() ); // for each branch: for ( int i = 1; i < paths.size(); i++ ) { Path<Linkage,Residue> path = paths.get(i); ListIterator<Object> iter = seqList.listIterator(); // for each element in the branch, root -> leaf for ( int j = 0; j < path.size(); j++ ) { if ( seen.contains( path.get(j) ) ) { iter.next(); continue; } // we've reached the point at which the current branch // diverges from the main brain, so insert it. iter.add( BRANCH_END ); for ( int k = j; k < path.size(); k++ ) { // iter.add( path.getValue( k ) ); iter.add( path.get( k ) ); seen.add( path.get( k ) ); } iter.add( BRANCH_START ); } } // log.debug( "sequence is: " + seqList ); Collections.reverse( seqList ); this.sequence = new StringBuilder( seqList.size() * 5 ); for ( Object x : seqList ) { visit( x ); } return sequence.toString(); } static final String BRANCH_START = "["; static final String BRANCH_END = "]"; static final String LINKAGE_START = "("; static final String LINKAGE_END = ")"; static final String LINKAGE_SEP = "-"; public void accept( Monosaccharide x ) { sequence.append( new IupacSequenceFormat().getSequence( x ) ); } public void accept( Substituent x ) { sequence.append( new IupacSequenceFormat().getSequence( x ) ); } public void accept( String x ) { sequence.append( x ); } /* public void accept( GlycosidicLinkage x ) { sequence.append( LINKAGE_START ); sequence.append( x.getAnomer().toChar() ); sequence.append( x.getChildTerminus() ); sequence.append( LINKAGE_SEP ); sequence.append( x.getParentTerminus() ); sequence.append( LINKAGE_END ); } public void accept( SugarRepeat s ) { assert s.getGraph() == this.graph; hasRepeat = true; visit( graph ); } public void accept( Sugar s ) { assert s.getGraph() == this.graph; visit( graph ); } public void accept( Graph<Linkage,Residue> g ) { assert g == this.graph; unvisited = new HashSet<Vertex<Linkage,Residue>>( g.size(), 1.0 ); } */ public void accept( Vertex<Linkage,Residue> v ) { visit( v.getValue() ); } public void accept( Edge<Linkage,Residue> e ) { sequence.append( LINKAGE_START ); GlycosidicLinkage x = (GlycosidicLinkage) e.getValue(); Residue parent = e.getParent().getValue(); Residue child = e.getChild().getValue(); if ( ! (child instanceof Substituent) ) { Anomer a = ((Monosaccharide) child).getAnomer(); // if ( x.getChildAnomer() != Anomer.NONE ) if ( a != Anomer.None ) sequence.append( a.toChar() ); sequence.append( x.getChildTerminus() > 0 ? x.getChildTerminus() : '?' ); } sequence.append( LINKAGE_SEP ); if ( ! (parent instanceof Substituent) ) { sequence.append( x.getParentTerminus() > 0 ? x.getChildTerminus() : '?' ); } sequence.append( LINKAGE_END ); } } // end class Generator } // end class IupacSequenceFormat