/*
* EuroCarbDB, a framework for carbohydrate bioinformatics
*
* Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as
* indicated by the @author tags or express copyright attribution
* statements applied by the authors.
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
* A copy of this license accompanies this distribution in the file LICENSE.txt.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* Last commit: $Rev: 1237 $ by $Author: glycoslave $ on $Date:: 2009-06-21 #$
*/
package org.eurocarbdb.sugar.seq.grammar;
// stdlib imports
import java.util.List;
import java.util.Stack;
import java.util.ArrayList;
// 3rd party imports - antlr
import antlr.LLkParser;
import antlr.Token;
import antlr.TokenBuffer;
import antlr.TokenStream;
import antlr.SemanticException;
import antlr.RecognitionException;
import antlr.TokenStreamException;
import antlr.TokenStreamRecognitionException;
import antlr.ParserSharedInputState;
// 3rd party imports - commons logging
import org.apache.log4j.Logger;
// eurocarb imports - sugar stuff
import org.eurocarbdb.sugar.Sugar;
import org.eurocarbdb.sugar.Anomer;
import org.eurocarbdb.sugar.Residue;
import org.eurocarbdb.sugar.Linkage;
import org.eurocarbdb.sugar.Substituent;
import org.eurocarbdb.sugar.Substituents;
import org.eurocarbdb.sugar.Monosaccharide;
import org.eurocarbdb.sugar.SequenceFormat;
import org.eurocarbdb.sugar.SequenceFormatException;
// eurocarb imports - graphs
import org.eurocarbdb.util.graph.Graph;
import org.eurocarbdb.util.graph.Vertex;
import org.eurocarbdb.util.graph.Edge;
import org.eurocarbdb.util.graph.GraphIterator;
import org.eurocarbdb.util.graph.DepthFirstGraphVisitor;
// eurocarb imports - string manipulation
import static org.eurocarbdb.util.StringUtils.join;
import static org.eurocarbdb.util.StringUtils.repeat;
/* class ParserAdaptor *//*****************************************
*<p>
* This class is an adaptor/helper class for building an Abstract
* Syntax Tree (AST) for a carbohydrate sequence using ANTLR.
* This class is intended to be used as a base class for generated
* sugar parser classes. For ANTLR grammars, this is accomplished by
* declaring your grammar to inherit from this class, eg:
*</p>
<pre>
class GlycoctParser extends Parser("org.eurocarbdb.sugar.seq.grammar.ParserAdaptor");
rest of ANTLR grammar here...
</pre>
*<p>
* Usage of this class from within an ANTLR grammar is very straightforward
* -- just call the {@link #addResidue} & {@link #addLinkage} methods repeatedly (from
* grammatical actions), and assign a root residue with the {@link #setRootResidue}
* method. {@link Residue} and {@link Linkage} tokens are produced via the
* <tt>create*Token</tt> methods. The AST produced is of type
* <tt>Graph<ResidueToken,LinkageToken></tt>, which can be obtained from
* the {@link #getAST} method, and the transformed sugar object from
* the {@link #getSugar} method. {@link SequenceFormatException}s are emitted
* immediately upon recognition of syntactic errors.
*</p>
*<p>
* Finally, the static method {@link #performParse} serves as a nice, easy
* driver for running a built parser:
*</p>
<pre>
String sequence = ...;
MyLexer lexer = new MyLexer( new StringReader( sequence ) );
MyParser parser = new MyParser( lexer );
ParserAdaptor.performParse( parser, sequence );
Sugar s = parser.getSugar();
</pre>
*
* @see ResidueToken
* @see LinkageToken
* @author mjh
* @version $Rev: 1237 $
*/
public abstract class ParserAdaptor extends LLkParser
{
//~~~~~~~~~~~~~~~~~~~~~ STATIC FIELDS ~~~~~~~~~~~~~~~~~~~~~~~//
/** Logging instance. */
static final Logger log = Logger.getLogger( ParserAdaptor.class );
static final boolean debugging = log.isDebugEnabled();
static final boolean tracing = log.isTraceEnabled();
//~~~~~~~~~~~~~~~~~~~~~~~~~~ FIELDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~//
/** Our sugar graph (Abstract Syntax Tree). */
protected Graph<LinkageToken,ResidueToken> graph
= new Graph<LinkageToken,ResidueToken>();
/** The sugar sequence currently being parsed. */
protected String sequence;
//~~~~~~~~~~~~~~~~~~~~~~ CONSTRUCTORS ~~~~~~~~~~~~~~~~~~~~~~~//
/* pointlessly inherited constructors, stupid java */
public ParserAdaptor( int k ) { super( k ); }
public ParserAdaptor( ParserSharedInputState state, int k ) { super( state, k ); }
public ParserAdaptor( TokenBuffer buffer, int k ) { super( buffer, k ); }
public ParserAdaptor( TokenStream stream, int k ) { super( stream, k ); }
//~~~~~~~~~~~~~~~~~~~~~~ STATIC METHODS ~~~~~~~~~~~~~~~~~~~~~~~//
/**
* Parses the given sequence string using the given parser instance.
* @throws SequenceFormatException in response to sequence syntax
* errors.
*/
public static void performParse( ParserAdaptor parse, String sequence )
throws SequenceFormatException
{
if ( sequence == null || sequence.length() == 0 )
throw new SequenceFormatException(
"sequence cannot be null or zero-length" );
parse.setSequence( sequence );
try
{
parse.sugar();
}
// catch ANTLR exceptions and turn into seq format exceptions
// for contextual error messages. it's ugly, but it's very
// effective for diagnosing sequence syntax errors.
catch ( RecognitionException e )
{
throw new SequenceFormatException(
sequence,
e.column - 1,
e.getMessage()
);
}
catch ( TokenStreamRecognitionException e )
{
throw new SequenceFormatException(
sequence,
e.recog.column - 1,
e.getMessage()
);
}
catch ( TokenStreamException e )
{
// we don't really care about this i don't think...
//e.printStackTrace();
log.warn("Caught " + e.getClass() + " while parsing sequence: "
+ e.getMessage() );
throw new SequenceFormatException( e.getMessage() );
}
return;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~ METHODS ~~~~~~~~~~~~~~~~~~~~~~~~~~~//
/* sugar *//***************************************************
*
* Start rule for parsing a sugar sequence in any format.
* This method is meant to be overriden by a grammar subclass;
* this method only serves to identify the start rule for parsing
* any given grammar.
*/
public abstract void sugar()
throws RecognitionException, TokenStreamException
;
/* addResidue *//**********************************************
*
* Adds a residue vertex to the current sugar graph. Note that the
* added residue will be regarded as *not* being attached to anything
* after this method is called - it needs to be linked to the nascent
* sugar with the addLinkage method.
*
* @param r
* A ResidueToken instance representing a residue.
* @throws SequenceFormatException
* upon encountering invalid syntax
*
* @see #createResidueToken
*/
public void addResidue( ResidueToken r ) throws SequenceFormatException
{
_check_notnull( r );
if ( tracing )
{
log.trace( "adding to graph: residue '" + r + "'" );
traceParse( r, "A residue" );
}
assert ! graph.contains( r ): "Graph already contains residue " + r;
graph.addVertex( r );
}
/* addLinkage *//**********************************************
*
* Adds an edge to the object graph between the passed ResidueToken
* vertices, with the value of the passed LinkageToken.
*
* @param child
* The residue on the <b>non-reducing</b> terminal side of the linkage.
* @param link
* The linkage.
* @param parent
* The residue on the <b>reducing</b> terminal side of the linkage.
* @throws SequenceFormatException
* If there's a problem
*/
// public void addLinkage( ResidueToken child, LinkageToken link, ResidueToken parent )
public void addLinkage( ResidueToken parent, LinkageToken link, ResidueToken child )
throws SequenceFormatException
{
_check_notnull( child );
_check_notnull( parent );
_check_notnull( link );
if ( tracing )
{
log.trace(
"adding to graph: linkage="
+ link
+ " between parent="
+ parent
+ ", position="
+ link.getLinkage().getParentTerminus()
+ " and child="
+ child
+ ", position="
+ link.getLinkage().getChildTerminus()
);
traceParse( link, "A linkage" );
// produce a debugging message to STDERR showing
// start/end of added linkage. pretty useful IMO.
int left = child.getColumn();
int right = parent.getColumn();
if ( left > right ) { int swap = left; left = right; right = swap; }
int padding = (right - left - 2);
if ( padding < 0 ) padding = 0;
int link_left = link.getLeftColumn();
int link_right = link.getRightColumn();
int link_len = link_right - link_left + 1;
if ( link_len <= 1 ) link_len = 0;
int padding2 = link_left - right - 1;
if ( padding2 < 0 ) padding2 = 0;
log.trace( sequence );
log.trace(
repeat(" ", left)
+ '\\'
+ repeat("_", padding)
+ '/'
+ repeat(" ", padding2 )
+ repeat("^", link_len )
+ " linkage value = "
+ link
);
}
graph.addEdge(
graph.getVertex( parent ),
graph.getVertex( child ),
link
);
return;
}
/* checkRepeatBounds *//***************************************
*
* Sanity checks the bounds of an internal sugar repeat sequence.
*
* @param lowertok
* @param uppertok
* @throws SemanticException
*/
protected void checkRepeatBounds( Token lowertok, Token uppertok )
throws SequenceFormatException
{
int lower, upper;
lower = Integer.parseInt( lowertok.getText() );
if ( uppertok != null )
{
/* it's a dual-bounded repeat range "XX-YY" */
if ( tracing )
{
log.trace(
"checking dual-bounded repeat range: "
+ lowertok.getText()
+ "-"
+ uppertok.getText()
);
}
upper = Integer.parseInt( uppertok.getText() );
if ( lower >= upper )
throw new SequenceFormatException(
getSequence(),
lowertok.getColumn() - 1,
uppertok.getColumn() - 1,
"First repeat bound in a repeat range must be"
+ " less than the second repeat bound"
);
}
else
{
/* it's a single bounded repeat "XX", not a range */
if ( tracing )
log.trace( "checking singly-bounded repeat = " + lowertok.getText() );
if ( lower <= 1 )
throw new SequenceFormatException(
getSequence(),
lowertok.getColumn() - 1,
"Single repeat bound must be greater than 1"
);
}
}
/* createLinkageToken *//**************************************
*
* Creates a LinkageToken. Intended to be overriden by subclasses
* to return sequence format-specific linkage tokens as appropriate
* to the needs of the format.
*
* @param anomer Token encapsulating anomeric configuration.
* @param parent Token from reducing terminal side.
* @param child Token from non-reducing terminal side.
*/
public LinkageToken createLinkageToken( Token anomer, Token parent, Token child )
{
return new LinkageToken( sequence, anomer, parent, child );
}
/* createResidueToken *//**************************************
*
* Creates a ResidueToken representing a {@link Residue}, which
* is neither a {@link Monosaccharide} nor {@link Substituent}.
* Intended to be overriden by subclasses to return sequence
* format-specific residue tokens as appropriate to the needs
* of the format.
*
* Since we generally expect {@link Residue}s to be either
* {@link Monosaccharide}s or {@link Substituent}s, the default
* implementation of this method throws a {@link UnsupportedOperationException}.
*
* @param raw_sequence_token
* A token taken directly from the raw sequence.
* @throws SequenceFormatException
* If the token text does not correspond to a valid Residue.
*/
protected ResidueToken createResidueToken( Token raw_sequence_token )
throws SequenceFormatException
{
throw new UnsupportedOperationException(
"The default implementation of this method throws this exception"
+ " -- feel free to implement it."
);
}
/* createMonosaccharideToken *//*******************************
*
* Creates a ResidueToken representing a {@link Monosaccharide}.
* Intended to be overriden by subclasses to return sequence
* format-specific residue tokens as appropriate to the needs
* of the format.
*
* @param raw_sequence_token
* A token taken directly from the raw sequence.
* @return
* A {@link ResidueToken}, initialised from the token text
* @throws SequenceFormatException
* If the token text does not correspond to a valid Monosaccharide.
*/
protected ResidueToken createMonosaccharideToken( Token raw_sequence_token )
throws SequenceFormatException
{
String name = raw_sequence_token.getText();
Monosaccharide monosac = getSequenceFormat().getMonosaccharide( name );
if ( monosac == null )
{
throw new SequenceFormatException(
getSequence(),
raw_sequence_token.getColumn() - 1,
raw_sequence_token.getColumn() + name.length() - 2,
"Unrecognised monosaccharide name: " + name
);
}
return new ResidueToken( this, raw_sequence_token, monosac );
}
/* createSubstituentToken *//**********************************
*
* Creates a ResidueToken representing a {@link Substituent}.
* Intended to be overriden by subclasses to return sequence
* format-specific residue tokens as appropriate to the needs
* of the format.
*
* @param raw_sequence_token
* A token taken directly from the raw sequence.
* @throws SequenceFormatException
* If the token text does not correspond to a valid Substituent.
*/
protected ResidueToken createSubstituentToken( Token raw_sequence_token )
throws SequenceFormatException
{
String name = raw_sequence_token.getText();
Substituent substit = getSequenceFormat().getSubstituent( name );
if ( substit == null )
{
/*
throw new SequenceFormatException(
getSequence(),
raw_sequence_token.getColumn() - 1,
raw_sequence_token.getColumn() + name.length() - 2,
"Unrecognised substituent name: " + name
);
*/
log.warn(
"Substituent with name '"
+ name
+ "' is unknown, returning a generic substituent residue"
);
substit = Substituents.createUnknownSubstituent( name );
}
return new ResidueToken( this, raw_sequence_token, substit );
}
/**
* Returns the {@link SequenceFormat} that this parser implements.
*/
public abstract SequenceFormat getSequenceFormat()
;
/* getGraph *//************************************************
*
* Returns the graph instance used internally to build the sugar
* object whilst parsing a sequence string.
*/
public Graph<LinkageToken,ResidueToken> getGraph() { return graph; }
/* getSugar *//************************************************
*
* Returns the sugar object for the parsed sequence. Only makes
* sense to call this method after parsing, see the sugar() method.
*
* @see #sugar()
* @see #createSugar()
*/
public Sugar getSugar() throws SequenceFormatException
{
if ( tracing )
log.trace("parsed sugar AST:\n" + graph.toString() );
Sugar sugar = createSugar();
if ( debugging )
log.debug("Translating sugar AST to sugar object");
translateAstToSugar( graph, sugar );
return sugar;
}
/* createSugar *//*********************************************
*
* Factory contructor for a new, empty {@link Sugar} object.
* Intended to be overridden by subclasses where necessary.
*/
protected Sugar createSugar()
{
return new Sugar( graph.countVertices() );
}
/**
* Translates the abstract syntax tree of parsed {@link ResidueToken}s
* and {@link LinkageToken}s to the given empty {@link Sugar}.
*/
protected void translateAstToSugar( Graph<LinkageToken,ResidueToken> ast, Sugar sugar )
{
// create an AST walker to populate the new Sugar
AstTranslatorVisitor ast_visitor = new AstTranslatorVisitor( sugar );
// walk the AST & add to sugar
ast_visitor.visit( ast );
}
/* getSequence *//*********************************************
*
* Gets the sequence string currently being parsed.
*/
public String getSequence() { return sequence; }
/* setRootResidue *//******************************************
*
* Sets the "root" residue of the Sugar being parsed, since different
* sequence formats encounter the root residue at different times
* during parsing.
*/
public void setRootResidue( ResidueToken r ) throws SequenceFormatException
{
_check_notnull( r );
if ( tracing )
{
log.trace( "setting root residue '" + r + "'" );
traceParse( r, "Root residue" );
}
graph.setRootVertex( graph.getVertex(r) );
}
/* setSequence *//*********************************************
*
* Sets the sequence being parsed.
*/
public void setSequence( String seq ) { sequence = seq; }
/* traceIn *//*************************************************
*
* Overriden from ANTLR Parser class in order to add additional
* context information to the trace messages produced by ANTLR.
* Calling of this method is controlled by the 'trace' and
* 'traceparser' debugging settings in ANTLR; that is, this
* method is implicitly called by ANTLR only if the ANTLR
* trace/traceparser setting is true.
* @see the Ant "build-grammar" build task in the build.xml
* file for this package.
*/
public void traceIn( String rule_name ) throws TokenStreamException
{
super.traceIn( rule_name ); /*
if ( log.isDebugEnabled() )
{
traceParse( LT(1) );
System.err.println();
}
*/
}
/* only for hard-core debugging...
public void traceOut( String rule_name ) throws TokenStreamException
{
super.traceOut( rule_name );
traceParse( LT(1) );
}
*/
/**
* Factory method for creating {@link SequenceFormatException}s.
* @see SequenceFormatException
*/
protected SequenceFormatException createSyntaxException( Token t, String message )
{
return new SequenceFormatException(
getSequence(),
t.getColumn() - 1,
t.getColumn() + t.getText().length() - 2,
message
);
}
/**
* Factory method for creating {@link SequenceFormatException}s.
*
* @see SequenceFormatException
* @param seq_index
* the index of the syntax error in the sequence string returned
* by {@link #getSequence}
* @param message the error message to show
*/
protected SequenceFormatException createSyntaxException( int seq_index, String message )
{
return new SequenceFormatException(
getSequence(),
seq_index,
message
);
}
/* traceParse *//**********************************************
*
* Writes a formatted debugging message to STDERR indicating
* the position and content of the passed Token. Returns without
* error if Token is null, or if the log level is not set to at least
* TRACE level.
*
* @param t
* The token to be traced
* @param desc
* A text string describing what this token is meant to represent
*/
public void traceParse( Token t, String description )
{
if ( ! tracing )
return;
if ( t == null )
return;
if ( t.getText() == null )
return;
int col = t.getColumn();
if ( col < 1 ) return;
System.err.println( sequence );
System.err.println(
repeat(" ", col - 1 )
+ repeat("^", t.getText().length() )
+ " "
+ description
+ " (col "
+ col
+ ")"
);
// alt format
// System.out.println(
// String.format(
// "@index %3d | '%s' -> %s",
// t.getColumn(),
// t.getText(),
// this.getTokenName( t.getType() )
// )
// );
// alt format 2
// System.err.println(
// String.format(
// "%s>%s :: index %3d: %s",
// StringUtils.repeat( "-", t.getColumn() ),
// t.getText(),
// t.getColumn(),
// this.getTokenName( t.getType() )
// )
// );
}
public void traceParse( Token t )
{
if ( ! tracing )
return;
if ( t == null )
return;
traceParse( t, this.getTokenName( t.getType() ) );
}
//~~~~~~~~~~~~~~~~~~~~ PRIVATE METHODS ~~~~~~~~~~~~~~~~~~~~~~~~
private static final void _check_notnull( ResidueToken rt )
throws SequenceFormatException
{
if ( rt == null )
throw new SequenceFormatException(
"Expected Residue, but got null");
}
private static final void _check_notnull( LinkageToken lt )
throws SequenceFormatException
{
if ( lt == null )
throw new SequenceFormatException(
"Expected Linkage, but got null");
}
} // end class ParserAdapter