/*
* EuroCarbDB, a framework for carbohydrate bioinformatics
*
* Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as
* indicated by the @author tags or express copyright attribution
* statements applied by the authors.
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
* A copy of this license accompanies this distribution in the file LICENSE.txt.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* Last commit: $Rev: 1210 $ by $Author: glycoslave $ on $Date:: 2009-06-12 #$
*/
package org.eurocarbdb.application.glycanbuilder;
import java.util.*;
import java.util.regex.*;
/**
Read and write glycan structures in the GlycoWorkbench internal
format.
@author Alessio Ceroni (a.ceroni@imperial.ac.uk)
*/
public class GWSParser implements GlycanParser {
private static Pattern residue_pattern;
private static Pattern link_pattern;
static {
String link_old_pattern_str = "-([1-9N\\?])";
String link_pattern_str = "--(?:((?:[1-9N\\?]/)*[1-9N\\?]=[1-9N\\?]),)*((?:[1-9N\\?]/)*[1-9N\\?])";
link_pattern = Pattern.compile("(?:" + link_old_pattern_str + ")|(?:" + link_pattern_str + ")");
String start_repeat_str = "\\[";
String end_repeat_str = "\\](?:\\_([0-9]+))?+(?:\\^([0-9]+))?+";
String residue_str = "([abo\\?][1-9N\\?])?+([DL]-)?+([a-zA-z0-9_#=\\.]+)(?:,([\\?opf]))?+";
String cleaved_str = "/([a-zA-z0-9_#]+)";
String place_str = "@(-?[0-9]+s?)";
residue_pattern = Pattern.compile("(?:" + start_repeat_str + ")|(?:" + end_repeat_str + ")|" +
"(?:" + residue_str + "(?:" + cleaved_str + ")?+)" +
"(?:" + place_str + ")?+" );
}
/**
Default Constructor
*/
public GWSParser() {
}
public void setTolerateUnknown(boolean f) {
}
public String writeGlycan(Glycan structure) {
return toString(structure,false,true);
}
/**
Create a unique representation of a glycan structure using the
lexical ordering between the children of each residue.
*/
public String writeGlycanOrdered(Glycan structure) {
return toString(structure,true,true);
}
public Glycan readGlycan(String str, MassOptions default_mass_options) throws Exception {
return fromString(str,default_mass_options);
}
/**
Static method for creating string representation of glycan
structures.
@param structure the structure to be converted
*/
static public String toString(Glycan structure) {
return toString(structure,false,true);
}
/**
Static method for creating string representation of glycan
structures.
@param structure the structure to be converted
@param ordered <code>true</code> if the representation must use
the lexical ordering between children
*/
static public String toString(Glycan structure, boolean ordered) {
return toString(structure,ordered,true);
}
/**
Static method for creating string representation of glycan
structures.
@param structure the structure to be converted
@param ordered <code>true</code> if the representation must use
the lexical ordering between children
@param add_massopt <code>true</code> if the representation must
contain the mass options
*/
static public String toString(Glycan structure, boolean ordered, boolean add_massopt) {
if( structure==null )
return "";
StringBuilder ss = new StringBuilder();
if( structure.getRoot()!=null ) {
ss.append(writeSubtree(structure.getRoot(),ordered));
if( structure.getBracket()!=null )
ss.append(writeSubtree(structure.getBracket(),ordered));
if( add_massopt ) {
ss.append("$");
ss.append(structure.getMassOptions().toString());
}
}
return ss.toString();
}
/**
Static method for creating glycan structures from their string representation
@param default_mass_options the mass options to use for the new
structure if they are not specified in the string
representation
@throws Exception if the string cannot be parsed
*/
static public Glycan fromString(String str, MassOptions default_mass_options) throws Exception {
str = TextUtils.trim(str);
// read mass options
MassOptions mass_opt = default_mass_options.clone();
int ind1 = str.indexOf('$');
if( ind1!=-1 ) {
mass_opt = MassOptions.fromString(str.substring(ind1+1));
str = str.substring(0,ind1);
}
// read structure
Glycan ret = null;
int ind2 = str.indexOf('}');
if( ind2==-1 )
ret = new Glycan(readSubtree(str,true),false,mass_opt);
else {
// read structure with bracket
ret = new Glycan(readSubtree(str.substring(0,ind2),true),
readSubtree(str.substring(ind2),true),
false,mass_opt);
}
return ret;
}
static protected String writeResidueType(Residue r) {
String str = "";
if( r.isBracket() )
str += '}';
else if( r.isStartRepetition() ) {
str += '[';
}
else if( r.isEndRepetition() ) {
str += ']';
if( r.getType().getMinRepetitions()>=0 )
str += "_" + r.getType().getMinRepetitions();
if( r.getType().getMaxRepetitions()>=0 )
str += "^" + r.getType().getMaxRepetitions();
}
else if( r.isCleavage() ) {
Residue cleaved_residue = r.getCleavedResidue();
str += writeResidueType(cleaved_residue) + "/" + r.getTypeName();
}
else {
if( r.hasAnomericState() || r.hasAnomericCarbon() )
str += r.getAnomericState() + "" + r.getAnomericCarbon();
if( r.hasChirality() )
str += r.getChirality() + "-";
str += r.getTypeName();
if( r.hasRingSize() )
str += "," + r.getRingSize();
}
return str;
}
static protected String writeSubtree(Residue r, boolean ordered ) {
//------------
// write type
String str = writeResidueType(r);
// write placement
if( r.getCleavedResidue()!=null ) {
Residue cleaved_residue = r.getCleavedResidue();
if( cleaved_residue.hasPreferredPlacement() )
str += "@" + placementToString(cleaved_residue.getPreferredPlacement());
}
else {
if( r.hasPreferredPlacement() )
str += "@" + placementToString(r.getPreferredPlacement());
}
//-----------------
// write children
Vector<String> str_children = new Vector<String>();
for( Linkage l : r.getChildrenLinkages() )
str_children.add(writeSubtree(l,ordered));
if( ordered )
Collections.sort(str_children);
// add parenthesis
for( int i=0; i<r.getChildrenLinkages().size()-1; i++ )
str += "(";
// write children
for( Iterator<String> i=str_children.iterator(); i.hasNext(); ) {
str += i.next();
// close parenthesis
if( i.hasNext() )
str += ")";
}
return str;
}
static protected String writeSubtree(Linkage l, boolean ordered) {
return ("--" + toStringLinkage(l) + writeSubtree(l.getChildResidue(),ordered));
}
static protected String toStringLinkage(Linkage link) {
StringBuilder sb = new StringBuilder();
for( Iterator<Bond> i=link.getBonds().iterator(); i.hasNext(); ) {
Bond b = i.next();
if( sb.length()>0 )
sb.append(',');
// write parent positions
char[] p_poss = b.getParentPositions();
for( int l=0; l<p_poss.length; l++ ) {
if( l>0 )
sb.append('/');
sb.append(p_poss[l]);
}
// write child position for non-glycosidic bonds
if( i.hasNext() ) {
sb.append('=');
sb.append(b.getChildPosition());
}
}
return sb.toString();
}
static protected Residue readSubtree(String str, boolean accept_empty) throws Exception {
if( str.length()==0 ) {
if( accept_empty )
return null;
throw new Exception("Empty node");
}
Residue ret = null;
if( str.charAt(0)=='}' ) {
ret = ResidueDictionary.createBracket();
str = str.substring(1);
}
else {
//------------------
// create residue
Matcher m = residue_pattern.matcher(str);
if( !m.lookingAt() )
throw new Exception("Invalid format for string: " + str );
if( str.charAt(0)=='[' )
ret = ResidueDictionary.createStartRepetition();
else if( str.charAt(0)==']' )
ret = ResidueDictionary.createEndRepetition(m.group(1),m.group(2));
else {
// get stereochemistry
char ret_anom_state = '?';
char ret_anom_carbon = '?';
char ret_chirality = '?';
if( m.group(3)!=null ) {
ret_anom_state = m.group(3).charAt(0);
ret_anom_carbon = m.group(3).charAt(1);
}
if( m.group(4)!=null )
ret_chirality = m.group(4).charAt(0);
// get type name
String typename = m.group(5);
// get ring size
char ret_ring_size = '?';
if( m.group(6)!=null )
ret_ring_size = m.group(6).charAt(0);
// create residue
ret = ResidueDictionary.newResidue(typename);
ret.setAnomericState(ret_anom_state);
ret.setAnomericCarbon(ret_anom_carbon);
ret.setChirality(ret_chirality);
ret.setRingSize(ret_ring_size);
// create cleavage
String cleavage_typename = m.group(7);
if( cleavage_typename!=null ) {
Residue cleavage = null;
if( cleavage_typename.indexOf('_')!=-1 )
cleavage = CrossRingFragmentDictionary.newFragment(cleavage_typename,ret);
else
cleavage = ResidueDictionary.newResidue(cleavage_typename);
cleavage.setCleavedResidue(ret);
ret = cleavage;
}
}
// get placement
if( m.group(8)!=null ) {
ResiduePlacement pref_place = placementFromString(m.group(8));
if( ret.getCleavedResidue()!=null )
ret.getCleavedResidue().setPreferredPlacement(pref_place);
else
ret.setPreferredPlacement(pref_place);
}
str = str.substring(m.end());
}
//-----------------
// parse children
// skip open parentheses
int nopars = 0;
for( ; nopars<str.length() && str.charAt(nopars)=='('; nopars++ );
str = str.substring(nopars);
// add children
while(str.length()>0) {
Linkage child_link = null;
if( nopars>0 ) {
// find subtree enclosed in parenthesis
int ind = TextUtils.findClosedParenthesis(str);
if( ind==-1 ) throw new Exception("Invalid string format: " + str);
child_link = readSubtreeLinkage(str.substring(0,ind));
str = str.substring(ind+1);
nopars--;
}
else {
// add last child
child_link = readSubtreeLinkage(str);
str = "";
}
// add child
child_link.setParentResidue(ret);
ret.getChildrenLinkages().add(child_link);
}
return ret;
}
static protected Linkage readSubtreeLinkage(String str) throws Exception {
Matcher m = link_pattern.matcher(str);
if( !m.lookingAt() )
throw new Exception("invalid format for linkage: " + str);
if( m.group(1)!=null ) {
// old style
// parse child
Residue child = readSubtree(str.substring(m.end()),false);
// create linkage
return new Linkage(null,child,m.group(1).charAt(0));
}
// parse bonds
Vector<Bond> ret_bonds = new Vector<Bond>();
for( int i=2; i<=m.groupCount(); i++ ) {
String str_bond = m.group(i);
if( i<m.groupCount() ) {
// parse non glyco bonds
if( str_bond!=null && str_bond.length()>0 ) {
String[] fields = str_bond.split("=");
char[] p_poss = parsePositions(fields[0]);
char c_pos = fields[1].charAt(0);
ret_bonds.add(new Bond(p_poss,c_pos));
}
}
else {
// parse glyco bond
char[] p_poss = parsePositions(str_bond);
ret_bonds.add(new Bond(p_poss,'?')); // anomeric carbon position is added later
}
}
// parse child
Residue child = readSubtree(str.substring(m.end()),false);
// create linkage
Linkage ret = new Linkage(null,child);
ret.setBonds(ret_bonds);
ret.getChildResidue().setParentLinkage(ret);
return ret;
}
static private char[] parsePositions(String str) throws Exception {
int c = 0;
char[] ret = new char[(str.length()+1)/2];
for( int i=0; i<str.length(); i+=2 ) {
if( i>0 && str.charAt(i-1)!='/' )
throw new Exception("Invalid positions string: " + str);
ret[c++] = str.charAt(i);
}
return ret;
}
static private String placementToString(ResiduePlacement rp) {
if( rp==null )
return "";
String str = "" + (rp.getPositions()[0].getIntAngle()+360);
if( rp.isSticky() )
str += 's';
return str;
}
static private ResiduePlacement placementFromString(String str) throws Exception {
if( str.length()==0 )
return null;
boolean _sticky = false;
if( str.charAt(str.length()-1)=='s' ) {
_sticky = true;
str = str.substring(0,str.length()-1);
}
return new ResiduePlacement(new ResAngle(Integer.parseInt(str)),false,_sticky);
}
}