/*
* EuroCarbDB, a framework for carbohydrate bioinformatics
*
* Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as
* indicated by the @author tags or express copyright attribution
* statements applied by the authors.
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
* A copy of this license accompanies this distribution in the file LICENSE.txt.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* Last commit: $Rev: 1210 $ by $Author: glycoslave $ on $Date:: 2009-06-12 #$
*/
package org.eurocarbdb.application.glycanbuilder;
import java.util.*;
import java.util.regex.*;
/**
Read glycan structure from strings in GlycoMind format.
@author Alessio Ceroni (a.ceroni@imperial.ac.uk)
*/
public class GlycoMindsParser implements GlycanParser {
private static final class SU {
public String code;
public char chirality;
public char ring_size;
public SU(String _code) {
code = _code;
chirality = '?';
ring_size = '?';
}
public SU(String _code, char _chirality, char _ring_size) {
code = _code;
chirality = _chirality;
ring_size = _ring_size;
}
}
private static Pattern gmind_pattern;
private static Pattern gmind_sub_pattern;
private static HashMap<String,String> gmind_types;
private static HashMap<String,SU> gmind_codes;
static {
gmind_pattern = Pattern.compile("([A-Z]+)([\\'\\^\\~]?)((?:\\[[\\?1-9a-zA-Z\\,]+\\])?)([abo\\?]?)([1-9\\?]?(?:/[1-9\\?])*)\\z");
gmind_sub_pattern = Pattern.compile("([\\?1-9])([a-zA-Z]+)(?:\\,([\\?1-9])([a-zA-Z]+))*");
gmind_types = new HashMap<String,String>();
gmind_types.put("G", "?1D-Glc,p");
gmind_types.put("A", "?1D-Gal,p");
gmind_types.put("GN","?1D-GlcNAc,p");
gmind_types.put("AN","?1D-GalNAc,p");
gmind_types.put("M", "?1D-Man,p");
gmind_types.put("N", "?2D-Neu,p");
gmind_types.put("NN","?2D-NeuAc,p");
gmind_types.put("NJ","?2D-NeuGc,p");
gmind_types.put("K", "?2D-KDN,p");
gmind_types.put("W", "?2D-KDO,p");
gmind_types.put("L", "?1D-GalA,p");
gmind_types.put("I", "?1D-IdoA,p");
gmind_types.put("H", "?1L-Rha,p");
gmind_types.put("F", "?1L-Fuc,p");
gmind_types.put("X", "?1D-Xyl,p");
gmind_types.put("B", "?1D-Rib,p");
gmind_types.put("R", "?1L-Ara,f");
gmind_types.put("U", "?1D-GlcA,p");
gmind_types.put("O", "?1D-All,p");
gmind_types.put("P", "?1D-Api,p");
gmind_types.put("E", "?2D-Fru,f");
gmind_types.put("T", "Ac");
gmind_types.put("S", "S");
gmind_types.put("P", "P");
gmind_types.put("ME", "Me");
gmind_codes = new HashMap<String,SU>();
gmind_codes.put("Glc",new SU("G", 'D', 'p'));
gmind_codes.put("Gal",new SU("A", 'D', 'p'));
gmind_codes.put("GlcNAc",new SU("GN",'D','p'));
gmind_codes.put("GalNAc",new SU("AN",'D','p'));
gmind_codes.put("Man",new SU("M",'D','p'));
gmind_codes.put("Neu",new SU("N",'D','p'));
gmind_codes.put("NeuAc",new SU("NN",'D','p'));
gmind_codes.put("NeuGc",new SU("NJ",'D','p'));
gmind_codes.put("KDN",new SU("K",'D','p'));
gmind_codes.put("KDO",new SU("W",'D','p'));
gmind_codes.put("GalA",new SU("L",'D','p'));
gmind_codes.put("IdoA",new SU("I",'D','p'));
gmind_codes.put("Rha",new SU("H",'L','p'));
gmind_codes.put("Fuc",new SU("F",'L','p'));
gmind_codes.put("Xyl",new SU("X",'D','p'));
gmind_codes.put("Rib",new SU("B",'D','p'));
gmind_codes.put("Ara",new SU("R",'L','f'));
gmind_codes.put("Glc",new SU("U",'D','p'));
gmind_codes.put("All",new SU("O",'D','p'));
gmind_codes.put("Api",new SU("P",'D','p'));
gmind_codes.put("Fru",new SU("E",'D','f'));
gmind_codes.put("Ac",new SU("T"));
gmind_codes.put("S",new SU("S"));
gmind_codes.put("P",new SU("P"));
gmind_codes.put("Me",new SU("ME"));
}
public void setTolerateUnknown(boolean f) {
}
public String writeGlycan(Glycan structure) {
if( structure.isFragment() )
return "";
// remove reducing end modification
Residue root = structure.getRoot();
if( root!=null && !root.isSaccharide() )
root = root.firstChild();
// write structure
if( structure.getBracket()==null )
return writeSubtree(root,false);
// write core
StringBuilder sb = new StringBuilder();
sb.append(writeSubtree(root,true));
// write antennae
for( Linkage l : structure.getBracket().getChildrenLinkages() ) {
sb.append(',');
sb.append(writeSubtree(l.getChildResidue(),false));
}
return sb.toString();
}
public Glycan readGlycan(String str, MassOptions default_mass_options) throws Exception {
str = TextUtils.trim(str);
if( str.indexOf("//")!=-1 )
throw new Exception("Unsupported structures with uncertain residues");
if( str.indexOf("*")!=-1 )
throw new Exception("Unsupported structures with unknown residues");
if( str.indexOf("{")!=-1 )
throw new Exception("Unsupported structures with repeating units");
// remove aglyca
int index = str.indexOf(";");
if( index==-1 ) {
index = str.indexOf(":");
if( index==-1 )
index = str.indexOf("#");
}
if( index!=-1 )
str = str.substring(0,index);
// remove variable specifications
str = str.replaceAll("(\\([1-9]+\\%\\))|([1-9]+\\%)","");
// detect core and antennae
String[] tokens1 = str.split("\\|");
String str_core = tokens1[tokens1.length-1];
String[] tokens2 = new String[0];
if( str_core.indexOf(",")!=-1 ) {
// antennae specification in cartoonist
tokens2 = str.split("\\,");
str_core = tokens2[0];
}
// parse the core
Glycan structure = new Glycan(readSubtree(str_core),true,default_mass_options);
// parse antennae
for( int i=tokens1.length-2; i>=0; i-- ) {
String str_antenna = tokens1[i].substring(0,tokens1[i].length()-1);
Residue antenna = readSubtree(str_antenna);
structure.addAntenna(antenna,antenna.getParentLinkage().getBonds());
}
for( int i=1; i<tokens2.length; i++ ) {
String str_antenna = tokens2[i];
Residue antenna = readSubtree(str_antenna);
structure.addAntenna(antenna,antenna.getParentLinkage().getBonds());
}
return structure;
}
private static Residue readSubtree(String str) throws Exception {
String in = "" + str;
Matcher m = gmind_pattern.matcher(str);
if( !m.find() )
throw new Exception("Unrecognized format: " + str);
// parse residue
Residue ret = createFromGlycoMinds(m.group(1),m.group(2),m.group(3),m.group(4),m.group(5));
str = str.substring(0,str.length()-m.group(0).length());
// parse children
Vector<Linkage> children = new Vector<Linkage>();
while( str.length()>0 ) {
Residue child = null;
int par_ind = TextUtils.findEnclosedInvert(str,str.length()-1,'(',')');
if( par_ind!=-1 ) {
child = readSubtree(str.substring(par_ind+1,str.length()-1));
str = str.substring(0,par_ind);
}
else {
child = readSubtree(str);
str = "";
}
children.add(child.getParentLinkage());
}
// put children in glycomics order
if( children.size()>0 ) {
children.insertElementAt(children.lastElement(),0);
children.remove(children.size()-1);
}
fixBisectingGlcNAc(ret,children);
// add children
//Collections.sort(children,new Linkage.LinkageComparator());
for( Linkage l : children )
ret.addChild(l.getChildResidue(),l.getBonds());
return ret;
}
private static void fixBisectingGlcNAc(Residue parent, Vector<Linkage> children ) {
if( !parent.getTypeName().equals("Man") || children.size()!=3 )
return;
int pos = 0;
int glcnac_pos = -1;
int no_glcnac = 0;
int no_man = 0;
for( int i=0; i<children.size(); i++ ) {
Linkage l = children.get(i);
if( l.getChildResidue().getTypeName().equals("Man") )
no_man++;
else if( l.getChildResidue().getTypeName().equals("GlcNAc") ) {
no_glcnac++;
glcnac_pos = i;
}
else
return;
}
if( no_glcnac!=1 || no_man!=2 )
return;
if( glcnac_pos!=1 ) {
// swap pos
Linkage help = children.get(1);
children.set(1,children.get(glcnac_pos));
children.set(glcnac_pos,help);
}
}
private static Residue createFromGlycoMinds(String type, String mod_stereo, String subs, String anom, String link) throws Exception {
// get residue type
String res_type = gmind_types.get(type);
if( res_type==null )
throw new Exception("Unrecognized gmind type: " + type);
Residue ret = GWSParser.readSubtree(res_type,false);
// stereochemistry modifications
if( mod_stereo!=null && mod_stereo.length()>0 ) {
if( mod_stereo.equals("'") )
ret.setChirality((ret.getChirality()=='D') ?'L' :'D');
else if( mod_stereo.equals("^") )
ret.setRingSize((ret.getRingSize()=='p') ?'f' :'p');
else if( mod_stereo.equals("~") ) {
ret.setChirality((ret.getChirality()=='D') ?'L' :'D');
ret.setRingSize((ret.getRingSize()=='p') ?'f' :'p');
}
}
// anomericity
if( anom!=null && anom.length()>0 )
ret.setAnomericState(anom.charAt(0));
// substitutions
if( subs!=null && subs.length()>1 ) {
subs = subs.substring(1,subs.length()-1);
Matcher m = gmind_sub_pattern.matcher(subs);
if( !m.lookingAt() )
throw new Exception("Unrecognized format for substitution: " + subs);
for( int i=0; i<m.groupCount(); i+=2 ) {
String sub = m.group(i+2);
if( sub!=null && sub.length()>0 ) {
String sub_type = gmind_types.get(sub);
if( sub_type==null )
throw new Exception("Unrecognized gmind type: " + sub);
Residue ret_sub = ResidueDictionary.newResidue(sub_type);
ret.addChild(ret_sub,m.group(i+1).charAt(0));
}
}
}
// linkage position
Linkage par_link = new Linkage(null,ret);
if( link!=null && link.length()>0 )
par_link.setLinkagePositions(parsePositions(link));
ret.setParentLinkage(par_link);
return ret;
}
static private char[] parsePositions(String str) {
String[] fields = str.split("/");
char[] ret = new char[fields.length];
for( int i=0; i<fields.length; i++ )
ret[i] = fields[i].charAt(0);
return ret;
}
private String writeSubtree(Residue r, boolean add_uncertain_leaf) {
if( r==null )
return "";
if( !r.isSaccharide() )
return "*";
StringBuilder sb = new StringBuilder();
// create SU
if( gmind_codes.get(r.getTypeName())==null ) {
// unsupported type
sb.insert(0,'*');
}
else {
// add linkage info
if( r.getParentLinkage()!=null ) {
char ppos = r.getParentLinkage().getParentPositionsSingle();
if( ppos!='?' )
sb.insert(0,ppos);
}
sb.insert(0,r.getAnomericState());
// get childrens
Vector<Linkage> modifications = new Vector<Linkage>();
Vector<Linkage> children = new Vector<Linkage>();
for( Linkage l : r.getChildrenLinkages() ) {
if( l.getChildResidue().isSaccharide() )
children.add(l);
else
modifications.add(l);
}
Collections.sort(modifications,new Linkage.LinkageComparator());
// add modifications
if( modifications.size()>0 ) {
StringBuilder msb = new StringBuilder();
msb.append('[');
for( Linkage l : modifications ) {
if( gmind_codes.get(l.getChildResidue().getTypeName())==null )
sb.append('*');
else {
sb.append(l.getParentPositionsSingle());
sb.append(gmind_codes.get(l.getChildResidue().getTypeName()).code);
}
}
msb.append(']');
sb.insert(0,msb.toString());
}
// add type
SU su = gmind_codes.get(r.getTypeName());
if( su.chirality!=r.getChirality() && su.ring_size!=r.getRingSize() )
sb.insert(0,'~');
else if( su.chirality!=r.getChirality() )
sb.insert(0,'\'');
else if( su.ring_size!=r.getRingSize() )
sb.insert(0,'^');
sb.insert(0,su.code);
// add children
if( children.size()>0 ) {
for( int i=1; i<children.size(); i++ ) {
sb.insert(0,')');
sb.insert(0,writeSubtree(children.get(i).getChildResidue(),add_uncertain_leaf));
sb.insert(0,'(');
}
sb.insert(0,writeSubtree(children.firstElement().getChildResidue(),add_uncertain_leaf));
}
else {
if( add_uncertain_leaf )
sb.insert(0,"1%");
}
}
return sb.toString();
}
}