/* * EuroCarbDB, a framework for carbohydrate bioinformatics * * Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as * indicated by the @author tags or express copyright attribution * statements applied by the authors. * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * A copy of this license accompanies this distribution in the file LICENSE.txt. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * Last commit: $Rev: 1210 $ by $Author: glycoslave $ on $Date:: 2009-06-12 #$ */ package org.eurocarbdb.application.glycanbuilder; import org.eurocarbdb.resourcesdb.*; import org.eurocarbdb.resourcesdb.io.*; import org.eurocarbdb.resourcesdb.glycoconjugate_derived.*; import org.eurocarbdb.MolecularFramework.sugar.*; import org.eurocarbdb.MolecularFramework.io.GlycoCT.*; import org.eurocarbdb.MolecularFramework.io.namespace.*; import org.eurocarbdb.MolecularFramework.sugar.NonMonosaccharides.HistoricalEntity; import java.util.*; import java.util.regex.*; /** Read and write glycan structure in the GlycoCT XML format using the MolecularFramework and ResourceDB libraries. @author Alessio Ceroni (a.ceroni@imperial.ac.uk) */ public class GlycoCTParser implements GlycanParser { static private Pattern iupac_sub_pattern; static { iupac_sub_pattern = Pattern.compile("^(.+)(\\-?[1-9][a-zA-Z]+)$"); } private SugarImporterGlycoCT importer = null; private SugarExporterGlycoCT exporter = null; private MonosaccharideConverter converter = null; private GlycoVisitorFromGlycoCT visitor_import = null; private GlycoVisitorToGlycoCT visitor_export = null; private boolean tolerate_unknown_residues = false; /** Default constructor. Initialize the MolecularFramework and ResourceDB objects. */ public GlycoCTParser(boolean tolerate) { try { tolerate_unknown_residues = tolerate; importer = new SugarImporterGlycoCT(); exporter = new SugarExporterGlycoCT(); converter = new MonosaccharideConverter(new Config()); visitor_import = new GlycoVisitorFromGlycoCT(converter); //visitor_import.setNameScheme(GlycanNamescheme.GLYCOSCIENCES); visitor_import.setNameScheme(GlycanNamescheme.GWB); visitor_export = new GlycoVisitorToGlycoCT(converter); //visitor_export.setNameScheme(GlycanNamescheme.GLYCOSCIENCES); visitor_export.setNameScheme(GlycanNamescheme.GWB); visitor_export.setUseFusion(true); } catch( Exception e ) { LogUtils.report(e); } } public void setTolerateUnknown(boolean f) { tolerate_unknown_residues = f; } public String writeGlycan(Glycan structure) { return toGlycoCT(structure); } public Glycan readGlycan(String buffer, MassOptions default_mass_options) throws Exception { return fromGlycoCT(buffer,default_mass_options); } /** Get the MolecularFramework object used to parse the GlycoCT string. */ public SugarImporterGlycoCT getImporter() { return importer; } /** Get the MolecularFramework object used to produce the GlycoCT string. */ public SugarExporterGlycoCT getExporter() { return exporter; } /** Get the ResourceDB object used to translate between the monosaccharide namespaces. */ public MonosaccharideConverter getConverter() { return converter; } /** Get the MolecularFramework object used to translate between a MolecularFramework and a GlycoWorkbench glycan structure. */ public GlycoVisitorFromGlycoCT getVisitorImport() { return visitor_import; } /** Get the MolecularFramework object used to translate between a GlycoWorkbench and a MolecularFramework glycan structure. */ public GlycoVisitorToGlycoCT getVisitorExport() { return visitor_export; } /** Return a GlycoCT representation of a glycan structure. Equivalent to a call to {@link #writeGlycan} */ public String toGlycoCT(Glycan structure) { try { exporter.start(toSugar(structure)); return exporter.getXMLCode(); } catch( Exception e) { LogUtils.report(e); return ""; } } /** Return a representation of a glycan structure as a MolecularFramework object. @throws Exception if the conversion cannot be made */ public Sugar toSugar(Glycan structure) throws Exception { // init Sugar sugar = new Sugar(); if( structure==null ) return sugar; // create the sugar if( structure.isFragment() ) throw new Exception("fragments not supported for the moment"); if( structure.getRoot()!=null ) { Residue root = structure.getRoot(); if( !root.isSaccharide() ) { if( root.getTypeName().equals("freeEnd") ) toSugar(root.firstChild(),sugar,false,null,null); else if( root.getTypeName().equals("redEnd") ) toSugar(root.firstChild(),sugar,true,null,null); else toSugar(root.firstChild(),sugar,false,null,null); } else toSugar(root,sugar,false,null,null); } // add antennae if( structure.getBracket()!=null ) { ArrayList<GlycoNode> parents = (ArrayList<GlycoNode>)sugar.getNodes().clone(); for( Linkage link : structure.getBracket().getChildrenLinkages()) { Residue antenna = link.getChildResidue(); // create subtree UnderdeterminedSubTree nm_antenna = new UnderdeterminedSubTree(); toSugar(antenna,nm_antenna,false,null,null); // set connection nm_antenna.setConnection(toSugar(link)); // add subtree sugar.addUndeterminedSubTree(nm_antenna); for( GlycoNode n : parents ) sugar.addUndeterminedSubTreeParent(nm_antenna,n); } } // Normalize the sugar visitor_export.start(sugar); return visitor_export.getNormalizedSugar(); } private GlycoNode toSugar(Residue current, GlycoGraph nm_graph, boolean alditol, Residue stop_at, HashMap<Residue,GlycoNode> map) throws Exception { if( nm_graph==null ) return null; if( current==stop_at ) return null; // create new node Residue parent = null; GlycoNode nm_current = null; if( current.isStartRepetition() ) { // create repeat unit nm_current = toSugarUnitRepeat(current); parent = current.findEndRepetition(); } else { // translate current residue UnvalidatedGlycoNode toadd = new UnvalidatedGlycoNode(); toadd.setName(getIupacName(current,alditol)); nm_current = toadd; parent = current; } // add node nm_graph.addNode(nm_current); if( map!=null ) map.put(current,nm_current); // translate children for( Linkage link : parent.getChildrenLinkages() ) { GlycoNode nm_child = toSugar(link.getChildResidue(),nm_graph,false,stop_at,map); if( nm_child!=null ) nm_graph.addEdge(nm_current,nm_child,toSugar(link)); } return nm_current; } private SugarUnitRepeat toSugarUnitRepeat(Residue start) throws Exception { // init SugarUnitRepeat unit = new SugarUnitRepeat(); // add nodes Residue root = start.getChildAt(0); Residue end = start.findEndRepetition(); HashMap<Residue,GlycoNode> map = new HashMap<Residue,GlycoNode>(); toSugar(root,unit,false,end,map); // set min and max unit.setMinRepeatCount(end.getMinRepetitions()); unit.setMaxRepeatCount(end.getMaxRepetitions()); // set linkage unit.setRepeatLinkage(toSugar(root.getParentLinkage()),map.get(end.getParent()),map.get(root)); return unit; } /** Create a glycan structure from its GlycoCT representation. Equivalent to a call to {@link #readGlycan}. @param default_mass_opt the mass options to use for the new structure if they are not specified in the string representation @throws Exception if the string cannot be parsed */ public Glycan fromGlycoCT(String str, MassOptions default_mass_opt) throws Exception { return fromSugar(importer.parse(str),default_mass_opt); } /** Create a glycan structure from its representation as a MolecularFramework object. @param default_mass_opt the mass options to use for the new structure if they are not specified in the string representation @throws Exception if the string cannot be parsed */ public Glycan fromSugar(Sugar sugar, MassOptions default_mass_opt) throws Exception { return fromSugar(sugar,converter,visitor_import,default_mass_opt,tolerate_unknown_residues); } static private Glycan fromSugar(Sugar sugar, MonosaccharideConversion converter, GlycoVisitorFromGlycoCT visitor, MassOptions default_mass_opt, boolean tolerate_unknown_residues) throws Exception { if( sugar==null ) return null; // remove protein if( sugar.getRootNodes().size()==1 ) { GlycoNode gn_root = sugar.getRootNodes().iterator().next(); if( gn_root instanceof HistoricalEntity ) sugar.removeNode(gn_root); } // "Denormalize" the sugar visitor.start( sugar ); sugar = visitor.getNormalizedSugar(); // create the sugar if( sugar.getRootNodes().size()>1 ) throw new Exception("Multiple roots are not currently supported"); if( sugar.getRootNodes().size()==0 ) return new Glycan(null,false,default_mass_opt); // parse from the root GlycoNode gn_root = sugar.getRootNodes().iterator().next(); Residue root = fromSugar(gn_root,converter,tolerate_unknown_residues,null); if( root!=null && !root.isReducingEnd() ) { if( root.isAlditol() ) { Residue redend = ResidueDictionary.newResidue("redEnd"); redend.addChild(root); root = redend; } else { Residue redend = ResidueDictionary.newResidue("freeEnd"); redend.addChild(root); root = redend; } } Glycan ret = new Glycan(root,false,default_mass_opt); // parse antennae for( UnderdeterminedSubTree antenna : sugar.getUndeterminedSubTrees() ) { if( antenna.getRootNodes().size()>1 ) throw new Exception("Multiple roots in antenna are not currently supported"); if( antenna.getRootNodes().size()==0 ) continue; GlycoNode antenna_root = antenna.getRootNodes().iterator().next(); Residue toadd = fromSugar(antenna_root,converter,tolerate_unknown_residues,null); Vector<Bond> bonds = fromSugar(antenna.getConnection()); ret.addAntenna(toadd,bonds); } return ret; } static private Residue fromSugar(GlycoNode nm_current, MonosaccharideConversion converter, boolean tolerate_unknown_residues, HashMap<GlycoNode,Residue> map) throws Exception { if( nm_current==null ) return null; Residue ret = null; Residue parent = null; if( nm_current instanceof SugarUnitRepeat ) { ret = fromSugarUnitRepeat((SugarUnitRepeat)nm_current,converter,tolerate_unknown_residues); parent = ret.findEndRepetition(); if( map!=null ) map.put(nm_current,ret); } else { // transform the node into a residue String iupac_name = ((UnvalidatedGlycoNode)nm_current).getName(); //System.out.println(iupac_name); Residue current = fromIupacName(iupac_name,converter,converter==null); if( current==null ) { // try to remove the substitutions Vector<String> subs = new Vector<String>(); iupac_name = removeSubstitutions(subs,iupac_name); // parse again current = fromIupacName(iupac_name,converter,tolerate_unknown_residues); if( current==null ) throw new Exception("Unrecognized residue type: " + iupac_name + " " + tolerate_unknown_residues); // add substitutions addModifications(current,subs,converter,tolerate_unknown_residues); } ret = parent = current; if( map!=null ) map.put(nm_current,current); } //System.out.println(nm_current.getChildEdges().size() + " children"); // parse the children for( GlycoEdge edge : nm_current.getChildEdges() ) { Residue child = fromSugar(edge.getChild(),converter,tolerate_unknown_residues,map); Vector<Bond> bonds = fromSugar(edge); parent.addChild(child,bonds); child.setAnomericCarbon(bonds.lastElement().getChildPosition()); } return ret; } static private Residue fromSugarUnitRepeat(SugarUnitRepeat unit, MonosaccharideConversion converter, boolean tolerate_unknown_residues) throws Exception { if( unit.getRootNodes().size()>1 ) throw new Exception("Multiple roots are not currently supported in repeat units"); // convert nodes GlycoNode gn_root = unit.getRootNodes().iterator().next(); HashMap<GlycoNode,Residue> map = new HashMap<GlycoNode,Residue>(); Residue root = fromSugar(gn_root,converter,tolerate_unknown_residues,map); // add start repetition Residue start = ResidueDictionary.createStartRepetition(); start.addChild(root,fromSugar(unit.getRepeatLinkage())); // add end repetition Residue end = ResidueDictionary.createEndRepetition(); end.setMinRepetitions(""+unit.getMinRepeatCount()); end.setMaxRepetitions(""+unit.getMaxRepeatCount()); // connect end repetition to the last residue of the repeating unit Residue last = map.get(unit.getRepeatLinkage().getParent()); last.addChild(end); return start; } static private String removeSubstitutions(Vector<String> subs, String iupac_name) throws Exception { //System.out.println("removing substitutions from " + iupac_name); while( iupac_name.length()>0 ) { Matcher m = iupac_sub_pattern.matcher(iupac_name); if( !m.matches() ) break; iupac_name = m.group(1); subs.add(m.group(2)); } return iupac_name; } static private void addModifications(Residue current, Vector<String> children_iupac_names, MonosaccharideConversion converter, boolean tolerate_unknown_residues) throws Exception { for(String s : children_iupac_names) { char pos = s.charAt(0); String type = s.substring(1); Residue child = fromIupacName(type,converter,tolerate_unknown_residues); if( child==null ) throw new Exception("Unrecognized residue type: " + type); current.addChild(child,pos); } } /** Get the IUPAC representation of the type of a residue. @param current the residue @param alditol <code>true</code> if the residue represent an alditol */ public String getIupacName(Residue current, boolean alditol) throws Exception { if( !current.getType().hasIupacName() ) throw new Exception("Unsupported IUPAC name for type " + current.getTypeName() ); return getIupacName(current.getType(), alditol, current.getAnomericCarbon(), current.getAnomericState(), current.getChirality(), current.getRingSize()); } static private String getIupacName(ResidueType type, boolean alditol, char anomeric_carbon, char anomeric_state, char chirality, char ring_size) { String iupac_name = type.getIupacName(); if( type.isSaccharide() ) { // add chirality if( type.hasChirality() ) iupac_name = chirality + "-" + iupac_name; // get name if( alditol ) return TextUtils.delete(iupac_name,'$') + "-ol"; // add stereochemistry iupac_name = anomeric_state + "-" + iupac_name; // open ring if( ring_size=='o' ) { if( anomeric_carbon=='2' ) return "keto-" + TextUtils.delete(iupac_name,'$'); return "aldehydo-" + TextUtils.delete(iupac_name,'$'); } // add ring size; if( ring_size=='?' ) return TextUtils.delete(iupac_name,'$'); return iupac_name.replace('$',ring_size); } return iupac_name; } static private Residue fromIupacName(String iupac_name, MonosaccharideConversion converter, boolean tolerate_unknown_residues) throws Exception { if( converter==null ) { if( tolerate_unknown_residues ) return new Residue(ResidueType.createSaccharide(iupac_name)); else throw new Exception("Cannot convert iupac name to residue"); } EcdbMonosaccharide type = null; try { MonosaccharideExchangeObject data = converter.convertResidue(iupac_name,GlycanNamescheme.GWB,GlycanNamescheme.GLYCOCT); type = data.getBasetype(); } catch(Exception e) { if( tolerate_unknown_residues ) return new Residue(ResidueType.createSaccharide(iupac_name)); else return null; } if( type!=null ) { // saccharide // get anomeric state char anomeric_state = type.getAnomer().getSymbol().charAt(0); if( anomeric_state=='x' ) anomeric_state = '?'; // get ring size char ring_size = '?'; if( (type.getRingEnd() - type.getRingStart())==4 ) ring_size = 'p'; else if( (type.getRingEnd() - type.getRingStart())==3 ) ring_size = 'f'; else if( type.getRingStart()==EcdbMonosaccharide.OPEN_CHAIN ) { ring_size = 'o'; anomeric_state = '?'; } // get chirality char chirality = '?'; if( type.getBaseTypeCount()>0 ) { chirality = Character.toUpperCase(type.getBaseType(0).getName().charAt(0)); if( chirality=='X' ) chirality = '?'; } // find type for( ResidueType t : ResidueDictionary.allResidues() ) { if( getIupacName(t,false,t.getAnomericCarbon(),anomeric_state,chirality,ring_size).compareToIgnoreCase(iupac_name)==0 ) { Residue ret = new Residue(t); ret.setAnomericState(anomeric_state); ret.setChirality(chirality); ret.setRingSize(ring_size); ret.setAlditol(false); return ret; } else if( getIupacName(t,true,t.getAnomericCarbon(),anomeric_state,chirality,ring_size).compareToIgnoreCase(iupac_name)==0 ) { Residue ret = new Residue(t); ret.setAnomericState(anomeric_state); ret.setChirality(chirality); ret.setRingSize(ring_size); ret.setAlditol(true); return ret; } } if( tolerate_unknown_residues ) { Residue ret = new Residue(ResidueType.createSaccharide(iupac_name)); ret.setAnomericState(anomeric_state); ret.setChirality(chirality); ret.setRingSize(ring_size); ret.setAlditol(true); return ret; } } else { // substituent for( ResidueType t : ResidueDictionary.allResidues() ) { if( getIupacName(t,false,'?','?','?','?').compareToIgnoreCase(iupac_name)==0 ) return new Residue(t); } if( tolerate_unknown_residues ) return new Residue(ResidueType.createSubstituent(iupac_name)); } return null; } private GlycoEdge toSugar(Linkage link) throws Exception { GlycoEdge nm_edge = new GlycoEdge(); for( Bond b : link.getBonds() ) { org.eurocarbdb.MolecularFramework.sugar.Linkage nm_link = new org.eurocarbdb.MolecularFramework.sugar.Linkage(); char[] p_poss = b.getParentPositions(); for( int i=0; i<p_poss.length; i++ ) nm_link.addParentLinkage(toIntPosition(p_poss[i])); nm_link.addChildLinkage(toIntPosition(b.getChildPosition())); nm_edge.addGlycosidicLinkage(nm_link); } return nm_edge; } private static Vector<Bond> fromSugar(GlycoEdge nm_edge) { if( nm_edge.getGlycosidicLinkages().size()==0 ) return Bond.single(); Vector<Bond> ret = new Vector<Bond>(); for( org.eurocarbdb.MolecularFramework.sugar.Linkage nm_link : nm_edge.getGlycosidicLinkages() ) { // get parent linkages ArrayList<Integer> nm_ppos = nm_link.getParentLinkages(); char[] p_poss = new char[nm_ppos.size()]; for( int i=0; i<nm_ppos.size(); i++ ) p_poss[i] = fromIntPosition(nm_ppos.get(i)); // get child linkage char c_pos = (nm_link.getChildLinkages().size()==1) ?fromIntPosition(nm_link.getChildLinkages().get(0)) :'?'; // create bond ret.add(new Bond(p_poss,c_pos)); } return ret; } static private char fromIntPosition(int pos) { if( pos==org.eurocarbdb.MolecularFramework.sugar.Linkage.UNKNOWN_POSITION ) return '?'; else return (char)(pos + '0'); } static private int toIntPosition(char pos) { if( pos=='N' ) return 2; if( pos=='?' || pos=='N' ) return org.eurocarbdb.MolecularFramework.sugar.Linkage.UNKNOWN_POSITION; return (int)(pos - '0'); } }