/* * EuroCarbDB, a framework for carbohydrate bioinformatics * * Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as * indicated by the @author tags or express copyright attribution * statements applied by the authors. * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * A copy of this license accompanies this distribution in the file LICENSE.txt. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * Last commit: $Rev: 1210 $ by $Author: glycoslave $ on $Date:: 2009-06-12 #$ */ /** @author Alessio Ceroni (a.ceroni@imperial.ac.uk) */ package org.eurocarbdb.application.glycoworkbench.plugin.grammar; import org.eurocarbdb.application.glycoworkbench.plugin.*; import org.eurocarbdb.application.glycoworkbench.*; import org.eurocarbdb.application.glycanbuilder.*; import java.util.*; import java.io.*; import org.w3c.dom.*; public class Grammar implements StructureGenerator, StructureScorer { public static final int SCORE_DIST = 0; public static final int SCORE_PDIST = 1; public static final int SCORE_PROB = 2; public static final int SCORE_PROB_DIST = 3; // constants private static GrammarTree[] cores = new GrammarTree[8]; private static GrammarTree[] cores_tagged = new GrammarTree[8]; static { try { // big one first cores[0] = GrammarTree.fromString("(+(GlcNAc(Fuc)(GlcNAc(Man(Man(Man)(Man))(GlcNAc)(Man)))))"); cores[1] = GrammarTree.fromString("(+(GlcNAc(Fuc)(GlcNAc(Man(Man(Man)(Man))(Man)))))"); cores[2] = GrammarTree.fromString("(+(GlcNAc(Fuc)(GlcNAc(Man(Man)(GlcNAc)(Man)))))"); cores[3] = GrammarTree.fromString("(+(GlcNAc(Fuc)(GlcNAc(Man(Man)(Man)))))"); cores[4] = GrammarTree.fromString("(+(GlcNAc(GlcNAc(Man(Man(Man)(Man))(GlcNAc)(Man)))))"); cores[5] = GrammarTree.fromString("(+(GlcNAc(GlcNAc(Man(Man(Man)(Man))(Man)))))"); cores[6] = GrammarTree.fromString("(+(GlcNAc(GlcNAc(Man(Man)(GlcNAc)(Man)))))"); cores[7] = GrammarTree.fromString("(+(GlcNAc(GlcNAc(Man(Man)(Man)))))"); for( int i=0; i<cores.length; i++ ) { cores_tagged[i] = cores[i].clone(); tagCore(cores_tagged[i]); } } catch(Exception e) { } }; // members private TreeMap<Rule,Double> rules = new TreeMap<Rule,Double>(); private TreeMap<GrammarTree,RuleProfile> seeds = new TreeMap<GrammarTree,RuleProfile>(); private GrammarOptions options = new GrammarOptions(); // generation private MassOptions mass_opt = null; private LinkedList<GrammarTree> pool = new LinkedList<GrammarTree>(); private TreeSet<String> visited = new TreeSet<String>(); private GrammarTree last_structure = null; public Grammar() { } public Grammar(String filename, boolean on_file_system) throws Exception { load(filename,on_file_system); } public Collection<Rule> getRules() { return rules.keySet(); } public String getScorerType() { return "Grammar"; } // i/o public void load(String filename, boolean on_file_system) throws Exception { load(FileUtils.open(this.getClass(),filename,!on_file_system)); } public void load(InputStream is) throws Exception { Document d = XMLUtils.read(is); this.fromXML(XMLUtils.assertChild(d,"Grammar"),false); } public void save(String filename) throws Exception { save(new FileOutputStream(filename)); } public void save(OutputStream os) throws Exception { Document d = XMLUtils.newDocument(); d.appendChild(toXML(d)); XMLUtils.write(os,d); } // learning public static Collection<Rule> extractRules(Glycan structure, GrammarOptions opt) throws Exception { Vector<Rule> ret = new Vector<Rule>(); extractRules(ret,structure,opt); return ret; } public static Collection<Rule> extractRules(GrammarTree structure, GrammarOptions opt) { Vector<Rule> ret = new Vector<Rule>(); extractRules(ret,structure,opt); return ret; } public static void extractRules(Collection<Rule> buffer, Glycan structure, GrammarOptions opt) throws Exception { // parse the tree GrammarTree gt = GrammarTree.fromGlycan(structure, opt.ADD_LINKAGE_INFO); // tag the core if( opt.TAG_CORES ) tagCore(gt); // extract the rules extractRules(buffer,gt,opt); } private static void extractRules(Collection<Rule> buffer, GrammarTree gt, GrammarOptions opt) { if( buffer==null ) return; // add new rule Rule toadd = Rule.createRule(gt,opt); if( toadd!=null ) buffer.add(toadd); // recurse for( GrammarTree gtc : gt.getChildren() ) extractRules(buffer,gtc,opt); return; } public static Grammar createGrammar(Collection<Glycan> structures, GrammarOptions opt) { if( structures==null || opt==null ) return new Grammar(); // set options Grammar ret = new Grammar(); ret.options = opt; try { // extract all rules RuleProfile all_rules = new RuleProfile(); for( Glycan s : structures ) { // parse the tree GrammarTree gt = GrammarTree.fromGlycan(s, ret.options.ADD_LINKAGE_INFO); if( !ret.seeds.containsKey(gt) ) { // tag the core if( opt.TAG_CORES ) tagCore(gt); // extract the rules Collection<Rule> extracted = extractRules(gt,ret.options); all_rules.addAll(extracted); // save the seed extracted = extractRules(gt,ret.options); ret.seeds.put(gt,new RuleProfile(extracted)); } } // normalize HashMap<String,Double> normalizations = new HashMap<String,Double>(); for( Map.Entry<Rule,Double> e : all_rules.getEntries() ) { String nkey = e.getKey().leftSide(); Double old_val = normalizations.get(nkey); if( old_val==null ) normalizations.put(nkey,e.getValue()); else normalizations.put(nkey,old_val + e.getValue()); } for( Map.Entry<Rule,Double> e : all_rules.getEntries() ) { String nkey = e.getKey().leftSide(); ret.rules.put(e.getKey(),e.getValue()/normalizations.get(nkey)); } } catch(Exception ex){ LogUtils.report(ex); } return ret; } private static void tagCore(GrammarTree gt) { for( int i=0; i<cores.length; i++ ) { HashSet<GrammarTree> matchings = new HashSet<GrammarTree>(); if( subtreeMatches(gt,cores[i],matchings) ) { GrammarTree.tag(gt,""+i,matchings); return; } } } public static int whichCore(GrammarTree gt) { for( int i=0; i<cores.length; i++ ) { HashSet<GrammarTree> matchings = new HashSet<GrammarTree>(); if( subtreeMatches(gt,cores[i],matchings) ) return i; } return -1; } private static boolean subtreeMatches(GrammarTree structure, GrammarTree core, Collection<GrammarTree> matchings) { if( !structure.getLabel().equals(core.getLabel()) ) return false; if( core.getNoChildren()==0 ) { matchings.add(structure); return true; } // try all possible combinations of children, to prevent ordering for( Collection<GrammarTree> combination : getAllCombinations(structure.getChildren(),core.getNoChildren()) ) { // match all the subtrees Vector<GrammarTree> children_matchings = new Vector<GrammarTree>(); Iterator<GrammarTree> i=combination.iterator(), l=core.getChildren().iterator(); boolean matches = true; while(i.hasNext()) { if( !subtreeMatches(i.next(),l.next(),children_matchings) ) matches = false; } // if all match if( matches ) { matchings.add(structure); matchings.addAll(children_matchings); return true; } } return false; } private static Vector<Vector<GrammarTree>> getAllCombinations(Collection<GrammarTree> src, int size) { Vector<Vector<GrammarTree>> ret = new Vector<Vector<GrammarTree>>(); getAllCombinations(ret,src,new Union<GrammarTree>(),size); return ret; } private static void getAllCombinations(Vector<Vector<GrammarTree>> buffer, Collection<GrammarTree> src, Union<GrammarTree> combination, int size) { if( (src.size()-combination.size())<size ) return; if( size==0 && combination.size()>0 ) buffer.add(combination); for( GrammarTree o : src ) { if( !combination.contains(o) ) getAllCombinations(buffer,src,combination.and(o),size-1); } } // generation public boolean accept(Glycan structure) throws Exception { if( structure==null || structure.isEmpty() ) return true; // parse the tree GrammarTree gt = GrammarTree.fromGlycan(structure, options.ADD_LINKAGE_INFO); if( seeds.containsKey(gt) ) return true; // tag the core if( options.TAG_CORES ) tagCore(gt); // extract the rules Collection<Rule> extracted = extractRules(gt,options); // check if the structure components are all recognized for( Rule r : extracted ) { if( !rules.containsKey(r) ) return false; } return true; } public void start(MassOptions _mass_opt) { mass_opt = _mass_opt; if( seeds.size()==0 || !options.USE_SEEDS ) { for( int i=0; i<cores_tagged.length; i++ ) pool.add(cores_tagged[i]); //generateStructures(pool,null); } else pool.addAll(seeds.keySet()); last_structure = null; visited.clear(); } public FragmentEntry next(boolean backtrack) { if( last_structure!=null && !backtrack) generateStructures(pool,last_structure); while( pool.size()>0 ) { if( seeds.size()==0 || !options.USE_SEEDS ) { // remove the back of the stack (depth first) last_structure = pool.getLast(); pool.removeLast(); } else { // remove the top of the stack (breadth first) last_structure = pool.getFirst(); pool.removeFirst(); } // check for duplicates String str_structure = last_structure.toString(true); if( !visited.contains(str_structure) ) { visited.add(str_structure); try { Glycan s = last_structure.toGlycan(mass_opt); if( options.USE_SEEDS && seeds.containsKey(last_structure) ) return new FragmentEntry(s,"seed"); return new FragmentEntry(s,"generated"); } catch(Exception e) { LogUtils.report(e); return null; } } } return null; } public double computeScore(Glycan structure) { return computeScore(structure,SCORE_DIST); } public double computeMinDist(RuleProfile rp) { double min_dist = Double.MAX_VALUE; for( Map.Entry<GrammarTree,RuleProfile> e : seeds.entrySet() ) { double dist = rp.intersection(e.getValue()).absSum(); min_dist = Math.min(min_dist,dist); } return min_dist; } public double computeMinPDist(RuleProfile rp) { double min_pdist = Double.MAX_VALUE; for( Map.Entry<GrammarTree,RuleProfile> e : seeds.entrySet() ) { double dist = -computeLogProb(rp.intersection(e.getValue())); min_pdist = Math.min(min_pdist,dist); } return min_pdist; } public double computeLogProb(RuleProfile rp) { double ret = 0.; for( Map.Entry<Rule,Double> e : rp.getEntries() ) ret += Math.log(rules.get(e.getKey())) * Math.abs(e.getValue()); return ret; } public double computeScore(Glycan structure, int type) { try { RuleProfile rp = new RuleProfile(extractRules(structure,options)); if( type==SCORE_DIST ) return -computeMinDist(rp); if( type==SCORE_PDIST ) return -computeMinPDist(rp); if( type==SCORE_PROB ) return computeLogProb(rp); if( type==SCORE_PROB_DIST ) return computeMinDist(rp)*computeLogProb(rp); throw new Exception("Invalid score type"); } catch(Exception e) { LogUtils.report(e); return 0.; } } public StructureDictionary generateDictionary(String dict_name, String s_type, String source, double max_mass) { StructureDictionary dict = new StructureDictionary(dict_name); dict.setScorer(this); // generate all structures boolean backtrack = false; FragmentEntry fe = null; start(MassOptions.empty()); while( (fe = next(backtrack))!=null ) { dict.add(new StructureType(dict_name,s_type,source,fe.structure)); backtrack = (fe.mass>=max_mass); } return dict; } public StructureDictionary generateDictionary(String dict_name, String s_type, String source, double max_mass, int max_res, int score_type) { StructureDictionary dict = new StructureDictionary(dict_name); dict.setScorer(this); // generate all structures, order by mass TreeMap<Double,ArrayList<FragmentEntry>> generated = new TreeMap<Double,ArrayList<FragmentEntry>>(); System.out.println("generating structures"); boolean backtrack = false; FragmentEntry gen = null; start(MassOptions.empty()); while( (gen = next(backtrack))!=null ) { if( gen.mass<=max_mass ) { // compute score gen.score = computeScore(gen.fragment,score_type); // decreasing order gen.fragment = null; // free memory // add to mass group ArrayList<FragmentEntry> mass_group = generated.get(gen.mass); if( mass_group==null ) { mass_group = new ArrayList<FragmentEntry>(); generated.put(gen.mass,mass_group); } mass_group.add(gen); } backtrack = (gen.mass>=max_mass); } System.out.println("adding to dict"); // add structures to dict order by rank for( ArrayList<FragmentEntry> mass_group : generated.values() ) { // order mass group by rank TreeMap<Double,ArrayList<FragmentEntry>> ranked = new TreeMap<Double,ArrayList<FragmentEntry>>(); for( FragmentEntry fe : mass_group ) { ArrayList<FragmentEntry> rank_group = ranked.get(-fe.score); if( rank_group == null ) { rank_group = new ArrayList<FragmentEntry>(); ranked.put(-fe.score,rank_group); } rank_group.add(fe); } // add to dictionary int rank = 0; for( ArrayList<FragmentEntry> rank_group: ranked.values() ) { if( rank>max_res ) break; for( FragmentEntry fe : rank_group ) dict.add(new StructureType(dict_name,s_type,source,fe.structure)); rank += rank_group.size(); } } return dict; } public TreeMap<Glycan,Double> generateStructures(double mass, int score_type) { boolean backtrack = false; FragmentEntry fe = null; TreeMap<Glycan,Double> ret = new TreeMap<Glycan,Double>(); start(MassOptions.empty()); while( (fe = next(backtrack))!=null ) { if( Math.abs(fe.mass-mass)<0.1 ) { double score = computeScore(fe.fragment,score_type); ret.put(fe.fragment,score); } backtrack = (fe.mass>=mass); } return ret; } public Collection<GrammarTree> generateStructures(GrammarTree seed) { Vector<GrammarTree> ret = new Vector<GrammarTree>(); generateStructures(ret,seed); return ret; } public void generateStructures(Collection<GrammarTree> buffer, GrammarTree seed) { if( seed==null ) seed = new GrammarTree(GrammarTree.END); for( Rule r : rules.keySet() ) generateStructures(buffer,seed,r,options); } public static void generateStructures(Collection<GrammarTree> buffer, GrammarTree seed, Rule r, GrammarOptions opt) { if( r==null || buffer==null ) return; if( seed==null ) seed = new GrammarTree(GrammarTree.END); GrammarTree toadd = r.generateStructure(seed,opt); if( toadd!=null ) buffer.add(toadd); for( GrammarTree c : seed.getChildren() ) generateStructures(buffer,c,r,opt); } // serialization public StructureScorer fromXML(Node g_node) throws Exception { Grammar ret = new Grammar(); ret.fromXML(g_node,false); return ret; } public void fromXML(Node g_node, boolean merge) throws Exception { if( !merge ) rules.clear(); // read options Node c_node = XMLUtils.assertChild(g_node,"Configuration"); Configuration conf = new Configuration(); conf.fromXML(c_node); options.retrieve(conf); // read rules Vector<Node> r_nodes = XMLUtils.findAllChildren(g_node, "Rule"); for( Node r_node : r_nodes) { Rule r = Rule.fromXML(r_node); String p = XMLUtils.getAttribute(r_node,"probability"); if( p!=null ) rules.put(r,Double.valueOf(p)); else rules.put(r,1.); } // read seeds Vector<Node> s_nodes = XMLUtils.findAllChildren(g_node, "Seed"); for( Node s_node : s_nodes) { GrammarTree gt = GrammarTree.fromString(XMLUtils.getAttribute(s_node,"structure")); RuleProfile rp = new RuleProfile(extractRules(gt,options)); seeds.put(gt,rp); } } public Element toXML(Document document) { if( document==null ) return null; // create root node Element g_node = document.createElement("Grammar"); if( g_node==null ) return null; // add options Configuration conf = new Configuration(); options.store(conf); g_node.appendChild(conf.toXML(document)); // add rules for( Map.Entry<Rule,Double> e : rules.entrySet() ) { Element r_node = e.getKey().toXML(document); if( r_node!=null ) { r_node.setAttribute("probability", e.getValue().toString()); g_node.appendChild(r_node); } } // add seeds for( GrammarTree s : seeds.keySet() ) { Element s_node = document.createElement("Seed"); s_node.setAttribute("structure", s.toString(false)); if( s_node!=null ) g_node.appendChild(s_node); } return g_node; } }