/** Copyright (c) 2012 Delcyon, Inc. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ package com.delcyon.capo.parsers; import java.util.HashMap; import java.util.Map.Entry; import java.util.Set; import java.util.Vector; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import com.delcyon.capo.parsers.GrammarParser.SymbolType; import com.delcyon.capo.parsers.Tokenizer.CharacterType; /** * @author jeremiah * */ public class ParseTree { /** * Used when multiple but different expression matches are found for a rule. * @author jeremiah * */ public enum ParseOrderPreference { /** * choose the farthest left matching expression */ LEFT, /** * choose the farthest right matching expression */ RIGHT, /** * choose the expression with the longest match, starting from the left. */ MAX_LENGTH } public enum TermType { /** * indicates this term is a RULE */ RULE, /** * indicates this term is a SYMBOL, which means it has no defined meaning. */ SYMBOL, /** * indicates that this term is used to separate terms, * generally EOL is the only one that should really show up this way, * most others will be taken care of by the tokenizer. */ DELIMITER, /** * indicates that this term is a literal, and should be used for demarcation of a token list. */ LITERAL } //contains an ordered list of the parseRules private Vector<ParseRule> parseRuleVector = new Vector<ParseRule>(); //used for quick lookup of rules by name private HashMap<String, ParseRule> parseRuleHashMap = new HashMap<String, ParseRule>(); //a hash table keyed by SymbolType.toString() and an array of symbols or setting for that type. private HashMap<SymbolType, String[]> symbolHashMap = new HashMap<SymbolType, String[]>(); //this is a reverse lookup against the symbol hashmap, that's keyed by the symbol, and returns it's symbol type. private HashMap<String, SymbolType> symbolTypeHashMap = new HashMap<String, SymbolType>(); //used to lookup if a term is a literal private HashMap<String, String> literalHashMap = new HashMap<String, String>(); private ParseOrderPreference parseOrderPreference = ParseOrderPreference.LEFT; private boolean allowPartialMatch = false; private boolean includeLiterals = false; private boolean useLiteralsAsTokens = false; private String namespaceURI = null; private String prefix = null; private boolean isEOLSignificant = false; /** * determines of EOL will be returned as a separate token, or treated as whitespace. * This will automatically be set if EOL is used as a term in a RULE added to this parse tree. * @param isEOLSignificant */ public void setEOLSignificant(boolean isEOLSignificant) { this.isEOLSignificant = isEOLSignificant; } public boolean isEOLSignificant() { return isEOLSignificant; } /** * Setting this to true will cause any length 1 literals in a rule to be marked as separate tokens when reading the input. * This should always be turned off if parsing a notation, and probably a grammar. * @param useLiteralsAsTokens */ public void setUseLiteralsAsTokens(boolean useLiteralsAsTokens) { this.useLiteralsAsTokens = useLiteralsAsTokens; } public boolean isUseLiteralsAsTokens() { return useLiteralsAsTokens; } /** * This will set the namespace to be used for any created elements resulting from the parse. * @param prefix prefix of the namespace to use. * @param namespaceURI namespaceURI to use. */ public void setNamespace(String prefix, String namespaceURI) { this.namespaceURI = namespaceURI; this.prefix = prefix; } public String getNamespaceURI() { return namespaceURI; } public String getPrefix() { return prefix; } public boolean isIncludeLiterals() { return includeLiterals; } /** * The parse can either consume any literals it encounters, or include them in the result XML as <LITERAL VALUE=""/> elements. * @param includeLiterals */ public void setIncludeLiterals(boolean includeLiterals) { this.includeLiterals = includeLiterals; } /** * Controls whether or not we return the best match we've found, even if all of the data has not been matched. * This is handy if you want to partially parse the first part of a file, but don't care about the remainder. * * defaults to false */ public void setAllowPartialMatch(boolean allowPartialMatch) { this.allowPartialMatch = allowPartialMatch; } public boolean isAllowPartialMatch() { return allowPartialMatch; } /** * This adds a parse rule to the tree. The order that these rules are added is the order in which they will be processed. * @param parseRule */ public void addRule(ParseRule parseRule) { parseRuleVector.add(parseRule); parseRuleHashMap.put(parseRule.getName(), parseRule); //find any literals in the expression, and mark them as a literal String[][] expressions = parseRule.getExpressions(); for (String[] expresssion : expressions) { for (String term : expresssion) { if(term.equals("EOL")) { isEOLSignificant = true; } String[] patterns = symbolHashMap.get(SymbolType.LITERAL); if (patterns != null) { for (String literalPattern : patterns) { if(term.matches(literalPattern)) { literalHashMap.put(term.replaceAll(literalPattern, "$1"), term); } } } } } parseRule.setParseTree(this); } /** * Given a tokenizer object, apply an parse rules, and return the resulting XML Document * @param tokenizer * @return * @throws Exception */ public Document parse(Tokenizer tokenizer) throws Exception { DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance(); documentBuilderFactory.setNamespaceAware(true); DocumentBuilder documentBuilder = documentBuilderFactory.newDocumentBuilder(); Document parseDocument = documentBuilder.newDocument(); parse(tokenizer, parseDocument); return parseDocument; } /** * Given a tokenizer, and an XML Element or Document, this will append the result of the parse rules to that node. * @param tokenizer * @param node * @throws Exception */ public void parse(Tokenizer tokenizer, Node node) throws Exception { //walk the list of literals, and find any that have a length of 1. then make sure that, that char is treated as a separate token, and not part of a word. Set<Entry<String, String>> entries = literalHashMap.entrySet(); for (Entry<String, String> entry : entries) { if(entry.getKey().length() == 1) { if(useLiteralsAsTokens == true) { tokenizer.setCharType(entry.getKey().charAt(0), CharacterType.TOKEN); } } } tokenizer.setEOLSignificant(isEOLSignificant); ParseTape parseTape = new ParseTape(tokenizer); Element parseNode = createElement(node, parseRuleVector.firstElement().getName()); //appendChild(parseNode); if(parseRuleVector.firstElement().parse(parseNode,parseTape)) { if(allowPartialMatch == true || parseTape.hasMore() == false) { node.appendChild(parseNode); } } } /** * This returns whether or not a string in the name of a parse Rule. * @param term * @return */ public boolean isRule(String term) { return parseRuleHashMap.containsKey(term); } public void setSymbolHashMap(HashMap<SymbolType, String[]> symbolHashMap) { this.symbolHashMap = symbolHashMap; Set<Entry<SymbolType, String[]>> symbolEntrySet = symbolHashMap.entrySet(); for (Entry<SymbolType, String[]> entry : symbolEntrySet) { String[] symbols = entry.getValue(); for (String symbol : symbols) { symbolTypeHashMap.put(symbol, entry.getKey()); } } } /** * This returns a rule for a given name. * @param term * @return */ public ParseRule getRule(String term) { return parseRuleHashMap.get(term); } /** * this returns the TermType for a given string. * @param term * @return */ public TermType getTermType(String term) { if (symbolTypeHashMap.containsKey(term) ) { if(symbolTypeHashMap.get(term) == SymbolType.LITERAL) { return TermType.LITERAL; } else { return TermType.DELIMITER; } } else if (parseRuleHashMap.containsKey(term)) { return TermType.RULE; } else { String[] patterns = symbolHashMap.get(SymbolType.LITERAL); if (patterns != null) { for (String literalPattern : patterns) { if(term.matches(literalPattern)) { return TermType.LITERAL; } } } return TermType.SYMBOL; } } /** * Given a term that still has it's literal indicators around it, will find a matching pattern and use it to remove them. * For example 'value' will result in value. * @param term * @return */ public String getLiteralValue(String term) { String[] patterns = symbolHashMap.get(SymbolType.LITERAL); if (patterns != null) { for (String literalPattern : patterns) { if(term.matches(literalPattern)) { return term.replaceAll(literalPattern, "$1"); } } } return term; } /** * Check to see if a token value is registered as a literal * @param value * @return */ public boolean isLiteral(String value) { return literalHashMap.containsKey(value); } /** * Allows you to set the way the parse tree will choose when finding multiple rules that match a particular token list. * @param parseOrderPreference */ public void setParseOrderPreference(ParseOrderPreference parseOrderPreference) { this.parseOrderPreference = parseOrderPreference; } public ParseOrderPreference getParseOrderPreference() { return parseOrderPreference; } /** * Returns the symbol type of this literal, or LITERAL if there is no match. * @param value * @return */ public SymbolType getLiteralType(String value) { if(symbolTypeHashMap.containsKey(SymbolType.valueOf(value))) { return symbolTypeHashMap.get(value); } else { return SymbolType.LITERAL; } } /** * centralized method for creating an element where we take set namespace and prefix into account. * @param someNode * @param name * @return */ public Element createElement(Node someNode, String name) { Document ownerDocument = null; if(someNode instanceof Document) { ownerDocument = (Document) someNode; } else { ownerDocument = someNode.getOwnerDocument(); } if(namespaceURI != null && prefix != null) { return ownerDocument.createElementNS(namespaceURI,prefix+":"+name); } else { return ownerDocument.createElement(name); } } }