/** Copyright (c) 2012 Delcyon, Inc. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ package com.delcyon.capo.parsers; import java.util.Arrays; import java.util.Collections; import java.util.Vector; import java.util.logging.Level; import java.util.logging.Logger; import org.w3c.dom.Attr; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import com.delcyon.capo.parsers.ParseTree.ParseOrderPreference; import com.delcyon.capo.parsers.ParseTree.TermType; import com.delcyon.capo.parsers.Tokenizer.TokenType; /** * @author jeremiah * */ public class ParseRule { private String name; private String[][] expressions; private ParseTree parseTree; private String[][] localExpressions; /** * * @param name * @param expressions each expression must be a list of terms w/o any alterations. ie each expression is a separate choice */ public ParseRule(String name, String[]... expressions) { this.name = name; this.expressions = expressions; } public void setParseTree(ParseTree parseTree) { this.parseTree = parseTree; } public String getName() { return name; } private void printPathMessage(Node element, String message) { if(Logger.getGlobal().isLoggable(Level.FINER)) { StringBuilder stringBuilder = new StringBuilder(); while(element != null) { stringBuilder.insert(0, element.getNodeName()+"/"); element = (Node) element.getParentNode(); } System.out.println(stringBuilder+":"+message); } } public boolean parse(Element originalParseNode, ParseTape parseTape) throws Exception { //System.out.println("\n\n"); printPathMessage(originalParseNode, "STARTING:"+this); Vector<MatchItem> matchItemVector = new Vector<MatchItem>(); boolean foundExpressionMatch = false; int initialTapePosition = parseTape.getPosition(); Element peerParseNode = (Element) originalParseNode.cloneNode(true); //there is only one instance of each rule, treat this as setup, to see if we need to reverse this array for RIGHT tree optimizations if(localExpressions == null) { localExpressions = expressions; if(parseTree.getParseOrderPreference() == ParseOrderPreference.RIGHT && expressions.length > 1) { Collections.reverse(Arrays.asList(localExpressions)); } } //EXPRESSIONS LOOP expressions: for (int currentExpression = 0; currentExpression < localExpressions.length; currentExpression++) { if(foundExpressionMatch == true) { matchItemVector.add(new MatchItem(peerParseNode,parseTape.getPosition())); //quick optimization, if we're not doing a max length, and we have a match, we can bail out here and don't need to go further. if(parseTree.getParseOrderPreference() != ParseOrderPreference.MAX_LENGTH) { foundExpressionMatch = false; //this isn't true, but keeps us from adding an additional match outside this loop. break; } } //check to see if we need to try something else, if we're out of tape, and we have a match, then we don't if(parseTape.hasMore() == false && foundExpressionMatch) { break; } peerParseNode = (Element) originalParseNode.cloneNode(true); foundExpressionMatch = true; //backup. set list pointer to parse entry position parseTape.setPosition(initialTapePosition); String[] expression = localExpressions[currentExpression]; printPathMessage(peerParseNode, "starting parse with "+Arrays.toString(expression)); for (int currentTerm = 0; currentTerm < expression.length; currentTerm++) { String term = expression[currentTerm]; //====================================REGEX CODE============================================ String parsedRegex = null; String parsedReplacement = null; String splitRegex = null; //check to see if we're dealing with a regex if(term.startsWith("~") && term.matches("~.*/.+(/.*)?")) { String originalTerm = term; int firstSlash = originalTerm.indexOf('/'); int lastSlash = originalTerm.lastIndexOf('/'); boolean defaultedLength = false; if(lastSlash == firstSlash) { defaultedLength = true; lastSlash = originalTerm.length(); } term = originalTerm.substring(1,firstSlash); //check for some kind of delimiter char in the term, because we might be dealing with a split regex where we make two temrs from one regex if(term.matches("[a-zA-Z0-9]+[^a-zA-Z0-9]+[a-zA-Z0-9]{1}.*")) { //pull out the delimiter, well use this to split the terms and the result later splitRegex = term.replaceAll("[a-zA-Z0-9]+([^a-zA-Z0-9]+)[a-zA-Z0-9]{1}.*","$1"); } parsedRegex = originalTerm.substring(firstSlash+1,lastSlash); if ( firstSlash != lastSlash) { if(defaultedLength == false) { lastSlash++; } parsedReplacement = originalTerm.substring(lastSlash); if(parsedReplacement.isEmpty()) { parsedReplacement = null; } } } //=====================================QUANTIFICATION CODE============================================== boolean useQuantification = false; boolean inQuantificationLoop = false; int quantifier = 0; int minimumQuantity = 0; int maximumQuantity = Integer.MAX_VALUE; if(term.endsWith("+")) { useQuantification = true; inQuantificationLoop = true; term = term.substring(0, term.length()-1); minimumQuantity = 1; } else if(term.endsWith("?")) { useQuantification = true; inQuantificationLoop = true; term = term.substring(0, term.length()-1); maximumQuantity = 1; } else if(term.endsWith("*")) { useQuantification = true; inQuantificationLoop = true; term = term.substring(0, term.length()-1); } else if(term.matches(".+\\{\\d+\\}")) { String originalTerm = term; useQuantification = true; inQuantificationLoop = true; term = originalTerm.replaceFirst("(.+)\\{\\d+\\}", "$1"); maximumQuantity = Integer.parseInt(originalTerm.replaceFirst(".+\\{(\\d+)\\}", "$1")); minimumQuantity = maximumQuantity; } else if(term.matches(".+\\{\\d*,\\d+\\}")) { String originalTerm = term; useQuantification = true; inQuantificationLoop = true; term = originalTerm.replaceFirst("(.+)\\{\\d*,\\d+\\}", "$1"); maximumQuantity = Integer.parseInt(originalTerm.replaceFirst(".+\\{\\d*,(\\d+)\\}", "$1")); String minString = originalTerm.replaceFirst(".+\\{(\\d*),\\d+\\}", "$1"); if(minString.isEmpty() == false) { minimumQuantity = Integer.parseInt(minString); } } //=====================START MAIN DO WHILE LOOP==================================================== do { if(parseTape.next() == null && currentTerm < expression.length-1) { parseTape.pushBack(); foundExpressionMatch = false; if(inQuantificationLoop == false) { continue expressions; } } ParseToken token = parseTape.getCurrent(); //figure out what to do with the current term TermType termType = parseTree.getTermType(term); if(term.isEmpty() && parsedRegex != null) { termType = TermType.LITERAL; } printPathMessage(peerParseNode, "Checking "+term+"["+termType+"] against "+token); //==========================MATCHING TERM TO TOKEN CODE======================================================= //do the full term type checking switch (termType) { case RULE: //drill down into new rule parseTape.pushBack(); Element parseNode = parseTree.createElement(originalParseNode,parseTree.getRule(term).getName()); peerParseNode.appendChild(parseNode); if (parseTree.getRule(term).parse(parseNode, parseTape) == false) { peerParseNode.removeChild(parseNode); foundExpressionMatch = false; printPathMessage(peerParseNode, "FAILURE "+term+"["+termType+"] against "+token); if(inQuantificationLoop == false) { continue expressions; } } break; case LITERAL: //Literals must be tokens if(token.getTokenType() != TokenType.TOKEN) { parseTape.pushBack(); foundExpressionMatch = false; printPathMessage(peerParseNode, "FAILURE "+parsedRegex+"["+termType+"] against "+token); if(inQuantificationLoop == false) { continue expressions; } break; } else if(parsedRegex != null && token.getValue().matches(parsedRegex)) { if(parseTree.isIncludeLiterals() == true) { Element cElement = parseTree.createElement(originalParseNode,"LITERAL"); cElement.setAttribute(parseTree.getLiteralType(token.getValue()).toString(), token.getValue()); peerParseNode.appendChild(cElement); } } else if(parseTree.getLiteralValue(term).equals(token.getValue())) { if(parseTree.isIncludeLiterals() == true) { Element cElement = parseTree.createElement(originalParseNode,"LITERAL"); cElement.setAttribute(parseTree.getLiteralType(token.getValue()).toString(), token.getValue()); peerParseNode.appendChild(cElement); } } else { parseTape.pushBack(); foundExpressionMatch = false; printPathMessage(peerParseNode, "FAILURE "+term+"["+termType+"] against "+token); if(inQuantificationLoop == false) { continue expressions; } } break; case SYMBOL: //Symbols must be tokens if(token.getTokenType() != TokenType.TOKEN) { parseTape.pushBack(); foundExpressionMatch = false; printPathMessage(peerParseNode, "FAILURE "+parsedRegex+"["+termType+"] against "+token); if(inQuantificationLoop == false) { continue expressions; } break; } String value = token.getValue(); TermType valueTermType = parseTree.getTermType(value); //named rules can't actually be referred to the INPUT, so change this to a symbol if(valueTermType == TermType.RULE) { valueTermType = TermType.SYMBOL; } //do a simple regex check against the value before we start to get fancier later. //esp after we've tried to run a replacement regex against the value. if(parsedRegex != null && value.matches(parsedRegex) == false) { parseTape.pushBack(); foundExpressionMatch = false; printPathMessage(peerParseNode, "FAILURE "+parsedRegex+"["+termType+"] against "+token); if(inQuantificationLoop == false) { continue expressions; } break; } //see if we need to do some sort of regex replacement if(parsedRegex != null && parsedReplacement != null) { value = value.replaceAll(parsedRegex, parsedReplacement); //if we've modified this, don't let it turn into some other TermType. it IS a symbol. valueTermType = TermType.SYMBOL; } //this is the basic literal check for symbols. but converted regex's are never marked as literals else if(valueTermType == TermType.SYMBOL && parseTree.isLiteral(value)) { valueTermType = TermType.LITERAL; } //check to see if this is an escaped Literal, if so, it's a symbol. //This can happen where the input includes a token wrapped in literal indicators, that was ALSO //wrapped in literal indicators in the grammar. Such as '|' in a new notation and the Default Grammar. if(valueTermType == TermType.LITERAL && parseTree.getLiteralValue(value).length() != value.length()) { valueTermType = TermType.SYMBOL; } //check to see if it's a misplaced DELIMITER. DELIMITER's will never also be tokens if(valueTermType == TermType.DELIMITER && token.getTokenType() == TokenType.TOKEN) { valueTermType = TermType.SYMBOL; } //delimiters and literals should never be treated as symbols if(valueTermType == TermType.DELIMITER || valueTermType == TermType.LITERAL) { //System.err.println(token+"<=="+valueTermType); parseTape.pushBack(); foundExpressionMatch = false; printPathMessage(peerParseNode, "FAILURE "+term+"["+termType+"] against "+token+"["+valueTermType+"]"); if(inQuantificationLoop == false) { continue expressions; } } //overlap with RULE names should be ignored as something we're parsing can't refer to a rule name //overlap with Literals should be ignored as a literal can be a SYMBOL_NAME else { //check to see if this is a compound regex if(splitRegex != null) { String[] terms = term.split(splitRegex); String[] values = value.split(splitRegex); for(int currentSplitTerm = 0; currentSplitTerm < terms.length && currentSplitTerm < values.length; currentSplitTerm++) { //if we already have this attribute, then just append the values together if(peerParseNode.hasAttribute(terms[currentSplitTerm])) { values[currentSplitTerm] = peerParseNode.getAttribute(terms[currentSplitTerm]) +" "+ values[currentSplitTerm]; } peerParseNode.setAttribute(terms[currentSplitTerm], values[currentSplitTerm]); } } else { //if we already have this attribute, then just append the values together if(peerParseNode.hasAttribute(term)) { value = peerParseNode.getAttribute(term) +" "+ value; } peerParseNode.setAttribute(term, value); } } break; case DELIMITER: //DELIMITER must NOT be tokens if(token.getTokenType() != TokenType.TOKEN) { //comsume it, and do nothing } else { //System.err.println(token+"<=="+parseTree.getTermType(token.getValue())); parseTape.pushBack(); foundExpressionMatch = false; printPathMessage(peerParseNode, "FAILURE "+term+"["+termType+"] against "+token); if(inQuantificationLoop == false) { continue expressions; } } break; default: System.err.println("unknown term:"+term); foundExpressionMatch = false; break; } //=============================MORE QUANTIFICATION PROCESSING FOR CHECKING OF LOOP CONTINUATION================ if(useQuantification == true && inQuantificationLoop == true) { if(foundExpressionMatch) { quantifier++; if(quantifier > maximumQuantity) { printPathMessage(peerParseNode, "QUANTITY FAILURE "+term+"["+termType+"] against "+token); foundExpressionMatch = false; continue expressions; } } else { inQuantificationLoop = false; if(quantifier >= minimumQuantity && quantifier <= maximumQuantity) { foundExpressionMatch = true; } else { printPathMessage(peerParseNode, "QUANTITY FAILURE "+term+"["+termType+"] against "+token); foundExpressionMatch = false; //redundant continue expressions; } } } }//============================END DO WHILE LOOP============================================== while(useQuantification == true && inQuantificationLoop == true); } } if(foundExpressionMatch == true) { matchItemVector.add(new MatchItem(peerParseNode,parseTape.getPosition())); } if(matchItemVector.size() > 0) { MatchItem matchItem = null; switch(parseTree.getParseOrderPreference()) { case LEFT: case RIGHT: //we reverse the expressions above, so on left or right, our first match should always be the correct one to return. matchItem = matchItemVector.firstElement(); break; case MAX_LENGTH: int matchItemPos = -1; for (int index = 0 ; index < matchItemVector.size(); index++) { if(matchItemVector.get(index).endTapePosition > matchItemPos) { matchItem = matchItemVector.get(index); matchItemPos = matchItem.endTapePosition; } } break; } parseTape.setPosition(matchItem.endTapePosition); NodeList childrenNodeList = matchItem.parseNode.getChildNodes(); for(int index = 0; index < childrenNodeList.getLength();) { originalParseNode.appendChild(childrenNodeList.item(index)); } NamedNodeMap namedNodeMap = matchItem.parseNode.getAttributes(); while(namedNodeMap.getLength() > 0) { originalParseNode.setAttributeNode((Attr) namedNodeMap.removeNamedItem(namedNodeMap.item(0).getNodeName())); } //XPath.dumpNode(peerParseNode, System.out); printPathMessage(peerParseNode, "finished parse with TRUE"); return true; } printPathMessage(peerParseNode, "finished parse with FAILURE"); return false; } public String[][] getExpressions() { return expressions; } @Override public String toString() { StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("["); for (String[] expression : expressions) { stringBuilder.append(Arrays.toString(expression)); } stringBuilder.append("]"); return getName()+""+stringBuilder; } private class MatchItem { Element parseNode = null; int endTapePosition = -1; public MatchItem(Element parseNode, int position) { this.parseNode = parseNode; this.endTapePosition = position; } } }