/* * Copyright 2012 JBoss Inc * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.artificer.common.query.xpath; import java.text.ParseException; import org.artificer.common.i18n.Messages; /** * This class tokenizes an input string into a stream of tokens. */ public class XPathTokenizer { /** * Constructor. */ public XPathTokenizer() { } /** * Tokenize the input into a stream of tokens that the parser can then use. * @param input * @throws ParsingException */ public TokenStream tokenize(String input) throws ParseException { CharacterStream stream = new CharacterStream(input); TokenStream tokens = new TokenStream(); while (stream.hasNext()) { char c = stream.next(); int startIndex, endIndex; switch (c) { case ' ': case '\t': case '\n': case '\r': // Just skip these whitespace characters ... break; case '.': case '-': // If it's followed by a digit, then tokenize it as a numeric if (stream.isNextNumericStart()) { startIndex = stream.index(); while (stream.isNextNumeric()) { c = stream.next(); } endIndex = stream.index() + 1; tokens.addToken(stream.get(startIndex, endIndex), TokenType.numeric); break; } case ')': case '{': case '}': case '*': case ',': case ';': case '+': case '%': case '?': case '$': case '!': case '<': case '>': case '|': case '=': case ':': case '[': case ']': case '^': case '/': case '\\': case '#': case '@': tokens.addToken(stream.get(stream.index(), stream.index() + 1), TokenType.symbol); break; case '\'': case '\"': startIndex = stream.index(); char closingChar = c; boolean foundClosingQuote = false; while (stream.hasNext()) { c = stream.next(); if (c == closingChar && stream.isNext(closingChar)) { c = stream.next(); // consume the next closeChar since it is escaped } else if (c == closingChar) { foundClosingQuote = true; break; } } if (!foundClosingQuote) { throw new ParseException(Messages.i18n.format("XPATH_TOK_MISSING_QUOTE"), stream.index()); } endIndex = stream.index() + 1; // beyond last character read tokens.addToken(stream.get(startIndex, endIndex), TokenType.quotedString); break; case '(': startIndex = stream.index(); if (stream.isNext(':')) { // This is a comment ... while (stream.hasNext() && !stream.areNext(':', ')')) { c = stream.next(); } // consume the ':' if (stream.hasNext()) stream.next(); // consume the ')' if (stream.hasNext()) stream.next(); } else { tokens.addToken(stream.get(stream.index(), stream.index() + 1), TokenType.symbol); break; } break; default: startIndex = stream.index(); TokenType tokenType; if (isValidNcNameStart(c)) { tokenType = TokenType.name; // Read as long as there is a valid XML character ... while (stream.isNextValidXmlNcNameCharacter()) { c = stream.next(); } } else if (isValidNumericStart(c)) { tokenType = TokenType.numeric; // Read as long as there is a valid numeric character ... while (stream.isNextNumeric()) { c = stream.next(); } } else { tokenType = TokenType.other; // Read as long as there is a valid XML character ... while (stream.isNextValidXmlNcNameCharacter()) { c = stream.next(); } } endIndex = stream.index() + 1; // beyond last character that was included tokens.addToken(stream.get(startIndex, endIndex), tokenType); } } return tokens.build(); } /** * Returns true if the given character is a valid start of an NCName. * @param c */ private boolean isValidNcNameStart(char c) { return Character.isLetter(c) || c == '_'; } /** * Returns true if the given character is a valid numeric start character. * @param c the character * @return boolean */ private boolean isValidNumericStart(char c) { return Character.isDigit(c); } }