package client.net.sf.saxon.ce.expr;
import client.net.sf.saxon.ce.trans.XPathException;
import client.net.sf.saxon.ce.value.Whitespace;
/**
* Tokenizer for expressions and inputs.
*
* This code was originally derived from James Clark's xt, but has been almost entirely rewritten.
*/
@SuppressWarnings({"StringEquality"})
public final class Tokenizer {
private int state = DEFAULT_STATE;
// we may need to make this a stack at some time
/**
* Initial default state of the Tokenizer
*/
public static final int DEFAULT_STATE = 0;
/**
* State in which a name is NOT to be merged with what comes next, for example "("
*/
public static final int BARE_NAME_STATE = 1;
/**
* The number identifying the most recently read token
*/
public int currentToken = Token.EOF;
/**
* The string value of the most recently read token
*/
public String currentTokenValue = null;
/**
* The position in the input expression where the current token starts
*/
public int currentTokenStartOffset = 0;
/**
* The number of the next token to be returned
*/
private int nextToken = Token.EOF;
/**
* The string value of the next token to be returned
*/
private String nextTokenValue = null;
/**
* The position in the expression of the start of the next token
*/
private int nextTokenStartOffset = 0;
/**
* The string being parsed
*/
public String input;
/**
* The current position within the input string
*/
public int inputOffset = 0;
/**
* The length of the input string
*/
private int inputLength;
/**
* The token number of the token that preceded the current token
*/
private int precedingToken = Token.UNKNOWN;
public Tokenizer() {}
/**
* Get the current tokenizer state
* @return the current state
*/
//
// Lexical analyser for expressions, queries, and XSLT patterns
//
/**
* Prepare a string for tokenization.
* The actual tokens are obtained by calls on next()
*
* @param input the string to be tokenized
* @param start start point within the string
* @param end end point within the string (last character not read):
* -1 means end of string
* @throws XPathException if a lexical error occurs, e.g. unmatched
* string quotes
*/
public void tokenize(String input, int start, int end) throws XPathException {
nextToken = Token.EOF;
nextTokenValue = null;
nextTokenStartOffset = 0;
inputOffset = start;
this.input = input;
if (end==-1) {
inputLength = input.length();
} else {
inputLength = end;
}
// The tokenizer actually reads one token ahead. The raw lexical analysis performed by
// the lookAhead() method does not (in general) distinguish names used as QNames from names
// used for operators, axes, and functions. The next() routine further refines names into the
// correct category, by looking at the following token. In addition, it combines compound tokens
// such as "instance of" and "cast as".
lookAhead();
next();
}
//diagnostic version of next(): change real version to realnext()
//
//public void next() throws XPathException {
// realnext();
// System.err.println("Token: " + currentToken + "[" + tokens[currentToken] + "]");
//}
/**
* Get the next token from the input expression. The type of token is returned in the
* currentToken variable, the string value of the token in currentTokenValue.
*
* @throws XPathException if a lexical error is detected
*/
public void next() throws XPathException {
precedingToken = currentToken;
currentToken = nextToken;
currentTokenValue = nextTokenValue;
if (currentTokenValue==null) {
currentTokenValue="";
}
currentTokenStartOffset = nextTokenStartOffset;
// disambiguate the current token based on the tokenizer state
switch (currentToken) {
case Token.NAME:
int optype = getBinaryOp(currentTokenValue);
if (optype!=Token.UNKNOWN && !followsOperator(precedingToken)) {
currentToken = optype;
}
break;
case Token.STAR:
if (!followsOperator(precedingToken)) {
currentToken = Token.MULT;
}
break;
}
if (currentToken == Token.RCURLY) {
// End of an AVT
return;
}
int oldPrecedingToken = precedingToken;
lookAhead();
if (currentToken == Token.NAME) {
if (state == BARE_NAME_STATE) {
return;
}
if (oldPrecedingToken == Token.DOLLAR) {
return;
}
switch (nextToken) {
case Token.LPAR:
int op = getBinaryOp(currentTokenValue);
// the test on followsOperator() is to cater for an operator being used as a function name,
// e.g. is(): see XQTS test K-FunctionProlog-66
if (op == Token.UNKNOWN || followsOperator(oldPrecedingToken)) {
currentToken = getFunctionType(currentTokenValue);
lookAhead(); // swallow the "("
} else {
currentToken = op;
}
break;
case Token.COLONCOLON:
lookAhead();
currentToken = Token.AXIS;
break;
case Token.COLONSTAR:
lookAhead();
currentToken = Token.PREFIX;
break;
case Token.DOLLAR:
if (currentTokenValue.equals("for")) {
currentToken = Token.FOR;
} else if (currentTokenValue.equals("some")) {
currentToken = Token.SOME;
} else if (currentTokenValue.equals("every")) {
currentToken = Token.EVERY;
}
break;
case Token.NAME:
String composite = currentTokenValue + ' ' + nextTokenValue;
Integer val = Token.doubleKeywords.get(composite);
if (val==null) {
break;
} else {
currentToken = val.intValue();
currentTokenValue = composite;
lookAhead();
return;
}
default:
// no action needed
}
}
}
/**
* Look ahead by one token. This method does the real tokenization work.
* The method is normally called internally, but the XQuery parser also
* calls it to resume normal tokenization after dealing with pseudo-XML
* syntax.
* @throws XPathException if a lexical error occurs
*/
public void lookAhead() throws XPathException {
precedingToken = nextToken;
nextTokenValue = null;
nextTokenStartOffset = inputOffset;
for (;;) {
if (inputOffset >= inputLength) {
nextToken = Token.EOF;
return;
}
char c = input.charAt(inputOffset++);
switch (c) {
case '/':
if (inputOffset < inputLength
&& input.charAt(inputOffset) == '/') {
inputOffset++;
nextToken = Token.SLSL;
return;
}
nextToken = Token.SLASH;
return;
case ':':
if (inputOffset < inputLength) {
if (input.charAt(inputOffset) == ':') {
inputOffset++;
nextToken = Token.COLONCOLON;
return;
}
}
throw new XPathException("Unexpected colon at start of token");
case '@':
nextToken = Token.AT;
return;
case '?':
nextToken = Token.QMARK;
return;
case '[':
nextToken = Token.LSQB;
return;
case ']':
nextToken = Token.RSQB;
return;
case '}':
nextToken = Token.RCURLY;
return;
case '(':
if (inputOffset < inputLength && input.charAt(inputOffset) == ':') {
// XPath comment syntax is (: .... :)
// Comments may be nested, and may now be empty
inputOffset++;
int nestingDepth = 1;
while (nestingDepth > 0 && inputOffset < (inputLength-1)) {
if (input.charAt(inputOffset) == ':' &&
input.charAt(inputOffset+1) == ')') {
nestingDepth--;
inputOffset++;
} else if (input.charAt(inputOffset) == '(' &&
input.charAt(inputOffset+1) == ':') {
nestingDepth++;
inputOffset++;
}
inputOffset++;
}
if (nestingDepth > 0) {
throw new XPathException("Unclosed XPath comment");
}
lookAhead();
} else {
nextToken = Token.LPAR;
}
return;
case ')':
nextToken = Token.RPAR;
return;
case '+':
nextToken = Token.PLUS;
return;
case '-':
nextToken = Token.MINUS; // not detected if part of a name
return;
case '=':
nextToken = Token.EQUALS;
return;
case '!':
if (inputOffset < inputLength
&& input.charAt(inputOffset) == '=') {
inputOffset++;
nextToken = Token.NE;
return;
}
throw new XPathException("'!' without '='");
case '*':
// disambiguation of MULT and STAR is now done later
if (inputOffset < inputLength
&& input.charAt(inputOffset) == ':') {
inputOffset++;
nextToken = Token.SUFFIX;
// we leave the parser to get the following name as a separate
// token, but first check there's no intervening white space or comments
if (inputOffset < inputLength) {
char ahead = input.charAt(inputOffset);
if (" \r\t\n(".indexOf(ahead) >= 0) {
throw new XPathException("Whitespace and comments are not allowed after '*:'");
}
}
return;
}
nextToken = Token.STAR;
return;
case ',':
nextToken = Token.COMMA;
return;
case '$':
nextToken = Token.DOLLAR;
return;
case '|':
nextToken = Token.UNION;
return;
case '#':
nextToken = Token.HASH;
return;
case '<':
if (inputOffset < inputLength
&& input.charAt(inputOffset) == '=') {
inputOffset++;
nextToken = Token.LE;
return;
}
if (inputOffset < inputLength
&& input.charAt(inputOffset) == '<') {
inputOffset++;
nextToken = Token.PRECEDES;
return;
}
nextToken = Token.LT;
return;
case '>':
if (inputOffset < inputLength
&& input.charAt(inputOffset) == '=') {
inputOffset++;
nextToken = Token.GE;
return;
}
if (inputOffset < inputLength
&& input.charAt(inputOffset) == '>') {
inputOffset++;
nextToken = Token.FOLLOWS;
return;
}
nextToken = Token.GT;
return;
case '.':
if (inputOffset < inputLength
&& input.charAt(inputOffset) == '.') {
inputOffset++;
nextToken = Token.DOTDOT;
return;
}
if (inputOffset == inputLength
|| input.charAt(inputOffset) < '0'
|| input.charAt(inputOffset) > '9') {
nextToken = Token.DOT;
return;
}
// otherwise drop through: we have a number starting with a decimal point
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
// The logic here can return some tokens that are not legitimate numbers,
// for example "23e" or "1.0e+". However, this will only happen if the XPath
// expression as a whole is syntactically incorrect.
// These errors will be caught by the numeric constructor.
boolean allowE = true;
boolean allowSign = false;
boolean allowDot = true;
boolean endOfNum = false;
numloop:
while (!endOfNum) {
switch (c) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
allowSign = false;
break;
case '.':
if (allowDot) {
allowDot = false;
allowSign = false;
} else {
inputOffset--;
break numloop;
}
break;
case 'E': case 'e':
if (allowE) {
allowSign = true;
allowE = false;
} else {
inputOffset--;
break numloop;
}
break;
case '+': case '-':
if (allowSign) {
allowSign = false;
} else {
inputOffset--;
break numloop;
}
break;
default:
if (('a' <= c && c <= 'z') || c>127) {
// this prevents the famous "10div 3"
throw new XPathException("Separator needed after numeric literal");
}
inputOffset--;
break numloop;
}
if (inputOffset >= inputLength) break;
c = input.charAt(inputOffset++);
}
nextTokenValue = input.substring(nextTokenStartOffset, inputOffset);
nextToken = Token.NUMBER;
return;
case '"':
case '\'':
nextTokenValue = "";
while (true) {
inputOffset = input.indexOf(c, inputOffset);
if (inputOffset < 0) {
inputOffset = nextTokenStartOffset + 1;
throw new XPathException("Unmatched quote in expression");
}
nextTokenValue += input.substring(nextTokenStartOffset + 1, inputOffset++);
if (inputOffset < inputLength) {
char n = input.charAt(inputOffset);
if (n == c) {
// Doubled delimiters
nextTokenValue += c;
nextTokenStartOffset = inputOffset;
inputOffset++;
} else {
break;
}
} else {
break;
}
}
nextToken = Token.STRING_LITERAL;
return;
case '\n':
case ' ':
case '\t':
case '\r':
nextTokenStartOffset = inputOffset;
break;
default:
if (c < 0x80 && !Character.isLetter(c)) {
throw new XPathException("Invalid character '" + c + "' in expression");
}
/* fall through */
case '_':
loop:
for (;inputOffset < inputLength; inputOffset++) {
c = input.charAt(inputOffset);
switch (c) {
case ':':
if (inputOffset+1 < inputLength) {
char nc = input.charAt(inputOffset+1);
if (nc == ':') {
nextTokenValue = input.substring(nextTokenStartOffset, inputOffset);
//nextTokenValue = nextTokenValue.intern();
nextToken = Token.AXIS;
inputOffset+=2;
return;
} else if (nc == '*') {
nextTokenValue = input.substring(nextTokenStartOffset, inputOffset);
//nextTokenValue = nextTokenValue.intern();
nextToken = Token.PREFIX;
inputOffset+=2;
return;
} else if (nc == '=') {
// as in "let $x:=2"
nextTokenValue = input.substring(nextTokenStartOffset, inputOffset);
//nextTokenValue = nextTokenValue.intern();
nextToken = Token.NAME;
return;
}
}
break;
case '.':
case '-':
case '_':
break;
default:
if (c < 0x80 && !Character.isLetterOrDigit(c))
break loop;
break;
}
}
nextTokenValue = input.substring(nextTokenStartOffset, inputOffset);
//nextTokenValue = nextTokenValue.intern();
nextToken = Token.NAME;
return;
}
}
}
/**
* Identify a binary operator
*
* @param s String representation of the operator - must be interned
* @return the token number of the operator, or UNKNOWN if it is not a
* known operator
*/
private static int getBinaryOp(String s) {
switch(s.length()) {
case 2:
if (s.equals("or")) return Token.OR;
if (s.equals("is")) return Token.IS;
if (s.equals("to")) return Token.TO;
if (s.equals("in")) return Token.IN;
if (s.equals("eq")) return Token.FEQ;
if (s.equals("ne")) return Token.FNE;
if (s.equals("gt")) return Token.FGT;
if (s.equals("ge")) return Token.FGE;
if (s.equals("lt")) return Token.FLT;
if (s.equals("le")) return Token.FLE;
if (s.equals("as")) return Token.AS;
break;
case 3:
if (s.equals("and")) return Token.AND;
if (s.equals("div")) return Token.DIV;
if (s.equals("mod")) return Token.MOD;
break;
case 4:
if (s.equals("idiv")) return Token.IDIV;
if (s.equals("then")) return Token.THEN;
if (s.equals("else")) return Token.ELSE;
break;
case 5:
if (s.equals("union")) return Token.UNION;
break;
case 6:
if (s.equals("except")) return Token.EXCEPT;
if (s.equals("return")) return Token.RETURN;
break;
case 9:
if (s.equals("intersect")) return Token.INTERSECT;
if (s.equals("satisfies")) return Token.SATISFIES;
break;
}
return Token.UNKNOWN;
}
/**
* Distinguish nodekind names, "if", and function names, which are all
* followed by a "("
*
* @param s the name - must be interned
* @return the token number
*/
private static int getFunctionType(String s) {
switch(s.length()) {
case 2:
if (s.equals("if")) return Token.IF;
break;
case 4:
if (s.equals("node")) return Token.NODEKIND;
if (s.equals("item")) return Token.NODEKIND;
if (s.equals("text")) return Token.NODEKIND;
break;
case 7:
if (s.equals("element")) return Token.NODEKIND;
if (s.equals("comment")) return Token.NODEKIND;
break;
case 9:
if (s.equals("attribute")) return Token.NODEKIND;
break;
default:
if (s.equals("document-node")) return Token.NODEKIND;
if (s.equals("empty-sequence")) return Token.NODEKIND;
if (s.equals("namespace-node")) return Token.NODEKIND;
if (s.equals("schema-element")) return Token.NODEKIND;
if (s.equals("schema-attribute")) return Token.NODEKIND;
if (s.equals("processing-instruction")) return Token.NODEKIND;
break;
}
return Token.FUNCTION;
}
/**
* Test whether the previous token is an operator
* @param precedingToken the token to be tested
* @return true if the previous token is an operator token
*/
private boolean followsOperator(int precedingToken) {
return precedingToken <= Token.LAST_OPERATOR;
}
/**
* Get the most recently read text (for use in an error message)
* @param offset the offset of the offending token, if known, or -1 to use the current offset
* @return a chunk of text leading up to the error
*/
public String recentText(int offset) {
if (offset == -1) {
// if no offset was supplied, we want the text immediately before the current reading position
if (inputOffset > inputLength) {
inputOffset = inputLength;
}
if (inputOffset < 34) {
return input.substring(0, inputOffset);
} else {
return Whitespace.collapseWhitespace(
"..." + input.substring(inputOffset-30, inputOffset)).toString();
}
} else {
// if a specific offset was supplied, we want the text *starting* at that offset
int end = offset + 30;
if (end > inputLength) {
end = inputLength;
}
return Whitespace.collapseWhitespace(
(offset > 0 ? "..." : "") +
input.substring(offset, end)).toString();
}
}
}
// This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// This Source Code Form is “Incompatible With Secondary Licenses”, as defined by the Mozilla Public License, v. 2.0.