/*
* Copyright 2013 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.xd.dirt.stream.dsl;
import java.util.ArrayList;
import java.util.List;
import org.springframework.util.Assert;
/**
* Lex some input data into a stream of tokens that can then then be parsed.
*
* @author Andy Clement
*/
class Tokenizer {
private String expressionString; // The string to be tokenized
private char[] toProcess; // The expressionString as a char array
private int max; // Length of input data
private int pos; // Current lexing position in the input data
private List<Token> tokens = new ArrayList<Token>(); // Output stream of tokens
public Tokenizer(String inputdata) {
this.expressionString = inputdata;
this.toProcess = (inputdata + "\0").toCharArray();
this.max = toProcess.length;
this.pos = 0;
process();
}
private void process() {
boolean justProcessedEquals = false;
while (pos < max) {
char ch = toProcess[pos];
if (justProcessedEquals) {
if (!isWhitespace(ch) && ch != 0) {
// following an '=' we commence a variant of regular tokenization,
// here we consume everything up to the next special char.
// This allows SpEL expressions to be used without quoting in many
// situations.
lexArgValueIdentifier();
}
justProcessedEquals = false;
continue;
}
if (isAlphabetic(ch) || isDigit(ch) || ch == '_') {
lexIdentifier();
}
else {
switch (ch) {
case '-':
if (!isTwoCharToken(TokenKind.DOUBLE_MINUS)) {
throw new StreamDefinitionException(
expressionString, pos,
XDDSLMessages.MISSING_CHARACTER, "-");
}
pushPairToken(TokenKind.DOUBLE_MINUS);
break;
case '=':
justProcessedEquals = true;
pushCharToken(TokenKind.EQUALS);
break;
case '&':
pushCharToken(TokenKind.AND);
break;
case '|':
pushCharToken(TokenKind.PIPE);
break;
case ' ':
case '\t':
case '\r':
// drift over white space
pos++;
break;
case '\n':
pushCharToken(TokenKind.NEWLINE);
break;
case '.':
pushCharToken(TokenKind.DOT);
break;
case '>':
pushCharToken(TokenKind.GT);
break;
case ':':
pushCharToken(TokenKind.COLON);
break;
case ';':
pushCharToken(TokenKind.SEMICOLON);
break;
case '\'':
lexQuotedStringLiteral();
break;
case '"':
lexDoubleQuotedStringLiteral();
break;
case '@':
pushCharToken(TokenKind.REFERENCE);
break;
case 0:
// hit sentinel at end of char data
pos++; // will take us to the end
break;
case '\\':
throw new StreamDefinitionException(
expressionString, pos, XDDSLMessages.UNEXPECTED_ESCAPE_CHAR);
default:
throw new StreamDefinitionException(
expressionString, pos, XDDSLMessages.UNEXPECTED_DATA,
Character.valueOf(ch).toString());
}
}
}
}
public List<Token> getTokens() {
return tokens;
}
/**
* Lex a string literal which uses single quotes as delimiters. To include a single quote within the literal, use a
* pair ''
*/
private void lexQuotedStringLiteral() {
int start = pos;
boolean terminated = false;
while (!terminated) {
pos++;
char ch = toProcess[pos];
if (ch == '\'') {
// may not be the end if the char after is also a '
if (toProcess[pos + 1] == '\'') {
pos++; // skip over that too, and continue
}
else {
terminated = true;
}
}
if (ch == 0) {
throw new StreamDefinitionException(
expressionString, start, XDDSLMessages.NON_TERMINATING_QUOTED_STRING);
}
}
pos++;
tokens.add(new Token(TokenKind.LITERAL_STRING,
subarray(start, pos), start, pos));
}
/**
* Lex a string literal which uses double quotes as delimiters. To include a single quote within the literal, use a
* pair ""
*/
private void lexDoubleQuotedStringLiteral() {
int start = pos;
boolean terminated = false;
while (!terminated) {
pos++;
char ch = toProcess[pos];
if (ch == '"') {
// may not be the end if the char after is also a "
if (toProcess[pos + 1] == '"') {
pos++; // skip over that too, and continue
}
else {
terminated = true;
}
}
if (ch == 0) {
throw new StreamDefinitionException(
expressionString, start, XDDSLMessages.NON_TERMINATING_DOUBLE_QUOTED_STRING);
}
}
pos++;
tokens.add(new Token(TokenKind.LITERAL_STRING,
subarray(start, pos), start, pos));
}
private void lexIdentifier() {
int start = pos;
do {
pos++;
}
while (isIdentifier(toProcess[pos]));
char[] subarray = subarray(start, pos);
tokens.add(new Token(TokenKind.IDENTIFIER, subarray, start, pos));
}
/**
* For the variant tokenizer (used following an '=' to parse an argument value) we only terminate that identifier if
* encountering a small set of characters. If the argument has included a ' to put something in quotes, we remember
* that and don't allow ' ' (space) and '\t' (tab) to terminate the value.
*/
private boolean isArgValueIdentifierTerminator(char ch, boolean quoteOpen) {
return (ch == '|' && !quoteOpen) || (ch == ';' && !quoteOpen) || ch == '\0' || (ch == ' ' && !quoteOpen)
|| (ch == '\t' && !quoteOpen) || (ch == '>' && !quoteOpen)
|| ch == '\r' || ch == '\n';
}
/**
* To prevent the need to quote all argument values, this identifier lexing function is used just after an '=' when
* we are about to digest an arg value. It is much more relaxed about what it will include in the identifier.
*/
private void lexArgValueIdentifier() {
// Much of the complexity in here relates to supporting cases like these:
// 'hi'+payload
// 'hi'+'world'
// In these situations it looks like a quoted string and that perhaps the entire
// argument value is being quoted, but in fact half way through it is discovered that the
// entire value is not quoted, only the first part of the argument value is a string literal.
int start = pos;
boolean quoteOpen = false;
int quoteClosedCount = 0; // Enables identification of this pattern: 'hello'+'world'
Character quoteInUse = null; // If set, indicates this is being treated as a quoted string
if (isQuote(toProcess[pos])) {
quoteOpen = true;
quoteInUse = toProcess[pos++];
}
do {
char ch = toProcess[pos];
if ((quoteInUse != null && ch == quoteInUse) || (quoteInUse == null && isQuote(ch))) {
if (quoteInUse != null && quoteInUse == '\'' && ch == '\'' && toProcess[pos + 1] == '\'') {
pos++; // skip over that too, and continue
}
else {
quoteOpen = !quoteOpen;
if (!quoteOpen) {
quoteClosedCount++;
}
}
}
pos++;
}
while (!isArgValueIdentifierTerminator(toProcess[pos], quoteOpen));
char[] subarray = null;
if (quoteClosedCount < 2 && sameQuotes(start, pos - 1)) {
tokens.add(new Token(TokenKind.LITERAL_STRING,
subarray(start, pos), start, pos));
}
else {
subarray = subarray(start, pos);
tokens.add(new Token(TokenKind.IDENTIFIER, subarray, start, pos));
}
}
private boolean sameQuotes(int pos1, int pos2) {
if (toProcess[pos1] == '\'') {
return toProcess[pos2] == '\'';
}
else if (toProcess[pos1] == '"') {
return toProcess[pos2] == '"';
}
return false;
}
private char[] subarray(int start, int end) {
char[] result = new char[end - start];
System.arraycopy(toProcess, start, result, 0, end - start);
return result;
}
/**
* Check if this might be a two character token.
*/
private boolean isTwoCharToken(TokenKind kind) {
Assert.isTrue(kind.tokenChars.length == 2);
Assert.isTrue(toProcess[pos] == kind.tokenChars[0]);
return toProcess[pos + 1] == kind.tokenChars[1];
}
/**
* Push a token of just one character in length.
*/
private void pushCharToken(TokenKind kind) {
tokens.add(new Token(kind, pos, pos + 1));
pos++;
}
/**
* Push a token of two characters in length.
*/
private void pushPairToken(TokenKind kind) {
tokens.add(new Token(kind, pos, pos + 2));
pos += 2;
}
// ID: ('a'..'z'|'A'..'Z'|'_'|'$') ('a'..'z'|'A'..'Z'|'_'|'$'|'0'..'9'|DOT_ESCAPED|'-')*;
private boolean isIdentifier(char ch) {
return isAlphabetic(ch) || isDigit(ch) || ch == '_' || ch == '$' || ch == '-';
}
private boolean isQuote(char ch) {
return ch == '\'' || ch == '"';
}
private boolean isWhitespace(char ch) {
return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
}
private boolean isDigit(char ch) {
if (ch > 255) {
return false;
}
return (flags[ch] & IS_DIGIT) != 0;
}
private boolean isAlphabetic(char ch) {
if (ch > 255) {
return false;
}
return (flags[ch] & IS_ALPHA) != 0;
}
private static final byte flags[] = new byte[256];
private static final byte IS_DIGIT = 0x01;
private static final byte IS_HEXDIGIT = 0x02;
private static final byte IS_ALPHA = 0x04;
static {
for (int ch = '0'; ch <= '9'; ch++) {
flags[ch] |= IS_DIGIT | IS_HEXDIGIT;
}
for (int ch = 'A'; ch <= 'F'; ch++) {
flags[ch] |= IS_HEXDIGIT;
}
for (int ch = 'a'; ch <= 'f'; ch++) {
flags[ch] |= IS_HEXDIGIT;
}
for (int ch = 'A'; ch <= 'Z'; ch++) {
flags[ch] |= IS_ALPHA;
}
for (int ch = 'a'; ch <= 'z'; ch++) {
flags[ch] |= IS_ALPHA;
}
}
@Override
public String toString() {
StringBuilder s = new StringBuilder();
s.append(this.expressionString).append("\n");
for (int i = 0; i < this.pos; i++) {
s.append(" ");
}
s.append("^\n");
s.append(tokens).append("\n");
return s.toString();
}
}