/*
* ModeShape (http://www.modeshape.org)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.modeshape.sequencer.ddl;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.modeshape.common.CommonI18n;
import org.modeshape.common.text.ParsingException;
import org.modeshape.common.text.Position;
import org.modeshape.common.text.TokenStream;
/**
* A TokenStream implementation designed around requirements for tokenizing and parsing DDL statements.
* <p>
* Because of the complexity of DDL, it was necessary to extend {@link TokenStream} in order to override the basic tokenizer to
* tokenize the in-line comments prefixed with "--". In addition, because there is not a default ddl command (or statement)
* terminator, an override method was added to {@link TokenStream} to allow re-tokenizing the initial tokens to re-type the
* tokens, remove tokens, or any other operation to simplify parsing.
* </p>
* <p>
* In this case, both reserved words (or key words) and statement start phrases can be registered prior to the {@link TokenStream}
* 's start() method. Any resulting tokens that match the registered string values will be re-typed to identify them as key words
* (DdlTokenizer.KEYWORD) or statement start phrases (DdlTokenizer.STATEMENT_KEY).
* </p>
*/
public class DdlTokenStream extends TokenStream {
protected List<String[]> registeredStatementStartPhrases = new ArrayList<String[]>();
protected Set<String> registeredKeyWords = new HashSet<String>();
private Position currentMarkedPosition = Position.EMPTY_CONTENT_POSITION;
/**
* {@inheritDoc}
*
* @see org.modeshape.common.text.TokenStream#initializeTokens(java.util.List)
*/
@Override
protected List<Token> initializeTokens( List<Token> tokens ) {
// THIS IS WHERE WE DO THE WORK OF PRE-PARSING TOKENS AND REPLACING KEYWORDS AND STATEMENT STARTS WITH
// APPLICABLE TOKEN TYPE BITMASK VALUES
// MyClass[] array = (MyClass[])list.toArray(new MyClass[list.size()]);
Token[] tokensArray = tokens.toArray(new Token[tokens.size()]);
List<Token> reTypedTokens = new ArrayList<Token>(tokens.size());
for (int i = 0; i < tokensArray.length; i++) {
boolean isStatementStart = false;
if (isKeyWord(tokensArray[i].value())) {
Token retypedToken = tokensArray[i].withType(DdlTokenizer.KEYWORD);
// Now we check to see if this keyword begins a registered statement start
// Keep track of token increment (# of tokens for a phrase)
// Need to increment iterator (i) in case phrases like "ALTER ROLLBACK" appear. ROLLBACK is also a statement
// start phrase and we need to walk ignore ROLLBACK in this case.
int tokenIncrement = 0;
for (String[] nextStmtStart : registeredStatementStartPhrases) {
boolean matches = true;
for (int j = 0; j < nextStmtStart.length; j++) {
if (matches) {
matches = nextStmtStart[j].equalsIgnoreCase(tokensArray[i + j].value())
|| nextStmtStart[j].equals(ANY_VALUE);
}
}
if (matches) {
isStatementStart = true;
tokenIncrement = nextStmtStart.length - 1;
break;
}
}
if (isStatementStart) {
retypedToken = retypedToken.withType(DdlTokenizer.STATEMENT_KEY);
}
reTypedTokens.add(retypedToken);
if (isStatementStart) {
// Copy any additional tokens used in the phrase
for (int k = 0; k < tokenIncrement; k++) {
i++;
reTypedTokens.add(tokensArray[i]);
}
}
} else {
reTypedTokens.add(tokensArray[i]);
}
}
return reTypedTokens;
}
/**
* @param content
* @param tokenizer
* @param caseSensitive
*/
public DdlTokenStream( String content,
Tokenizer tokenizer,
boolean caseSensitive ) {
super(content, tokenizer, caseSensitive);
}
/**
* Register a phrase representing the start of a DDL statement
* <p>
* Examples would be: {"CREATE", "TABLE"} {"CREATE", "OR", "REPLACE", "VIEW"}
* </p>
* see {@link DdlConstants} for the default SQL 92 representations.
*
* @param phrase
*/
public void registerStatementStartPhrase( String[] phrase ) {
registeredStatementStartPhrases.add(phrase);
}
public void registerStatementStartPhrase( String[][] phrases ) {
for (String[] phrase : phrases) {
registeredStatementStartPhrases.add(phrase);
}
}
/**
* Register a single key word.
*
* @param keyWord
*/
public void registerKeyWord( String keyWord ) {
registeredKeyWords.add(keyWord);
}
/**
* Register an {@link List} of key words.
*
* @param keyWords
*/
public void registerKeyWords( List<String> keyWords ) {
registeredKeyWords.addAll(keyWords);
}
/**
* Register an array of key words.
*
* @param keyWords
*/
public void registerKeyWords( String[] keyWords ) {
registeredKeyWords.addAll(Arrays.asList(keyWords));
}
/**
* @param word
* @return is Key Word
*/
protected boolean isKeyWord( String word ) {
return registeredKeyWords.contains(word.toUpperCase());
}
/**
* Method to determine if the next token is of type {@link DdlTokenizer} KEYWORD.
*
* @return is Key Word
*/
public boolean isNextKeyWord() {
return this.matches(DdlTokenizer.KEYWORD);
}
/**
* Method to determine if next tokens match a registered statement start phrase.
*
* @return number of keywords in matched registered statement start phrase or zero if not matched
*/
public int computeNextStatementStartKeywordCount() {
int result = 0;
if (isNextKeyWord()) {
for (String[] nextStmtStart : registeredStatementStartPhrases) {
if (this.matches(nextStmtStart)) {
return nextStmtStart.length;
}
}
}
return result;
}
/**
* Marks the current position (line & column number) of the currentToken
*/
public void mark() {
if (this.hasNext()) {
currentMarkedPosition = this.nextPosition();
} else {
currentMarkedPosition = null;
}
}
/**
* Returns the string content for characters bounded by the previous marked position and the position of the currentToken
* (inclusive). Method also marks() the new position the the currentToken.
*
* @return the string content for characters bounded by the previous marked position and the position of the currentToken
* (inclusive).
*/
public String getMarkedContent() {
Position startPosition = new Position(currentMarkedPosition.getIndexInContent(), currentMarkedPosition.getLine(),
currentMarkedPosition.getColumn());
mark();
return getContentBetween(startPosition, currentMarkedPosition);
}
/**
* Obtain a ddl {@link DdlTokenizer} implementation that ignores whitespace but includes tokens for individual symbols, the
* period ('.'), single-quoted strings, double-quoted strings, whitespace-delimited words, and optionally comments.
* <p>
* Note that the resulting Tokenizer may not be appropriate in many situations, but is provided merely as a convenience for
* those situations that happen to be able to use it.
* </p>
*
* @param includeComments true if the comments should be retained and be included in the token stream, or false if comments
* should be stripped and not included in the token stream
* @return the tokenizer; never null
*/
public static DdlTokenizer ddlTokenizer( boolean includeComments ) {
return new DdlTokenizer(includeComments);
}
public static class DdlTokenizer implements Tokenizer {
public static final String PARSER_ID = "PARSER_ID";
/**
* The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that represent an unquoted string
* containing a character sequence made up of non-whitespace and non-symbol characters.
*/
public static final int WORD = 1;
/**
* The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of an individual
* "symbol" character. The set of characters includes: <code>-(){}*,;+%?$[]!<>|=:</code>
*/
public static final int SYMBOL = 2;
/**
* The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of an individual '.'
* character.
*/
public static final int DECIMAL = 4;
/**
* The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of all the characters
* within single-quotes. Single quote characters are included if they are preceded (escaped) by a '\' character.
*/
public static final int SINGLE_QUOTED_STRING = 8;
/**
* The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of all the characters
* within double-quotes. Double quote characters are included if they are preceded (escaped) by a '\' character.
*/
public static final int DOUBLE_QUOTED_STRING = 16;
/**
* The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that consist of all the characters
* between "/*" and "*/", between "//" and the next line terminator (e.g., '\n', '\r' or "\r\n"), or between "--" and
* the next line terminator (e.g., '\n', '\r' or "\r\n").
*/
public static final int COMMENT = 32;
private final boolean useComments;
/**
* The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that represent key words or
* reserved words for a given DDL dialect.
* <p>
* Examples would be: "CREATE", "TABLE", "ALTER", "SCHEMA", "DROP", etc...
* </p>
* see {@link DdlConstants} for the default SQL 92 representations.
*/
public static final int KEYWORD = 64;
/**
* The {@link org.modeshape.common.text.TokenStream.Token#type() token type} for tokens that represent the start of a DDL
* statement.
* <p>
* Examples would be: {"CREATE", "TABLE"} {"CREATE", "OR", "REPLACE", "VIEW"}
* </p>
* see {@link DdlConstants} for the default SQL 92 representations.
*/
public static final int STATEMENT_KEY = 128;
public DdlTokenizer( boolean useComments ) {
this.useComments = useComments;
}
/**
* @return useComments
*/
public boolean includeComments() {
return useComments;
}
/**
* {@inheritDoc}
*
* @see org.modeshape.common.text.TokenStream.Tokenizer#tokenize(TokenStream.CharacterStream, TokenStream.Tokens)
*/
@Override
public void tokenize( CharacterStream input,
Tokens tokens ) throws ParsingException {
int startIndex;
int endIndex;
while (input.hasNext()) {
char c = input.next();
switch (c) {
case ' ':
case '\t':
case '\n':
case '\r':
// Just skip these whitespace characters ...
break;
// ==============================================================================================
// DDL Comments token = "--"
// ==============================================================================================
case '-': {
startIndex = input.index();
Position startPosition = input.position(startIndex);
if (input.isNext('-')) {
// -- END OF LINE comment ...
boolean foundLineTerminator = false;
while (input.hasNext()) {
c = input.next();
if (c == '\n' || c == '\r') {
foundLineTerminator = true;
break;
}
}
endIndex = input.index(); // the token won't include the '\n' or '\r' character(s)
if (!foundLineTerminator) ++endIndex; // must point beyond last char
if (c == '\r' && input.isNext('\n')) input.next();
// Check for PARSER_ID
if (useComments) {
tokens.addToken(startPosition, startIndex, endIndex, COMMENT);
}
} else {
// just a regular dash ...
tokens.addToken(startPosition, startIndex, startIndex + 1, SYMBOL);
}
break;
}
// ==============================================================================================
case '(':
case ')':
case '{':
case '}':
case '*':
case ',':
case ';':
case '+':
case '%':
case '?':
case '[':
case ']':
case '!':
case '<':
case '>':
case '|':
case '=':
case ':':
tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, SYMBOL);
break;
case '.':
tokens.addToken(input.position(input.index()), input.index(), input.index() + 1, DECIMAL);
break;
case '\"':
startIndex = input.index();
Position startingPosition = input.position(startIndex);
boolean foundClosingQuote = false;
while (input.hasNext()) {
c = input.next();
if ((c == '\\' || c == '"') && input.isNext('"')) {
c = input.next(); // consume the ' character since it is escaped
} else if (c == '"') {
foundClosingQuote = true;
break;
}
}
if (!foundClosingQuote) {
String msg = CommonI18n.noMatchingDoubleQuoteFound.text(startingPosition.getLine(),
startingPosition.getColumn());
throw new ParsingException(startingPosition, msg);
}
endIndex = input.index() + 1; // beyond last character read
tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING);
break;
case '\u2019': // '’':
case '\'':
char quoteChar = c;
startIndex = input.index();
startingPosition = input.position(startIndex);
foundClosingQuote = false;
while (input.hasNext()) {
c = input.next();
if ((c == '\\' || c == quoteChar) && input.isNext(quoteChar)) {
c = input.next(); // consume the ' character since it is escaped
} else if (c == quoteChar) {
foundClosingQuote = true;
break;
}
}
if (!foundClosingQuote) {
String msg = CommonI18n.noMatchingSingleQuoteFound.text(startingPosition.getLine(),
startingPosition.getColumn());
throw new ParsingException(startingPosition, msg);
}
endIndex = input.index() + 1; // beyond last character read
tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING);
break;
case '/':
startIndex = input.index();
startingPosition = input.position(startIndex);
if (input.isNext('/')) {
// End-of-line comment ...
boolean foundLineTerminator = false;
while (input.hasNext()) {
c = input.next();
if (c == '\n' || c == '\r') {
foundLineTerminator = true;
break;
}
}
endIndex = input.index(); // the token won't include the '\n' or '\r' character(s)
if (!foundLineTerminator) ++endIndex; // must point beyond last char
if (c == '\r' && input.isNext('\n')) input.next();
if (useComments) {
tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
}
} else if (input.isNext('*')) {
// Multi-line comment ...
while (input.hasNext() && !input.isNext('*', '/')) {
c = input.next();
}
if (input.hasNext()) input.next(); // consume the '*'
if (input.hasNext()) input.next(); // consume the '/'
endIndex = input.index() + 1; // the token will include the '/' and '*' characters
if (useComments) {
tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
}
} else {
// just a regular slash ...
tokens.addToken(startingPosition, startIndex, startIndex + 1, SYMBOL);
}
break;
default:
startIndex = input.index();
Position startPosition = input.position(startIndex);
// Read until another whitespace/symbol/decimal/slash is found
while (input.hasNext() && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?[]!<>|=:"))) {
c = input.next();
}
endIndex = input.index() + 1; // beyond last character that was included
tokens.addToken(startPosition, startIndex, endIndex, WORD);
}
}
}
}
}