/*
* Copyright (C) 2007 SQL Explorer Development Team
* http://sourceforge.net/projects/eclipsesql
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package net.sourceforge.sqlexplorer.parsers;
import net.sourceforge.sqlexplorer.util.BackedCharSequence;
/**
* Tokenizer is a utility class for reading and tokenising a string; note that this
* is not restricted to SQL - it is also used for tokenising structured comments.
* @author John Spackman
*/
public class Tokenizer {
/*
* Types of token which can be returned
*/
public enum TokenType {
WORD, NUMBER, QUOTED, PUNCTUATION, EOL_COMMENT, ML_COMMENT
};
/*
* Tokens
*/
public static class Token extends BackedCharSequence {
private TokenType tokenType;
private int lineNo;
private int charNo;
public Token(StringBuffer buffer, TokenType tokenType, int start, int end, int lineNo, int charNo) {
super(buffer, start, end);
if (tokenType == null)
throw new IllegalArgumentException();
this.tokenType = tokenType;
this.lineNo = lineNo;
this.charNo = charNo;
}
/**
* @return the tokenType
*/
public TokenType getTokenType() {
return tokenType;
}
/**
* @return the lineNo
*/
public int getLineNo() {
return lineNo;
}
/**
* @return the charNo
*/
public int getCharNo() {
return charNo;
}
/**
* Returns the value without quotes, if applicable
* @return
*/
public CharSequence getUnquotedValue() {
if (tokenType == TokenType.QUOTED)
return new BackedCharSequence(buffer, start + 1, end - 1);
return this;
}
}
// The SQL being parsed
private StringBuffer sql;
// Where in sql to look for the next token
private int nextToken;
// Initial line number
private int initialLineNo;
// Current line number
private int lineNo;
// Current position within the line
private int charNo;
/**
* Constructor; the tokenizer will work on sql. If sql is a StringBuffer
* it will use it as is, otherwise it will duplicate it into its own
* StringBuffer
* @param sql
*/
public Tokenizer(CharSequence sql) {
super();
if (sql instanceof StringBuffer)
this.sql = (StringBuffer)sql;
else
this.sql = new StringBuffer(sql);
lineNo = initialLineNo = 1;
charNo = 0;
}
/**
* Resets the tokenizer to the start of the buffer
*/
public void reset() {
nextToken = 0;
lineNo = initialLineNo;
charNo = 0;
}
/**
* Gets the remaining, untokenized part of the string
* @return
*/
public BackedCharSequence getRemainder() {
return new BackedCharSequence(sql, nextToken, sql.length());
}
/**
* Generates a token which consists of everything from the current
* position up to and including the end of line character
* @return
*/
public Token skipToEOL() {
int start = nextToken;
for (; nextToken < sql.length(); nextToken++) {
char c = sql.charAt(nextToken);
if (c == '\n') {
lineNo++;
charNo = 1;
break;
}
}
// Return the token
return new Token(sql, TokenType.WORD, start, nextToken, lineNo, charNo);
}
/**
* Scans looking for the next token, returning null if there are no more.
* Whitespace is not a token, but comments are.
* @return
*/
public Token nextToken() throws ParserException {
TokenType tokenType = null;
char currentQuote = 0;
int startCharNo = charNo + 1;
int startLineNo = lineNo;
int start = nextToken;
char c = 0;
char nextC = 0;
for (; nextToken < sql.length(); nextToken++) {
c = sql.charAt(nextToken);
if (c == '\n') {
lineNo++;
charNo = 0;
// No token type? then we're skipping whitespace so adjust
if (tokenType == null) {
startCharNo = charNo + 1;
startLineNo = lineNo;
}
}
charNo++;
TokenType nextType = null;
// Quotes
if (c == '\'' || c == '\"') {
// Unless it's in a comment
if (tokenType == TokenType.EOL_COMMENT || tokenType == TokenType.ML_COMMENT)
continue;
// Not in between quotes but we are already on a token? Then stop
if (tokenType != TokenType.QUOTED && tokenType != null)
break;
// First token? Then start eating characters
if (tokenType == null) {
tokenType = TokenType.QUOTED;
currentQuote = c;
continue;
// Else if we're currently eating a quoted string
} else if (tokenType == TokenType.QUOTED) {
// It's got to be the same as the one that started it
if (c != currentQuote)
continue;
// If the next char is the same quote, then it's an escapement
// EG: 'that''s mine'
if (nextToken < sql.length() -1 && sql.charAt(nextToken + 1) == currentQuote) {
// Skip
nextToken++;
charNo++;
continue;
}
// End of quote; move to the character following and stop
nextToken++;
charNo++;
currentQuote = 0;
break;
}
}
// In a quote? Carry on eating
if (tokenType == TokenType.QUOTED)
continue;
// Check for comments up to the end of the line
if (tokenType == TokenType.EOL_COMMENT) {
// EOL? Then we're done
if (c == '\n')
break;
continue;
}
// If there's at least 2 characters left to check
if (nextToken < sql.length() - 1) {
nextC = sql.charAt(nextToken + 1);
// If we're in a multi-line comment, check for the end of the comment
if (tokenType == TokenType.ML_COMMENT) {
// If we're at the end, skip to the character after the comment and stop
if (c == '*' && nextC == '/') {
nextToken += 2;
charNo += 2;
break;
}
// Still eating the comment
continue;
}
// Single line comment
if ((c == '-' && nextC == '-') || (c == '/' && nextC == '/'))
nextType = TokenType.EOL_COMMENT;
// Multi-line comment
else if (c == '/' && nextC == '*')
nextType = TokenType.ML_COMMENT;
// Found something?
if (nextType != null) {
// If we're yet in a token, then start the token and start eating
if (tokenType == null) {
tokenType = nextType;
continue;
}
// Else we're in a token already so exit to mark the end of that token
break;
}
} else
nextC = 0;
// Continuing a word
if (tokenType == TokenType.WORD) {
if (!isIdentifier(c))
break;
continue;
}
// Starting a word
if (tokenType == null && isFirstIdentifier(c)) {
tokenType = TokenType.WORD;
continue;
}
// A Number
if (Character.isDigit(c)) {
// If we're not in a token yet, OR: if we'd started punctuation but it's just a dot, then convert to a decimal (missing zero prefix)
if (tokenType == null || (tokenType == TokenType.PUNCTUATION && sql.substring(start, nextToken).equals("."))) {
tokenType = TokenType.NUMBER;
continue;
// Already on a number? Then carry on eating
} else if (tokenType == TokenType.NUMBER)
continue;
break;
} else if (c == '.' && tokenType == TokenType.NUMBER) {
continue;
}
// If it's an identifier but we're not on a word, then stop
if (isIdentifier(c) && tokenType != TokenType.WORD)
break;
// Whitespace
if (Character.isWhitespace(c)) {
// If we're in a token, then stop
if (tokenType != null)
break;
// Skip the whitespace
start++;
continue;
}
// If we're already on punctuation, stop there (punctuation is always only one character)
if (tokenType == TokenType.PUNCTUATION)
break;
// Anything else is considered punctuation
if (tokenType != null && tokenType != TokenType.PUNCTUATION)
break;
if (tokenType == null)
tokenType = TokenType.PUNCTUATION;
}
// Check for unterminated strings
if (currentQuote != 0)
throw new ParserException("Unterminated string literal", startLineNo, startCharNo);
if (tokenType == TokenType.ML_COMMENT && (c != '*' | nextC != '/'))
throw new ParserException("Unterminated multi-line comment", startLineNo, startCharNo);
// Nothing found?
if (tokenType == null) {
if (nextToken < sql.length())
throw new RuntimeException("Internal error: could not find a token but buffer is not exhausted");
return null;
// If we found a token but the last character found was a CR, then we have to reduce
// the line count because the CR will be the first one read next time around.
} else if (c == '\n') {
lineNo--;
}
// Return the token
return new Token(sql, tokenType, start, nextToken, startLineNo, startCharNo);
}
/**
* Returns true if c is suitable as the first character of an identifier; it
* must be a character or an underscore
* @param c
* @return true if its suitable as an identifier
*/
private boolean isFirstIdentifier(char c) {
return Character.isLetter(c) || c == '_';
}
/**
* Returns true if c is suitable a subsequent character of an identifier; it
* must be a character or an underscore or digits
* @param c
* @return
*/
private boolean isIdentifier(char c) {
return Character.isDigit(c) || isFirstIdentifier(c);
}
/**
* Sets the inital line number - the line number that the first line of
* text is percieved to be on
* @param initialLineNo
*/
public void setInitialLineNo(int initialLineNo) {
this.initialLineNo = initialLineNo;
this.lineNo = initialLineNo;
}
}