Tokenizer.java example

Explorer
tdq-studio-se-master
/*
 * Copyright (C) 2007 SQL Explorer Development Team
 * http://sourceforge.net/projects/eclipsesql
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package net.sourceforge.sqlexplorer.parsers;

import net.sourceforge.sqlexplorer.util.BackedCharSequence;

/**
 * Tokenizer is a utility class for reading and tokenising a string; note that this
 * is not restricted to SQL - it is also used for tokenising structured comments.
 * @author John Spackman
 */
public class Tokenizer {

	/*
	 * Types of token which can be returned
	 */
	public enum TokenType {
		WORD, NUMBER, QUOTED, PUNCTUATION, EOL_COMMENT, ML_COMMENT
	};
	
	/*
	 * Tokens
	 */
	public static class Token extends BackedCharSequence {
		private TokenType tokenType;
		private int lineNo;
		private int charNo;
		
		public Token(StringBuffer buffer, TokenType tokenType, int start, int end, int lineNo, int charNo) {
			super(buffer, start, end);
			if (tokenType == null)
				throw new IllegalArgumentException();
			this.tokenType = tokenType;
			this.lineNo = lineNo;
			this.charNo = charNo;
		}
		
		/**
		 * @return the tokenType
		 */
		public TokenType getTokenType() {
			return tokenType;
		}

		/**
		 * @return the lineNo
		 */
		public int getLineNo() {
			return lineNo;
		}

		/**
		 * @return the charNo
		 */
		public int getCharNo() {
			return charNo;
		}

		/**
		 * Returns the value without quotes, if applicable
		 * @return
		 */
		public CharSequence getUnquotedValue() {
			if (tokenType == TokenType.QUOTED)
				return new BackedCharSequence(buffer, start + 1, end - 1);
			return this;
		}
	}
	
	// The SQL being parsed
	private StringBuffer sql;
	
	// Where in sql to look for the next token
	private int nextToken; 
	
	// Initial line number
	private int initialLineNo;
	
	// Current line number
	private int lineNo;
	
	// Current position within the line
	private int charNo;

	/**
	 * Constructor; the tokenizer will work on sql.  If sql is a StringBuffer
	 * it will use it as is, otherwise it will duplicate it into its own
	 * StringBuffer
	 * @param sql
	 */
	public Tokenizer(CharSequence sql) {
		super();
		if (sql instanceof StringBuffer)
			this.sql = (StringBuffer)sql;
		else
			this.sql = new StringBuffer(sql);
		lineNo = initialLineNo = 1;
		charNo = 0;
	}
	
	/**
	 * Resets the tokenizer to the start of the buffer
	 */
	public void reset() {
		nextToken = 0;
		lineNo = initialLineNo;
		charNo = 0;
	}
	
	/**
	 * Gets the remaining, untokenized part of the string
	 * @return
	 */
	public BackedCharSequence getRemainder() {
		return new BackedCharSequence(sql, nextToken, sql.length());
	}

	/**
	 * Generates a token which consists of everything from the current
	 * position up to and including the end of line character
	 * @return
	 */
	public Token skipToEOL() {
		int start = nextToken;
		for (; nextToken < sql.length(); nextToken++) {
			char c = sql.charAt(nextToken);
			if (c == '\n') {
				lineNo++;
				charNo = 1;
				break;
			}
		}
		
		// Return the token
		return new Token(sql, TokenType.WORD, start, nextToken, lineNo, charNo);
	}

	/**
	 * Scans looking for the next token, returning null if there are no more.
	 * Whitespace is not a token, but comments are.
	 * @return
	 */
	public Token nextToken() throws ParserException {
		TokenType tokenType = null;
		char currentQuote = 0;
		
		int startCharNo = charNo + 1;
		int startLineNo = lineNo;
		int start = nextToken;
		char c = 0;
		char nextC = 0;
		for (; nextToken < sql.length(); nextToken++) {
			c = sql.charAt(nextToken);
			
			if (c == '\n') {
				lineNo++;
				charNo = 0;
				
				// No token type?  then we're skipping whitespace so adjust
				if (tokenType == null) {
					startCharNo = charNo + 1;
					startLineNo = lineNo;
				}
			}
			charNo++;
			
			TokenType nextType = null;
			
			// Quotes
			if (c == '\'' || c == '\"') {
				// Unless it's in a comment
				if (tokenType == TokenType.EOL_COMMENT || tokenType == TokenType.ML_COMMENT)
					continue;
				
				// Not in between quotes but we are already on a token?  Then stop
				if (tokenType != TokenType.QUOTED && tokenType != null)
					break;
				
				// First token?  Then start eating characters
				if (tokenType == null) {
					tokenType = TokenType.QUOTED;
					currentQuote = c;
					continue;
					
				// Else if we're currently eating a quoted string
				} else if (tokenType == TokenType.QUOTED) {
					// It's got to be the same as the one that started it
					if (c != currentQuote)
						continue;
					
					// If the next char is the same quote, then it's an escapement
					//	EG:		'that''s mine'
					if (nextToken < sql.length() -1 && sql.charAt(nextToken + 1) == currentQuote) {
						// Skip
						nextToken++;
						charNo++;
						continue;
					}

					// End of quote; move to the character following and stop
					nextToken++;
					charNo++;
					currentQuote = 0;
					break;
				}
			}
			
			// In a quote? Carry on eating
			if (tokenType == TokenType.QUOTED)
				continue;
			
			// Check for comments up to the end of the line
			if (tokenType == TokenType.EOL_COMMENT) {
				// EOL?  Then we're done
				if (c == '\n')
					break;
				continue;
			}
			
			// If there's at least 2 characters left to check
			if (nextToken < sql.length() - 1) {
				nextC = sql.charAt(nextToken + 1);
				
				// If we're in a multi-line comment, check for the end of the comment
				if (tokenType == TokenType.ML_COMMENT) {
					// If we're at the end, skip to the character after the comment and stop
					if (c == '*' && nextC == '/') {
						nextToken += 2;
						charNo += 2;
						break;
					}
					
					// Still eating the comment
					continue;
				}
				
				// Single line comment
				if ((c == '-' && nextC == '-') || (c == '/' && nextC == '/'))
					nextType = TokenType.EOL_COMMENT;
				
				// Multi-line comment
				else if (c == '/' && nextC == '*')
					nextType = TokenType.ML_COMMENT;
				
				// Found something?
				if (nextType != null) {
					// If we're yet in a token, then start the token and start eating
					if (tokenType == null) {
						tokenType = nextType;
						continue;
					}
					
					// Else we're in a token already so exit to mark the end of that token
					break;
				}
			} else
				nextC = 0;

			// Continuing a word
			if (tokenType == TokenType.WORD) {
				if (!isIdentifier(c))
					break;
				continue;
			}
			
			// Starting a word
			if (tokenType == null && isFirstIdentifier(c)) {
				tokenType = TokenType.WORD;
				continue;
			}

			// A Number
			if (Character.isDigit(c)) {
				// If we're not in a token yet, OR: if we'd started punctuation but it's just a dot, then convert to a decimal (missing zero prefix)
				if (tokenType == null || (tokenType == TokenType.PUNCTUATION && sql.substring(start, nextToken).equals("."))) {
					tokenType = TokenType.NUMBER;
					continue;
					
				// Already on a number?  Then carry on eating
				} else if (tokenType == TokenType.NUMBER)
					continue;
				
				break;
			} else if (c == '.' && tokenType == TokenType.NUMBER) {
				continue;
			}
			
			// If it's an identifier but we're not on a word, then stop
			if (isIdentifier(c) && tokenType != TokenType.WORD)
				break;
			
			// Whitespace
			if (Character.isWhitespace(c)) {
				// If we're in a token, then stop
				if (tokenType != null)
					break;
				
				// Skip the whitespace
				start++;
				continue;
			}

			// If we're already on punctuation, stop there (punctuation is always only one character)
			if (tokenType == TokenType.PUNCTUATION)
				break;
			
			// Anything else is considered punctuation
			if (tokenType != null && tokenType != TokenType.PUNCTUATION)
				break;
			if (tokenType == null)
				tokenType = TokenType.PUNCTUATION;
		}
		
		// Check for unterminated strings
		if (currentQuote != 0)
			throw new ParserException("Unterminated string literal", startLineNo, startCharNo);
		if (tokenType == TokenType.ML_COMMENT && (c != '*' | nextC != '/'))
			throw new ParserException("Unterminated multi-line comment", startLineNo, startCharNo);
		
		// Nothing found?
		if (tokenType == null) {
			if (nextToken < sql.length())
				throw new RuntimeException("Internal error: could not find a token but buffer is not exhausted");
			return null;
			
		// If we found a token but the last character found was a CR, then we have to reduce
		//	the line count because the CR will be the first one read next time around.
		} else if (c == '\n') {
			lineNo--;
		}
		
		// Return the token
		return new Token(sql, tokenType, start, nextToken, startLineNo, startCharNo);
	}

	/**
	 * Returns true if c is suitable as the first character of an identifier; it
	 * must be a character or an underscore
	 * @param c
	 * @return true if its suitable as an identifier
	 */
	private boolean isFirstIdentifier(char c) {
		return Character.isLetter(c) || c == '_';
	}

	/**
	 * Returns true if c is suitable a subsequent character of an identifier; it
	 * must be a character or an underscore or digits
	 * @param c
	 * @return
	 */
	private boolean isIdentifier(char c) {
		return Character.isDigit(c) || isFirstIdentifier(c);
	}

	/**
	 * Sets the inital line number - the line number that the first line of
	 * text is percieved to be on
	 * @param initialLineNo
	 */
	public void setInitialLineNo(int initialLineNo) {
		this.initialLineNo = initialLineNo;
		this.lineNo = initialLineNo;
	}
	
}