package com.delcyon.capo.parsers;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
/**
* @author jeremiah
*
*/
public class Tokenizer
{
/**
* These values are used to tokenize the token stream as it is read.
*/
public enum CharacterType
{
//There is a maximum of 8 possibilities, or the bitmask math won't work
/** used to indicate a char that will force delimination, but will still be included as the next token **/
TOKEN,
/** set's as a white space char **/
WHITESPACE,
/** used to indicate a plain text char, ie. part of a token, not a complete token unto itself. **/
ALPHA,
/** used to indicate a quote char **/
QUOTE,
/** used to indicate a comment char **/
COMMENT,
/** used to indicate the start of an escape sequence for an non ALPHA chars.**/
ESCAPE,
/** used to indicate a EOL char **/
EOL
;
/**
* bit mask of the ControlToken
*/
public byte mask = 0;
private CharacterType()
{
mask = (byte) Math.floor(Math.pow(2, ordinal()-1));
}
}
public enum TokenType
{
NOTHING(-5),
TOKEN(-4),
OTHER(-3),
EOL(-2),
EOF(-1);
public int value;
private TokenType(int intValue)
{
this.value = intValue;
}
}
private class InternalTokenType
{
public TokenType tokenType = null;
public int tokenValue = TokenType.NOTHING.value;
public InternalTokenType(TokenType tokenType)
{
this.tokenType = tokenType;
this.tokenValue = tokenType.value;
}
public TokenType setTokenType(TokenType tokenType, int tokenValue)
{
this.tokenType = tokenType;
this.tokenValue = tokenValue;
return tokenType;
}
public TokenType setTokenValue(int tokenValue)
{
this.tokenValue = tokenValue;
if(tokenValue < 0)
{
for (TokenType tokenType : TokenType.values())
{
if(tokenType.value == tokenValue)
{
this.tokenType = tokenType;
return tokenType;
}
}
}
else if(tokenValue < 256 && (characterTypes[tokenValue] & CharacterType.EOL.mask) != 0)
{
tokenType = TokenType.EOL;
}
else
{
tokenType = TokenType.OTHER;
}
return tokenType;
}
}
private byte characterTypes[] = new byte[256];
private BufferedInputStream reader = null;
private String value = null;
int currentChar = -4;
int currentQuoteChar = -1;
private boolean pushedBack = false;
long currentPosition = -1l;
private InternalTokenType internalTokenTypeHolder = new InternalTokenType(TokenType.NOTHING);
private boolean isEOLSignificant = false;
/**
* default constructor. This will set a number of basic char types. resetSyntax() should be called if you want a blank slate.
*/
public Tokenizer()
{
setCharRangeType('a', 'z', CharacterType.ALPHA);
setCharRangeType('A', 'Z', CharacterType.ALPHA);
setCharRangeType(128 + 32, 255, CharacterType.ALPHA);
setCharRangeType(0, ' ', CharacterType.WHITESPACE);
setCharType('/', CharacterType.COMMENT);
setCharType('"', CharacterType.QUOTE);
setCharType('\'', CharacterType.QUOTE);
setCharType('\n', CharacterType.EOL);
setCharType('\r', CharacterType.EOL);
}
public Tokenizer(InputStream inputStream)
{
this();
reader = new BufferedInputStream(inputStream,1);
}
/**
* will set the input stream to be read.
* @param inputStream
*/
public void setInputStream(InputStream inputStream)
{
this.reader = new BufferedInputStream(inputStream,1);
//reset everything
value = null;
pushedBack = false;
internalTokenTypeHolder = new InternalTokenType(TokenType.NOTHING);
}
/**
* Set's every char to the TOKEN control type
*/
public void resetSyntax()
{
Arrays.fill(characterTypes, CharacterType.TOKEN.mask);
}
/**
* Set a particular character to a characterType
* @param character
* @param characterType
*/
public void setCharType(int character, CharacterType characterType)
{
if(characterType.mask == 0)
{
characterTypes[character] = characterType.mask;
}
else
{
characterTypes[character] |= characterType.mask;
}
}
/**
* Sets a range of characters to a characterType 'A','Z',ALPHA for example
* @param lowChar
* @param highChar
* @param characterType
*/
public void setCharRangeType(int lowChar, int highChar, CharacterType characterType)
{
if (lowChar < 0)
{
lowChar = 0;
}
if (highChar >= characterTypes.length)
{
highChar = characterTypes.length - 1;
}
while (lowChar <= highChar)
{
if(characterType.mask == 0)
{
characterTypes[lowChar++] = characterType.mask;
}
else
{
characterTypes[lowChar++] |= characterType.mask;
}
}
}
/**
* determines of EOL will be returned as a separate token
* @param isEOLSignificant
*/
public void setEOLSignificant(boolean isEOLSignificant)
{
this.isEOLSignificant = isEOLSignificant;
}
/**
* @return the type of token that nextToken just read
*/
public TokenType getTokenType()
{
return internalTokenTypeHolder.tokenType;
}
/**
* @return the value that nextToken just read
*/
public String getValue()
{
return value;
}
public boolean hasMore() throws IOException
{
//if we've already read an EOF, then the answer is no
if(internalTokenTypeHolder.tokenType == TokenType.EOF)
{
return false;
}
else //if we haven't gotten an EOF, then we're going to be allowed to read
{ //at least one more char even if it results in an EOF, since we count that as the final token
return true;
}
}
/**
* This is the main method of this class. Each call to it will return the next token type from the stream, and make available getValue() and getTokenType() for use.
* @return
* @throws Exception
*/
public TokenType nextToken() throws Exception
{
value = null;
if (pushedBack) //in theory, push back will only occur when we read a significant control char, or token
{
pushedBack = false;
//if we're not a control char, then turn us into a token with a value.
if(internalTokenTypeHolder.setTokenValue(currentChar) == TokenType.OTHER)
{
value = ((char)currentChar)+"";
return internalTokenTypeHolder.setTokenType(TokenType.TOKEN, currentChar);
}
else
{
return internalTokenTypeHolder.tokenType;
}
}
StringBuffer stringBuffer = new StringBuffer();
while(true)
{
currentChar = reader.read();
currentPosition++;
//check for EOF
if(currentChar < 0)
{
if(stringBuffer.length() == 0) //if we don't have data, just return
{
return internalTokenTypeHolder.setTokenValue(currentChar);
}
else //if we do, return it, and push back for the next call
{
pushedBack = true;
break;
}
}
int charType = currentChar < 256 ? characterTypes[currentChar] : CharacterType.ALPHA.mask;
//check for EOL
if((charType & CharacterType.EOL.mask) != 0)
{
//check to see if this is the second EOL in a row, and whether or not it's the same char as last time.
//because eol chars can be paired up like on windows, if it's two different eol chars in a row, consume them, making them appear as a single EOL char.
if(internalTokenTypeHolder.tokenType == TokenType.EOL && internalTokenTypeHolder.tokenValue != currentChar)
{
continue;
}
else if(isEOLSignificant) //check to see if we care about EOL chars
{
//if we do, and we have no data, just return it.
if(stringBuffer.length() == 0)
{
return internalTokenTypeHolder.setTokenValue(currentChar);
}
else //otherwise return our data, and then let the system know, that on the next read, we have to fake it.
{
pushedBack = true;
break;
}
}
else //we just need to eat EOL's like any other whitespace
{
if(stringBuffer.length() == 0) //if we have no data, just keep reading
{
continue;
}
else //if we do have data, return it.
{
break;
}
}
}
//check for an escape symbol
//There are two kinds of escapes, one that turns a TOKEN into an ALPHA
//and one that actually un-escapes some sort of escape code like \n
if((charType & CharacterType.ESCAPE.mask) != 0)
{
int nextChar = reader.read();
currentPosition++;
if(nextChar != currentChar) //if there are the same, then we're just escaping our escape char
{
switch (nextChar)
{
case 'r':
currentChar = '\r';
charType = CharacterType.ALPHA.mask;
break; /* switch */
case 'n':
currentChar = '\n';
charType = CharacterType.ALPHA.mask;
break; /* switch */
case 'f':
currentChar = '\f';
charType = CharacterType.ALPHA.mask;
break; /* switch */
case 'b':
currentChar = '\b';
charType = CharacterType.ALPHA.mask;
break; /* switch */
case 't':
currentChar = '\t';
charType = CharacterType.ALPHA.mask;
break; /* switch */
case 'a':
currentChar = '\007';
charType = CharacterType.ALPHA.mask;
break; /* switch */
case 'e':
currentChar = '\033';
charType = CharacterType.ALPHA.mask;
break; /* switch */
case 'c': //handle control chars
nextChar = reader.read();
currentPosition++;
if(nextChar > 0x7f)
{
throw new Exception("Expected ASCII after \\c");
}
stringBuffer.append(Character.toChars(nextChar ^ 64));
continue; //while loop
case '8'://start octal code
case '9': throw new Exception("Illegal octal digit");
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '0':
stringBuffer.append(getOctalCodeFromReader(nextChar));
continue; //while loop
case 'x': //start hex code
stringBuffer.append(getHexCodeFromReader());
continue; //while loop
case 'u': //start unicode small
stringBuffer.append(getUnicodeFromReader(4));
continue; //while loop
case 'U'://start unicode big
stringBuffer.append(getUnicodeFromReader(8));
continue; //while loop
default://we're just escaping a token here so set it as an alpha
charType = CharacterType.ALPHA.mask;
currentChar = nextChar;
}
}
else
{
//got the escape char twice, so assume we're escaping it and set it as an alpha
charType = CharacterType.ALPHA.mask;
currentChar = nextChar;
}
}
//if were a comment, then read until EOL or EOF
if ((charType & CharacterType.COMMENT.mask) != 0)
{
while(currentChar >= 0)
{
currentChar = reader.read();
currentPosition++;
if (currentChar == TokenType.EOF.value || (characterTypes[currentChar] & CharacterType.EOL.mask) != 0)
{
break;
}
}
//only push back if we've got some data, and EOL is Significant. Otherwise we just need to act like just a new token.
if(internalTokenTypeHolder.tokenType == TokenType.EOL && stringBuffer.length() == 0)
{
//do nothing, since the last token was already an EOL, we just want to absorb this one as this line was nothing but a comment.
continue;
}
else if(isEOLSignificant == true && stringBuffer.length() != 0)
{
pushedBack = true;
}
else if(isEOLSignificant == true && stringBuffer.length() == 0)
{
internalTokenTypeHolder.setTokenValue(currentChar);
}
break;
}
//check for a quote symbol
if((charType & CharacterType.QUOTE.mask) != 0)
{
//start quoting
if(stringBuffer.length() == 0 && currentQuoteChar < 0)
{
currentQuoteChar = currentChar; //keep track of our current quote type
continue;
}
else if(currentChar == currentQuoteChar && stringBuffer.length() > 0)//we're done quoting
{
currentQuoteChar = TokenType.NOTHING.value; //reset our current quote type
break;
}
else if(currentChar == currentQuoteChar && stringBuffer.length() == 0)//we're done quoting
{
currentQuoteChar = TokenType.NOTHING.value; //reset our current quote type
value = ""; //special case, where we've referred to the empty string, NOT null.
break;
}
}
if(currentQuoteChar > 0) //if we're quoting, then ignore whitespace
{
charType = CharacterType.ALPHA.mask;
}
if((charType & CharacterType.WHITESPACE.mask) != 0)
{
if(stringBuffer.length() == 0)
{
continue;
}
else
{
break;
}
}
if(charType == 0)
{
if(stringBuffer.length() == 0) //if we don't have anything on the buffer, then just go ahead and dump this
{
stringBuffer.append((char)currentChar);
internalTokenTypeHolder.setTokenType(TokenType.TOKEN, currentChar);
break;
}
else //already got something on the buffer, going to need to get creative, and push stuff back a bit.
{
pushedBack = true;
break;
}
}
if((charType & CharacterType.ALPHA.mask) != 0)
{
stringBuffer.append((char)currentChar);
internalTokenTypeHolder.setTokenType(TokenType.TOKEN, currentChar);
}
}
if(stringBuffer.length() != 0)
{
value = stringBuffer.toString();
}
return internalTokenTypeHolder.tokenType;
}
/**
* this will read up to 8 chars in brackets, or 2 without brackets, and make a char array from the resulting hexcode.
* @return
* @throws Exception
*/
private char[] getHexCodeFromReader() throws Exception
{
reader.mark(10);
char[] buffer = null;
int firstChar = reader.read();
currentPosition++;
int count = 1;
boolean braced = false;
if(firstChar == '{')
{
braced = true;
buffer = new char[8];
count = 0;
}
else if((firstChar >= '0' && firstChar <= '9') || (firstChar >= 'A' && firstChar <= 'F') || (firstChar >= 'a' && firstChar <= 'f'))
{
buffer = new char[2];
buffer[0] = (char) firstChar;
}
for(;count <= buffer.length;count++)
{
int ch = reader.read();
currentPosition++;
if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'))
{
buffer[count] = (char) ch;
reader.mark(10);
}
else if (ch == '}' && braced == true)
{
break;
}
else //not a valid hex, so reset
{
reader.reset();
currentPosition -= (long)count;
break;
}
}
int value = Integer.parseInt(new String(buffer,0,count), 16);
return Character.toChars(value);
}
//read until we get a non octal char or we read 2 additional values.
private char[] getOctalCodeFromReader(int firstChar) throws Exception
{
reader.mark(10);
char[] buffer = new char[3];
buffer[0] = (char) firstChar;
int count = 1;
for(;count < buffer.length;count++)
{
int ch = reader.read();
currentPosition++;
if (ch >= '0' && ch <= '7')
{
buffer[count] = (char) ch;
reader.mark(10);
}
else //not a valid octal, so reset
{
reader.reset();
currentPosition -= (long)count;
break;
}
}
int value = Integer.parseInt(new String(buffer,0,count), 8);
return Character.toChars(value);
}
/**
*
* @param the maximum length that the unicode declaration can be.
* @return char[] representing the unicode value.
* @throws Exception
*/
private char[] getUnicodeFromReader(int length) throws Exception
{
byte[] buffer = new byte[length];
int readLength = reader.read(buffer);
currentPosition += (long)readLength;
if(readLength < length) //check length
{
throw new Exception("value to short of unicode escape");
}
//check values
for (int index = 0; index < length; index++)
{
/* this also handles the surrogate issue */
if (buffer[index] > 127)
{
throw new Exception("Illegal non-ASCII hex digit in \\u escape");
}
}
int value = 0;
try
{
value = Integer.parseInt(new String(buffer), 16);
return Character.toChars(value);
}
catch (NumberFormatException numberFormatException)
{
numberFormatException.printStackTrace();
throw new Exception("Invalid hex value for \\u escape");
}
}
}