/*
* Copyright (C) 2008,2014 Steve Ratcliffe
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*
* Author: Steve Ratcliffe
* Create date: May 10, 2008
*/
package uk.me.parabola.mkgmap.scan;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.LinkedList;
/**
* Read a file in terms of word and symbol tokens.
*
* @author Steve Ratcliffe
*/
public class TokenScanner {
private static final int NO_PUSHBACK = 0;
// Reading state
private final Reader reader;
private int pushback = NO_PUSHBACK;
private boolean isEOF;
private final String fileName;
private int linenumber;
private final LinkedList<Token> tokens = new LinkedList<>();
private boolean bol = true;
// Extra word characters.
private String extraWordChars = "";
private String commentChar = "#";
public TokenScanner(String filename, Reader reader) {
if (reader instanceof BufferedReader)
this.reader = reader;
else
this.reader = new BufferedReader(reader);
fileName = filename;
}
/**
* Peek and return the first token. It is not consumed.
*/
public Token peekToken() {
ensureTok();
return tokens.peek();
}
/**
* Get and remove the next token. May return space or newline. This is the
* only place that a token is removed from the tokens queue.
*/
public Token nextRawToken() {
ensureTok();
if (bol) {
bol = false;
linenumber++;
}
Token token = tokens.removeFirst();
if (token.getType() == TokType.EOL)
bol = true;
return token;
}
/**
* Get the next token tht is not a space or newline.
* @return The first valid text or symbol token.
*/
public Token nextToken() {
skipSpace();
return nextRawToken();
}
/**
* Push a token back to the beginning of the token queue.
* @param tok The token to add to the beginning of the queue.
*/
public void pushToken(Token tok) {
tokens.push(tok);
}
/**
* Get the value of the next non-space token and consume the token. You'd
* probably only call this after having peeked the type earlier.
* Any initial space is skipped.
*/
public String nextValue() {
skipSpace();
return nextRawToken().getValue();
}
public boolean isEndOfFile() {
ensureTok();
if (tokens.isEmpty()) {
return isEOF;
} else {
return tokens.peek().getType() == TokType.EOF;
}
}
/**
* Skip any white space. After calling this the next token
* will be end of file or something other than SPACE or EOL.
*/
public void skipSpace() {
while (!isEndOfFile()) {
if (tokens.peek().isValue(commentChar)) {
skipLine();
continue;
}
if (!tokens.peek().isWhiteSpace())
break;
nextRawToken();
}
}
/**
* Skip everything up to a new line token. The new line
* token will be consumed, so the next token will the the first
* on a new line (or at EOF).
*/
public void skipLine() {
while (!isEndOfFile()) {
Token t = nextRawToken();
if (t.getType() == TokType.EOL)
break;
}
}
private void ensureTok() {
if (tokens.isEmpty())
fillTok();
}
private void fillTok() {
Token t = readTok();
tokens.add(t);
}
/**
* Read a token from the input stream. There are only a few
* kinds of token that are recognised on input. Other token
* types are recognised or constructed later on.
* @return A token. Never returns null or throws an exception.
* Once end of file or an error occurs the routine will always return
* EOF.
*/
private Token readTok() {
if (isEOF)
return new Token(TokType.EOF);
int c = readChar();
if (c == -1) {
isEOF = true;
return new Token(TokType.EOF);
}
StringBuilder val = new StringBuilder();
val.append((char) c);
TokType tt;
if (c == '\r') {
c = readChar();
if (c != '\n')
pushback = c;
tt = TokType.EOL;
} else if (c == '\n') {
tt = TokType.EOL;
} else if (isSpace(c)) {
while (isSpace(c = readChar()) && c != '\n')
val.append((char) c);
pushback = c;
tt = TokType.SPACE;
} else if (isWordChar(c)) {
while (isWordChar(c = readChar()))
val.append((char) c);
pushback = c;
tt = TokType.TEXT;
} else {
// A symbol. The value has already been set. Some symbols
// combine from multiple characters.
if (c == '!' || c == '<' || c == '>') {
c = readChar();
if (c == '=')
val.append('=');
else
pushback = c;
} else if (c == '&' || c == '|') {
// Allow && and || as single symbols
int c2 = readChar();
if (c2 == c)
val.append((char) c2);
else
pushback = c2;
}
tt = TokType.SYMBOL;
}
Token t = new Token(tt);
t.setValue(val.toString());
return t;
}
/**
* Read a single character.
* @return The next character, or -1 if at EOF. The isEOF field will also be set to true at end of file.
*/
private int readChar() {
int c;
if (pushback != NO_PUSHBACK) {
c = pushback;
pushback = NO_PUSHBACK;
return c;
}
try {
c = reader.read();
if (c == 0xfffd)
throw new SyntaxException(this, "Bad character in input, file probably not in utf-8");
} catch (IOException e) {
isEOF = true;
c = -1;
}
return c;
}
private boolean isSpace(int nextch) {
return Character.isWhitespace(nextch) || nextch == '\uFEFF';
}
private boolean isWordChar(int ch) {
return Character.isLetterOrDigit(ch)
|| ch == '_'
|| extraWordChars.indexOf(ch) >= 0;
}
/**
* Read the tokens up until the end of the line and combine then
* into one string.
*
* @return A single string, not including the newline terminator. Never
* returns null, returns an empty string if there is nothing there. The
* end of line is consumed.
*/
public String readLine() {
String res = readUntil(TokType.EOL, null);
nextRawToken(); // use up new line
return res;
}
/**
* Read tokens until one of the given type and value is found and return the result as a single string.
* The searched token is not consumed from the input.
*
* @param type The token type to search for.
* @param value The string value of the token to search for.
* @return A single string of all the tokens preceding the searched token.
*/
public String readUntil(TokType type, String value) {
StringBuilder sb = new StringBuilder();
while (!isEndOfFile()) {
Token t = peekToken();
if (t.getType() == type && (value == null || value.equals(t.getValue())))
break;
sb.append(nextRawToken().getValue());
}
return sb.toString().trim();
}
/**
* Convenience routine to get an integer. Skips space and reads a
* token. This token is converted to an integer if possible.
* @return An integer as read from the next non space token.
* @throws NumberFormatException When the next symbol isn't
* a valid integer.
*/
public int nextInt() throws NumberFormatException {
skipSpace();
Token t = nextRawToken();
if (t == null)
throw new NumberFormatException("no number");
return Integer.parseInt(t.getValue());
}
/**
* As {@link #nextWordWithInfo()} but just the string is returned.
* @return The next word as a string. A quoted entity is regarded as a
* word for the purposes of this scanner.
*/
public String nextWord() {
WordInfo info = nextWordWithInfo();
return info.getText();
}
/**
* Read a string that can be quoted. If it is quoted, then everything
* until the closing quotes is part of the string. Both single
* and double quotes can be used.
*
* If there are no quotes then it behaves like nextToken apart from
* skipping space.
*
* Initial and final space is skipped.
*
* The word string is returned along with a flag to indicate whether it
* was quoted or not.
*/
public WordInfo nextWordWithInfo() {
skipSpace();
Token tok = peekToken();
char quotec = 0;
if (tok.getType() == TokType.SYMBOL) {
String s = tok.getValue();
if ("'".equals(s) || "\"".equals(s)) {
quotec = s.charAt(0);
nextRawToken();
}
}
StringBuilder sb = new StringBuilder();
while (!isEndOfFile()) {
tok = nextRawToken();
if (quotec == 0) {
sb.append(tok.getValue());
break;
} else {
if (tok.isValue(String.valueOf(quotec)))
break;
sb.append(tok.getValue());
}
}
skipSpace();
return new WordInfo(sb.toString(), quotec != 0);
}
/**
* Check the value of the next token without consuming it.
*
* @param val String value to compare against.
* @return True if the next token has the same value as the argument.
*/
public boolean checkToken(String val) {
skipSpace();
Token tok = peekToken();
if (val == null || tok.getValue() == null)
return false;
return val.equals(tok.getValue());
}
/**
* Validate the next word is the given value. Space is skipped before
* checking, the checked value is consumed. Use when you want to
* ensure that a required syntax element is present.
*
* The input will either be positioned after the required word or an
* exception will have been thrown.
*
* @param val The string value to look for.
* @throws SyntaxException If the required string is not found.
*/
public void validateNext(String val) {
skipSpace();
Token tok = nextToken();
if (val == null || !val.equals(tok.getValue()))
throw new SyntaxException(this, "Expecting " + val + ", instead saw " + tok.getValue());
}
public int getLinenumber() {
return linenumber;
}
public String getFileName() {
return fileName;
}
/**
* Extra word characters are characters that should be considered as part of a word in addition
* to alphanumerics and underscore.
* @param extraWordChars A string containing all the characters to be considered part of a word.
*/
public void setExtraWordChars(String extraWordChars) {
this.extraWordChars = extraWordChars;
}
/**
* The skip space routine, will skip all characters after a '#' until the end of the
* line as part of its skip white space functionality.
*
* This is a mis-feature if your comment character is not '#' or that character is
* sometimes important. Therefore you can turn this off by passing in an empty string here.
*/
public void setCommentChar(String commentChar) {
if (commentChar == null)
this.commentChar = "";
else
this.commentChar = commentChar;
}
}