/* * Copyright (C) 2008,2014 Steve Ratcliffe * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * * Author: Steve Ratcliffe * Create date: May 10, 2008 */ package uk.me.parabola.mkgmap.scan; import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.util.LinkedList; /** * Read a file in terms of word and symbol tokens. * * @author Steve Ratcliffe */ public class TokenScanner { private static final int NO_PUSHBACK = 0; // Reading state private final Reader reader; private int pushback = NO_PUSHBACK; private boolean isEOF; private final String fileName; private int linenumber; private final LinkedList<Token> tokens = new LinkedList<>(); private boolean bol = true; // Extra word characters. private String extraWordChars = ""; private String commentChar = "#"; public TokenScanner(String filename, Reader reader) { if (reader instanceof BufferedReader) this.reader = reader; else this.reader = new BufferedReader(reader); fileName = filename; } /** * Peek and return the first token. It is not consumed. */ public Token peekToken() { ensureTok(); return tokens.peek(); } /** * Get and remove the next token. May return space or newline. This is the * only place that a token is removed from the tokens queue. */ public Token nextRawToken() { ensureTok(); if (bol) { bol = false; linenumber++; } Token token = tokens.removeFirst(); if (token.getType() == TokType.EOL) bol = true; return token; } /** * Get the next token tht is not a space or newline. * @return The first valid text or symbol token. */ public Token nextToken() { skipSpace(); return nextRawToken(); } /** * Push a token back to the beginning of the token queue. * @param tok The token to add to the beginning of the queue. */ public void pushToken(Token tok) { tokens.push(tok); } /** * Get the value of the next non-space token and consume the token. You'd * probably only call this after having peeked the type earlier. * Any initial space is skipped. */ public String nextValue() { skipSpace(); return nextRawToken().getValue(); } public boolean isEndOfFile() { ensureTok(); if (tokens.isEmpty()) { return isEOF; } else { return tokens.peek().getType() == TokType.EOF; } } /** * Skip any white space. After calling this the next token * will be end of file or something other than SPACE or EOL. */ public void skipSpace() { while (!isEndOfFile()) { if (tokens.peek().isValue(commentChar)) { skipLine(); continue; } if (!tokens.peek().isWhiteSpace()) break; nextRawToken(); } } /** * Skip everything up to a new line token. The new line * token will be consumed, so the next token will the the first * on a new line (or at EOF). */ public void skipLine() { while (!isEndOfFile()) { Token t = nextRawToken(); if (t.getType() == TokType.EOL) break; } } private void ensureTok() { if (tokens.isEmpty()) fillTok(); } private void fillTok() { Token t = readTok(); tokens.add(t); } /** * Read a token from the input stream. There are only a few * kinds of token that are recognised on input. Other token * types are recognised or constructed later on. * @return A token. Never returns null or throws an exception. * Once end of file or an error occurs the routine will always return * EOF. */ private Token readTok() { if (isEOF) return new Token(TokType.EOF); int c = readChar(); if (c == -1) { isEOF = true; return new Token(TokType.EOF); } StringBuilder val = new StringBuilder(); val.append((char) c); TokType tt; if (c == '\r') { c = readChar(); if (c != '\n') pushback = c; tt = TokType.EOL; } else if (c == '\n') { tt = TokType.EOL; } else if (isSpace(c)) { while (isSpace(c = readChar()) && c != '\n') val.append((char) c); pushback = c; tt = TokType.SPACE; } else if (isWordChar(c)) { while (isWordChar(c = readChar())) val.append((char) c); pushback = c; tt = TokType.TEXT; } else { // A symbol. The value has already been set. Some symbols // combine from multiple characters. if (c == '!' || c == '<' || c == '>') { c = readChar(); if (c == '=') val.append('='); else pushback = c; } else if (c == '&' || c == '|') { // Allow && and || as single symbols int c2 = readChar(); if (c2 == c) val.append((char) c2); else pushback = c2; } tt = TokType.SYMBOL; } Token t = new Token(tt); t.setValue(val.toString()); return t; } /** * Read a single character. * @return The next character, or -1 if at EOF. The isEOF field will also be set to true at end of file. */ private int readChar() { int c; if (pushback != NO_PUSHBACK) { c = pushback; pushback = NO_PUSHBACK; return c; } try { c = reader.read(); if (c == 0xfffd) throw new SyntaxException(this, "Bad character in input, file probably not in utf-8"); } catch (IOException e) { isEOF = true; c = -1; } return c; } private boolean isSpace(int nextch) { return Character.isWhitespace(nextch) || nextch == '\uFEFF'; } private boolean isWordChar(int ch) { return Character.isLetterOrDigit(ch) || ch == '_' || extraWordChars.indexOf(ch) >= 0; } /** * Read the tokens up until the end of the line and combine then * into one string. * * @return A single string, not including the newline terminator. Never * returns null, returns an empty string if there is nothing there. The * end of line is consumed. */ public String readLine() { String res = readUntil(TokType.EOL, null); nextRawToken(); // use up new line return res; } /** * Read tokens until one of the given type and value is found and return the result as a single string. * The searched token is not consumed from the input. * * @param type The token type to search for. * @param value The string value of the token to search for. * @return A single string of all the tokens preceding the searched token. */ public String readUntil(TokType type, String value) { StringBuilder sb = new StringBuilder(); while (!isEndOfFile()) { Token t = peekToken(); if (t.getType() == type && (value == null || value.equals(t.getValue()))) break; sb.append(nextRawToken().getValue()); } return sb.toString().trim(); } /** * Convenience routine to get an integer. Skips space and reads a * token. This token is converted to an integer if possible. * @return An integer as read from the next non space token. * @throws NumberFormatException When the next symbol isn't * a valid integer. */ public int nextInt() throws NumberFormatException { skipSpace(); Token t = nextRawToken(); if (t == null) throw new NumberFormatException("no number"); return Integer.parseInt(t.getValue()); } /** * As {@link #nextWordWithInfo()} but just the string is returned. * @return The next word as a string. A quoted entity is regarded as a * word for the purposes of this scanner. */ public String nextWord() { WordInfo info = nextWordWithInfo(); return info.getText(); } /** * Read a string that can be quoted. If it is quoted, then everything * until the closing quotes is part of the string. Both single * and double quotes can be used. * * If there are no quotes then it behaves like nextToken apart from * skipping space. * * Initial and final space is skipped. * * The word string is returned along with a flag to indicate whether it * was quoted or not. */ public WordInfo nextWordWithInfo() { skipSpace(); Token tok = peekToken(); char quotec = 0; if (tok.getType() == TokType.SYMBOL) { String s = tok.getValue(); if ("'".equals(s) || "\"".equals(s)) { quotec = s.charAt(0); nextRawToken(); } } StringBuilder sb = new StringBuilder(); while (!isEndOfFile()) { tok = nextRawToken(); if (quotec == 0) { sb.append(tok.getValue()); break; } else { if (tok.isValue(String.valueOf(quotec))) break; sb.append(tok.getValue()); } } skipSpace(); return new WordInfo(sb.toString(), quotec != 0); } /** * Check the value of the next token without consuming it. * * @param val String value to compare against. * @return True if the next token has the same value as the argument. */ public boolean checkToken(String val) { skipSpace(); Token tok = peekToken(); if (val == null || tok.getValue() == null) return false; return val.equals(tok.getValue()); } /** * Validate the next word is the given value. Space is skipped before * checking, the checked value is consumed. Use when you want to * ensure that a required syntax element is present. * * The input will either be positioned after the required word or an * exception will have been thrown. * * @param val The string value to look for. * @throws SyntaxException If the required string is not found. */ public void validateNext(String val) { skipSpace(); Token tok = nextToken(); if (val == null || !val.equals(tok.getValue())) throw new SyntaxException(this, "Expecting " + val + ", instead saw " + tok.getValue()); } public int getLinenumber() { return linenumber; } public String getFileName() { return fileName; } /** * Extra word characters are characters that should be considered as part of a word in addition * to alphanumerics and underscore. * @param extraWordChars A string containing all the characters to be considered part of a word. */ public void setExtraWordChars(String extraWordChars) { this.extraWordChars = extraWordChars; } /** * The skip space routine, will skip all characters after a '#' until the end of the * line as part of its skip white space functionality. * * This is a mis-feature if your comment character is not '#' or that character is * sometimes important. Therefore you can turn this off by passing in an empty string here. */ public void setCommentChar(String commentChar) { if (commentChar == null) this.commentChar = ""; else this.commentChar = commentChar; } }