/*
* tuProlog - Copyright (C) 2001-2002 aliCE team at deis.unibo.it
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package alice.tuprolog;
import java.io.IOException;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.util.Arrays;
import java.util.LinkedList;
/**
* BNF for tuProlog
*
* part 1: Lexer
* digit ::= 0 .. 9
* lc_letter ::= a .. z
* uc_letter ::= A .. Z | _
* symbol ::= \ | $ | & | ^ | @ | # | . | , | : | ; | = | < | > | + | - | * | / | ~
* letter ::= digit | lc_letter | uc_letter
* integer ::= { digit }+
* float ::= { digit }+ . { digit }+ [ E|e [ +|- ] { digit }+ ]
* // TODO Update BNF for quotes?
* atom ::= lc_letter { letter }* | !
* variable ::= uc_letter { letter }*
*
* from the super class, the super.nextToken() returns and updates the following relevant fields:
* - if the next token is a collection of wordChars,
* the type returned is TT_WORD and the value is put into the field sval.
* - if the next token is an ordinary char,
* the type returned is the same as the unicode int value of the ordinary character
* - other characters should be handled as ordinary characters.
*/
class Tokenizer extends StreamTokenizer {
static final int TYPEMASK = 0x00FF;
static final int ATTRMASK = 0xFF00;
static final int LPAR = 0x0001;
static final int RPAR = 0x0002;
static final int LBRA = 0x0003;
static final int RBRA = 0x0004;
static final int BAR = 0x0005;
static final int INTEGER = 0x0006;
static final int FLOAT = 0x0007;
static final int ATOM = 0x0008;
static final int VARIABLE = 0x0009;
static final int SQ_SEQUENCE = 0x000A;
static final int DQ_SEQUENCE = 0x000B;
static final int END = 0x000D;
static final int LBRA2 = 0x000E;
static final int RBRA2 = 0x000F;
static final int FUNCTOR = 0x0100;
static final int OPERATOR = 0x0200;
static final int EOF = 0x1000;
static final char[] GRAPHIC_CHARS = {'\\', '$', '&', '?', '^', '@', '#', '.', ',', ':', ';', '=', '<', '>', '+', '-', '*', '/', '~'};
static {
Arrays.sort(Tokenizer.GRAPHIC_CHARS); // must be done to ensure correct behavior of Arrays.binarySearch
}
//used to enable pushback from the parser. Not in any way connected with pushBack2 and super.pushBack().
private LinkedList<Token> tokenList = new LinkedList<Token>();
//used in the double lookahead check that . following ints is a fraction marker or end marker (pushback() only works on one level)
private PushBack pushBack2 = null;
public Tokenizer(String text) {
this(new StringReader(text));
}
/**
* Creating a tokenizer for the source stream.
*/
public Tokenizer(Reader text) {
super(text);
// Prepare the tokenizer for Prolog-style tokenizing rules
resetSyntax();
// letters
wordChars('a', 'z');
wordChars('A', 'Z');
wordChars('_', '_');
wordChars('0', '9'); // need to parse numbers as special words
ordinaryChar('!');
// symbols
ordinaryChar('\\');
ordinaryChar('$');
ordinaryChar('&');
ordinaryChar('^');
ordinaryChar('@');
ordinaryChar('#');
ordinaryChar(',');
ordinaryChar('.');
ordinaryChar(':');
ordinaryChar(';');
ordinaryChar('=');
ordinaryChar('<');
ordinaryChar('>');
ordinaryChar('+');
ordinaryChar('-');
ordinaryChar('*');
ordinaryChar('/');
ordinaryChar('~');
// quotes
ordinaryChar('\''); // must be parsed individually to handles \\ in quotes and character code constants
ordinaryChar('\"'); // same as above?
// comments
ordinaryChar('%');
// it is not possible to enable StreamTokenizer#slashStarComments and % as a StreamTokenizer#commentChar
// and it is also not possible to use StreamTokenizer#whitespaceChars for ' '
}
/**
* Reads next available token.
*/
Token readToken() throws InvalidTermException, IOException {
return !tokenList.isEmpty() ? (Token) tokenList.removeFirst() : readNextToken();
}
/**
* Puts back token to be read again.
*/
void unreadToken(Token token) {
tokenList.addFirst(token);
}
Token readNextToken() throws IOException, InvalidTermException {
int typea;
String svala;
if (pushBack2 != null) {
typea = pushBack2.typea;
svala = pushBack2.svala;
pushBack2 = null;
} else {
typea = super.nextToken();
svala = sval;
}
// skips whitespace
// could be simplified if lookahead for blank space in functors wasn't necessary
// and if '.' in numbers could be written with blank space
while (Tokenizer.isWhite(typea)) {
typea = super.nextToken();
svala = sval;
}
// skips single line comments
// could be simplified if % was not a legal character in quotes
if (typea == '%'){
do {
typea = super.nextToken();
} while (typea != '\r' && typea != '\n' && typea != TT_EOF);
pushBack(); // pushes back \r or \n. These are whitespace, so when readNextToken() finds them, they are marked as whitespace
return readNextToken();
}
// skips /* comments */
if (typea == '/'){
int typeb = super.nextToken();
if (typeb == '*'){
do {
typea = typeb;
typeb = super.nextToken();
} while (typea != '*' || typeb != '/');
return readNextToken();
} else {
pushBack();
}
}
// syntactic characters
if (typea == TT_EOF) return new Token("", Tokenizer.EOF);
if (typea == '(') return new Token("(", Tokenizer.LPAR);
if (typea == ')') return new Token(")", Tokenizer.RPAR);
if (typea == '{') return new Token("{", Tokenizer.LBRA2);
if (typea == '}') return new Token("}", Tokenizer.RBRA2);
if (typea == '[') return new Token("[", Tokenizer.LBRA);
if (typea == ']') return new Token("]", Tokenizer.RBRA);
if (typea == '|') return new Token("|", Tokenizer.BAR);
if (typea == '!') return new Token("!", Tokenizer.ATOM);
if (typea == ',') return new Token(",", Tokenizer.OPERATOR);
if (typea == '.') { // check that '.' as end token is followed by a layout character, see ISO Standard 6.4.8 endnote
int typeb = super.nextToken();
if (Tokenizer.isWhite(typeb) || typeb == '%' || typeb == StreamTokenizer.TT_EOF)
return new Token(".", Tokenizer.END);
else
pushBack();
}
boolean isNumber = false;
// variable, atom or number
if (typea == TT_WORD) {
char firstChar = svala.charAt(0);
// variable
if (Character.isUpperCase(firstChar) || '_' == firstChar)
return new Token(svala, Tokenizer.VARIABLE);
else if (firstChar >= '0' && firstChar <= '9') // all words starting with 0 or 9 must be a number
isNumber = true; // set type to number and handle later
else { // otherwise, it must be an atom (or wrong)
int typeb = super.nextToken(); // lookahead 1 to identify what type of atom
pushBack(); // this does not skip whitespaces, only readNext does so.
if (typeb == '(')
return new Token(svala, Tokenizer.ATOM | Tokenizer.FUNCTOR);
if (Tokenizer.isWhite(typeb))
return new Token(svala, Tokenizer.ATOM | Tokenizer.OPERATOR);
return new Token(svala, Tokenizer.ATOM);
}
}
// quotes
if (typea == '\'' || typea == '\"' || typea == '`' ) {
int qType = typea;
StringBuilder quote = new StringBuilder();
while (true) { // run through entire quote and added body to quote buffer
typea = super.nextToken();
svala = sval;
// continuation escape sequence
if (typea == '\\'){
int typeb = super.nextToken();
if (typeb == '\n') // continuation escape sequence marker \\n
continue;
if (typeb == '\r'){
int typec = super.nextToken();
if (typec == '\n')
continue; // continuation escape sequence marker \\r\n
pushBack();
continue; // continuation escape sequence marker \\r
}
pushBack(); // pushback typeb
}
// double '' or "" or ``
if (typea == qType){
int typeb = super.nextToken();
if (typeb == qType) { // escaped '' or "" or ``
quote.append((char) qType);
continue;
} else {
pushBack();
break; // otherwise, break on single quote
}
}
if (typea == '\n' || typea == '\r')
throw new InvalidTermException("line break in quote not allowed (unless they are escaped \\ first)");
if (svala != null)
quote.append(svala);
else
quote.append((char) typea);
}
String quoteBody = quote.toString();
qType = qType == '\'' ? SQ_SEQUENCE : qType == '\"' ? DQ_SEQUENCE : SQ_SEQUENCE;
if (qType == SQ_SEQUENCE) {
if (Parser.isAtom(quoteBody))
qType = ATOM;
int typeb = super.nextToken(); // lookahead 1 to identify what type of quote
pushBack(); // nextToken() does not skip whitespaces, only readNext does so.
if (typeb == '(')
return new Token(quoteBody, qType | FUNCTOR);
}
return new Token(quoteBody, qType);
}
// symbols
if (Arrays.binarySearch(Tokenizer.GRAPHIC_CHARS, (char) typea) >= 0) {
// the symbols are parsed individually by the super.nextToken(), so accumulate symbollist
StringBuilder symbols = new StringBuilder();
int typeb = typea;
// String svalb = null;
while (Arrays.binarySearch(Tokenizer.GRAPHIC_CHARS, (char) typeb) >= 0) {
symbols.append((char) typeb);
typeb = super.nextToken();
// svalb = sval;
}
pushBack();
// special symbols: unary + and unary -
// try {
// if (symbols.length() == 1 && typeb == TT_WORD && java.lang.Long.parseLong(svalb) > 0) {
// if (typea == '+') //todo, issue of handling + and -. I don't think this is ISO..
// return readNextToken(); //skips + and returns the next number
// if (typea == '-') {
// Token t = readNextToken(); //read the next number
// t.seq = "-" + t.seq; //add minus to value
// return t; //return token
// }
// } //ps. the reason why the number isn't returned right away, but through nextToken(), is because the number might be for instance a float
// } catch (NumberFormatException e) {
// }
return new Token(symbols.toString(), Tokenizer.OPERATOR);
}
// numbers: 1. integer, 2. float
if (isNumber) {
try { // the various parseInt checks will throw exceptions when parts of numbers are written illegally
// 1.a. complex integers
if (svala.startsWith("0")) {
if (svala.indexOf('b') == 1)
return new Token("" + java.lang.Long.parseLong(svala.substring(2), 2), Tokenizer.INTEGER); // try binary
if (svala.indexOf('o') == 1)
return new Token("" + java.lang.Long.parseLong(svala.substring(2), 8), Tokenizer.INTEGER); // try octal
if (svala.indexOf('x') == 1)
return new Token("" + java.lang.Long.parseLong(svala.substring(2), 16), Tokenizer.INTEGER); // try hex
}
// lookahead 1
int typeb = super.nextToken();
String svalb = sval;
// 1.b ordinary integers
if (typeb != '.' && typeb != '\'') { // i.e. not float or character constant
pushBack(); // lookahead 0
return new Token("" + java.lang.Long.parseLong(svala), Tokenizer.INTEGER);
}
// 1.c character code constant
if (typeb == '\'' && "0".equals(svala)) {
int typec = super.nextToken(); // lookahead 2
String svalc = sval;
int intVal;
if ((intVal = isCharacterCodeConstantToken(typec, svalc)) != -1)
return new Token("" + intVal, Tokenizer.INTEGER);
// this is an invalid character code constant int
throw new InvalidTermException("Character code constant starting with 0'<X> at line: " + super.lineno() + " cannot be recognized.");
}
// 2.a check that the value of the word prior to period is a valid long
java.lang.Long.parseLong(svala); // throws an exception if not
// 2.b first int is followed by a period
if (typeb != '.')
throw new InvalidTermException("A number starting with 0-9 cannot be rcognized as an int and does not have a fraction '.' at line: " + super.lineno() );
// lookahead 2
int typec = super.nextToken();
String svalc = sval;
// 2.c check that the next token after '.' is a possible fraction
if (typec != TT_WORD) { // if its not, the period is an End period
pushBack(); // pushback 1 the token after period
pushBack2 = new PushBack(typeb, svalb); // pushback 2 the period token
return new Token(svala, INTEGER); // return what must be an int
}
// 2.d checking for exponent
int exponent = svalc.indexOf("E");
if (exponent == -1)
exponent = svalc.indexOf("e");
if (exponent >= 1) { // the float must have a valid exponent
if (exponent == svalc.length() - 1) { // the exponent must be signed exponent
int typeb2 = super.nextToken();
if (typeb2 == '+' || typeb2 == '-') {
int typec2 = super.nextToken();
String svalc2 = sval;
if (typec2 == TT_WORD){
// verify the remaining parts of the float and return
java.lang.Long.parseLong(svalc.substring(0, exponent));
java.lang.Integer.parseInt(svalc2);
return new Token(svala + "." + svalc + (char) typeb2 + svalc2, Tokenizer.FLOAT);
}
}
}
}
// 2.e verify lastly that ordinary floats and unsigned exponent floats are Java legal and return them
java.lang.Double.parseDouble(svala + "." + svalc);
return new Token(svala + "." + svalc, Tokenizer.FLOAT);
} catch (NumberFormatException e) {
// TODO return more info on what was wrong with the number given
throw new InvalidTermException("A term starting with 0-9 cannot be parsed as a number at line: "+ lineno());
}
}
throw new InvalidTermException("Unknown Unicode character: " + typea + " (" + svala + ")");
}
/**
*
*
* @param typec
* @param svalc
* @return the intValue of the next character token, -1 if invalid
* TODO needs a lookahead if typec is \
*/
private static int isCharacterCodeConstantToken(int typec, String svalc) {
if (svalc != null) {
if (svalc.length() == 1)
return (int) svalc.charAt(0);
if (svalc.length() > 1) {
// TODO the following charachters is not implemented:
// * 1 meta escape sequence (* 6.4.2.1 *) todo
// * 1 control escape sequence (* 6.4.2.1 *)
// * 1 octal escape sequence (* 6.4.2.1 *)
// * 1 hexadecimal escape sequence (* 6.4.2.1 *)
return -1;
}
}
if (typec == ' ' || // space char (* 6.5.4 *)
Arrays.binarySearch(GRAPHIC_CHARS, (char)typec) >= 0) // graphic char (* 6.5.1 *)
// TODO solo char (* 6.5.3 *)
return typec;
return -1;
}
private static boolean isWhite(int type) {
return type == ' ' || type == '\r' || type == '\n' || type == '\t' || type == '\f';
}
/**
* Uused to implement lookahead for two tokens, super.pushBack() only handles one pushBack.
*/
private static class PushBack {
int typea;
String svala;
public PushBack(int i, String s) {
typea = i;
svala = s;
}
}
}