Tokenizer.java example

Explorer
tuprolog-5-master
- src
  - alice
    - tuprolog
- test
/*
 * tuProlog - Copyright (C) 2001-2002  aliCE team at deis.unibo.it
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package alice.tuprolog;

import java.io.IOException;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.util.Arrays;
import java.util.LinkedList;

/**
 * BNF for tuProlog
 *
 * part 1: Lexer
 *      digit ::= 0 .. 9
 *      lc_letter ::= a .. z
 *      uc_letter ::= A .. Z | _
 *      symbol ::= \ | $ | & | ^ | @ | # | . | , | : | ; | = | < | > | + | - | * | / | ~

 *      letter ::= digit | lc_letter | uc_letter
 *      integer ::= { digit }+
 *      float ::= { digit }+ . { digit }+ [ E|e [ +|- ] { digit }+ ]
 *                                                                           // TODO Update BNF for quotes?
 *      atom ::= lc_letter { letter }* | !
 *      variable ::= uc_letter { letter }*
 *
 * from the super class, the super.nextToken() returns and updates the following relevant fields:
 * - if the next token is a collection of wordChars,
 * the type returned is TT_WORD and the value is put into the field sval.
 * - if the next token is an ordinary char,
 * the type returned is the same as the unicode int value of the ordinary character
 * - other characters should be handled as ordinary characters.
 */
class Tokenizer extends StreamTokenizer {

    static final int TYPEMASK = 0x00FF;
    static final int ATTRMASK = 0xFF00;
    static final int LPAR = 0x0001;
    static final int RPAR = 0x0002;
    static final int LBRA = 0x0003;
    static final int RBRA = 0x0004;
    static final int BAR = 0x0005;
    static final int INTEGER = 0x0006;
    static final int FLOAT = 0x0007;
    static final int ATOM = 0x0008;
    static final int VARIABLE = 0x0009;
    static final int SQ_SEQUENCE = 0x000A;
    static final int DQ_SEQUENCE = 0x000B;
    static final int END = 0x000D;
    static final int LBRA2 = 0x000E;
    static final int RBRA2 = 0x000F;
    static final int FUNCTOR = 0x0100;
    static final int OPERATOR = 0x0200;
    static final int EOF = 0x1000;

    static final char[] GRAPHIC_CHARS = {'\\', '$', '&', '?', '^', '@', '#', '.', ',', ':', ';', '=', '<', '>', '+', '-', '*', '/', '~'};
    static {
        Arrays.sort(Tokenizer.GRAPHIC_CHARS);  // must be done to ensure correct behavior of Arrays.binarySearch
    }

    //used to enable pushback from the parser. Not in any way connected with pushBack2 and super.pushBack().
    private LinkedList<Token> tokenList = new LinkedList<Token>();

    //used in the double lookahead check that . following ints is a fraction marker or end marker (pushback() only works on one level)
    private PushBack pushBack2 = null;

    public Tokenizer(String text) {
        this(new StringReader(text));
    }
    /**
     * Creating a tokenizer for the source stream.
     */
    public Tokenizer(Reader text) {
        super(text);

        // Prepare the tokenizer for Prolog-style tokenizing rules
        resetSyntax();

        // letters
        wordChars('a', 'z');
        wordChars('A', 'Z');
        wordChars('_', '_');
        wordChars('0', '9'); // need to parse numbers as special words

        ordinaryChar('!');

        // symbols
        ordinaryChar('\\');
        ordinaryChar('$');
        ordinaryChar('&');
        ordinaryChar('^');
        ordinaryChar('@');
        ordinaryChar('#');
        ordinaryChar(',');
        ordinaryChar('.');
        ordinaryChar(':');
        ordinaryChar(';');
        ordinaryChar('=');
        ordinaryChar('<');
        ordinaryChar('>');
        ordinaryChar('+');
        ordinaryChar('-');
        ordinaryChar('*');
        ordinaryChar('/');
        ordinaryChar('~');

        // quotes
        ordinaryChar('\''); // must be parsed individually to handles \\ in quotes and character code constants
        ordinaryChar('\"'); // same as above?

        // comments
        ordinaryChar('%');
        // it is not possible to enable StreamTokenizer#slashStarComments and % as a StreamTokenizer#commentChar
        // and it is also not possible to use StreamTokenizer#whitespaceChars for ' '
    }

    /**
     * Reads next available token.
     */
    Token readToken() throws InvalidTermException, IOException {
    	return !tokenList.isEmpty() ? (Token) tokenList.removeFirst() : readNextToken();
    }

    /**
     * Puts back token to be read again.
     */
    void unreadToken(Token token) {
        tokenList.addFirst(token);
    }

    Token readNextToken() throws IOException, InvalidTermException {
        int typea;
        String svala;
        if (pushBack2 != null) {
            typea = pushBack2.typea;
            svala = pushBack2.svala;
            pushBack2 = null;
        } else {
            typea = super.nextToken();
            svala = sval;
        }

        // skips whitespace
        // could be simplified if lookahead for blank space in functors wasn't necessary
        // and if '.' in numbers could be written with blank space
        while (Tokenizer.isWhite(typea)) {
            typea = super.nextToken();
            svala = sval;
        }

        // skips single line comments
        // could be simplified if % was not a legal character in quotes
        if (typea == '%'){
            do {
                typea = super.nextToken();
            } while (typea != '\r' && typea != '\n' && typea != TT_EOF);
            pushBack();  // pushes back \r or \n. These are whitespace, so when readNextToken() finds them, they are marked as whitespace
            return readNextToken();
        }

        // skips /* comments */
        if (typea == '/'){
            int typeb = super.nextToken();
            if (typeb == '*'){
                do {
                    typea = typeb;
                    typeb = super.nextToken();
                } while (typea != '*' || typeb != '/');
                return readNextToken();
            } else {
                pushBack();
            }
        }

        // syntactic characters
        if (typea == TT_EOF) return new Token("", Tokenizer.EOF);
        if (typea == '(') return new Token("(", Tokenizer.LPAR);
        if (typea == ')') return new Token(")", Tokenizer.RPAR);
        if (typea == '{') return new Token("{", Tokenizer.LBRA2);
        if (typea == '}') return new Token("}", Tokenizer.RBRA2);
        if (typea == '[') return new Token("[", Tokenizer.LBRA);
        if (typea == ']') return new Token("]", Tokenizer.RBRA);
        if (typea == '|') return new Token("|", Tokenizer.BAR);

        if (typea == '!') return new Token("!", Tokenizer.ATOM);
        if (typea == ',') return new Token(",", Tokenizer.OPERATOR);

        if (typea == '.')  { // check that '.' as end token is followed by a layout character, see ISO Standard 6.4.8 endnote
        	int typeb = super.nextToken();
        	if (Tokenizer.isWhite(typeb) || typeb == '%' || typeb == StreamTokenizer.TT_EOF)
        		return new Token(".", Tokenizer.END);
        	else
        		pushBack();
        }

        boolean isNumber = false;

        // variable, atom or number
        if (typea == TT_WORD) {
            char firstChar = svala.charAt(0);
            // variable
            if (Character.isUpperCase(firstChar) || '_' == firstChar)
                return new Token(svala, Tokenizer.VARIABLE);

            else if (firstChar >= '0' && firstChar <= '9') // all words starting with 0 or 9 must be a number
                isNumber = true;                           // set type to number and handle later

            else {                                         // otherwise, it must be an atom (or wrong)
                int typeb = super.nextToken();             // lookahead 1 to identify what type of atom
                pushBack();                                // this does not skip whitespaces, only readNext does so.
                if (typeb == '(')
                    return new Token(svala, Tokenizer.ATOM | Tokenizer.FUNCTOR);
                if (Tokenizer.isWhite(typeb))
                    return new Token(svala, Tokenizer.ATOM | Tokenizer.OPERATOR);
                return new Token(svala, Tokenizer.ATOM);
            }
        }

        // quotes
        if (typea == '\'' || typea == '\"' || typea == '`' ) {
            int qType = typea;
            StringBuilder quote = new StringBuilder();
            while (true) { // run through entire quote and added body to quote buffer
                typea = super.nextToken();
                svala = sval;
                // continuation escape sequence
                if (typea == '\\'){
                    int typeb = super.nextToken();
                    if (typeb == '\n') // continuation escape sequence marker \\n
                        continue;
                    if (typeb == '\r'){
                        int typec = super.nextToken();
                        if (typec == '\n')
                            continue; // continuation escape sequence marker \\r\n
                        pushBack();
                        continue; // continuation escape sequence marker \\r
                    }
                    pushBack(); // pushback typeb
                }
                // double '' or "" or ``
                if (typea == qType){
                    int typeb = super.nextToken();
                    if (typeb == qType) { // escaped '' or "" or ``
                        quote.append((char) qType);
                        continue;
                    } else {
                        pushBack();
                        break; // otherwise, break on single quote
                    }
                }
                if (typea == '\n' || typea == '\r')
                    throw new InvalidTermException("line break in quote not allowed (unless they are escaped \\ first)");

                if (svala != null)
                    quote.append(svala);
                else
                    quote.append((char) typea);
            }

            String quoteBody = quote.toString();

            qType = qType == '\'' ? SQ_SEQUENCE : qType == '\"' ? DQ_SEQUENCE : SQ_SEQUENCE;
            if (qType == SQ_SEQUENCE) {
                if (Parser.isAtom(quoteBody))
                    qType = ATOM;
                int typeb = super.nextToken(); // lookahead 1 to identify what type of quote
                pushBack();                    // nextToken() does not skip whitespaces, only readNext does so.
                if (typeb == '(')
                    return new Token(quoteBody, qType | FUNCTOR);
            }
            return new Token(quoteBody, qType);
        }

        // symbols
        if (Arrays.binarySearch(Tokenizer.GRAPHIC_CHARS, (char) typea) >= 0) {

            // the symbols are parsed individually by the super.nextToken(), so accumulate symbollist
            StringBuilder symbols = new StringBuilder();
            int typeb = typea;
            // String svalb = null;
            while (Arrays.binarySearch(Tokenizer.GRAPHIC_CHARS, (char) typeb) >= 0) {
                symbols.append((char) typeb);
                typeb = super.nextToken();
                // svalb = sval;
            }
            pushBack();
            
            // special symbols: unary + and unary -
//            try {
//                if (symbols.length() == 1 && typeb == TT_WORD && java.lang.Long.parseLong(svalb) > 0) {
//                    if (typea == '+')                         //todo, issue of handling + and -. I don't think this is ISO..
//                        return readNextToken();               //skips + and returns the next number
//                    if (typea == '-') {
//                        Token t = readNextToken();            //read the next number
//                        t.seq = "-" + t.seq;                   //add minus to value
//                        return t;                             //return token
//                    }
//                }                                             //ps. the reason why the number isn't returned right away, but through nextToken(), is because the number might be for instance a float
//            } catch (NumberFormatException e) {
//            }
            return new Token(symbols.toString(), Tokenizer.OPERATOR);
        }

        // numbers: 1. integer, 2. float
        if (isNumber) {
            try { // the various parseInt checks will throw exceptions when parts of numbers are written illegally

                // 1.a. complex integers
                if (svala.startsWith("0")) {
                    if (svala.indexOf('b') == 1)
                        return new Token("" + java.lang.Long.parseLong(svala.substring(2), 2), Tokenizer.INTEGER); // try binary
                    if (svala.indexOf('o') == 1)
                        return new Token("" + java.lang.Long.parseLong(svala.substring(2), 8), Tokenizer.INTEGER); // try octal
                    if (svala.indexOf('x') == 1)
                        return new Token("" + java.lang.Long.parseLong(svala.substring(2), 16), Tokenizer.INTEGER); // try hex
                }

                // lookahead 1
                int typeb = super.nextToken();
                String svalb = sval;

                // 1.b ordinary integers
                if (typeb != '.' && typeb != '\'') { // i.e. not float or character constant
                    pushBack(); // lookahead 0
                    return new Token("" + java.lang.Long.parseLong(svala), Tokenizer.INTEGER);
                }

                // 1.c character code constant
                if (typeb == '\'' && "0".equals(svala)) {
                    int typec = super.nextToken(); // lookahead 2
                    String svalc = sval;
                    int intVal;
                    if ((intVal = isCharacterCodeConstantToken(typec, svalc)) != -1)
                        return new Token("" + intVal, Tokenizer.INTEGER);

                    // this is an invalid character code constant int
                    throw new InvalidTermException("Character code constant starting with 0'<X> at line: " + super.lineno() + " cannot be recognized.");
                }

                // 2.a check that the value of the word prior to period is a valid long
                java.lang.Long.parseLong(svala); // throws an exception if not

                // 2.b first int is followed by a period
                if (typeb != '.')
                    throw new InvalidTermException("A number starting with 0-9 cannot be rcognized as an int and does not have a fraction '.' at line: " + super.lineno() );

                // lookahead 2
                int typec = super.nextToken();
                String svalc = sval;

                // 2.c check that the next token after '.' is a possible fraction
                if (typec != TT_WORD) { // if its not, the period is an End period
                    pushBack(); // pushback 1 the token after period
                    pushBack2 = new PushBack(typeb, svalb); // pushback 2 the period token
                    return new Token(svala, INTEGER); // return what must be an int
                }

                // 2.d checking for exponent
                int exponent = svalc.indexOf("E");
                if (exponent == -1)
                    exponent = svalc.indexOf("e");

                if (exponent >= 1) {                                  // the float must have a valid exponent
                    if (exponent == svalc.length() - 1) {             // the exponent must be signed exponent
                        int typeb2 = super.nextToken();
                        if (typeb2 == '+' || typeb2 == '-') {
                            int typec2 = super.nextToken();
                            String svalc2 = sval;
                            if (typec2 == TT_WORD){
                                // verify the remaining parts of the float and return
                                java.lang.Long.parseLong(svalc.substring(0, exponent));
                                java.lang.Integer.parseInt(svalc2);
                                return new Token(svala + "." + svalc + (char) typeb2 + svalc2, Tokenizer.FLOAT);
                            }
                        }
                    }
                }
                // 2.e verify lastly that ordinary floats and unsigned exponent floats are Java legal and return them
                java.lang.Double.parseDouble(svala + "." + svalc);
                return new Token(svala + "." + svalc, Tokenizer.FLOAT);

            } catch (NumberFormatException e) {
                // TODO return more info on what was wrong with the number given
                throw new InvalidTermException("A term starting with 0-9 cannot be parsed as a number at line: "+ lineno());
            }
        }
        throw new InvalidTermException("Unknown Unicode character: " + typea + "  (" + svala + ")");
    }

    /**
     *
     *
     * @param typec
     * @param svalc
     * @return the intValue of the next character token, -1 if invalid
     * TODO needs a lookahead if typec is \
     */
    private static int isCharacterCodeConstantToken(int typec, String svalc) {
        if (svalc != null) {
            if (svalc.length() == 1)
                return (int) svalc.charAt(0);
            if (svalc.length() > 1) {
// TODO the following charachters is not implemented:
//                * 1 meta escape sequence (* 6.4.2.1 *) todo
//                * 1 control escape sequence (* 6.4.2.1 *)
//                * 1 octal escape sequence (* 6.4.2.1 *)
//                * 1 hexadecimal escape sequence (* 6.4.2.1 *)
                return -1;
            }
        }
        if (typec == ' ' ||                       // space char (* 6.5.4 *)
            Arrays.binarySearch(GRAPHIC_CHARS, (char)typec) >= 0)  // graphic char (* 6.5.1 *)
//        	TODO solo char (* 6.5.3 *)
            return typec;

        return -1;
    }

    private static boolean isWhite(int type) {
        return type == ' ' || type == '\r' || type == '\n' || type == '\t' || type == '\f';
    }

    /**
     * Uused to implement lookahead for two tokens, super.pushBack() only handles one pushBack.
     */
    private static class PushBack {
        int typea;
        String svala;

        public PushBack(int i, String s) {
            typea = i;
            svala = s;
        }
    }
    
}