TokenizerText.java example

Explorer
jena-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.jena.riot.tokens;

import static org.apache.jena.atlas.lib.Chars.* ;
import static org.apache.jena.riot.system.RiotChars.* ;

import java.util.NoSuchElementException ;

import org.apache.jena.atlas.AtlasException ;
import org.apache.jena.atlas.io.IO ;
import org.apache.jena.atlas.io.PeekReader ;
import org.apache.jena.atlas.lib.Chars ;
import org.apache.jena.riot.RiotParseException ;
import org.apache.jena.riot.system.RiotChars ;
import org.apache.jena.sparql.ARQInternalErrorException ;

/** Tokenizer for all sorts of things RDF-ish */

public final class TokenizerText implements Tokenizer
{
    // TODO Remove CNTL and make SYMBOLS
    // Drop through to final general symbol/keyword reader, including <=, != 
    // Care with <=
    // STRING, not STRING1/2, LONG_STRING1/2
    // Policy driven for CURIES?
    
    // Various allow/deny options (via checker?)
    
    // RDF mode:
    //   Prefixes - yes and no.
    //   IRIs
    //   BNodes
    
    // Space for CURIEs, stricter Turtle QNames, sane Turtle (i.e. leading digits in local part).
    public static final int CTRL_CHAR = CH_STAR ;
    
    public static boolean Checking = false ;

    private Token token = null ; 
    private final StringBuilder stringBuilder = new StringBuilder(200) ;
    private final PeekReader reader ;
    private final boolean lineMode ;        // Whether whitespace includes or excludes NL (in its various forms).  
    private boolean finished = false ;
    private TokenChecker checker = null ;

    /*package*/ TokenizerText(PeekReader reader) {
        this(reader, false) ;
    }
    
    /* package */TokenizerText(PeekReader reader, boolean lineMode) {
        this.reader = reader ;
        this.lineMode = lineMode ;
    }
    
    @Override
    public final boolean hasNext() {
        if ( finished )
            return false ;
        if ( token != null )
            return true ;

        try {
            skip() ;
            if ( reader.eof() ) {
                // close() ;
                finished = true ;
                return false ;
            }
            token = parseToken() ;
            if ( token == null ) {
                // close() ;
                finished = true ;
                return false ;
            }
            return true ;
        } catch (AtlasException ex) {
            if ( ex.getCause() != null ) {
                if ( ex.getCause().getClass() == java.nio.charset.MalformedInputException.class )
                    throw new RiotParseException("Bad character encoding", reader.getLineNum(), reader.getColNum()) ;
                throw new RiotParseException("Bad input stream [" + ex.getCause() + "]", reader.getLineNum(),
                                             reader.getColNum()) ;
            }
            throw new RiotParseException("Bad input stream", reader.getLineNum(), reader.getColNum()) ;
        }
    }
    
    
    @Override
    public final boolean eof() {
        return hasNext() ;
    }

    @Override
    public final Token next() {
        if ( !hasNext() )
            throw new NoSuchElementException() ;
        Token t = token ;
        token = null ;
        return t ;
    }

    @Override
    public final Token peek() {
        if ( !hasNext() )
            return null ;
        return token ;
    }
    
    @Override
    public void remove()                            { throw new UnsupportedOperationException() ; }

    public TokenChecker getChecker()                { return checker ; }
    public void setChecker(TokenChecker checker)    { this.checker = checker ; }

    @Override
    public void close() {
        IO.close(reader) ;
    }

    // ---- Machinary
    
    private void skip() {
        int ch = EOF ;
        for (;;) {
            if ( reader.eof() )
                return ;

            ch = reader.peekChar() ;
            if ( ch == CH_HASH ) {
                reader.readChar() ;
                // Comment. Skip to NL
                for (;;) {
                    ch = reader.peekChar() ;
                    if ( ch == EOF || isNewlineChar(ch) )
                        break ;
                    reader.readChar() ;
                }
            }

            // Including excess newline chars from comment.
            if ( lineMode ) {
                if ( !isHorizontalWhitespace(ch) )
                    break ;
            } else {
                if ( !isWhitespace(ch) )
                    break ;
            }
            reader.readChar() ;
        }
    }

    private Token parseToken() {
        token = new Token(getLine(), getColumn()) ;

        int ch = reader.peekChar() ;

        // ---- IRI
        if ( ch == CH_LT ) {
            reader.readChar() ;
            token.setImage(readIRI()) ;
            token.setType(TokenType.IRI) ;
            if ( Checking )
                checkURI(token.getImage()) ;
            return token ;
        }

        // ---- Literal
        if ( ch == CH_QUOTE1 || ch == CH_QUOTE2 ) {
            reader.readChar() ;
            int ch2 = reader.peekChar() ;
            if ( ch2 == ch ) {
                reader.readChar() ; // Read potential second quote.
                int ch3 = reader.peekChar() ;
                if ( ch3 == ch ) {
                    reader.readChar() ;
                    token.setImage(readLongString(ch, false)) ;
                    TokenType tt = (ch == CH_QUOTE1) ? TokenType.LONG_STRING1 : TokenType.LONG_STRING2 ;
                    token.setType(tt) ;
                } else {
                    // Two quotes then a non-quote.
                    // Must be '' or ""
                    // No need to pushback characters as we know the lexical
                    // form is the empty string.
                    // if ( ch2 != EOF ) reader.pushbackChar(ch2) ;
                    // if ( ch1 != EOF ) reader.pushbackChar(ch1) ; // Must be
                    // '' or ""
                    token.setImage("") ;
                    token.setType((ch == CH_QUOTE1) ? TokenType.STRING1 : TokenType.STRING2) ;
                }
            } else {
                // Single quote character.
                token.setImage(readString(ch, ch)) ;
                // Single quoted string.
                token.setType((ch == CH_QUOTE1) ? TokenType.STRING1 : TokenType.STRING2) ;
            }

            // Whte space after lexical part of a literal.
            skip() ;
            
            // Literal. Is it @ or ^^
            if ( reader.peekChar() == CH_AT ) {
                reader.readChar() ;
                // White space is not legal here.
                // The Turtle spec terminal is "LANGTAG" which includes the '@'.
                Token mainToken = new Token(token) ;
                mainToken.setType(TokenType.LITERAL_LANG) ;
                mainToken.setSubToken1(token) ;
                mainToken.setImage2(langTag()) ;
                token = mainToken ;
                if ( Checking )
                    checkLiteralLang(token.getImage(), token.getImage2()) ;
            } else if ( reader.peekChar() == '^' ) {
                expect("^^") ;
                // White space is legal after a ^^. 
                // It's not a good idea, but it is legal.
//                // Check no whitespace.
//                int nextCh = reader.peekChar() ;
//                if ( isWhitespace(nextCh) )
//                    exception("No whitespace after ^^ in literal with datatype") ;
                skip() ;
                
                // Stash current token.
                Token mainToken = new Token(token) ;
                mainToken.setSubToken1(token) ;
                mainToken.setImage(token.getImage()) ;

                Token subToken = parseToken() ;
                if ( !subToken.isIRI() )
                    exception("Datatype URI required after ^^ - URI or prefixed name expected") ;

                mainToken.setSubToken2(subToken) ;
                mainToken.setType(TokenType.LITERAL_DT) ;

                token = mainToken ;
                if ( Checking )
                    checkLiteralDT(token.getImage(), subToken) ;
            } else {
                // Was a simple string.
                if ( Checking )
                    checkString(token.getImage()) ;
            }
            return token ;
        }

        if ( ch == CH_UNDERSCORE ) {
            // Blank node :label must be at least one char
            expect("_:") ;
            token.setImage(readBlankNodeLabel()) ;
            token.setType(TokenType.BNODE) ;
            if ( Checking ) checkBlankNode(token.getImage()) ;
            return token ;
        }

        // TODO remove and make a symbol/keyword.
        // Control
        if ( ch == CTRL_CHAR ) {
            reader.readChar() ;
            token.setType(TokenType.CNTRL) ;
            ch = reader.readChar() ;
            if ( ch == EOF )
                exception("EOF found after " + CTRL_CHAR) ;
            if ( RiotChars.isWhitespace(ch) )
                token.cntrlCode = -1 ;
            else
                token.cntrlCode = (char)ch ;
            if ( Checking )
                checkControl(token.cntrlCode) ;
            return token ;
        }

        // A directive (not part of a literal as lang tag)
        if ( ch == CH_AT ) {
            reader.readChar() ;
            token.setType(TokenType.DIRECTIVE) ;
            token.setImage(readWord(false)) ;
            if ( Checking )
                checkDirective(token.cntrlCode) ;
            return token ;
        }

        // Variable
        if ( ch == CH_QMARK ) {
            reader.readChar() ;
            token.setType(TokenType.VAR) ;
            // Character set?
            token.setImage(readVarName()) ;
            if ( Checking )
                checkVariable(token.getImage()) ;
            return token ;
        }
        
        // Symbol?
        switch(ch)
        { 
            // DOT can start a decimal.  Check for digit.
            case CH_DOT:
                reader.readChar() ;
                ch = reader.peekChar() ;
                if ( range(ch, '0', '9') ) {
                    // Not a DOT after all.
                    reader.pushbackChar(CH_DOT) ;
                    readNumber() ;
                    if ( Checking )
                        checkNumber(token.getImage(), token.getImage2()) ;
                    return token ;
                }
                token.setType(TokenType.DOT) ;
                return token ;
            
            case CH_SEMICOLON:  reader.readChar() ; token.setType(TokenType.SEMICOLON) ; /*token.setImage(CH_SEMICOLON) ;*/ return token ;
            case CH_COMMA:      reader.readChar() ; token.setType(TokenType.COMMA) ;     /*token.setImage(CH_COMMA) ;*/ return token ;
            case CH_LBRACE:     reader.readChar() ; token.setType(TokenType.LBRACE) ;    /*token.setImage(CH_LBRACE) ;*/ return token ;
            case CH_RBRACE:     reader.readChar() ; token.setType(TokenType.RBRACE) ;    /*token.setImage(CH_RBRACE) ;*/ return token ;
            case CH_LPAREN:     reader.readChar() ; token.setType(TokenType.LPAREN) ;    /*token.setImage(CH_LPAREN) ;*/ return token ;
            case CH_RPAREN:     reader.readChar() ; token.setType(TokenType.RPAREN) ;    /*token.setImage(CH_RPAREN) ;*/ return token ;
            case CH_LBRACKET:   reader.readChar() ; token.setType(TokenType.LBRACKET) ;  /*token.setImage(CH_LBRACKET) ;*/ return token ;
            case CH_RBRACKET:   reader.readChar() ; token.setType(TokenType.RBRACKET) ;  /*token.setImage(CH_RBRACKET) ;*/ return token ;
            case CH_EQUALS:     reader.readChar() ; token.setType(TokenType.EQUALS) ;    /*token.setImage(CH_EQUALS) ;*/ return token ;

            // Specials (if blank node processing off)
            //case CH_COLON:      reader.readChar() ; token.setType(TokenType.COLON) ; return token ;
            case CH_UNDERSCORE: reader.readChar() ; token.setType(TokenType.UNDERSCORE) ; /*token.setImage(CH_UNDERSCORE) ;*/ return token ;
            case CH_LT:         reader.readChar() ; token.setType(TokenType.LT) ; /*token.setImage(CH_LT) ;*/ return token ;
            case CH_GT:         reader.readChar() ; token.setType(TokenType.GT) ; /*token.setImage(CH_GT) ;*/ return token ;
            case CH_STAR:       reader.readChar() ; token.setType(TokenType.STAR) ; /*token.setImage(CH_STAR) ;*/ return token ;
            
            // Multi character symbols
            // Two character tokens && || GE >= , LE <=
            // Single character symbols for * /
            // +/- may start numbers.

//            case CH_PLUS:
//            case CH_MINUS:
//            case CH_STAR:
//            case CH_SLASH:
//            case CH_RSLASH:
                
        }
        
        // ---- Numbers.
        // But a plain "+" and "-" are symbols.
        
        /*
        [16]    integer         ::=     ('-' | '+') ? [0-9]+
        [17]    double          ::=     ('-' | '+') ? ( [0-9]+ '.' [0-9]* exponent | '.' ([0-9])+ exponent | ([0-9])+ exponent )
                                        0.e0, .0e0, 0e0
        [18]    decimal         ::=     ('-' | '+')? ( [0-9]+ '.' [0-9]* | '.' ([0-9])+ | ([0-9])+ )
                                        0.0 .0 0.
        [19]    exponent        ::=     [eE] ('-' | '+')? [0-9]+
        []      hex             ::=     0x0123456789ABCDEFG
        
        */
        
        // TODO readNumberNoSign
        
        int signCh = 0 ;

        if ( ch == CH_PLUS || ch == CH_MINUS ) {
            reader.readChar() ;
            int ch2 = reader.peekChar() ;

            if ( !range(ch2, '0', '9') ) {
                // ch was end of symbol.
                // reader.readChar() ;
                if ( ch == CH_PLUS )
                    token.setType(TokenType.PLUS) ;
                else
                    token.setType(TokenType.MINUS) ;
                return token ;
            }

            // Already got a + or - ...
            // readNumberNoSign
            // Because next, old code proceses signs.
            reader.pushbackChar(ch) ;
            signCh = ch ;
            // Drop to next "if"
        }

        if ( ch == CH_PLUS || ch == CH_MINUS || range(ch, '0', '9') ) {
            // readNumberNoSign
            readNumber() ;
            if ( Checking )
                checkNumber(token.getImage(), token.getImage2()) ;
            return token ;
        }

        if ( isNewlineChar(ch) ) {
            //** - If collecting token image.
            //** stringBuilder.setLength(0) ;
            // Any number of NL and CR become one "NL" token.
            do {
                int ch2 = reader.readChar() ;
                //** stringBuilder.append((char)ch2) ;
            } while (isNewlineChar(reader.peekChar())) ;
            token.setType(TokenType.NL) ;
            //** token.setImage(stringBuilder.toString()) ;
            return token ;
        }
        
        // Plain words and prefixes.
        //   Can't start with a number due to numeric test above.
        //   Can't start with a '_' due to blank node test above.
        // If we see a :, the first time it means a prefixed name else it's a token break.
        
        readPrefixedNameOrKeyword(token) ;
        
        if ( Checking ) checkKeyword(token.getImage()) ;
        return token ;
    }

    
    private static final boolean VeryVeryLaxIRI = false ;
    
    // [8]  IRIREF  ::= '<' ([^#x00-#x20<>"{}|^`\] | UCHAR)* '>'
    private String readIRI() {
        stringBuilder.setLength(0) ;
        for (;;) {
            int ch = reader.readChar() ;
            switch(ch) {
                case EOF:
                    exception("Broken IRI (End of file)") ;
                case NL:
                    exception("Broken IRI (newline): %s", stringBuilder.toString()) ;
                case CR:
                    exception("Broken IRI (CR): %s", stringBuilder.toString()) ;
                case CH_GT:
                    // Done!
                    return stringBuilder.toString() ;
                case CH_RSLASH:
                    if ( VeryVeryLaxIRI )
                        // Includes unicode escapes and also \n etc 
                        ch = readLiteralEscape() ;
                    else
                        // NORMAL
                        ch = readUnicodeEscape() ;
                    // Don't check legality of ch (strict syntax at this point).
                    // That does not mean it is a good idea to bypass checking.
                    // Bad characters will lead to trouble elsewhere.
                    break ;
                case CH_LT:
                    // Probably a corrupt file so not a warning.
                    exception("Bad character in IRI (bad character: '<'): <%s<...>", stringBuilder.toString()) ;
                case TAB:
                    exception("Bad character in IRI (Tab character): <%s[tab]...>", stringBuilder.toString()) ;
                case SPC:
                    warning("Bad character in IRI (space): <%s[space]...>", stringBuilder.toString()) ;
                case '{': case '}': case '"': case '|': case '^': case '`' :
                    if ( ! VeryVeryLaxIRI )
                        warning("Illegal character in IRI (codepoint 0x%02X, '%c'): <%s[%c]...>", ch, (char)ch, stringBuilder.toString(), (char)ch) ;
                default:
                    if ( ch <= 0x19 )
                        warning("Illegal character in IRI (control char 0x%02X): %s", ch, stringBuilder.toString()) ;
            }
            insertCodepoint(stringBuilder, ch) ;
        }
    }
    
    // Read a unicode escape : does not allow \\ bypass
    private final int readUnicodeEscape() {
        int ch = reader.readChar() ;
        if ( ch == EOF )
            exception("Broken escape sequence") ;
    
        switch (ch) {
            case 'u': return readUnicode4Escape(); 
            case 'U': return readUnicode8Escape(); 
            default:
                exception("Illegal unicode escape sequence value: \\%c (0x%02X)", ch, ch);
        }
        return 0 ;
    }

    private void readPrefixedNameOrKeyword(Token token) {
        long posn = reader.getPosition() ;
        String prefixPart = readPrefixPart() ; // Prefix part or keyword
        token.setImage(prefixPart) ;
        token.setType(TokenType.KEYWORD) ;
        int ch = reader.peekChar() ;
        if ( ch == CH_COLON ) {
            reader.readChar() ;
            token.setType(TokenType.PREFIXED_NAME) ;
            String ln = readLocalPart() ; // Local part
            token.setImage2(ln) ;
            if ( Checking )
                checkPrefixedName(token.getImage(), token.getImage2()) ;
        }

        // If we made no progress, nothing found, not even a keyword -- it's an
        // error.
        if ( posn == reader.getPosition() )
            exception("Failed to find a prefix name or keyword: %c(%d;0x%04X)", ch, ch, ch) ;

        if ( Checking )
            checkKeyword(token.getImage()) ;

    }
    
    /*
    The token rules from SPARQL and Turtle.
    PNAME_NS       ::=  PN_PREFIX? ':'
    PNAME_LN       ::=  PNAME_NS PN_LOCAL
    
    PN_CHARS_BASE  ::=  [A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
    PN_CHARS_U  ::=  PN_CHARS_BASE | '_'
    PN_CHARS  ::=  PN_CHARS_U | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]
    
    PN_PREFIX  ::=  PN_CHARS_BASE ((PN_CHARS|'.')* PN_CHARS)?
    PN_LOCAL  ::=  (PN_CHARS_U | ':' | [0-9] | PLX ) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX) )?
    PLX  ::=  PERCENT | PN_LOCAL_ESC
    PERCENT  ::=  '%' HEX HEX
    HEX  ::=  [0-9] | [A-F] | [a-f]
    PN_LOCAL_ESC  ::=  '\' ( '_' | '~' | '.' | '-' | '!' | '$' | '&' | "'" | '(' | ')' | '*' | '+' | ',' | ';' | '=' | '/' | '?' | '#' | '@' | '%' )
    */
    
    private String readPrefixPart()
    { return readSegment(false) ; }
    
    private String readLocalPart()
    { return readSegment(true) ; }

    private String readSegment(boolean isLocalPart) { 
        // Prefix: PN_CHARS_BASE                       ((PN_CHARS|'.')* PN_CHARS)?
        // Local: ( PN_CHARS_U | ':' | [0-9] | PLX )   ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX) )?

        // RiotChars has isPNChars_U_N for   ( PN_CHARS_U | [0-9] )
        stringBuilder.setLength(0) ;

        // -- Test first character
        int ch = reader.peekChar() ;
        if ( ch == EOF )
            return "" ;
        if ( isLocalPart ) {
            if ( ch == CH_COLON ) {
                reader.readChar() ;
                stringBuilder.append((char)ch) ;
            }

            // processPLX
            else if ( ch == CH_PERCENT || ch == CH_RSLASH ) {
                reader.readChar() ;
                processPLX(ch) ;
            } else if ( RiotChars.isPNChars_U_N(ch) ) {
                stringBuilder.append((char)ch) ;
                reader.readChar() ;
            } else
                return "" ;
        } else {
            if ( !RiotChars.isPNCharsBase(ch) )
                return "" ;
            stringBuilder.append((char)ch) ;
            reader.readChar() ;
        }
        // Done first character
        int chDot = 0 ;

        for (;;) {
            ch = reader.peekChar() ;
            boolean valid = false ;

            if ( isLocalPart && (ch == CH_PERCENT || ch == CH_RSLASH) ) {
                reader.readChar() ;
                if ( chDot != 0 )
                    stringBuilder.append((char)chDot) ;
                processPLX(ch) ;
                chDot = 0 ;
                continue ;
            }

            // Single valid characters
            if ( isLocalPart && ch == CH_COLON )
                valid = true ;
            else if ( isPNChars(ch) )
                valid = true ;
            else if ( ch == CH_DOT )
                valid = true ;
            else
                valid = false ;

            if ( !valid )
                break ; // Exit loop

            // Valid character.
            reader.readChar() ;
            // Was there also a DOT previous loop?
            if ( chDot != 0 ) {
                stringBuilder.append((char)chDot) ;
                chDot = 0 ;
            }

            if ( ch != CH_DOT )
                stringBuilder.append((char)ch) ;
            else
                // DOT - delay until next loop.
                chDot = ch ;
        }

        // On exit, chDot may hold a character.

        if ( chDot == CH_DOT )
            // Unread it.
            reader.pushbackChar(chDot) ;
        return stringBuilder.toString() ;
    }

    // Process PLX (percent or character escape for a prefixed name)
    private void processPLX(int ch)
    {
        if ( ch == CH_PERCENT )
        {
            stringBuilder.append((char)ch) ;

            ch = reader.peekChar() ;
            if ( ! isHexChar(ch) )
                exception("Not a hex charcater: '%c'",ch) ;
            stringBuilder.append((char)ch) ;
            reader.readChar() ;

            ch = reader.peekChar() ;
            if ( ! isHexChar(ch) )
                exception("Not a hex charcater: '%c'",ch) ;
            stringBuilder.append((char)ch) ;
            reader.readChar() ;
        }
        else if ( ch == CH_RSLASH )
        {
            ch = readCharEscape() ;
            stringBuilder.append((char)ch) ;
        }
        else
            throw new ARQInternalErrorException("Not a '\\' or a '%' character") ;
    }
    
    // Get characters between two markers.
    // strEscapes may be processed
    private String readString(int startCh, int endCh) {
        long y = getLine() ;
        long x = getColumn() ;
        stringBuilder.setLength(0) ;
        // Assumes first delimiter char read already.
        // Reads terminating delimiter 

        for (;;) {
            int ch = reader.readChar() ;
            if ( ch == EOF ) {
                // if ( endNL ) return stringBuilder.toString() ;
                exception("Broken token: " + stringBuilder.toString(), y, x) ;
            }

            if ( ch == NL )
                exception("Broken token (newline): " + stringBuilder.toString(), y, x) ;

            if ( ch == endCh ) {
                return stringBuilder.toString() ;
            }

            if ( ch == CH_RSLASH )
                ch = readLiteralEscape() ;
            insertCodepoint(stringBuilder, ch) ;
        }
    }

    private String readLongString(int quoteChar, boolean endNL) {
        stringBuilder.setLength(0) ;
        for (;;) {
            int ch = reader.readChar() ;
            if ( ch == EOF ) {
                if ( endNL )
                    return stringBuilder.toString() ;
                exception("Broken long string") ;
            }

            if ( ch == quoteChar ) {
                if ( threeQuotes(quoteChar) )
                    return stringBuilder.toString() ;
            }

            if ( ch == CH_RSLASH )
                ch = readLiteralEscape() ;
            insertCodepoint(stringBuilder, ch) ;
        }
    }

    private String readWord(boolean leadingDigitAllowed)
    { return readWordSub(leadingDigitAllowed, false) ; }

    // A 'word' is used in several places:
    //   keyword
    //   prefix part of prefix name 
    //   local part of prefix name (allows digits)
    
    static private char[] extraCharsWord = new char[] {'_', '.' , '-'};
    
    private String readWordSub(boolean leadingDigitAllowed, boolean leadingSignAllowed) {
        return readCharsAnd(leadingDigitAllowed, leadingSignAllowed, extraCharsWord, false) ;
    }

    static private char[] extraCharsVar = new char[]{'_', '.', '-', '?', '@', '+'} ;

    private String readVarName() {
        return readCharsAnd(true, true, extraCharsVar, true) ;
    }
    
    // See also readBlankNodeLabel
    
    private String readCharsAnd(boolean leadingDigitAllowed, boolean leadingSignAllowed, char[] extraChars,
                                boolean allowFinalDot) {
        stringBuilder.setLength(0) ;
        int idx = 0 ;
        if ( !leadingDigitAllowed ) {
            int ch = reader.peekChar() ;
            if ( Character.isDigit(ch) )
                return "" ;
        }

        // Used for local part of prefix names =>
        if ( !leadingSignAllowed ) {
            int ch = reader.peekChar() ;
            if ( ch == '-' || ch == '+' )
                return "" ;
        }

        for (;; idx++) {
            int ch = reader.peekChar() ;

            if ( isAlphaNumeric(ch) || Chars.charInArray(ch, extraChars) ) {
                reader.readChar() ;
                stringBuilder.append((char)ch) ;
                continue ;
            } else
                // Inappropriate character.
                break ;

        }

        if ( !allowFinalDot ) {
            // BAD : assumes pushbackChar is infinite.
            // Check is ends in "."
            while (idx > 0 && stringBuilder.charAt(idx - 1) == CH_DOT) {
                // Push back the dot.
                reader.pushbackChar(CH_DOT) ;
                stringBuilder.setLength(idx - 1) ;
                idx-- ;
            }
        }
        return stringBuilder.toString() ;
    }

    // BLANK_NODE_LABEL    ::=     '_:' (PN_CHARS_U | [0-9]) ((PN_CHARS | '.')* PN_CHARS)?

    private String readBlankNodeLabel() {
        stringBuilder.setLength(0) ;
        // First character.
        {
            int ch = reader.peekChar() ;
            if ( ch == EOF )
                exception("Blank node label missing (EOF found)") ;
            if ( isWhitespace(ch) )
                exception("Blank node label missing") ;
            // if ( ! isAlpha(ch) && ch != '_' )
            // Not strict

            if ( !RiotChars.isPNChars_U_N(ch) )
                exception("Blank node label does not start with alphabetic or _ :" + (char)ch) ;
            reader.readChar() ;
            stringBuilder.append((char)ch) ;
        }

        // Remainder. DOT can't be last so do a delay on that.

        int chDot = 0 ;

        for (;;) {
            int ch = reader.peekChar() ;
            if ( ch == EOF )
                break ;

            // DOT magic.
            if ( !(RiotChars.isPNChars(ch) || ch == CH_DOT) )
                break ;
            reader.readChar() ;

            if ( chDot != 0 ) {
                stringBuilder.append((char)chDot) ;
                chDot = 0 ;
            }

            if ( ch != CH_DOT )
                stringBuilder.append((char)ch) ;
            else
                // DOT - delay until next loop.
                chDot = ch ;
        }

        if ( chDot == CH_DOT )
            // Unread it.
            reader.pushbackChar(chDot) ;

        // if ( ! seen )
        // exception("Blank node label missing") ;
        return stringBuilder.toString() ;
    }

    /*
     * [146]  INTEGER  ::=  [0-9]+
     * [147]  DECIMAL  ::=  [0-9]* '.' [0-9]+
     * [148]  DOUBLE  ::=  [0-9]+ '.' [0-9]* EXPONENT | '.' ([0-9])+ EXPONENT | ([0-9])+ EXPONENT
     * []     hex             ::=     0x0123456789ABCDEFG
     */
    private void readNumber() {
        // One entry, definitely a number.
        // Beware of '.' as a (non) decimal.
        /*
        maybeSign()
        digits()
        if dot ==> decimal, digits
        if e   ==> double, maybeSign, digits
        else
            check not "." for decimal.
        */
        boolean isDouble = false ;
        boolean isDecimal = false ;
        stringBuilder.setLength(0) ;
        
        /*
        readPossibleSign(stringBuilder) ;
        readDigits may be hex
        readDot
        readDigits
        readExponent.
        */
        
        int x = 0 ; // Digits before a dot.
        int ch = reader.peekChar() ;
        if ( ch == '0' ) {
            x++ ;
            reader.readChar() ;
            stringBuilder.append((char)ch) ;
            ch = reader.peekChar() ;
            if ( ch == 'x' || ch == 'X' ) {
                reader.readChar() ;
                stringBuilder.append((char)ch) ;
                readHex(reader, stringBuilder) ;
                token.setImage(stringBuilder.toString()) ;
                token.setType(TokenType.HEX) ;
                return ;
            }
        } else if ( ch == '-' || ch == '+' ) {
            readPossibleSign(stringBuilder) ;
        }

        x += readDigits(stringBuilder) ;
//        if ( x == 0 ) {}
        ch = reader.peekChar() ;
        if ( ch == CH_DOT ) {
            reader.readChar() ;
            stringBuilder.append(CH_DOT) ;
            isDecimal = true ; // Includes things that will be doubles.
            readDigits(stringBuilder) ;
        }

        if ( x == 0 && !isDecimal )
            // Possible a tokenizer error - should not have entered readNumber
            // in the first place.
            exception("Unrecognized as number") ;

        if ( exponent(stringBuilder) ) {
            isDouble = true ;
            isDecimal = false ;
        }

        // Final part - "decimal" 123. is an integer 123 and a DOT.
        if ( isDecimal ) {
            int len = stringBuilder.length() ;
            if ( stringBuilder.charAt(len - 1) == CH_DOT ) {
                stringBuilder.setLength(len - 1) ;
                reader.pushbackChar(CH_DOT) ;
                isDecimal = false ;
            }
        }

        token.setImage(stringBuilder.toString()) ;
        if ( isDouble )
            token.setType(TokenType.DOUBLE) ;
        else if ( isDecimal )
            token.setType(TokenType.DECIMAL) ;
        else
            token.setType(TokenType.INTEGER) ;
    }

    private static void readHex(PeekReader reader, StringBuilder sb) {
        // Just after the 0x, which are in sb
        int x = 0 ;
        for (;;) {
            int ch = reader.peekChar() ;

            if ( !isHexChar(ch) )
                break ;
            reader.readChar() ;
            sb.append((char)ch) ;
            x++ ;
        }
        if ( x == 0 )
            exception(reader, "No hex characters after " + sb.toString()) ;
    }

    private int readDigits(StringBuilder buffer) {
        int count = 0 ;
        for (;;) {
            int ch = reader.peekChar() ;
            if ( !range(ch, '0', '9') )
                break ;
            reader.readChar() ;
            buffer.append((char)ch) ;
            count++ ;
        }
        return count ;
    }

    private void readPossibleSign(StringBuilder sb) {
        int ch = reader.peekChar() ;
        if ( ch == '-' || ch == '+' ) {
            reader.readChar() ;
            sb.append((char)ch) ;
        }
    }

    // Assume have read the first quote char.
    // On return:
    //   If false, have moved over no more characters (due to pushbacks) 
    //   If true, at end of 3 quotes
    private boolean threeQuotes(int ch) {
        // reader.readChar() ; // Read first quote.
        int ch2 = reader.peekChar() ;
        if ( ch2 != ch ) {
            // reader.pushbackChar(ch2) ;
            return false ;
        }

        reader.readChar() ; // Read second quote.
        int ch3 = reader.peekChar() ;
        if ( ch3 != ch ) {
            // reader.pushbackChar(ch3) ;
            reader.pushbackChar(ch2) ;
            return false ;
        }

        // Three quotes.
        reader.readChar() ; // Read third quote.
        return true ;
    }

    private boolean exponent(StringBuilder sb) {
        int ch = reader.peekChar() ;
        if ( ch != 'e' && ch != 'E' )
            return false ;
        reader.readChar() ;
        sb.append((char)ch) ;
        readPossibleSign(sb) ;
        int x = readDigits(sb) ;
        if ( x == 0 )
            exception("Malformed double: " + sb) ;
        return true ;
    }

    private String langTag() {
        stringBuilder.setLength(0) ;
        a2z(stringBuilder) ;
        if ( stringBuilder.length() == 0 )
            exception("Bad language tag") ;
        for (;;) {
            int ch = reader.peekChar() ;
            if ( ch == '-' ) {
                reader.readChar() ;
                stringBuilder.append('-') ;
                int x = stringBuilder.length() ;
                a2zN(stringBuilder) ;
                if ( stringBuilder.length() == x )
                    exception("Bad language tag") ;
            } else
                break ;
        }
        return stringBuilder.toString().intern() ;
    }
    
    // ASCII-only e.g. in lang tags.
    private void a2z(StringBuilder sb2) {
        for (;;) {
            int ch = reader.peekChar() ;
            if ( isA2Z(ch) ) {
                reader.readChar() ;
                stringBuilder.append((char)ch) ;
            } else
                return ;
        }
    }

    private void a2zN(StringBuilder sb2) {
        for (;;) {
            int ch = reader.peekChar() ;
            if ( isA2ZN(ch) ) {
                reader.readChar() ;
                stringBuilder.append((char)ch) ;
            } else
                return ;
        }
    }

    private void insertCodepoint(StringBuilder buffer, int ch) {
        if ( Character.charCount(ch) == 1 )
            buffer.append((char)ch) ;
        else {
            // Convert to UTF-16. Note that the rest of any system this is used
            // in must also respect codepoints and surrogate pairs.
            if ( !Character.isDefined(ch) && !Character.isSupplementaryCodePoint(ch) )
                exception("Illegal codepoint: 0x%04X", ch) ;
            char[] chars = Character.toChars(ch) ;
            buffer.append(chars) ;
        }
    }

    @Override
    public long getColumn() {
        return reader.getColNum() ;
    }

    @Override
    public long getLine() {
        return reader.getLineNum() ;
    }

    // ---- Routines to check tokens

    private void checkBlankNode(String blankNodeLabel) {
        if ( checker != null )
            checker.checkBlankNode(blankNodeLabel) ;
    }

    private void checkLiteralLang(String lexicalForm, String langTag) {
        if ( checker != null )
            checker.checkLiteralLang(lexicalForm, langTag) ;
    }

    private void checkLiteralDT(String lexicalForm, Token datatype) {
        if ( checker != null )
            checker.checkLiteralDT(lexicalForm, datatype) ;
    }

    private void checkString(String string) {
        if ( checker != null )
            checker.checkString(string) ;
    }

    private void checkURI(String uriStr) {
        if ( checker != null )
            checker.checkURI(uriStr) ;
    }

    private void checkNumber(String image, String datatype) {
        if ( checker != null )
            checker.checkNumber(image, datatype) ;
    }

    private void checkVariable(String tokenImage) {
        if ( checker != null )
            checker.checkVariable(tokenImage) ;
    }

    private void checkDirective(int cntrlCode) {
        if ( checker != null )
            checker.checkDirective(cntrlCode) ;
    }

    private void checkKeyword(String tokenImage) {
        if ( checker != null )
            checker.checkKeyword(tokenImage) ;
    }

    private void checkPrefixedName(String tokenImage, String tokenImage2) {
        if ( checker != null )
            checker.checkPrefixedName(tokenImage, tokenImage2) ;
    }

    private void checkControl(int code) {
        if ( checker != null )
            checker.checkControl(code) ;
    }

    // ---- Escape sequences

    private final int readLiteralEscape() {
        int c = reader.readChar() ;
        if ( c == EOF )
            exception("Escape sequence not completed") ;

        switch (c) {
            case 'n':   return NL ; 
            case 'r':   return CR ;
            case 't':   return '\t' ;
            case 'f':   return '\f' ;
            case 'b':   return BSPACE ;
            case '"':   return '"' ;
            case '\'':  return '\'' ;
            case '\\':  return '\\' ;
            case 'u':   return readUnicode4Escape();
            case 'U':   return readUnicode8Escape();
            default:
                exception("Illegal escape sequence value: %c (0x%02X)", c, c);
                return 0 ;
        }
    }

    private final int readCharEscape() {
        // PN_LOCAL_ESC ::= '\' ( '_' | '~' | '.' | '-' | '!' | '$' | '&' | "'"
        // | '(' | ')' | '*' | '+' | ',' | ';' | '=' | '/' | '?' | '#' | '@' |
        // '%' )

        int c = reader.readChar() ;
        if ( c == EOF )
            exception("Escape sequence not completed") ;

        switch (c) {
            case '_': case '~': case '.':  case '-':  case '!':  case '$':  case '&': 
            case '\'':  
            case '(':  case ')':  case '*':  case '+':  case ',':  case ';': 
            case '=':  case '/':  case '?':  case '#':  case '@':  case '%':
                return c ;
            default:
                exception("illegal character escape value: \\%c", c);
                return 0 ;
        }
    }
    
    private final
    int readUnicode4Escape() { return readHexSequence(4) ; }
    
    private final int readUnicode8Escape() {
        int ch8 = readHexSequence(8) ;
        if ( ch8 > Character.MAX_CODE_POINT )
            exception("Illegal code point in \\U sequence value: 0x%08X", ch8) ;
        return ch8 ;
    }

    private final int readHexSequence(int N) {
        int x = 0 ;
        for (int i = 0; i < N; i++) {
            int d = readHexChar() ;
            if ( d < 0 )
                return -1 ;
            x = (x << 4) + d ;
        }
        return x ;
    }

    private final int readHexChar() {
        int ch = reader.readChar() ;
        if ( ch == EOF )
            exception("Not a hexadecimal character (end of file)") ;

        int x = valHexChar(ch) ;
        if ( x != -1 )
            return x ;
        exception("Not a hexadecimal character: " + (char)ch) ;
        return -1 ;
    }

    private boolean expect(String str) {
        for (int i = 0; i < str.length(); i++) {
            char want = str.charAt(i) ;
            if ( reader.eof() ) {
                exception("End of input during expected string: " + str) ;
                return false ;
            }
            int inChar = reader.peekChar() ;
            if ( inChar != want ) {
                // System.err.println("N-triple reader error");
                exception("expected \"" + str + "\"") ;
                return false ;
            }
            reader.readChar() ;
        }
        return true ;
    }

    private void warning(String message, Object... args) {
        exception(message, args); 
    }
    
    private void exception(String message, Object... args) {
        exception$(message, reader.getLineNum(), reader.getColNum(), args) ;
    }

    private static void exception(PeekReader reader, String message, Object... args) {
        exception$(message, reader.getLineNum(), reader.getColNum(), args) ;
    }

    private static void exception$(String message, long line, long col, Object... args) {
        throw new RiotParseException(String.format(message, args), line, col) ;
    }
}