/* * [The "BSD license"] * Copyright (c) 2011 Terence Parr * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package org.stringtemplate.v4.compiler; import java.util.ArrayList; import java.util.List; import nebula.simpletemplate.Misc; import nebula.simpletemplate.SParser; import nebula.simpletemplate.STGroup; import org.antlr.runtime.CharStream; import org.antlr.runtime.CommonToken; import org.antlr.runtime.MismatchedTokenException; import org.antlr.runtime.NoViableAltException; import org.antlr.runtime.RecognitionException; import org.antlr.runtime.Token; import org.antlr.runtime.TokenSource; import org.stringtemplate.v4.misc.ErrorManager; /** * This class represents the tokenizer for templates. It operates in two modes: * inside and outside of expressions. It implements the {@link TokenSource} * interface so it can be used with ANTLR parsers. Outside of expressions, we * can return these token types: {@link #TEXT}, {@link #INDENT}, {@link #LDELIM} * (start of expression), {@link #RCURLY} (end of subtemplate), and * {@link #NEWLINE}. Inside of an expression, this lexer returns all of the * tokens needed by {@link STParser}. From the parser's point of view, it can * treat a template as a simple stream of elements. * <p/> * This class defines the token types and communicates these values to * {@code STParser.g} via {@code STLexer.tokens} file (which must remain * consistent). */ public class STLexer implements TokenSource { public static final char EOF = (char) -1; // EOF char public static final int EOF_TYPE = CharStream.EOF; // EOF token type /** * We build {@code STToken} tokens instead of relying on {@link CommonToken} * so we can override {@link #toString()}. It just converts token types to * token names like 23 to {@code "LDELIM"}. */ @SuppressWarnings("serial") public static class STToken extends CommonToken { public STToken(CharStream input, int type, int start, int stop) { super(input, type, DEFAULT_CHANNEL, start, stop); } public STToken(int type, String text) { super(type, text); } @Override public String toString() { String channelStr = ""; if (channel > 0) { channelStr = ",channel=" + channel; } String txt = getText(); if (txt != null) txt = Misc.replaceEscapes(txt); else txt = "<no text>"; String tokenName = null; if (type == EOF_TYPE) tokenName = "EOF"; else tokenName = SParser.tokenNames[type]; return "[@" + getTokenIndex() + "," + start + ":" + stop + "='" + txt + "',<" + tokenName + ">" + channelStr + "," + line + ":" + getCharPositionInLine() + "]"; } } public static final Token SKIP = new STToken(-1, "<skip>"); // must follow STLexer.tokens file that STParser.g loads public static final int RBRACK = 17; public static final int LBRACK = 16; public static final int ELSE = 5; public static final int ELLIPSIS = 11; public static final int LCURLY = 20; public static final int BANG = 10; public static final int EQUALS = 12; public static final int TEXT = 22; public static final int ID = 25; public static final int SEMI = 9; public static final int LPAREN = 14; public static final int IF = 4; public static final int ELSEIF = 6; public static final int COLON = 13; public static final int RPAREN = 15; public static final int COMMA = 18; public static final int RCURLY = 21; public static final int ENDIF = 7; public static final int RDELIM = 24; public static final int SUPER = 8; public static final int DOT = 19; public static final int LDELIM = 23; public static final int STRING = 26; public static final int PIPE = 28; public static final int OR = 29; public static final int AND = 30; public static final int INDENT = 31; public static final int NEWLINE = 32; public static final int AT = 33; public static final int REGION_END = 34; public static final int TRUE = 35; public static final int FALSE = 36; public static final int COMMENT = 37; /** The char which delimits the start of an expression. */ char delimiterStartChar = '<'; /** The char which delimits the end of an expression. */ char delimiterStopChar = '>'; char delimiterLeadingChar = '<'; /** * This keeps track of the current mode of the lexer. Are we inside or * outside an ST expression? */ boolean scanningInsideExpr = false; /** * To be able to properly track the inside/outside mode, we need to track * how deeply nested we are in some templates. Otherwise, we know whether a * <code>'}'</code> and the outermost subtemplate to send this back to * outside mode. */ public int subtemplateDepth = 0; // start out *not* in a {...} subtemplate ErrorManager errMgr; /** template embedded in a group file? this is the template */ Token templateToken; CharStream input; /** current character */ char c; /** * When we started token, track initial coordinates so we can properly build * token objects. */ int startCharIndex; int startLine; int startCharPositionInLine; /** * Our lexer routines might have to emit more than a single token. We buffer * everything through this list. */ List<Token> tokens = new ArrayList<Token>(); public STLexer(CharStream input) { this(STGroup.DEFAULT_ERR_MGR, input, null, '<', '>'); } public STLexer(ErrorManager errMgr, CharStream input, Token templateToken) { this(errMgr, input, templateToken, '<', '>'); } public STLexer(ErrorManager errMgr, CharStream input, Token templateToken, char delimiterStartChar, char delimiterStopChar) { this.errMgr = errMgr; this.input = input; c = (char) input.LA(1); // prime lookahead this.templateToken = templateToken; this.delimiterStartChar = delimiterStartChar; this.delimiterStopChar = delimiterStopChar; switch (delimiterStopChar) { case '>': this.delimiterLeadingChar = '<'; break; case '}': this.delimiterLeadingChar = '{'; break; case ']': this.delimiterLeadingChar = '['; break; case ')': this.delimiterLeadingChar = '('; break; default: this.delimiterLeadingChar = (char) -1; break; } } @Override public Token nextToken() { Token t; if (tokens.size() > 0) { t = tokens.remove(0); } else t = _nextToken(); // System.out.println(t); return t; } /** * Consume if {@code x} is next character on the input stream. * * @throws NoViableAltException */ public void match(char x) { if (c != x) { NoViableAltException e = new NoViableAltException("", 0, 0, input); errMgr.lexerError(input.getSourceName(), "expecting '" + x + "', found '" + str(c) + "'", templateToken, e); } consume(); } protected void consume() { input.consume(); c = (char) input.LA(1); } public void emit(Token token) { tokens.add(token); } public Token _nextToken() { // System.out.println("nextToken: c="+(char)c+"@"+input.index()); while (true) { // lets us avoid recursion when skipping stuff startCharIndex = input.index(); startLine = input.getLine(); startCharPositionInLine = input.getCharPositionInLine(); if (c == EOF) return newToken(EOF_TYPE); Token t; if (scanningInsideExpr) t = inside(); else t = outside(); if (t != SKIP) return t; } } protected Token outside() { if (input.getCharPositionInLine() == 0 && (c == ' ' || c == '\t')) { while (c == ' ' || c == '\t') consume(); // scarf indent if (c != EOF) return newToken(INDENT); return newToken(TEXT); } if (c == delimiterStartChar) { consume(); if (c == delimiterLeadingChar) consume(); if (c == '!') return COMMENT(); if (c == '\\') return ESCAPE(); // <\\> <\uFFFF> <\n> etc... scanningInsideExpr = true; return newToken(LDELIM); } if (c == '\r') { consume(); consume(); return newToken(NEWLINE); } // \r\n -> \n if (c == '\n') { consume(); return newToken(NEWLINE); } if (c == '}' && subtemplateDepth > 0) { scanningInsideExpr = true; subtemplateDepth--; consume(); return newTokenFromPreviousChar(RCURLY); } return mTEXT(); } protected Token inside() { while (true) { switch (c) { case ' ': case '\t': case '\n': case '\r': consume(); return SKIP; case '.': consume(); if (input.LA(1) == '.' && input.LA(2) == '.') { consume(); match('.'); return newToken(ELLIPSIS); } return newToken(DOT); case ',': consume(); return newToken(COMMA); case ':': consume(); return newToken(COLON); case ';': consume(); return newToken(SEMI); case '(': consume(); return newToken(LPAREN); case ')': consume(); return newToken(RPAREN); case '[': consume(); return newToken(LBRACK); case ']': consume(); return newToken(RBRACK); case '=': consume(); return newToken(EQUALS); case '!': consume(); return newToken(BANG); case '@': consume(); if (c == 'e' && input.LA(2) == 'n' && input.LA(3) == 'd') { consume(); consume(); consume(); return newToken(REGION_END); } return newToken(AT); case '"': return mSTRING(); case '&': consume(); match('&'); return newToken(AND); // && case '|': consume(); match('|'); return newToken(OR); // || case '{': return subTemplate(); default: if (c == delimiterStopChar) { consume(); scanningInsideExpr = false; return newToken(RDELIM); } if (isIDStartLetter(c)) { Token id = mID(); String name = id.getText(); if (name.equals("if")) return newToken(IF); else if (name.equals("endif")) return newToken(ENDIF); else if (name.equals("else")) return newToken(ELSE); else if (name.equals("elseif")) return newToken(ELSEIF); else if (name.equals("super")) return newToken(SUPER); else if (name.equals("true")) return newToken(TRUE); else if (name.equals("false")) return newToken(FALSE); return id; } RecognitionException re = new NoViableAltException("", 0, 0, input); re.line = startLine; re.charPositionInLine = startCharPositionInLine; errMgr.lexerError(input.getSourceName(), "invalid character '" + str(c) + "'", templateToken, re); if (c == EOF) { return newToken(EOF_TYPE); } consume(); } } } Token subTemplate() { // look for "{ args ID (',' ID)* '|' ..." subtemplateDepth++; int m = input.mark(); int curlyStartChar = startCharIndex; int curlyLine = startLine; int curlyPos = startCharPositionInLine; List<Token> argTokens = new ArrayList<Token>(); consume(); Token curly = newTokenFromPreviousChar(LCURLY); WS(); argTokens.add(mID()); WS(); while (c == ',') { consume(); argTokens.add(newTokenFromPreviousChar(COMMA)); WS(); argTokens.add(mID()); WS(); } WS(); if (c == '|') { consume(); argTokens.add(newTokenFromPreviousChar(PIPE)); if (isWS(c)) consume(); // ignore a single whitespace after | // System.out.println("matched args: "+argTokens); for (Token t : argTokens) emit(t); input.release(m); scanningInsideExpr = false; startCharIndex = curlyStartChar; // reset state startLine = curlyLine; startCharPositionInLine = curlyPos; return curly; } input.rewind(m); startCharIndex = curlyStartChar; // reset state startLine = curlyLine; startCharPositionInLine = curlyPos; consume(); scanningInsideExpr = false; return curly; } Token ESCAPE() { startCharIndex = input.index(); startCharPositionInLine = input.getCharPositionInLine(); consume(); // kill \\ if (c == 'u') return UNICODE(); String text = null; switch (c) { case '\\': LINEBREAK(); return SKIP; case 'n': text = "\n"; break; case 't': text = "\t"; break; case ' ': text = " "; break; default: NoViableAltException e = new NoViableAltException("", 0, 0, input); errMgr.lexerError(input.getSourceName(), "invalid escaped char: '" + str(c) + "'", templateToken, e); consume(); match(delimiterStopChar); return SKIP; } consume(); Token t = newToken(TEXT, text, input.getCharPositionInLine() - 2); match(delimiterStopChar); return t; } Token UNICODE() { consume(); char[] chars = new char[4]; if (!isUnicodeLetter(c)) { NoViableAltException e = new NoViableAltException("", 0, 0, input); errMgr.lexerError(input.getSourceName(), "invalid unicode char: '" + str(c) + "'", templateToken, e); } chars[0] = c; consume(); if (!isUnicodeLetter(c)) { NoViableAltException e = new NoViableAltException("", 0, 0, input); errMgr.lexerError(input.getSourceName(), "invalid unicode char: '" + str(c) + "'", templateToken, e); } chars[1] = c; consume(); if (!isUnicodeLetter(c)) { NoViableAltException e = new NoViableAltException("", 0, 0, input); errMgr.lexerError(input.getSourceName(), "invalid unicode char: '" + str(c) + "'", templateToken, e); } chars[2] = c; consume(); if (!isUnicodeLetter(c)) { NoViableAltException e = new NoViableAltException("", 0, 0, input); errMgr.lexerError(input.getSourceName(), "invalid unicode char: '" + str(c) + "'", templateToken, e); } chars[3] = c; // ESCAPE kills > char uc = (char) Integer.parseInt(new String(chars), 16); Token t = newToken(TEXT, String.valueOf(uc), input.getCharPositionInLine() - 6); consume(); match(delimiterStopChar); return t; } Token mTEXT() { boolean modifiedText = false; StringBuilder buf = new StringBuilder(); while (c != EOF && c != delimiterStartChar) { if (c == '\r' || c == '\n') break; if (c == '}' && subtemplateDepth > 0) break; if (c == '\\') { if (input.LA(2) == '\\') { // convert \\ to \ consume(); consume(); buf.append('\\'); modifiedText = true; continue; } if (input.LA(2) == delimiterStartChar || input.LA(2) == '}') { modifiedText = true; consume(); // toss out \ char buf.append(c); consume(); } else { buf.append(c); consume(); } continue; } buf.append(c); consume(); } if (modifiedText) return newToken(TEXT, buf.toString()); else return newToken(TEXT); } /** * <pre> * ID : ('a'..'z'|'A'..'Z'|'_'|'/') * ('a'..'z'|'A'..'Z'|'0'..'9'|'_'|'/')* * ; * </pre> */ Token mID() { // called from subTemplate; so keep resetting position during // speculation startCharIndex = input.index(); startLine = input.getLine(); startCharPositionInLine = input.getCharPositionInLine(); consume(); while (isIDLetter(c)) { consume(); } return newToken(ID); } /** * <pre> * STRING : '"' * ( '\\' '"' * | '\\' ~'"' * | ~('\\'|'"') * )* * '"' * ; * </pre> */ Token mSTRING() { // {setText(getText().substring(1, getText().length()-1));} boolean sawEscape = false; StringBuilder buf = new StringBuilder(); buf.append(c); consume(); while (c != '"') { if (c == '\\') { sawEscape = true; consume(); switch (c) { case 'n': buf.append('\n'); break; case 'r': buf.append('\r'); break; case 't': buf.append('\t'); break; default: buf.append(c); break; } consume(); continue; } buf.append(c); consume(); if (c == EOF) { RecognitionException re = new MismatchedTokenException((int) '"', input); re.line = input.getLine(); re.charPositionInLine = input.getCharPositionInLine(); errMgr.lexerError(input.getSourceName(), "EOF in string", templateToken, re); break; } } buf.append(c); consume(); if (sawEscape) return newToken(STRING, buf.toString()); else return newToken(STRING); } void WS() { while (c == ' ' || c == '\t' || c == '\n' || c == '\r') consume(); } Token COMMENT() { match('!'); while (!(c == '!' && input.LA(2) == delimiterStopChar)) { if (c == EOF) { RecognitionException re = new MismatchedTokenException((int) '!', input); re.line = input.getLine(); re.charPositionInLine = input.getCharPositionInLine(); errMgr.lexerError(input.getSourceName(), "Nonterminated comment starting at " + startLine + ":" + startCharPositionInLine + ": '!" + delimiterStopChar + "' missing", templateToken, re); break; } consume(); } consume(); consume(); // grab !> return newToken(COMMENT); } void LINEBREAK() { match('\\'); // only kill 2nd \ as ESCAPE() kills first one match(delimiterStopChar); while (c == ' ' || c == '\t') consume(); // scarf WS after <\\> if (c == EOF) { RecognitionException re = new RecognitionException(input); re.line = input.getLine(); re.charPositionInLine = input.getCharPositionInLine(); errMgr.lexerError(input.getSourceName(), "Missing newline after newline escape <\\\\>", templateToken, re); return; } if (c == '\r') consume(); match('\n'); while (c == ' ' || c == '\t') consume(); // scarf any indent } public static boolean isIDStartLetter(char c) { return isIDLetter(c); } public static boolean isIDLetter(char c) { return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9' || c == '_' || c == '/'; } public static boolean isWS(char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; } public static boolean isUnicodeLetter(char c) { return c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F' || c >= '0' && c <= '9'; } public Token newToken(int ttype) { STToken t = new STToken(input, ttype, startCharIndex, input.index() - 1); t.setLine(startLine); t.setCharPositionInLine(startCharPositionInLine); return t; } public Token newTokenFromPreviousChar(int ttype) { STToken t = new STToken(input, ttype, input.index() - 1, input.index() - 1); t.setLine(input.getLine()); t.setCharPositionInLine(input.getCharPositionInLine() - 1); return t; } public Token newToken(int ttype, String text, int pos) { STToken t = new STToken(ttype, text); t.setStartIndex(startCharIndex); t.setStopIndex(input.index() - 1); t.setLine(input.getLine()); t.setCharPositionInLine(pos); return t; } public Token newToken(int ttype, String text) { STToken t = new STToken(ttype, text); t.setStartIndex(startCharIndex); t.setStopIndex(input.index() - 1); t.setLine(startLine); t.setCharPositionInLine(startCharPositionInLine); return t; } // public String getErrorHeader() { // return startLine+":"+startCharPositionInLine; // } // @Override public String getSourceName() { return "no idea"; } public static String str(int c) { if (c == EOF) return "<EOF>"; return String.valueOf((char) c); } }