/*
* [The "BSD license"]
* Copyright (c) 2011 Terence Parr
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.stringtemplate.v4.compiler;
import java.util.ArrayList;
import java.util.List;
import nebula.simpletemplate.Misc;
import nebula.simpletemplate.SParser;
import nebula.simpletemplate.STGroup;
import org.antlr.runtime.CharStream;
import org.antlr.runtime.CommonToken;
import org.antlr.runtime.MismatchedTokenException;
import org.antlr.runtime.NoViableAltException;
import org.antlr.runtime.RecognitionException;
import org.antlr.runtime.Token;
import org.antlr.runtime.TokenSource;
import org.stringtemplate.v4.misc.ErrorManager;
/**
* This class represents the tokenizer for templates. It operates in two modes:
* inside and outside of expressions. It implements the {@link TokenSource}
* interface so it can be used with ANTLR parsers. Outside of expressions, we
* can return these token types: {@link #TEXT}, {@link #INDENT}, {@link #LDELIM}
* (start of expression), {@link #RCURLY} (end of subtemplate), and
* {@link #NEWLINE}. Inside of an expression, this lexer returns all of the
* tokens needed by {@link STParser}. From the parser's point of view, it can
* treat a template as a simple stream of elements.
* <p/>
* This class defines the token types and communicates these values to
* {@code STParser.g} via {@code STLexer.tokens} file (which must remain
* consistent).
*/
public class STLexer implements TokenSource {
public static final char EOF = (char) -1; // EOF char
public static final int EOF_TYPE = CharStream.EOF; // EOF token type
/**
* We build {@code STToken} tokens instead of relying on {@link CommonToken}
* so we can override {@link #toString()}. It just converts token types to
* token names like 23 to {@code "LDELIM"}.
*/
@SuppressWarnings("serial")
public static class STToken extends CommonToken {
public STToken(CharStream input, int type, int start, int stop) {
super(input, type, DEFAULT_CHANNEL, start, stop);
}
public STToken(int type, String text) {
super(type, text);
}
@Override
public String toString() {
String channelStr = "";
if (channel > 0) {
channelStr = ",channel=" + channel;
}
String txt = getText();
if (txt != null) txt = Misc.replaceEscapes(txt);
else txt = "<no text>";
String tokenName = null;
if (type == EOF_TYPE) tokenName = "EOF";
else tokenName = SParser.tokenNames[type];
return "[@" + getTokenIndex() + "," + start + ":" + stop + "='" + txt + "',<" + tokenName + ">" + channelStr + "," + line + ":"
+ getCharPositionInLine() + "]";
}
}
public static final Token SKIP = new STToken(-1, "<skip>");
// must follow STLexer.tokens file that STParser.g loads
public static final int RBRACK = 17;
public static final int LBRACK = 16;
public static final int ELSE = 5;
public static final int ELLIPSIS = 11;
public static final int LCURLY = 20;
public static final int BANG = 10;
public static final int EQUALS = 12;
public static final int TEXT = 22;
public static final int ID = 25;
public static final int SEMI = 9;
public static final int LPAREN = 14;
public static final int IF = 4;
public static final int ELSEIF = 6;
public static final int COLON = 13;
public static final int RPAREN = 15;
public static final int COMMA = 18;
public static final int RCURLY = 21;
public static final int ENDIF = 7;
public static final int RDELIM = 24;
public static final int SUPER = 8;
public static final int DOT = 19;
public static final int LDELIM = 23;
public static final int STRING = 26;
public static final int PIPE = 28;
public static final int OR = 29;
public static final int AND = 30;
public static final int INDENT = 31;
public static final int NEWLINE = 32;
public static final int AT = 33;
public static final int REGION_END = 34;
public static final int TRUE = 35;
public static final int FALSE = 36;
public static final int COMMENT = 37;
/** The char which delimits the start of an expression. */
char delimiterStartChar = '<';
/** The char which delimits the end of an expression. */
char delimiterStopChar = '>';
char delimiterLeadingChar = '<';
/**
* This keeps track of the current mode of the lexer. Are we inside or
* outside an ST expression?
*/
boolean scanningInsideExpr = false;
/**
* To be able to properly track the inside/outside mode, we need to track
* how deeply nested we are in some templates. Otherwise, we know whether a
* <code>'}'</code> and the outermost subtemplate to send this back to
* outside mode.
*/
public int subtemplateDepth = 0; // start out *not* in a {...} subtemplate
ErrorManager errMgr;
/** template embedded in a group file? this is the template */
Token templateToken;
CharStream input;
/** current character */
char c;
/**
* When we started token, track initial coordinates so we can properly build
* token objects.
*/
int startCharIndex;
int startLine;
int startCharPositionInLine;
/**
* Our lexer routines might have to emit more than a single token. We buffer
* everything through this list.
*/
List<Token> tokens = new ArrayList<Token>();
public STLexer(CharStream input) {
this(STGroup.DEFAULT_ERR_MGR, input, null, '<', '>');
}
public STLexer(ErrorManager errMgr, CharStream input, Token templateToken) {
this(errMgr, input, templateToken, '<', '>');
}
public STLexer(ErrorManager errMgr, CharStream input, Token templateToken, char delimiterStartChar, char delimiterStopChar) {
this.errMgr = errMgr;
this.input = input;
c = (char) input.LA(1); // prime lookahead
this.templateToken = templateToken;
this.delimiterStartChar = delimiterStartChar;
this.delimiterStopChar = delimiterStopChar;
switch (delimiterStopChar) {
case '>':
this.delimiterLeadingChar = '<';
break;
case '}':
this.delimiterLeadingChar = '{';
break;
case ']':
this.delimiterLeadingChar = '[';
break;
case ')':
this.delimiterLeadingChar = '(';
break;
default:
this.delimiterLeadingChar = (char) -1;
break;
}
}
@Override
public Token nextToken() {
Token t;
if (tokens.size() > 0) {
t = tokens.remove(0);
} else t = _nextToken();
// System.out.println(t);
return t;
}
/**
* Consume if {@code x} is next character on the input stream.
*
* @throws NoViableAltException
*/
public void match(char x) {
if (c != x) {
NoViableAltException e = new NoViableAltException("", 0, 0, input);
errMgr.lexerError(input.getSourceName(), "expecting '" + x + "', found '" + str(c) + "'", templateToken, e);
}
consume();
}
protected void consume() {
input.consume();
c = (char) input.LA(1);
}
public void emit(Token token) {
tokens.add(token);
}
public Token _nextToken() {
// System.out.println("nextToken: c="+(char)c+"@"+input.index());
while (true) { // lets us avoid recursion when skipping stuff
startCharIndex = input.index();
startLine = input.getLine();
startCharPositionInLine = input.getCharPositionInLine();
if (c == EOF) return newToken(EOF_TYPE);
Token t;
if (scanningInsideExpr) t = inside();
else t = outside();
if (t != SKIP) return t;
}
}
protected Token outside() {
if (input.getCharPositionInLine() == 0 && (c == ' ' || c == '\t')) {
while (c == ' ' || c == '\t')
consume(); // scarf indent
if (c != EOF) return newToken(INDENT);
return newToken(TEXT);
}
if (c == delimiterStartChar) {
consume();
if (c == delimiterLeadingChar) consume();
if (c == '!') return COMMENT();
if (c == '\\') return ESCAPE(); // <\\> <\uFFFF> <\n> etc...
scanningInsideExpr = true;
return newToken(LDELIM);
}
if (c == '\r') {
consume();
consume();
return newToken(NEWLINE);
} // \r\n -> \n
if (c == '\n') {
consume();
return newToken(NEWLINE);
}
if (c == '}' && subtemplateDepth > 0) {
scanningInsideExpr = true;
subtemplateDepth--;
consume();
return newTokenFromPreviousChar(RCURLY);
}
return mTEXT();
}
protected Token inside() {
while (true) {
switch (c) {
case ' ':
case '\t':
case '\n':
case '\r':
consume();
return SKIP;
case '.':
consume();
if (input.LA(1) == '.' && input.LA(2) == '.') {
consume();
match('.');
return newToken(ELLIPSIS);
}
return newToken(DOT);
case ',':
consume();
return newToken(COMMA);
case ':':
consume();
return newToken(COLON);
case ';':
consume();
return newToken(SEMI);
case '(':
consume();
return newToken(LPAREN);
case ')':
consume();
return newToken(RPAREN);
case '[':
consume();
return newToken(LBRACK);
case ']':
consume();
return newToken(RBRACK);
case '=':
consume();
return newToken(EQUALS);
case '!':
consume();
return newToken(BANG);
case '@':
consume();
if (c == 'e' && input.LA(2) == 'n' && input.LA(3) == 'd') {
consume();
consume();
consume();
return newToken(REGION_END);
}
return newToken(AT);
case '"':
return mSTRING();
case '&':
consume();
match('&');
return newToken(AND); // &&
case '|':
consume();
match('|');
return newToken(OR); // ||
case '{':
return subTemplate();
default:
if (c == delimiterStopChar) {
consume();
scanningInsideExpr = false;
return newToken(RDELIM);
}
if (isIDStartLetter(c)) {
Token id = mID();
String name = id.getText();
if (name.equals("if")) return newToken(IF);
else if (name.equals("endif")) return newToken(ENDIF);
else if (name.equals("else")) return newToken(ELSE);
else if (name.equals("elseif")) return newToken(ELSEIF);
else if (name.equals("super")) return newToken(SUPER);
else if (name.equals("true")) return newToken(TRUE);
else if (name.equals("false")) return newToken(FALSE);
return id;
}
RecognitionException re = new NoViableAltException("", 0, 0, input);
re.line = startLine;
re.charPositionInLine = startCharPositionInLine;
errMgr.lexerError(input.getSourceName(), "invalid character '" + str(c) + "'", templateToken, re);
if (c == EOF) {
return newToken(EOF_TYPE);
}
consume();
}
}
}
Token subTemplate() {
// look for "{ args ID (',' ID)* '|' ..."
subtemplateDepth++;
int m = input.mark();
int curlyStartChar = startCharIndex;
int curlyLine = startLine;
int curlyPos = startCharPositionInLine;
List<Token> argTokens = new ArrayList<Token>();
consume();
Token curly = newTokenFromPreviousChar(LCURLY);
WS();
argTokens.add(mID());
WS();
while (c == ',') {
consume();
argTokens.add(newTokenFromPreviousChar(COMMA));
WS();
argTokens.add(mID());
WS();
}
WS();
if (c == '|') {
consume();
argTokens.add(newTokenFromPreviousChar(PIPE));
if (isWS(c)) consume(); // ignore a single whitespace after |
// System.out.println("matched args: "+argTokens);
for (Token t : argTokens)
emit(t);
input.release(m);
scanningInsideExpr = false;
startCharIndex = curlyStartChar; // reset state
startLine = curlyLine;
startCharPositionInLine = curlyPos;
return curly;
}
input.rewind(m);
startCharIndex = curlyStartChar; // reset state
startLine = curlyLine;
startCharPositionInLine = curlyPos;
consume();
scanningInsideExpr = false;
return curly;
}
Token ESCAPE() {
startCharIndex = input.index();
startCharPositionInLine = input.getCharPositionInLine();
consume(); // kill \\
if (c == 'u') return UNICODE();
String text = null;
switch (c) {
case '\\':
LINEBREAK();
return SKIP;
case 'n':
text = "\n";
break;
case 't':
text = "\t";
break;
case ' ':
text = " ";
break;
default:
NoViableAltException e = new NoViableAltException("", 0, 0, input);
errMgr.lexerError(input.getSourceName(), "invalid escaped char: '" + str(c) + "'", templateToken, e);
consume();
match(delimiterStopChar);
return SKIP;
}
consume();
Token t = newToken(TEXT, text, input.getCharPositionInLine() - 2);
match(delimiterStopChar);
return t;
}
Token UNICODE() {
consume();
char[] chars = new char[4];
if (!isUnicodeLetter(c)) {
NoViableAltException e = new NoViableAltException("", 0, 0, input);
errMgr.lexerError(input.getSourceName(), "invalid unicode char: '" + str(c) + "'", templateToken, e);
}
chars[0] = c;
consume();
if (!isUnicodeLetter(c)) {
NoViableAltException e = new NoViableAltException("", 0, 0, input);
errMgr.lexerError(input.getSourceName(), "invalid unicode char: '" + str(c) + "'", templateToken, e);
}
chars[1] = c;
consume();
if (!isUnicodeLetter(c)) {
NoViableAltException e = new NoViableAltException("", 0, 0, input);
errMgr.lexerError(input.getSourceName(), "invalid unicode char: '" + str(c) + "'", templateToken, e);
}
chars[2] = c;
consume();
if (!isUnicodeLetter(c)) {
NoViableAltException e = new NoViableAltException("", 0, 0, input);
errMgr.lexerError(input.getSourceName(), "invalid unicode char: '" + str(c) + "'", templateToken, e);
}
chars[3] = c;
// ESCAPE kills >
char uc = (char) Integer.parseInt(new String(chars), 16);
Token t = newToken(TEXT, String.valueOf(uc), input.getCharPositionInLine() - 6);
consume();
match(delimiterStopChar);
return t;
}
Token mTEXT() {
boolean modifiedText = false;
StringBuilder buf = new StringBuilder();
while (c != EOF && c != delimiterStartChar) {
if (c == '\r' || c == '\n') break;
if (c == '}' && subtemplateDepth > 0) break;
if (c == '\\') {
if (input.LA(2) == '\\') { // convert \\ to \
consume();
consume();
buf.append('\\');
modifiedText = true;
continue;
}
if (input.LA(2) == delimiterStartChar || input.LA(2) == '}') {
modifiedText = true;
consume(); // toss out \ char
buf.append(c);
consume();
} else {
buf.append(c);
consume();
}
continue;
}
buf.append(c);
consume();
}
if (modifiedText) return newToken(TEXT, buf.toString());
else return newToken(TEXT);
}
/**
* <pre>
* ID : ('a'..'z'|'A'..'Z'|'_'|'/')
* ('a'..'z'|'A'..'Z'|'0'..'9'|'_'|'/')*
* ;
* </pre>
*/
Token mID() {
// called from subTemplate; so keep resetting position during
// speculation
startCharIndex = input.index();
startLine = input.getLine();
startCharPositionInLine = input.getCharPositionInLine();
consume();
while (isIDLetter(c)) {
consume();
}
return newToken(ID);
}
/**
* <pre>
* STRING : '"'
* ( '\\' '"'
* | '\\' ~'"'
* | ~('\\'|'"')
* )*
* '"'
* ;
* </pre>
*/
Token mSTRING() {
// {setText(getText().substring(1, getText().length()-1));}
boolean sawEscape = false;
StringBuilder buf = new StringBuilder();
buf.append(c);
consume();
while (c != '"') {
if (c == '\\') {
sawEscape = true;
consume();
switch (c) {
case 'n':
buf.append('\n');
break;
case 'r':
buf.append('\r');
break;
case 't':
buf.append('\t');
break;
default:
buf.append(c);
break;
}
consume();
continue;
}
buf.append(c);
consume();
if (c == EOF) {
RecognitionException re = new MismatchedTokenException((int) '"', input);
re.line = input.getLine();
re.charPositionInLine = input.getCharPositionInLine();
errMgr.lexerError(input.getSourceName(), "EOF in string", templateToken, re);
break;
}
}
buf.append(c);
consume();
if (sawEscape) return newToken(STRING, buf.toString());
else return newToken(STRING);
}
void WS() {
while (c == ' ' || c == '\t' || c == '\n' || c == '\r')
consume();
}
Token COMMENT() {
match('!');
while (!(c == '!' && input.LA(2) == delimiterStopChar)) {
if (c == EOF) {
RecognitionException re = new MismatchedTokenException((int) '!', input);
re.line = input.getLine();
re.charPositionInLine = input.getCharPositionInLine();
errMgr.lexerError(input.getSourceName(), "Nonterminated comment starting at " + startLine + ":" + startCharPositionInLine + ": '!"
+ delimiterStopChar + "' missing", templateToken, re);
break;
}
consume();
}
consume();
consume(); // grab !>
return newToken(COMMENT);
}
void LINEBREAK() {
match('\\'); // only kill 2nd \ as ESCAPE() kills first one
match(delimiterStopChar);
while (c == ' ' || c == '\t')
consume(); // scarf WS after <\\>
if (c == EOF) {
RecognitionException re = new RecognitionException(input);
re.line = input.getLine();
re.charPositionInLine = input.getCharPositionInLine();
errMgr.lexerError(input.getSourceName(), "Missing newline after newline escape <\\\\>", templateToken, re);
return;
}
if (c == '\r') consume();
match('\n');
while (c == ' ' || c == '\t')
consume(); // scarf any indent
}
public static boolean isIDStartLetter(char c) {
return isIDLetter(c);
}
public static boolean isIDLetter(char c) {
return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9' || c == '_' || c == '/';
}
public static boolean isWS(char c) {
return c == ' ' || c == '\t' || c == '\n' || c == '\r';
}
public static boolean isUnicodeLetter(char c) {
return c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F' || c >= '0' && c <= '9';
}
public Token newToken(int ttype) {
STToken t = new STToken(input, ttype, startCharIndex, input.index() - 1);
t.setLine(startLine);
t.setCharPositionInLine(startCharPositionInLine);
return t;
}
public Token newTokenFromPreviousChar(int ttype) {
STToken t = new STToken(input, ttype, input.index() - 1, input.index() - 1);
t.setLine(input.getLine());
t.setCharPositionInLine(input.getCharPositionInLine() - 1);
return t;
}
public Token newToken(int ttype, String text, int pos) {
STToken t = new STToken(ttype, text);
t.setStartIndex(startCharIndex);
t.setStopIndex(input.index() - 1);
t.setLine(input.getLine());
t.setCharPositionInLine(pos);
return t;
}
public Token newToken(int ttype, String text) {
STToken t = new STToken(ttype, text);
t.setStartIndex(startCharIndex);
t.setStopIndex(input.index() - 1);
t.setLine(startLine);
t.setCharPositionInLine(startCharPositionInLine);
return t;
}
// public String getErrorHeader() {
// return startLine+":"+startCharPositionInLine;
// }
//
@Override
public String getSourceName() {
return "no idea";
}
public static String str(int c) {
if (c == EOF) return "<EOF>";
return String.valueOf((char) c);
}
}