/*******************************************************************************
* Copyright (c) 2008, 2016 Nokia Corporation.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Ed Swartz (Nokia) - initial API and implementation
*******************************************************************************/
package org.eclipse.cdt.autotools.ui.editors.parser;
import org.eclipse.cdt.autotools.ui.editors.AutoconfEditorMessages;
import org.eclipse.core.resources.IMarker;
import org.eclipse.jface.text.BadLocationException;
import org.eclipse.jface.text.IDocument;
/**
* This tokenizer traverses autotools-style text (m4 or configure.ac) to support the
* autoconf parser. It tracks the current context (m4 macro call or shell commands)
* to detect appropriate tokens, and tracks the m4 current quote style as well.
* <p>
* In m4 mode, its primary purpose is to find word boundaries, detect comments and quoted
* strings, and to find the macro punctuation tokens. It will not interpret anything
* (e.g. '$1' inside a macro) -- this is up to the parser.
* <p>
* In shell script mode, its primary purpose is to identify enough
* tokens to get a general picture of the structure of source for use by the autoconf
* parser. This isn't intended to be used for full shell script parsing. In fact,
* aside from the known tokens and identifiers, only individual characters will be returned.
* <p>
* Both modes know about "words" or identifiers and use the same syntax to detect these.
* It's expected that the parser will detect a word as a macro or possible macro and
* switch the mode of the tokenizer to fit. The parser should invoke "setM4Context(...)"
* (and "unreadToken" if necessary) to switch modes.
* @author eswartz
*
*/
public class AutoconfTokenizer {
private static final String UNTERMINATED_STRING = "UnterminatedString"; //$NON-NLS-1$
public static final String UNMATCHED_RIGHT_QUOTE = "UnmatchedRightQuote"; //$NON-NLS-1$
public static final String UNMATCHED_LEFT_QUOTE = "UnmatchedLeftQuote"; //$NON-NLS-1$
public static final String UNMATCHED_CLOSE_COMMENT = "UnmatchedCloseComment"; //$NON-NLS-1$
private IDocument document;
private int offset;
private String m4OpenQuote;
private String m4CloseQuote;
private String m4OpenComment;
private String m4CloseComment;
private char[] chars;
private int startOffset;
private boolean isM4Context;
private Token eofToken;
private IAutoconfErrorHandler errorHandler;
/** Create a tokenizer for a document. */
public AutoconfTokenizer(IDocument document, IAutoconfErrorHandler errorHandler) {
if (document == null /* || macroDetector == null*/)
throw new IllegalArgumentException();
this.document = document;
this.errorHandler = errorHandler;
this.chars = document.get().toCharArray();
this.offset = 0;
this.eofToken = new Token(ITokenConstants.EOF, "", document, chars.length, 0);
this.m4OpenQuote = "`"; //$NON-NLS-1$
this.m4CloseQuote = "'"; //$NON-NLS-1$
this.m4OpenComment = "#"; //$NON-NLS-1$
this.m4CloseComment = "\n"; //$NON-NLS-1$
}
/**
* Tell whether the tokenizer considers itself to be in an m4 context.
* This determines what kinds of tokens it returns.
* @return
*/
public boolean isM4Context() {
return isM4Context;
}
/**
* Switch the tokenizer into or out of m4 context.
* @return
*/
public void setM4Context(boolean flag) {
isM4Context = flag;
}
/**
* Set the m4 quote delimiters
*/
public void setM4Quote(String open, String close) {
this.m4OpenQuote = open;
this.m4CloseQuote = close;
}
/**
* Set the m4 comment delimiters
*/
public void setM4Comment(String open, String close) {
this.m4OpenComment = open;
this.m4CloseComment = close;
}
/** Push back the given token. This allows the tokenizer to restart from its start position,
* potentially in a different context. */
public void unreadToken(Token token) {
if (token.getLength() > 0 && offset == token.getOffset())
throw new IllegalStateException();
offset = token.getOffset();
}
/** Read the next token. Returns an EOF token at EOF. */
public Token readToken() {
if (offset >= chars.length)
return eofToken;
char ch = chars[offset];
// skip whitespace (but not EOL)
while (isWhitespace(ch)) {
offset++;
if (offset >= chars.length)
return eofToken;
ch = chars[offset];
}
// in shell mode, strip comments up to eol
if (!isM4Context && ch == '#') {
while (offset < chars.length) {
ch = chars[offset];
if (ch == '\n')
break;
offset++;
}
// keep inside doc if we didn't find that EOL
if (offset >= chars.length)
offset--;
}
startOffset = offset;
StringBuilder buffer = new StringBuilder();
// check EOL
if (ch == '\r' || ch == '\n') {
buffer.append(ch);
offset++;
if (ch == '\r' && offset < chars.length && chars[offset] == '\n') {
buffer.append(chars[offset++]);
}
return makeToken(ITokenConstants.EOL, buffer.toString());
}
// TODO: this parser always uses fixed logic for identifier reading, ignoring m4's "changeword"
if (isLeadIdentifierChar(ch)) {
return parseWord(ch);
}
// check comments and quotes
if (isM4Context) {
if (lookAhead(m4OpenComment)) {
boolean found = false;
// keep reading until the close comment (these are NOT nested)
while (offset < chars.length) {
if (lookAhead(m4CloseComment)) {
found = true;
break;
}
offset++;
}
if (!found) {
handleError(startOffset, offset, AutoconfEditorMessages.getFormattedString(UNMATCHED_CLOSE_COMMENT,
m4CloseComment.equals("\n") ? "newline" : m4CloseComment)); //$NON-NLS-1$
}
return makeToken(ITokenConstants.M4_COMMENT);
}
if (lookAhead(m4OpenQuote)) {
return parseQuote();
}
}
// check shell punctuation
if (!isM4Context) {
if (ch == ';' && offset + 1 < chars.length && chars[offset + 1] == ';') {
offset += 2;
return makeToken(ITokenConstants.SH_CASE_CONDITION_END);
}
if (ch == '<' && offset + 1 < chars.length && chars[offset + 1] == '<') {
offset += 2;
if (offset < chars.length && chars[offset] == '-') {
offset++;
return makeToken(ITokenConstants.SH_HERE_DASH);
} else {
return makeToken(ITokenConstants.SH_HERE);
}
}
switch (ch) {
case '$':
offset++;
return makeToken(ITokenConstants.SH_DOLLAR);
case '[':
offset++;
return makeToken(ITokenConstants.SH_LBRACKET);
case ']':
offset++;
return makeToken(ITokenConstants.SH_RBRACKET);
case '{':
offset++;
return makeToken(ITokenConstants.SH_LBRACE);
case '}':
offset++;
return makeToken(ITokenConstants.SH_RBRACE);
case '\'':
return parseString(ITokenConstants.SH_STRING_SINGLE, ch);
case '\"':
return parseString(ITokenConstants.SH_STRING_DOUBLE, ch);
case '`':
return parseString(ITokenConstants.SH_STRING_BACKTICK, ch);
}
}
// check common punctuation
if (ch == ';') {
offset++;
return makeToken(ITokenConstants.SEMI);
}
if (ch == ',') {
offset++;
return makeToken(ITokenConstants.COMMA);
}
if (ch == '(') {
offset++;
return makeToken(ITokenConstants.LPAREN);
}
if (ch == ')') {
offset++;
return makeToken(ITokenConstants.RPAREN);
}
// unknown text
offset++;
return makeToken(ITokenConstants.TEXT);
}
private Token parseWord(char ch) {
StringBuilder buffer = new StringBuilder();
buffer.append(ch);
offset++;
do {
if (offset >= chars.length)
break;
ch = chars[offset];
if (!isIdentifierChar(ch))
break;
buffer.append(ch);
offset++;
} while (true);
String text = buffer.toString();
if (!isM4Context) {
// detect sh tokens
if ("case".equals(text))
return makeToken(ITokenConstants.SH_CASE, text);
if ("in".equals(text))
return makeToken(ITokenConstants.SH_IN, text);
if ("esac".equals(text))
return makeToken(ITokenConstants.SH_ESAC, text);
if ("while".equals(text))
return makeToken(ITokenConstants.SH_WHILE, text);
if ("select".equals(text))
return makeToken(ITokenConstants.SH_SELECT, text);
if ("until".equals(text))
return makeToken(ITokenConstants.SH_UNTIL, text);
if ("for".equals(text))
return makeToken(ITokenConstants.SH_FOR, text);
if ("do".equals(text))
return makeToken(ITokenConstants.SH_DO, text);
if ("done".equals(text))
return makeToken(ITokenConstants.SH_DONE, text);
if ("if".equals(text))
return makeToken(ITokenConstants.SH_IF, text);
if ("then".equals(text))
return makeToken(ITokenConstants.SH_THEN, text);
if ("else".equals(text))
return makeToken(ITokenConstants.SH_ELSE, text);
if ("elif".equals(text))
return makeToken(ITokenConstants.SH_ELIF, text);
if ("fi".equals(text))
return makeToken(ITokenConstants.SH_FI, text);
}
// other identifier-looking word
return makeToken(ITokenConstants.WORD, text);
}
private Token parseQuote() {
// read text, honoring nested quotes, but don't put the outermost quotes in the token
StringBuilder buffer = new StringBuilder();
int quoteLevel = 1;
// keep reading until the close quote
while (offset < chars.length) {
if (lookAhead(m4CloseQuote)) {
quoteLevel--;
if (quoteLevel == 0)
break;
buffer.append(m4CloseQuote);
} else if (lookAhead(m4OpenQuote)) {
buffer.append(m4OpenQuote);
quoteLevel++;
} else {
buffer.append(chars[offset]);
offset++;
}
}
if (quoteLevel > 0) {
handleError(startOffset, offset, AutoconfEditorMessages.getFormattedString(UNMATCHED_LEFT_QUOTE, m4CloseQuote));
} else if (quoteLevel < 0) {
handleError(startOffset, offset, AutoconfEditorMessages.getFormattedString(UNMATCHED_RIGHT_QUOTE, m4OpenQuote));
}
return makeToken(ITokenConstants.M4_STRING, buffer.toString());
}
private Token parseString(int type, char terminal) {
startOffset = offset;
offset++;
StringBuilder buffer = new StringBuilder();
char ch = 0;
while (offset < chars.length) {
ch = chars[offset++];
if (ch == '\\') {
if (offset < chars.length)
buffer.append(chars[offset++]);
else
buffer.append(ch);
} else if (ch == terminal) {
break;
} else {
buffer.append(ch);
}
}
if (ch != terminal) {
handleError(startOffset, offset, AutoconfEditorMessages.getFormattedString(UNTERMINATED_STRING, "" + ch));
}
return makeToken(type, buffer.toString());
}
private void handleError(int start, int end,
String message) {
if (errorHandler != null) {
int lineNumber = 0;
int startColumn = 0;
int endColumn = 0;
try {
lineNumber = document.getLineOfOffset(start);
int lineOffs = document.getLineOffset(lineNumber);
startColumn = start - lineOffs;
endColumn = end - lineOffs;
} catch (BadLocationException e) {
// Don't care if we blow up trying to issue marker
}
errorHandler.handleError(new ParseException(
message,
start, end,
lineNumber,
startColumn, endColumn,
IMarker.SEVERITY_ERROR));
}
}
/**
* Look ahead for the given string. If found, return true, and have
* offset updated. Otherwise, return false with offset unchanged.
* @param keyword
* @param text
* @return
*/
private boolean lookAhead(String keyword) {
int length = keyword.length();
if (offset + length > chars.length) {
return false;
}
for (int idx = 0; idx < length; idx++) {
if (chars[offset + idx] != keyword.charAt(idx))
return false;
}
offset += length;
return true;
}
private boolean isWhitespace(char ch) {
return ch == ' ' || ch == '\t' || ch == '\f';
}
private Token makeToken(int type) {
return new Token(type,
new String(chars, startOffset, offset - startOffset),
document, startOffset, offset - startOffset);
}
private Token makeToken(int type, String text) {
return new Token(type, text, document, startOffset, offset - startOffset);
}
private boolean isIdentifierChar(char ch) {
return isLeadIdentifierChar(ch) || (ch >= '0' && ch <= '9');
}
private boolean isLeadIdentifierChar(char ch) {
return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_';
}
public Token peekToken() {
Token token = readToken();
unreadToken(token);
return token;
}
}