/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
*
* Copyright 1997-2010 Oracle and/or its affiliates. All rights reserved.
*
* Oracle and Java are registered trademarks of Oracle and/or its affiliates.
* Other names may be trademarks of their respective owners.
*
* The contents of this file are subject to the terms of either the GNU
* General Public License Version 2 only ("GPL") or the Common
* Development and Distribution License("CDDL") (collectively, the
* "License"). You may not use this file except in compliance with the
* License. You can obtain a copy of the License at
* http://www.netbeans.org/cddl-gplv2.html
* or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
* specific language governing permissions and limitations under the
* License. When distributing the software, include this License Header
* Notice in each file and include the License file at
* nbbuild/licenses/CDDL-GPL-2-CP. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the GPL Version 2 section of the License file that
* accompanied this code. If applicable, add the following below the
* License Header, with the fields enclosed by brackets [] replaced by
* your own identifying information:
* "Portions Copyrighted [year] [name of copyright owner]"
*
* Contributor(s):
*
* The Original Software is NetBeans. The Initial Developer of the Original
* Software is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
* Microsystems, Inc. All Rights Reserved.
*
* If you wish your version of this file to be governed by only the CDDL
* or only the GPL Version 2, indicate your decision by adding
* "[Contributor] elects to include this software in this distribution
* under the [CDDL or GPL Version 2] license." If you do not indicate a
* single choice of license, a recipient has the option to distribute
* your version of this file under either the CDDL, the GPL Version 2 or
* to extend the choice of license to its licensees as provided above.
* However, if you add GPL Version 2 code and therefore, elected the GPL
* Version 2 license, then the option applies only if the new code is
* made subject to such option by the copyright holder.
*/
package org.netbeans.modules.ruby.lexer;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import org.jrubyparser.IRubyWarnings;
import org.jrubyparser.SourcePosition;
import org.jrubyparser.lexer.Lexer.LexState;
import org.jrubyparser.lexer.LexerSource;
import org.jrubyparser.lexer.ReaderLexerSource;
import org.jrubyparser.lexer.StrTerm;
import org.jrubyparser.lexer.StringTerm;
import org.jrubyparser.lexer.SyntaxException;
import org.jrubyparser.parser.Tokens;
import org.netbeans.api.lexer.Token;
import org.netbeans.spi.lexer.Lexer;
import org.netbeans.spi.lexer.LexerInput;
import org.netbeans.spi.lexer.LexerRestartInfo;
import org.netbeans.spi.lexer.TokenFactory;
import org.openide.ErrorManager;
/**
* A scanner for Ruby, which directly uses the JRuby lexer and translates
* from JRuby tokens to NetBeans lexer-based tokens
*
* @todo Should I generate a single large token for composite token types
* like strings and regular expressions? Today, I go to a lot of trouble
* to do state saving between the opening quote, middle literal string,
* and ending quote, for strings. (Ditto for regular expressions, single
* quoted strings, other forms of quoted strings, etc.).
* This allows me to have for example the / /'s in regular expressions
* stay black, and have only the embedded portion be green. But is that
* really necessary? If I instead were to eat up a whole String combination
* into a single token and return it as a single token, that would ensure
* that for incremental parsing, I always get called on a token boundary
* where I don't need the extra state saving. All I would need would
* be the lexer state (if different than the default state) which is a small
* integer (which gets compressed to single bytes by the lexer infrastructure.)
*
* @author Tor Norbye
*/
public final class RubyLexer implements Lexer<RubyTokenId> {
/** This is still not working; I wonder if release() is called correctly at all times...*/
private static final boolean REUSE_LEXERS = false;
private static RubyLexer cached;
private final org.jrubyparser.lexer.Lexer lexer;
private LexerSource lexerSource;
private boolean inRegexp;
private LexerInput input;
private TokenFactory<RubyTokenId> tokenFactory;
private boolean substituting;
private boolean inSymbol;
private boolean inEmbedded;
private RubyLexer(LexerRestartInfo<RubyTokenId> info) {
lexer = new org.jrubyparser.lexer.Lexer();
// XXX Do something at scan time about illegal characters?
lexer.setWarnings(new NullWarnings());
lexer.setPreserveSpaces(true);
}
public static synchronized RubyLexer create(LexerRestartInfo<RubyTokenId> info) {
RubyLexer rubyLexer = cached;
if (rubyLexer == null) {
rubyLexer = new RubyLexer(info);
}
rubyLexer.restart(info);
return rubyLexer;
}
void restart(LexerRestartInfo<RubyTokenId> info) {
inRegexp = substituting = inSymbol = inEmbedded = false;
lexer.reset();
input = info.input();
tokenFactory = info.tokenFactory();
String fileName = "unknown";
Reader lexerReader = new LexerInputReader(input);
// InputStream lexerInput = new LexerInputStream(input);
// We don't need IDE positions during pure syntax lexing; that's only needed during
// parsing for AST nodes
//lexerSource = new LexerSource(fileName, lexerReader, 0, false);
//lexerSource = LexerSource.getSource(fileName, lexerInput, null, null);
//XXX: jruby-parser
lexerSource = new ReaderLexerSource(fileName, lexerReader, 0);
lexer.setSource(lexerSource);
Object state = info.state();
if (state instanceof JRubyLexerRestartInfo) {
((JRubyLexerRestartInfo)state).initializeState(this);
} else if (state instanceof Integer) {
int stateValue = ((Integer)state).intValue();
lexer.setState(LexState.fromOrdinal(stateValue));
}
}
public void release() {
if (REUSE_LEXERS) {
// Possibly reset the structures that could cause memory leaks
synchronized (RubyLexer.class) {
cached = this;
}
}
}
public Object state() {
if (JRubyLexerRestartInfo.needsStateStorage(this)) {
return new JRubyLexerRestartInfo(this);
}
// We only need to store the state of the lexer when it's in a significant state,
// e.g. outside of normal expressions and with no string processing in progress
LexState state = lexer.getLexState();
if (state == null) {
return null;
}
if (lexer.getStrTerm() != null) {
return new JRubyLexerRestartInfo(this);
}
// The lexer can store integer states very efficiently
// (besides, Integer.valueOf will cache all these values since they are < 128)
return Integer.valueOf(state.getOrdinal());
}
private Token<RubyTokenId> token(RubyTokenId id, int length) {
String fixedText = id.fixedText();
return (fixedText != null) ? tokenFactory.getFlyweightToken(id, fixedText)
: tokenFactory.createToken(id, length);
}
public Token<RubyTokenId> nextToken() {
int token = 0;
int tokenLength = 0;
int oldOffset = lexerSource.getOffset();
while (tokenLength == 0) {
try {
lexer.advance();
token = lexer.token();
StrTerm strTerm = lexer.getStrTerm();
if (strTerm != null) {
strTerm.splitEmbeddedTokens();
}
} catch (StringTerm.UnterminatedStringException use) {
token = Tokens.yyErrorCode;
// TODO: Compute the position of the FIRST newline in the
// output - how can I do that?
// Update lexer input to make sure it records the right
// character boundaries for the tokens (since incremental lexing
// will restart at token boundaries, and we want to make sure
// it knows in the character stream where those boundaries truly are
int readAhead = lexerSource.chompReadAhead();
if (readAhead > 0) {
input.backup(readAhead);
}
input.backup(input.readLengthEOF());
tokenLength = 0;
// Read forward and stop at the first newline
while (true) {
int ch = input.read();
if (ch == LexerInput.EOF) {
break;
}
tokenLength++;
if (ch == '\n') {
break;
}
}
lexerSource.setOffset(oldOffset+tokenLength);
if (tokenLength > 0) {
return token(RubyTokenId.ERROR, tokenLength);
} else {
// Buffer ends with an unterminated string constant - nothing to do
return null;
}
} catch (SyntaxException ex) {
token = Tokens.yyErrorCode; // TODO - generate incomplete tokens?
tokenLength = lexerSource.getOffset() - oldOffset;
if (tokenLength == 0) {
if (input.readLength() > 0) {
return token(RubyTokenId.IDENTIFIER, input.readLength()); // XXX?
} else {
return null;
}
}
break;
} catch (Throwable ex) { // includes SyntaxException
ErrorManager.getDefault().notify(ex);
break;
}
assert token != 0;
if (token == -1) { // EOF
if (input.readLength() > 0) {
return token(RubyTokenId.IDENTIFIER, input.readLength()); // XXX?
} else {
return null;
}
}
int offset = lexerSource.getOffset();
tokenLength = offset - oldOffset;
}
// Update lexer input to make sure it records the right
// character boundaries for the tokens (since incremental lexing
// will restart at token boundaries, and we want to make sure
// it knows in the character stream where those boundaries truly are
int readAhead = lexerSource.chompReadAhead();
if (readAhead > 0) {
input.backup(readAhead);
}
// Map to IDE types
RubyTokenId id = getTokenId(token, oldOffset);
// Fix #102082
if (inSymbol) {
// A type symbol in front of a keyword, literal or constant
// should be lexed as a symbol
String category = id.primaryCategory();
boolean isString = "string".equals(category); // NOI18N
boolean inEmbedded = id == RubyTokenId.EMBEDDED_RUBY;
if (!(isString || inEmbedded) || (id == RubyTokenId.STRING_END || id == RubyTokenId.QUOTED_STRING_END)) {
inSymbol = (token == Tokens.tSYMBEG);
}
if (isString || id == RubyTokenId.IDENTIFIER || id == RubyTokenId.CONSTANT ||
"keyword".equals(category)) { // NOI18N
id = RubyTokenId.TYPE_SYMBOL;
}
} else {
inSymbol = (token == Tokens.tSYMBEG);
}
if (tokenLength <= 0) {
// XXX this is not right but better than asserting in the lexer!
// Just assign some default text attributes to unexpected text, one character at a time
return token(RubyTokenId.IDENTIFIER, 1);
}
return token(id, tokenLength);
}
/** @todo Move classification of tokens into TokenTypes into JRuby somehow */
private RubyTokenId getTokenId(int token, int offset) {
// If you add any new token types here, remember to update #getRelevantTokenTypes below
switch (token) {
case Tokens.tCOMMENT:
return RubyTokenId.LINE_COMMENT;
case Tokens.tWHITESPACE:
return RubyTokenId.WHITESPACE;
case Tokens.tFLOAT:
return RubyTokenId.FLOAT_LITERAL;
case Tokens.tINTEGER:
return RubyTokenId.INT_LITERAL;
case Tokens.tQWORDS_BEG:
case Tokens.tWORDS_BEG:
case Tokens.tSTRING_BEG:
case Tokens.tXSTRING_BEG:
if (lexer.getStrTerm() != null) {
substituting = lexer.getStrTerm().isSubstituting();
} else {
substituting = false;
}
return substituting ? RubyTokenId.QUOTED_STRING_BEGIN : RubyTokenId.STRING_BEGIN;
case Tokens.tSTRING_DVAR:
case Tokens.tSTRING_DBEG:
inEmbedded = true;
return inRegexp ? RubyTokenId.REGEXP_LITERAL : RubyTokenId.STRING_LITERAL;
case Tokens.tSTRING_END:
return substituting ? RubyTokenId.QUOTED_STRING_END : RubyTokenId.STRING_END;
case Tokens.tSTRING_CONTENT: // What about tXSTRING??
if (inEmbedded) {
inEmbedded = false;
return RubyTokenId.EMBEDDED_RUBY;
} else if (inRegexp) {
return RubyTokenId.REGEXP_LITERAL;
} else {
// For heredocs I may not know when I see the opening
if (lexer.getStrTerm() != null) {
substituting = lexer.getStrTerm().isSubstituting();
if (substituting) {
return RubyTokenId.QUOTED_STRING_LITERAL;
} else {
return RubyTokenId.STRING_LITERAL;
}
} else {
substituting = false;
return RubyTokenId.STRING_LITERAL;
}
}
case Tokens.tREGEXP_BEG:
inRegexp = true;
return RubyTokenId.REGEXP_BEGIN;
case Tokens.tREGEXP_END:
inRegexp = false;
return RubyTokenId.REGEXP_END;
case Tokens.tDOCUMENTATION:
return RubyTokenId.DOCUMENTATION;
case Tokens.yyErrorCode:
return RubyTokenId.ERROR;
case Tokens.tGVAR: // Global variable
case Tokens.tBACK_REF:
case Tokens.tNTH_REF:
return RubyTokenId.GLOBAL_VAR;
case Tokens.tIVAR: // Instance variable
return RubyTokenId.INSTANCE_VAR;
case Tokens.tCVAR: // Class variable
return RubyTokenId.CLASS_VAR;
case Tokens.tCONSTANT: // Constant
return RubyTokenId.CONSTANT;
case Tokens.tIDENTIFIER:
return RubyTokenId.IDENTIFIER;
case Tokens.tSYMBEG:
return RubyTokenId.TYPE_SYMBOL;
case '[':
// I sometimes get ascii '[' instead of LBRACK, for example in this expression:
// for k, v in sort{|a1, a2| a1[0].id2name <=> a2[0].id2name}
case Tokens.tLBRACK:
return RubyTokenId.LBRACKET;
case ']':
case Tokens.tRBRACK:
return RubyTokenId.RBRACKET;
case Tokens.tLPAREN:
case Tokens.tLPAREN2: // XXX What is this?
case Tokens.tLPAREN_ARG: // XXX What is this?
return RubyTokenId.LPAREN;
case Tokens.tRPAREN:
return RubyTokenId.RPAREN;
case Tokens.tLCURLY: // block (primary)
case Tokens.tLBRACE: // hash
case Tokens.tLBRACE_ARG: // block (expr)
return RubyTokenId.LBRACE;
case Tokens.tRCURLY:
return RubyTokenId.RBRACE;
case Tokens.kDEF:
return RubyTokenId.DEF;
case Tokens.kEND:
return RubyTokenId.END;
case Tokens.kCLASS:
return RubyTokenId.CLASS;
case Tokens.kMODULE:
return RubyTokenId.MODULE;
case Tokens.kBEGIN:
return RubyTokenId.BEGIN;
case Tokens.kIF:
return RubyTokenId.IF;
case Tokens.kUNLESS:
return RubyTokenId.UNLESS;
case Tokens.kWHILE:
return RubyTokenId.WHILE;
case Tokens.kUNTIL:
return RubyTokenId.UNTIL;
case Tokens.kDO_BLOCK:
case Tokens.kDO_COND:
return RubyTokenId.ANY_KEYWORD;
case Tokens.kDO:
return RubyTokenId.DO;
case Tokens.kCASE:
return RubyTokenId.CASE;
case Tokens.kFOR:
return RubyTokenId.FOR;
case Tokens.kELSE:
return RubyTokenId.ELSE;
case Tokens.kELSIF:
return RubyTokenId.ELSIF;
case Tokens.kENSURE:
return RubyTokenId.ENSURE;
case Tokens.kWHEN:
return RubyTokenId.WHEN;
case Tokens.kRESCUE:
return RubyTokenId.RESCUE;
case Tokens.kSUPER:
return RubyTokenId.SUPER;
case Tokens.kSELF:
return RubyTokenId.SELF;
case Tokens.tAREF: // If you change this to a specific token, update BracketCompleter code which checks for [] and []=
case Tokens.tASET:
// XXX Change some of these into [, or ] ?
return RubyTokenId.ANY_OPERATOR;
case Tokens.kRESCUE_MOD:
case Tokens.kUNDEF:
case Tokens.kTHEN:
case Tokens.kBREAK:
case Tokens.kNEXT:
case Tokens.kREDO:
case Tokens.kRETRY:
case Tokens.kIN:
case Tokens.kRETURN:
case Tokens.kYIELD:
case Tokens.kNIL:
case Tokens.kTRUE:
case Tokens.kFALSE:
case Tokens.kAND:
case Tokens.kOR:
case Tokens.kNOT:
case Tokens.kIF_MOD:
case Tokens.kUNLESS_MOD:
case Tokens.kWHILE_MOD:
case Tokens.kUNTIL_MOD:
case Tokens.kALIAS:
case Tokens.kDEFINED:
case Tokens.klBEGIN: // "BEGIN { }": not matched with END { }
case Tokens.klEND: // "END { }": not matched with BEGIN { }
case Tokens.k__LINE__:
case Tokens.k__FILE__:
return RubyTokenId.ANY_KEYWORD;
case '=':
// Commas are most frequently used in argument lists and array declarations
// where treating "," as a continuation operator causes the first and
// subsequent entries to be misaligned - see FormattingTest.testArrayDecl
// and testHashDecl
//case ',':
case Tokens.tPLUS:
case Tokens.tMINUS:
case Tokens.tDIVIDE:
// Doesn't seem to work
//case '|':
//case Tokens.tPIPE:
case Tokens.tLT:
case Tokens.tGT:
case Tokens.tPOW:
case Tokens.tCMP:
// start, percent, bang and a few others missing
case Tokens.tANDOP:
case Tokens.tOROP:
case Tokens.tEQ:
case Tokens.tEQQ:
case Tokens.tNEQ:
case Tokens.tGEQ:
case Tokens.tLEQ:
case Tokens.tMATCH:
case Tokens.tNMATCH:
case Tokens.tLSHFT:
case Tokens.tRSHFT:
case Tokens.tASSOC:
case Tokens.tOP_ASGN:
case '?':
case ':':
return RubyTokenId.NONUNARY_OP;
case Tokens.tDOT:
return RubyTokenId.DOT;
case Tokens.tDOT2:
case Tokens.tDOT3:
return RubyTokenId.RANGE;
case Tokens.tCOLON3:
return RubyTokenId.COLON3;
default:
return RubyTokenId.IDENTIFIER;
}
}
private static class JRubyLexerRestartInfo {
/** Bit set when we're in regular expressions */
private static final int IN_REGEXP = 1;
/** Bit set when we're in symbols */
private static final int IN_SYMBOL = 2;
/** Bit set when we're in an embedded ruby context... #{here} */
private static final int IN_EMBEDDED = 4;
/** Bit set when we're in a substituting/doublequoted string */
private static final int IN_SUBSTITUTING = 8;
/** Bit set when we need to set the spaceSeen flag in RubyYaccLexer */
private static final int SET_SPACE_SEEN = 16;
/** Bit set when we need to set commandStart in RubyYaccLexer */
private static final int SET_COMMAND_START = 32;
private final StrTerm strTerm;
private int localState;
private final LexState lexState;
private Object strTermState;
private final org.jrubyparser.lexer.Lexer.HeredocContext heredocContext;
JRubyLexerRestartInfo(RubyLexer rubyLexer) {
strTerm = rubyLexer.lexer.getStrTerm();
if (strTerm != null) {
strTermState = strTerm.getMutableState();
}
heredocContext = rubyLexer.lexer.heredocContext;
lexState = rubyLexer.lexer.getLexState();
if (rubyLexer.inRegexp) {
localState += IN_REGEXP;
}
if (rubyLexer.inSymbol) {
localState += IN_SYMBOL;
}
if (rubyLexer.inEmbedded) {
localState += IN_EMBEDDED;
}
if (rubyLexer.substituting) {
localState += IN_SUBSTITUTING;
}
if (rubyLexer.lexer.isSetSpaceSeen()) {
localState += SET_SPACE_SEEN;
}
if (rubyLexer.lexer.isCommandStart()) {
localState += SET_COMMAND_START;
}
}
/** Return true iff the given lexer needs custom state storage beyond the state integers */
public static boolean needsStateStorage(RubyLexer rubyLexer) {
return rubyLexer.inRegexp || rubyLexer.inSymbol || rubyLexer.inEmbedded ||
rubyLexer.substituting || rubyLexer.lexer.isCommandStart() || rubyLexer.lexer.heredocContext != null||
rubyLexer.lexer.isSetSpaceSeen();
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final JRubyLexerRestartInfo other = (JRubyLexerRestartInfo)obj;
if ((this.strTerm != other.strTerm) &&
((this.strTerm == null) || !this.strTerm.equals(other.strTerm))) {
return false;
}
if (this.localState != other.localState) {
return false;
}
if ((this.lexState != other.lexState) &&
((this.lexState == null) ||
!(this.lexState.getOrdinal() == other.lexState.getOrdinal()))) {
return false;
}
if ((this.strTermState != other.strTermState) &&
((this.strTermState == null) || !this.strTermState.equals(other.strTermState))) {
return false;
}
if ((this.heredocContext != other.heredocContext) &&
((this.heredocContext == null) || !this.heredocContext.equals(other.heredocContext))) {
return false;
}
return true;
}
@Override
public int hashCode() {
int hash = 7;
hash = (43 * hash) + this.localState;
hash = (43 * hash) + ((this.strTerm != null) ? this.strTerm.hashCode() : 0);
hash = (43 * hash) + ((this.strTermState != null) ? this.strTermState.hashCode() : 0);
// UGH - what about heredocTerms?
//hash = (43 * hash) + ((this.heredocTerms != null) ? this.heredocTerms.getMutableState().hashCode() : 0);
return hash;
}
private static String toStateString(int localState) {
StringBuilder sb = new StringBuilder();
if ((localState & IN_REGEXP) != 0) {
sb.append("regexp|");
}
if ((localState & IN_SYMBOL) != 0) {
sb.append("symbol|");
}
if ((localState & IN_EMBEDDED) != 0) {
sb.append("embedded|");
}
if ((localState & IN_SUBSTITUTING) != 0) {
sb.append("substituting|");
}
if ((localState & SET_COMMAND_START) != 0) {
sb.append("commandstart|");
}
if ((localState & SET_SPACE_SEEN) != 0) {
sb.append("spaceseen|");
}
String s = sb.toString();
if (s.endsWith("|")) {
s = s.substring(0, s.length()-1);
} else if (s.length() == 0) {
s = "-";
}
return s;
}
@Override
public String toString() {
return "RubyLexerState[" + toStateString(localState) + "," + strTerm + "," + lexState + "," +
strTermState + "," + heredocContext + "]";
}
void initializeState(RubyLexer rubyLexer) {
rubyLexer.lexer.setStrTerm(strTerm);
rubyLexer.lexer.heredocContext = heredocContext;
if ((strTermState != null) && (strTerm != null)) {
strTerm.setMutableState(strTermState);
}
if ((localState & IN_REGEXP) != 0) {
rubyLexer.inRegexp = true;
}
if ((localState & IN_SYMBOL) != 0) {
rubyLexer.inSymbol = true;
}
if ((localState & IN_EMBEDDED) != 0) {
rubyLexer.inEmbedded = true;
}
if ((localState & IN_SUBSTITUTING) != 0) {
rubyLexer.substituting = true;
}
if ((localState & SET_COMMAND_START) != 0) {
rubyLexer.lexer.setCommandStart(true);
}
if ((localState & SET_SPACE_SEEN) != 0) {
rubyLexer.lexer.setSpaceSeen(true);
}
rubyLexer.lexer.setLexState(lexState);
}
}
private static class LexerInputReader extends Reader {
private LexerInput input;
LexerInputReader(LexerInput input) {
this.input = input;
}
public int read(char[] buf, int off, int len) throws IOException {
for (int i = 0; i < len; i++) {
int c = input.read();
if (c == LexerInput.EOF) {
return -1;
}
buf[i + off] = (char)c;
}
return len;
}
public void close() throws IOException {
}
}
private static class LexerInputStream extends InputStream {
private LexerInput input;
LexerInputStream(LexerInput input) {
this.input = input;
}
@Override
public void close() throws IOException {
}
@Override
public int read() throws IOException {
int c = input.read();
if (c == LexerInput.EOF) {
// Private
//return RubyYaccLexer.EOF;
return -1;
}
return c;
}
}
/**
* A Warnings implementation which silently ignores everything.
*/
private static class NullWarnings implements IRubyWarnings {
public boolean isVerbose() {
return false;
}
public void warn(ID id, String message, Object... data) {
}
public void warning(ID id, String message, Object... data) {
}
public void warn(ID id, String fileName, int lineNumber, String message, Object... data) {
}
public void warning(ID id, String fileName, int lineNumber, String message, Object... data) {
}
public void warn(ID arg0, SourcePosition arg1, String arg2, Object... arg3) {
}
public void warning(ID arg0, SourcePosition arg1, String arg2, Object... arg3) {
}
}
}