EnhancedPainlessLexer.java example

Explorer
elasticsearch-master
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.painless.antlr;

import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.Lexer;
import org.antlr.v4.runtime.LexerNoViableAltException;
import org.antlr.v4.runtime.Token;
import org.antlr.v4.runtime.TokenSource;
import org.antlr.v4.runtime.misc.Interval;
import org.antlr.v4.runtime.misc.Pair;
import org.elasticsearch.painless.Definition;
import org.elasticsearch.painless.Location;

/**
 * A lexer that is customized for painless. It:
 * <ul>
 * <li>Overrides the default error behavior to fail on the first error.
 * <li>Stores the last token in case we need to do lookbehind for semicolon insertion and regex vs division detection.
 * <li>Implements the regex vs division detection.
 * <li>Insert semicolons where they'd improve the language's readability. Rather than hack this into the parser and create a ton of
 * ambiguity we hack them here where we can use heuristics to do it quickly.
 * <li>Enhances the error message when a string contains invalid escape sequences to include a list of valid escape sequences.
 * </ul>
 */
final class EnhancedPainlessLexer extends PainlessLexer {
    private final String sourceName;
    private final Definition definition;

    private Token stashedNext = null;
    private Token previous = null;

    EnhancedPainlessLexer(CharStream charStream, String sourceName, Definition definition) {
        super(charStream);
        this.sourceName = sourceName;
        this.definition = definition;
    }

    public Token getPreviousToken() {
        return previous;
    }

    @Override
    public Token nextToken() {
        if (stashedNext != null) {
            previous = stashedNext;
            stashedNext = null;
            return previous;
        }
        Token next = super.nextToken();
        if (insertSemicolon(previous, next)) {
            stashedNext = next;
            previous = _factory.create(new Pair<TokenSource, CharStream>(this, _input), PainlessLexer.SEMICOLON, ";",
                    Lexer.DEFAULT_TOKEN_CHANNEL, next.getStartIndex(), next.getStopIndex(), next.getLine(), next.getCharPositionInLine());
            return previous;
        } else {
            previous = next;
            return next;
        }
    }

    @Override
    public void recover(final LexerNoViableAltException lnvae) {
        final CharStream charStream = lnvae.getInputStream();
        final int startIndex = lnvae.getStartIndex();
        final String text = charStream.getText(Interval.of(startIndex, charStream.index()));

        Location location = new Location(sourceName, _tokenStartCharIndex);
        String message = "unexpected character [" + getErrorDisplay(text) + "].";
        char firstChar = text.charAt(0);
        if ((firstChar == '\'' || firstChar == '"') && text.length() - 2 > 0 && text.charAt(text.length() - 2) == '\\') {
            /* Use a simple heuristic to guess if the unrecognized characters were trying to be a string but has a broken escape sequence.
             * If it was add an extra message about valid string escape sequences. */
            message += " The only valid escape sequences in strings starting with [" + firstChar + "] are [\\\\] and [\\"
                    + firstChar + "].";
        }
        throw location.createError(new IllegalArgumentException(message, lnvae));
    }

    @Override
    protected boolean isSimpleType(String name) {
        return definition.isSimpleType(name);
    }

    @Override
    protected boolean slashIsRegex() {
        Token lastToken = getPreviousToken();
        if (lastToken == null) {
            return true;
        }
        switch (lastToken.getType()) {
        case PainlessLexer.RBRACE:
        case PainlessLexer.RP:
        case PainlessLexer.OCTAL:
        case PainlessLexer.HEX:
        case PainlessLexer.INTEGER:
        case PainlessLexer.DECIMAL:
        case PainlessLexer.ID:
        case PainlessLexer.DOTINTEGER:
        case PainlessLexer.DOTID:
            return false;
        default:
            return true;
        }
    }

    private static boolean insertSemicolon(Token previous, Token next) {
        if (previous == null || next.getType() != PainlessLexer.RBRACK) {
            return false;
        }
        switch (previous.getType()) {
        case PainlessLexer.RBRACK:     // };} would be weird!
        case PainlessLexer.SEMICOLON:  // already have a semicolon, no need to add one
        case PainlessLexer.LBRACK:     // empty blocks don't need a semicolon
            return false;
        default:
            return true;
        }
    }
}