/** * Copyright 2002-2017 Evgeny Gryaznov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.textmapper.lapg.eval; import java.io.IOException; import java.io.Reader; import java.text.MessageFormat; import org.textmapper.lapg.api.Grammar; import org.textmapper.lapg.api.LexerData; import org.textmapper.lapg.api.LexerRule; public class GenericLexer { public static class Span { public Object value; public int symbol; public int state; public int line; public int offset; public int endoffset; } public interface Tokens { int Unavailable_ = -1; int eoi = 0; } public interface ErrorReporter { void error(String message, int line, int offset, int endoffset); } public static final int TOKEN_SIZE = 2048; private Reader stream; final private ErrorReporter reporter; private CharSequence input; private int tokenOffset; private int l; private int charOffset; private int chr; private int state; private int tokenLine; private int currLine; private int currOffset; private final Grammar grammar; private final int[] tmCharClass; private final int[] tmRuleSymbol; private final int[] tmGoto; private final int[] tmStateMap; private final int[] tmBacktracking; private final int tmFirstRule; private final int tmClassesCount; public GenericLexer(CharSequence input, ErrorReporter reporter, LexerData lexerData, Grammar grammar) throws IOException { this.reporter = reporter; this.grammar = grammar; tmRuleSymbol = getRuleSymbols(grammar); tmCharClass = lexerData.getChar2no(); tmGoto = lexerData.getChange(); tmClassesCount = lexerData.getNchars(); tmStateMap = lexerData.getGroupset(); tmBacktracking = lexerData.getBacktracking(); tmFirstRule = -1 - tmBacktracking.length/2; reset(input); } public void reset(CharSequence input) throws IOException { this.state = 0; tokenLine = currLine = 1; currOffset = 0; this.input = input; tokenOffset = l = 0; charOffset = l; chr = l < input.length() ? input.charAt(l++) : -1; if (chr >= Character.MIN_HIGH_SURROGATE && chr <= Character.MAX_HIGH_SURROGATE && l < input.length() && Character.isLowSurrogate(input.charAt(l))) { chr = Character.toCodePoint((char) chr, input.charAt(l++)); } } protected void advance() { if (chr == -1) return; currOffset += l - charOffset; if (chr == '\n') { currLine++; } charOffset = l; chr = l < input.length() ? input.charAt(l++) : -1; if (chr >= Character.MIN_HIGH_SURROGATE && chr <= Character.MAX_HIGH_SURROGATE && l < input.length() && Character.isLowSurrogate(input.charAt(l))) { chr = Character.toCodePoint((char) chr, input.charAt(l++)); } } public int getState() { return state; } public void setState(int state) { this.state = state; } public int getTokenLine() { return tokenLine; } public int getLine() { return currLine; } public void setLine(int currLine) { this.currLine = currLine; } public int getOffset() { return currOffset; } public void setOffset(int currOffset) { this.currOffset = currOffset; } public String tokenText() { return input.subSequence(tokenOffset, charOffset).toString(); } public int tokenSize() { return charOffset - tokenOffset; } private int mapCharacter(int chr) { if (chr >= 0 && chr < tmCharClass.length) return tmCharClass[chr]; return chr == -1 ? 0 : 1; } public Span next() throws IOException { Span token = new Span(); int state; tokenloop: do { token.offset = currOffset; tokenLine = token.line = currLine; tokenOffset = charOffset; // TODO use backupRule int backupRule = -1; for (state = tmStateMap[this.state]; state >= 0; ) { state = tmGoto[state * tmClassesCount + mapCharacter(chr)]; if (state > tmFirstRule && state < 0) { token.endoffset = currOffset; state = (-1 - state) * 2; backupRule = tmBacktracking[state++]; state = tmBacktracking[state]; } if (state == tmFirstRule && chr == -1) { token.endoffset = currOffset; token.symbol = 0; token.value = null; reporter.error("Unexpected end of input reached", token.line, token.offset, token.endoffset); token.offset = currOffset; break tokenloop; } if (state >= tmFirstRule && chr != -1) { currOffset += l - charOffset; if (chr == '\n') { currLine++; } charOffset = l; chr = l < input.length() ? input.charAt(l++) : -1; if (chr >= Character.MIN_HIGH_SURROGATE && chr <= Character.MAX_HIGH_SURROGATE && l < input.length() && Character.isLowSurrogate(input.charAt(l))) { chr = Character.toCodePoint((char) chr, input.charAt(l++)); } } } token.endoffset = currOffset; token.symbol = tmRuleSymbol[tmFirstRule - state]; token.value = null; if (token.symbol == -1) { reporter.error(MessageFormat.format("invalid token at line {0}: `{1}`, skipped", currLine, tokenText()), token.line, token.offset, token.endoffset); } } while (token.symbol == -1 || !createToken(token, tmFirstRule - state)); return token; } protected int charAt(int i) { if (i == 0) return chr; i += l - 1; int res = i < input.length() ? input.charAt(i++) : -1; if (res >= Character.MIN_HIGH_SURROGATE && res <= Character.MAX_HIGH_SURROGATE && i < input.length() && Character.isLowSurrogate(input.charAt(i))) { res = Character.toCodePoint((char) res, input.charAt(i++)); } return res; } protected boolean createToken(Span token, int ruleIndex) throws IOException { int lexemeKind = ruleIndex > 1 ? grammar.getLexerRules()[ruleIndex-2].getKind() : LexerRule.KIND_NONE; return lexemeKind != LexerRule.KIND_SPACE; } private static int[] getRuleSymbols(Grammar grammar) { LexerRule[] lexerRules = grammar.getLexerRules(); int[] result = new int[lexerRules.length + 2]; result[0] = grammar.getInvalidToken() != null ? grammar.getInvalidToken().getIndex() : -1; result[1] = grammar.getEoi().getIndex(); for (int i = 0; i < lexerRules.length; i++) { result[i + 2] = lexerRules[i].getSymbol().getIndex(); } return result; } }